def get_all(self): try: response = self.request(method="GET", url=self.url) return response except Exception as e: logger.info("Error while retrieving env: {}".format(str(e))) return {}
def login_with_token(token): access_code = get_basic_token(token) user = AuthClient().get_user(access_code) access_token = AccessToken(username=user.username, token=access_code) AuthConfigManager.set_access_token(access_token) russell_logger.info("Login Successful as " + user.username)
def print_favs(data_sources): headers = ["FAV ID", "CREATED", "CATEGORY", "URL", "SOURCE", "TAG"] data_list = [] for data_source in data_sources: data_list.append([data_source.id, data_source.created_pretty, data_source.state, data_source.size_pretty, data_source.name, str(data_source.version)]) cl_logger.info(tabulate(data_list, headers=headers))
def output(id, url): """ Shows the output url of the run. By default opens the output page in your default browser. """ # data_source = DataClient().get(id) data_url = "{}/files/data/{}/".format(ch.CODINGHUB_HOST, id) if url: cl_logger.info(data_url) else: cl_logger.info("Opening output directory in your browser ...") webbrowser.open(data_url)
def delete(id, yes): """ Delete data set. """ data_source = DataClient().get(id) if not yes: click.confirm('Delete Data: {}?'.format(data_source.name), abort=True, default=False) if DataClient().delete(id): cl_logger.info("Data deleted") else: cl_logger.error("Failed to delete data")
def login(token, username, password): """ Log into Russell via Auth0. """ if token: token = str(click.prompt('Please copy and paste the token here', type=str, hide_input=True)) login_with_token(token) elif username: if not password: password = str(click.prompt("Password", type=str, hide_input=True)) login_with_username_and_password(username, password) elif click.confirm('Authentication token page will now open in your browser. Continue?', default=True): webbrowser.open(ch.CODINGHUB_WEB_HOST + "/welcome") token = str(click.prompt('Please copy and paste the token here', type=str, hide_input=True)) if not token: russell_logger.info("Empty token received. Make sure your shell is handling the token appropriately.") russell_logger.info("See FAQ for help: http://docs.russellcloud.cn/") else: login_with_token(token) else: russell_logger.info("Login with your russell username/email and password. " "If you don't have a Russell account, " "head over to http://russellcloud.com to create one.") username = str(click.prompt("Username/Email", type=str)) password = str(click.prompt("Password", type=str, hide_input=True)) if not username or not password: russell_logger.info("Please make sure username and password are both provided.") else: login_with_username_and_password(username, password)
def init(id, name): """ Initialize new project at the current dir. russell init --name test_name or russell init --id 151af60026cd462792fa5d77ef79be4d """ if not id and not name: logger.warning("Neither id or name offered\n{}".format(init.__doc__)) return RussellIgnoreManager.init() try: pc = ProjectClient() except Exception as e: logger.error(str(e)) return access_token = AuthConfigManager.get_access_token() project_info = {} try: if id: project_info = pc.get_project_info_by_id(id=id) elif name: project_info = pc.get_project_info_by_name(access_token.username, name) except Exception as e: logger.error(str(e)) return else: if AuthClient().get_user( access_token.token).uid != project_info.get('owner_id'): logger.info("You can create a project then run 'russell init'") return project_id = project_info.get('id') name = project_info.get('name', '') if project_id: experiment_config = dict(name=name, project_id=project_id) ExperimentConfigManager.set_config(experiment_config) logger.info( "Project \"{}\" initialized in current directory".format(name)) else: logger.error( "Project \"{}\" initialization failed in current directory". format(name))
def socket_upload_tar(self, file_type, filename, access_token, file_id, user_name, data_name, temp_dir="./temp", is_compress=True): self.module_id = file_id # compress the folder russell_logger.info('compressing files...') self.temp_dir = temp_dir try: with tarfile.open(os.path.join(temp_dir, file_id), "w:gz" if is_compress else "w") as tar: ignore_list, whitelist = RussellIgnoreManager.get_list() ignore_list_expanded = ignore_list + ["{}/**".format(item) for item in ignore_list] ignore = shutil.ignore_patterns(*ignore_list_expanded) names = os.listdir(filename) if ignore is not None: ignored_names = ignore(filename, names) else: ignored_names = set() exclude_files = [os.path.join(filename, n) for n in ignored_names] tar.add(filename, filter=lambda x: None if x.name in exclude_files else x) self.FILE_NAME = os.path.join(temp_dir, file_id) except Exception as e: raise e # compute md5 checksum hash_code = get_md5_checksum(self.FILE_NAME) compressed_size = os.path.getsize(self.FILE_NAME) russell_logger.info("compressed size: {} Bytes".format(compressed_size)) # setup connection # websocket.enableTrace(True) web_socket = websocket.WebSocketApp( url=self.ws_url + "/{}/{}/".format(file_type, file_id), header={ 'access_token': access_token, 'size': str(compressed_size), 'hash_code': hash_code, 'user_name': user_name, 'data_name': data_name, 'is_compress': str(is_compress) }, on_message=self.on_message, on_error=self.on_error, on_close=self.on_close ) web_socket.on_open = self.on_open web_socket.run_forever()
def socket_upload(self, file_type, filename, access_token, file_id, user_name, data_name, temp_dir="./temp", is_compress=True, is_zip=False, is_direct=False): self.module_id = file_id if is_direct: self.FILE_NAME = filename else: # compress the folder russell_logger.info('compressing files...') self.temp_dir = temp_dir try: self.FILE_NAME = shutil.make_archive(base_name=os.path.join(temp_dir, file_id), format='gztar' if is_compress else 'tar', root_dir=filename, owner=None, group=None, logger=russell_logger) except Exception as e: raise e # compute md5 checksum hash_code = get_md5_checksum(self.FILE_NAME) compressed_size = os.path.getsize(self.FILE_NAME) russell_logger.info("compressed size: {} Bytes".format(compressed_size)) # setup connection # websocket.enableTrace(True) web_socket = websocket.WebSocketApp( url=self.ws_url + "/{}/{}/".format(file_type, file_id), header={ 'access_token': access_token, 'size': str(compressed_size), 'hash_code': hash_code, 'user_name': user_name, 'data_name': data_name, 'is_compress': str(is_compress), 'is_zip': str(is_zip) }, on_message=self.on_message, on_error=self.on_error, on_close=self.on_close ) web_socket.on_open = self.on_open web_socket.run_forever()
def download_compressed(self, url, compression='tar', uncompress=True, delete_after_uncompress=False, dir=None, api_version=1): """ Download and optionally uncompress the tar file from the given url """ if dir: if os.path.exists(dir): raise ExistedException else: os.mkdir(dir) os.chdir(dir) try: logger.info("Downloading the tar file to the current directory ...") filename = self.download(url=url, filename='output', api_version=api_version) if filename and os.path.isfile(filename) and uncompress: logger.info("Uncompressring the contents of the file ...") if compression == 'tar': tar = tarfile.open(filename) tar.extractall() tar.close() elif compression == 'zip': zip = zipfile.ZipFile(filename) zip.extractall() zip.close() if delete_after_uncompress: logger.info("Cleaning up the compressed file ...") os.remove(filename) return filename except requests.exceptions.ConnectionError as e: logger.error("Download ERROR! {}".format(e)) return False
def on_message(self, ws, message): russell_logger.debug(ws.header) russell_logger.debug(message) def start_sending(*args): with open(self.FILE_NAME, 'rb') as f: # with progressbar.ProgressBar(maxval=int(ws.header.get('size', 0))) as bar: bar = progressbar.ProgressBar(maxval=int(ws.header.get('size', 0))).start() try: total_uploaded_size = 0 block_size = 1024 * 1024 msg = f.read(block_size) while msg: total_uploaded_size += len(msg) ws.sock.send_binary(msg) msg = f.read(block_size) bar.update(total_uploaded_size) except: pass finally: pass russell_logger.debug('received {}'.format(message)) resp_json = json.loads(message) code = resp_json.get('code') if code == 200: # to be modified if self.STATE == SOCKET_STATE.INIT: self.STATE = SOCKET_STATE.UPLOADING russell_logger.info('Start uploading...') _thread.start_new_thread(start_sending, ()) else: self.STATE = SOCKET_STATE.FINISH ws.close() elif code == 522: self.STATE = SOCKET_STATE.FAILED raise OverPermissionException() else: self.STATE = SOCKET_STATE.FAILED raise ServiceBusyException()
def run(resubmit, command, env, jupyter, tensorboard, data, version, message, os, cputype, cpunum, gputype, gpunum, memtype, memnum, eager, value, earliest, deadline, duration): ''' :param resubmit: :param command: :param env: :param jupyter: :param tensorboard: :param data: :param version: :param message: :param os: :param cputype: :param cpunum: :param gputype: :param gpunum: :param memtype: :param memnum: :param eager: :param value: :param earliest: :param deadline: :param duration: :return: ''' """ """ # 初始化客户端 try: ec = ExperimentClient() except Exception as e: logger.error(str(e)) return if resubmit is True: # 只关注竞价部分的参数 jobSpec = {} # 从本地配置文件或者服务器读取上次竞价失败的(或者本地配置文件中的,上次竞价成功的也行)作业详情 jobId = jobSpec["id"] # 提交作业请求 jobReq = JobReq(duration=duration, tw_end=deadline, tw_start=earliest, job_id=jobId, value=value, resources=jobSpec["resources"]) resp = ec.submit(jobId, jobReq) if resp["accepted"] == False: logger.info("This job submit is not accepted, reason: {}".format(resp["message"])) return # 检查备注信息长度 if message and len(message) > 1024: logger.error("Message body length over limit") return # 获取认证令牌 access_token = AuthConfigManager.get_access_token() # 读取本地作业配置信息 experiment_config = ExperimentConfigManager.get_config() # 组装命令成列表 command_str = ' '.join(command) # # 处理挂载数据集 # success, data_ids = process_data_ids(data) # if not success: # return # 处理深度学习框架配置 if not env: # 未指定,获取作业所属项目的默认框架作为此次作业的框架 env = ProjectClient().get_project_info_by_id(experiment_config["project_id"]).get('default_env') # 检查所有资源的组合是否合法 if not validate_resource_list(env, jupyter, tensorboard, os, cputype, cpunum, gputype, gpunum): return # 上传代码到云端或者指定云端代码 # # 如果指定了代码版本 # if version: # module_resp = ModuleClient().get_by_entity_id_version(experiment_config.project_id, version) # if not module_resp: # logger.error("Remote project does not existed") # return # module_id = module_resp.get('id') # else: # # Gen temp dir # try: # # upload_files, total_file_size_fmt, total_file_size = get_files_in_directory('.', 'code') # # save_dir(upload_files, _TEMP_DIR) # file_count, size = get_files_in_current_directory('code') # if size > 100 * 1024 * 1024: # sys.exit("Total size: {}. " # "Code size too large to sync, please keep it under 100MB." # "If you have data files in the current directory, please upload them " # "separately using \"russell data\" command and remove them from here.\n".format( # sizeof_fmt(size))) # copy_files('.', _TEMP_DIR) # except OSError: # sys.exit("Directory contains too many files to upload. Add unused directories to .russellignore file.") # # logger.info("Creating project run. Total upload size: {}".format(total_file_size_fmt)) # # logger.debug("Creating module. Uploading: {} files".format(len(upload_files))) # # hash_code = dirhash(_TEMP_DIR) # logger.debug("Checking MD5 ...") # module_resp = ModuleClient().get_by_codehash_entity_id(hash_code, experiment_config.project_id) # if module_resp: # if code same with older version, use existed, don`t need upload # module_id = module_resp.get('id') # version = module_resp.get('version') # logger.info("Use older version-{}.".format(version)) # else: # version = experiment_config.version # # Create module # module = Module(name=experiment_config.name, # description=message, # family_id=experiment_config.family_id, # version=version, # module_type="code", # entity_id=experiment_config.project_id # ) # module_resp = mc.create(module) # if not module_resp: # logger.error("Remote project does not existed") # return # version = module_resp.get('version') # experiment_config.set_version(version=version) # ExperimentConfigManager.set_config(experiment_config) # # module_id = module_resp.get('id') # project_id = module_resp.get('entity_id') # if not project_id == experiment_config.project_id: # logger.error("Project conflict") # # logger.debug("Created module with id : {}".format(module_id)) # # # Upload code to fs # logger.info("Syncing code ...") # fc = FsClient() # try: # fc.socket_upload(file_type="code", # filename=_TEMP_DIR, # access_token=access_token.token, # file_id=module_id, # user_name=access_token.username, # data_name=experiment_config.name) # except Exception as e: # shutil.rmtree(_TEMP_DIR) # logger.error("Upload failed: {}".format(str(e))) # return # else: # ### check socket state, some errors like file-server down, cannot be catched by `except` # state = fc.get_state() # if state == SOCKET_STATE.FAILED: # logger.error("Upload failed, please try after a while...") # return # finally: # try: # shutil.rmtree(fc.temp_dir) # except FileNotFoundError: # pass # # ModuleClient().update_codehash(module_id, hash_code) # logger.info("\nUpload finished") # # # rm temp dir # shutil.rmtree(_TEMP_DIR) # logger.debug("Created code with id : {}".format(module_id)) # 创建作业描述指标 jobSpecification = JobSpecification(message=message, code_id="", data_ids=[], command=command_str, project_id=experiment_config["project_id"], framework=env, enable_jupyter=jupyter, enable_tensorboard=tensorboard, os="ubuntu:16", gpunum=gpunum, gputype=gputype, cpunum=cpunum, cputype=cputype, memnum=memnum, memtype=memtype) # 提交该作业描述,由服务器保存 jobId = ec.create(jobSpecification) logger.debug("Created job specification : {}".format(jobId)) # # 更新本地作业配置 # experiment_config.set_experiment_predecessor(experiment_id) # ExperimentConfigManager.set_config(experiment_config) # 打印作业描述信息 experiment_name = "{}/{}:{}".format(access_token.username, experiment_config["project_id"], version) table_output = [["JOB ID", "NAME", "VERSION"], [jobId, experiment_name, version]] logger.info(tabulate(table_output, headers="firstrow")) logger.info("") # 提交作业请求 jobReq = JobReq(duration=duration, tw_end=deadline, tw_start=earliest, job_id=jobId, value=value, resources=jobSpecification.resources) resp = ec.submit(jobId, jobReq) if resp["accepted"] == False: logger.info("This job submit is not accepted, reason: {}".format(resp["message"])) return # 作业成功提交后,处理jupyter/tensorboard task_url = {} if jupyter is True: while True: # Wait for the experiment / task instances to become available try: experiment = ec.get(jobId) if experiment.state != "waiting" and experiment.task_instances: break except Exception as e: logger.debug("Experiment not available yet: {}".format(jobId)) logger.debug("Experiment not available yet: {}".format(jobId)) sleep(1) continue task_url = ec.get_task_url(jobId) jupyter_url = task_url["jupyter_url"] print("Setting up your instance and waiting for Jupyter notebook to become available ...") if wait_for_url(jupyter_url, sleep_duration_seconds=2, iterations=900): logger.info("\nPath to jupyter notebook: {}".format(jupyter_url)) webbrowser.open(jupyter_url) else: logger.info("\nPath to jupyter notebook: {}".format(jupyter_url)) logger.info( "Notebook is still loading or can not be connected now. View logs to track progress") if tensorboard is True: if not task_url.get("tensorboard_url"): task_url = ec.get_task_url(jobId) tensorboard_url = task_url["tensorboard_url"] logger.info("\nPath to tensorboard: {}".format(tensorboard_url)) logger.info(""" To view logs enter: ch logs {} """.format(jobId))