def on_error(self, ws, error): self.STATE = SOCKET_STATE.FAILED russell_logger.debug(str(error)) ws.close() if isinstance(error, ClickException): # raised from on_message raise error
def init(cls): if os.path.isfile(cls.CONFIG_FILE_PATH): logger.debug("cl ignore file already present at {}".format( cls.CONFIG_FILE_PATH)) return logger.debug("Setting default ch ignore in the file {}".format( cls.CONFIG_FILE_PATH)) with open(cls.CONFIG_FILE_PATH, "w") as config_file: config_file.write(DEFAULT_FILE_IGNORE_LIST)
def check_response_status(self, response): """ Check if response is successful. Else raise Exception. """ # 处理流式响应 flag = False for ct in ('application/json', 'text/html'): if ct in response.headers.get('Content-Type'): flag = True break if flag is False: logger.debug("Content-Type is {}".format(response.headers.get('Content-Type'))) return response logger.debug("Http status code: {}".format(response.status_code)) # 处理标准HTTP错误码 if not (200 <= response.status_code < 300): if response.status_code == 401: raise AuthenticationException() elif response.status_code == 404: raise NotFoundException() else: raise InvalidResponseException() try: resp_json = response.json() except Exception as e: logger.debug(str(e)) raise InvalidResponseException() # 处理自定义错误码 code = resp_json.get("code", 500) if not (200 <= code < 300): try: message = resp_json.get("data") except Exception as e: logger.debug(str(e)) message = None logger.debug("Error received : status_code: {}, message: {}" .format(code, message or response.content)) if code == 404: raise NotFoundException() elif code == 401: raise AuthenticationException() elif code == 400: raise BadRequestException() else: raise InvalidResponseException() return resp_json
def download(self, url, filename, timeout=10, api_version=1): """ Download the file from the given url at the current path """ logger.debug("Downloading file from url: {}".format(url)) try: response = self.request(method='GET', url=url, stream=True, timeout=timeout, api_version=api_version) self.check_response_status(response) with open(filename, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) return filename except requests.exceptions.ConnectionError as exception: logger.debug("Exception: {}".format(exception)) sys.exit("Cannot connect to the Russell server. Check your internet connection.")
def on_message(self, ws, message): russell_logger.debug(ws.header) russell_logger.debug(message) def start_sending(*args): with open(self.FILE_NAME, 'rb') as f: # with progressbar.ProgressBar(maxval=int(ws.header.get('size', 0))) as bar: bar = progressbar.ProgressBar(maxval=int(ws.header.get('size', 0))).start() try: total_uploaded_size = 0 block_size = 1024 * 1024 msg = f.read(block_size) while msg: total_uploaded_size += len(msg) ws.sock.send_binary(msg) msg = f.read(block_size) bar.update(total_uploaded_size) except: pass finally: pass russell_logger.debug('received {}'.format(message)) resp_json = json.loads(message) code = resp_json.get('code') if code == 200: # to be modified if self.STATE == SOCKET_STATE.INIT: self.STATE = SOCKET_STATE.UPLOADING russell_logger.info('Start uploading...') _thread.start_new_thread(start_sending, ()) else: self.STATE = SOCKET_STATE.FINISH ws.close() elif code == 522: self.STATE = SOCKET_STATE.FAILED raise OverPermissionException() else: self.STATE = SOCKET_STATE.FAILED raise ServiceBusyException()
def request(self, method, url, params=None, data=None, json=None, files=None, access_token=None, auth=None, timeout=5, stream=False, api_version=1): """ Execute the request using requests library """ request_url = self.base_url.format(api_version) + url logger.debug("Starting request to url: {} with params: {}, data: {}".format(request_url, params, data)) headers = {} if access_token: headers = {"Authorization": "Basic {}".format(access_token)} elif not auth: headers = {"Authorization": "Basic {}".format( self.access_token.token if self.access_token else None) } try: response = requests.request(method=method, url=request_url, params=params, headers=headers, data=data, json=json, files=files, timeout=timeout, stream=stream, auth=auth) except requests.exceptions.ConnectionError: sys.exit("Cannot connect to the Russell server. Check your internet connection.") try: if not stream: logger.debug("Response Content: {}, Headers: {}".format(response.content, response.headers)) return self.check_response_status(response) else: logger.debug('HTTP Stream Request/Response...') return self.check_response_status(response) except Exception as e: sys.exit(str(e))
def set_config(cls, experiment_config): russell_logger.debug("Setting {} in the file {}".format(experiment_config.to_dict(), cls.CONFIG_FILE_PATH)) with open(cls.CONFIG_FILE_PATH, "w") as config_file: config_file.write(json.dumps(experiment_config))
def set_config(cls, data_config): logger.debug("Setting {} in the file {}".format( data_config.to_dict(), cls.CONFIG_FILE_PATH)) with open(cls.CONFIG_FILE_PATH, "w") as config_file: config_file.write(json.dumps(data_config.to_dict()))
def run(resubmit, command, env, jupyter, tensorboard, data, version, message, os, cputype, cpunum, gputype, gpunum, memtype, memnum, eager, value, earliest, deadline, duration): ''' :param resubmit: :param command: :param env: :param jupyter: :param tensorboard: :param data: :param version: :param message: :param os: :param cputype: :param cpunum: :param gputype: :param gpunum: :param memtype: :param memnum: :param eager: :param value: :param earliest: :param deadline: :param duration: :return: ''' """ """ # 初始化客户端 try: ec = ExperimentClient() except Exception as e: logger.error(str(e)) return if resubmit is True: # 只关注竞价部分的参数 jobSpec = {} # 从本地配置文件或者服务器读取上次竞价失败的(或者本地配置文件中的,上次竞价成功的也行)作业详情 jobId = jobSpec["id"] # 提交作业请求 jobReq = JobReq(duration=duration, tw_end=deadline, tw_start=earliest, job_id=jobId, value=value, resources=jobSpec["resources"]) resp = ec.submit(jobId, jobReq) if resp["accepted"] == False: logger.info("This job submit is not accepted, reason: {}".format(resp["message"])) return # 检查备注信息长度 if message and len(message) > 1024: logger.error("Message body length over limit") return # 获取认证令牌 access_token = AuthConfigManager.get_access_token() # 读取本地作业配置信息 experiment_config = ExperimentConfigManager.get_config() # 组装命令成列表 command_str = ' '.join(command) # # 处理挂载数据集 # success, data_ids = process_data_ids(data) # if not success: # return # 处理深度学习框架配置 if not env: # 未指定,获取作业所属项目的默认框架作为此次作业的框架 env = ProjectClient().get_project_info_by_id(experiment_config["project_id"]).get('default_env') # 检查所有资源的组合是否合法 if not validate_resource_list(env, jupyter, tensorboard, os, cputype, cpunum, gputype, gpunum): return # 上传代码到云端或者指定云端代码 # # 如果指定了代码版本 # if version: # module_resp = ModuleClient().get_by_entity_id_version(experiment_config.project_id, version) # if not module_resp: # logger.error("Remote project does not existed") # return # module_id = module_resp.get('id') # else: # # Gen temp dir # try: # # upload_files, total_file_size_fmt, total_file_size = get_files_in_directory('.', 'code') # # save_dir(upload_files, _TEMP_DIR) # file_count, size = get_files_in_current_directory('code') # if size > 100 * 1024 * 1024: # sys.exit("Total size: {}. " # "Code size too large to sync, please keep it under 100MB." # "If you have data files in the current directory, please upload them " # "separately using \"russell data\" command and remove them from here.\n".format( # sizeof_fmt(size))) # copy_files('.', _TEMP_DIR) # except OSError: # sys.exit("Directory contains too many files to upload. Add unused directories to .russellignore file.") # # logger.info("Creating project run. Total upload size: {}".format(total_file_size_fmt)) # # logger.debug("Creating module. Uploading: {} files".format(len(upload_files))) # # hash_code = dirhash(_TEMP_DIR) # logger.debug("Checking MD5 ...") # module_resp = ModuleClient().get_by_codehash_entity_id(hash_code, experiment_config.project_id) # if module_resp: # if code same with older version, use existed, don`t need upload # module_id = module_resp.get('id') # version = module_resp.get('version') # logger.info("Use older version-{}.".format(version)) # else: # version = experiment_config.version # # Create module # module = Module(name=experiment_config.name, # description=message, # family_id=experiment_config.family_id, # version=version, # module_type="code", # entity_id=experiment_config.project_id # ) # module_resp = mc.create(module) # if not module_resp: # logger.error("Remote project does not existed") # return # version = module_resp.get('version') # experiment_config.set_version(version=version) # ExperimentConfigManager.set_config(experiment_config) # # module_id = module_resp.get('id') # project_id = module_resp.get('entity_id') # if not project_id == experiment_config.project_id: # logger.error("Project conflict") # # logger.debug("Created module with id : {}".format(module_id)) # # # Upload code to fs # logger.info("Syncing code ...") # fc = FsClient() # try: # fc.socket_upload(file_type="code", # filename=_TEMP_DIR, # access_token=access_token.token, # file_id=module_id, # user_name=access_token.username, # data_name=experiment_config.name) # except Exception as e: # shutil.rmtree(_TEMP_DIR) # logger.error("Upload failed: {}".format(str(e))) # return # else: # ### check socket state, some errors like file-server down, cannot be catched by `except` # state = fc.get_state() # if state == SOCKET_STATE.FAILED: # logger.error("Upload failed, please try after a while...") # return # finally: # try: # shutil.rmtree(fc.temp_dir) # except FileNotFoundError: # pass # # ModuleClient().update_codehash(module_id, hash_code) # logger.info("\nUpload finished") # # # rm temp dir # shutil.rmtree(_TEMP_DIR) # logger.debug("Created code with id : {}".format(module_id)) # 创建作业描述指标 jobSpecification = JobSpecification(message=message, code_id="", data_ids=[], command=command_str, project_id=experiment_config["project_id"], framework=env, enable_jupyter=jupyter, enable_tensorboard=tensorboard, os="ubuntu:16", gpunum=gpunum, gputype=gputype, cpunum=cpunum, cputype=cputype, memnum=memnum, memtype=memtype) # 提交该作业描述,由服务器保存 jobId = ec.create(jobSpecification) logger.debug("Created job specification : {}".format(jobId)) # # 更新本地作业配置 # experiment_config.set_experiment_predecessor(experiment_id) # ExperimentConfigManager.set_config(experiment_config) # 打印作业描述信息 experiment_name = "{}/{}:{}".format(access_token.username, experiment_config["project_id"], version) table_output = [["JOB ID", "NAME", "VERSION"], [jobId, experiment_name, version]] logger.info(tabulate(table_output, headers="firstrow")) logger.info("") # 提交作业请求 jobReq = JobReq(duration=duration, tw_end=deadline, tw_start=earliest, job_id=jobId, value=value, resources=jobSpecification.resources) resp = ec.submit(jobId, jobReq) if resp["accepted"] == False: logger.info("This job submit is not accepted, reason: {}".format(resp["message"])) return # 作业成功提交后,处理jupyter/tensorboard task_url = {} if jupyter is True: while True: # Wait for the experiment / task instances to become available try: experiment = ec.get(jobId) if experiment.state != "waiting" and experiment.task_instances: break except Exception as e: logger.debug("Experiment not available yet: {}".format(jobId)) logger.debug("Experiment not available yet: {}".format(jobId)) sleep(1) continue task_url = ec.get_task_url(jobId) jupyter_url = task_url["jupyter_url"] print("Setting up your instance and waiting for Jupyter notebook to become available ...") if wait_for_url(jupyter_url, sleep_duration_seconds=2, iterations=900): logger.info("\nPath to jupyter notebook: {}".format(jupyter_url)) webbrowser.open(jupyter_url) else: logger.info("\nPath to jupyter notebook: {}".format(jupyter_url)) logger.info( "Notebook is still loading or can not be connected now. View logs to track progress") if tensorboard is True: if not task_url.get("tensorboard_url"): task_url = ec.get_task_url(jobId) tensorboard_url = task_url["tensorboard_url"] logger.info("\nPath to tensorboard: {}".format(tensorboard_url)) logger.info(""" To view logs enter: ch logs {} """.format(jobId))
def on_open(self, ws): russell_logger.debug('setup connection to server')
def on_close(self, ws): self.clear_archive() russell_logger.debug('close connection to server')
def set_access_token(cls, access_token): russell_logger.debug("Setting {} in the file {}".format( access_token.to_dict(), cls.CONFIG_FILE_PATH)) with open(cls.CONFIG_FILE_PATH, "w") as config_file: config_file.write(json.dumps(access_token.to_dict()))