def import_offline_feature(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json try: if not request_data.get("jobId"): return get_json_result(status=2, msg="no job id") job_id = request_data.get("jobId") job_data = query_job_by_id(job_id=job_id) if not job_data: return get_json_result(status=3, msg="can not found this job id: %s" % request_data.get("jobId", "")) response = GetFeature.import_data(request_data, json.loads(job_data[0]["config"])) if response.get("status", 1) == 0: update_job_by_id(job_id=job_id, update_data={ "status": "success", "end_date": datetime.datetime.now() }) return get_json_result() else: return get_json_result(status=1, msg="request offline feature error: %s" % response.get("msg", "")) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="request offline feature error: %s" % e)
def do_load_model(): request_data = request.json try: request_data["servings"] = server_conf.get("servers", {}).get("servings", []) publish_model.load_model(config_data=request_data) return get_json_result() except Exception as e: logger.exception(e) return get_json_result(status=1, msg="load model error: %s" % e)
def submit_job(): _data = request.json _job_id = generate_job_id() logger.info('generated job_id {}, body {}'.format(_job_id, _data)) try: push_into_job_queue(job_id=_job_id, config=_data) return get_json_result(0, "success, job_id {}".format(_job_id)) except Exception as e: return get_json_result(1, "failed, error: {}".format(e))
def import_id(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json table_name_space = "id_library" try: id_library_info = eggroll.table("info", table_name_space, partition=10, create_if_missing=True, error_if_exist=False) if request_data.request("rangeStart") == 0: data_id = generate_job_id() id_library_info.put("tmp_data_id", data_id) else: data_id = id_library_info.request("tmp_data_id") data_table = eggroll.table(data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) for i in request_data.request("ids", []): data_table.put(i, "") if request_data.request("rangeEnd") and request_data.request( "total") and (request_data.request("total") - request_data.request("rangeEnd") == 1): # end new_id_count = data_table.count() if new_id_count == request_data["total"]: id_library_info.put( data_id, json.dumps({ "salt": request_data.request("salt"), "saltMethod": request_data.request("saltMethod") })) old_data_id = id_library_info.request("use_data_id") id_library_info.put("use_data_id", data_id) logger.info( "import id success, dtable name is {}, namespace is {}", data_id, table_name_space) # TODO: destroy DTable, should be use a lock old_data_table = eggroll.table(old_data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) old_data_table.destroy() id_library_info.delete(old_data_id) else: data_table.destroy() return get_json_result( 2, "the actual amount of data is not equal to total.") return get_json_result() except Exception as e: logger.exception(e) return get_json_result(1, "import error.")
def query_model_version_history(): request_data = request.json try: config = file_utils.load_json_conf(request_data.get("config_path")) eggroll.init(mode=WORK_MODE) history = version_history(data_table_namespace=config.get("namespace")) return get_json_result(msg=json.dumps(history)) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="load model error: %s" % e)
def download_data(data_func): _data = request.json _job_id = generate_job_id() logger.info('generated job_id {}, body {}'.format(_job_id, _data)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) _download_module = os.path.join(file_utils.get_project_base_directory(), "arch/api/utils/download.py") _upload_module = os.path.join(file_utils.get_project_base_directory(), "arch/api/utils/upload.py") if data_func == "download": _module = _download_module else: _module = _upload_module try: if data_func == "download": progs = [ "python3", _module, "-j", _job_id, "-c", os.path.abspath(_data.get("config_path")) ] else: progs = [ "python3", _module, "-c", os.path.abspath(_data.get("config_path")) ] logger.info('Starting progs: {}'.format(progs)) std_log = open(os.path.join(_job_dir, 'std.log'), 'w') task_pid_path = os.path.join(_job_dir, 'pids') if os.name == 'nt': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE else: startupinfo = None p = subprocess.Popen(progs, stdout=std_log, stderr=std_log, startupinfo=startupinfo) os.makedirs(task_pid_path, exist_ok=True) with open(os.path.join(task_pid_path, data_func + ".pid"), 'w') as f: f.truncate() f.write(str(p.pid) + "\n") f.flush() return get_json_result(0, "success, job_id {}".format(_job_id)) except Exception as e: print(e) return get_json_result(-104, "failed, job_id {}".format(_job_id))
def publish_model_online(): request_data = request.json try: config = file_utils.load_json_conf(request_data.get("config_path")) if not config.get('servings'): # get my party all servings config['servings'] = SERVINGS publish_model.publish_online(config_data=config) return get_json_result() except Exception as e: logger.exception(e) return get_json_result(status=1, msg="publish model error: %s" % e)
def stop_workflow(job_id): _job_dir = get_job_directory(job_id) task_pid_path = os.path.join(_job_dir, 'pids') if os.path.isdir(task_pid_path): for pid_file in os.listdir(task_pid_path): try: if not pid_file.endswith('.pid'): continue with open(os.path.join(task_pid_path, pid_file), 'r') as f: pids = f.read().split('\n') for pid in pids: try: if len(pid) == 0: continue logger.debug( "terminating process pid:{} {}".format( pid, pid_file)) p = psutil.Process(int(pid)) for child in p.children(recursive=True): child.kill() p.kill() except NoSuchProcess: continue except Exception as e: logger.exception("error") continue update_job_by_id(job_id=job_id, update_data={ "status": "failed", "set_status": "failed" }) pop_from_job_queue(job_id=job_id) return get_json_result()
def run_job(self, job_id, config): default_runtime_dict = file_utils.load_json_conf('workflow/conf/default_runtime_conf.json') setting_conf = file_utils.load_json_conf('workflow/conf/setting_conf.json') _job_dir = get_job_directory(job_id=job_id) os.makedirs(_job_dir, exist_ok=True) ParameterOverride.override_parameter(default_runtime_dict, setting_conf, config, _job_dir) logger.info('job_id {} parameters overrode {}'.format(config, _job_dir)) channel, stub = get_proxy_data_channel() for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _method = 'POST' _module = runtime_conf['module'] _url = '/workflow/{}/{}/{}'.format(job_id, _module, _role) _packet = wrap_grpc_packet(runtime_conf, _method, _url, _party_id, job_id) logger.info( 'Starting workflow job_id:{} party_id:{} role:{} method:{} url:{}'.format(job_id, _party_id, _role, _method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed')
def update_job(job_id): request_data = request.json update_job_by_id(job_id=job_id, update_data={"status": request_data.get("status")}) update_job_queue(job_id=job_id, update_data={"status": request_data.get("status")}) if request_data.get("status") in ["failed", "deleted"]: stop_job(job_id=job_id) if request_data.get("status") in ["failed", "deleted", "success"]: pop_from_job_queue(job_id=job_id) return get_json_result()
def request_offline_feature(): request_data = request.json try: job_id = uuid.uuid1().hex response = GetFeature.request(job_id, request_data) if response.get("status", 1) == 0: job_data = dict() job_data.update(request_data) job_data["begin_date"] = datetime.datetime.now() job_data["status"] = "running" job_data["config"] = json.dumps(request_data) save_job_info(job_id=job_id, **job_data) return get_json_result() else: return get_json_result(status=1, msg="request offline feature error: %s" % response.get("msg", "")) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="request offline feature error: %s" % e)
def stop_job(job_id): _job_dir = get_job_directory(job_id) for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _url = '/workflow/{}'.format(job_id) _method = 'DELETE' _packet = wrap_grpc_packet({}, _method, _url, _party_id, job_id) channel, stub = get_proxy_data_channel() try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall stop to remote manager failed') return get_json_result()
def load_model(): config = file_utils.load_json_conf(request.json.get("config_path")) _job_id = generate_job_id() channel, stub = get_proxy_data_channel() for _party_id in config.get("party_ids"): config['my_party_id'] = _party_id _method = 'POST' _url = '/model/load/do' _packet = wrap_grpc_packet(config, _method, _url, _party_id, _job_id) logger.info( 'Starting load model job_id:{} party_id:{} method:{} url:{}'.format(_job_id, _party_id,_method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} method:{} url:{} Failed to start load model'.format(_job_id, _party_id, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed') return get_json_result()
def start_workflow(job_id, module, role): _data = request.json _job_dir = get_job_directory(job_id) _party_id = str(_data['local']['party_id']) _method = _data['WorkFlowParam']['method'] conf_path_dir = os.path.join(_job_dir, _method, module, role, _party_id) os.makedirs(conf_path_dir, exist_ok=True) conf_file_path = os.path.join(conf_path_dir, 'runtime_conf.json') with open(conf_file_path, 'w+') as f: f.truncate() f.write(json.dumps(_data, indent=4)) f.flush() if os.name == 'nt': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE else: startupinfo = None task_pid_path = os.path.join(_job_dir, 'pids') std_log = open(os.path.join(_job_dir, role + '.std.log'), 'w') progs = [ "python3", os.path.join(file_utils.get_project_base_directory(), _data['CodePath']), "-j", job_id, "-c", os.path.abspath(conf_file_path) ] logger.info('Starting progs: {}'.format(" ".join(progs))) p = subprocess.Popen(progs, stdout=std_log, stderr=std_log, startupinfo=startupinfo) os.makedirs(task_pid_path, exist_ok=True) with open(os.path.join(task_pid_path, role + ".pid"), 'w') as f: f.truncate() f.write(str(p.pid) + "\n") f.flush() job_data = dict() job_data["begin_date"] = datetime.datetime.now() job_data["status"] = "ready" with open(conf_file_path) as fr: config = json.load(fr) job_data.update(config) job_data["my_role"] = config.get("local", {}).get("role") save_job_info(job_id=job_id, **job_data) update_job_queue(job_id=job_id, update_data={"status": "ready"}) return get_json_result(msg="success, pid is %s" % p.pid)