def clean_job(job_id): try: logger.info('ready clean job {}'.format(job_id)) eggroll.cleanup('*', namespace=job_id, persistent=False) logger.info('send clean job {}'.format(job_id)) except Exception as e: logger.exception(e)
def run_job(self, job_id, config): default_runtime_dict = file_utils.load_json_conf('workflow/conf/default_runtime_conf.json') setting_conf = file_utils.load_json_conf('workflow/conf/setting_conf.json') _job_dir = get_job_directory(job_id=job_id) os.makedirs(_job_dir, exist_ok=True) ParameterOverride.override_parameter(default_runtime_dict, setting_conf, config, _job_dir) logger.info('job_id {} parameters overrode {}'.format(config, _job_dir)) channel, stub = get_proxy_data_channel() for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _method = 'POST' _module = runtime_conf['module'] _url = '/workflow/{}/{}/{}'.format(job_id, _module, _role) _packet = wrap_grpc_packet(runtime_conf, _method, _url, _party_id, job_id) logger.info( 'Starting workflow job_id:{} party_id:{} role:{} method:{} url:{}'.format(job_id, _party_id, _role, _method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed')
def run_do(self): logger.info("{} job are running.".format(running_job_amount())) try: if running_job_amount() < MAX_CONCURRENT_JOB_RUN: wait_jobs = get_job_from_queue(status="waiting", limit=1) if wait_jobs: wait_job = wait_jobs[0] run_job_id = wait_job.job_id try: run_job_success = self.run_job(job_id=run_job_id, config=json.loads( wait_job.config)) except Exception as e: run_job_success = False logger.exception(e) if run_job_success: update_job_queue(job_id=run_job_id, role=wait_job.role, party_id=wait_job.party_id, save_data={"status": "ready"}) else: pop_from_job_queue(job_id=run_job_id) logger.info("check waiting jobs done.") self.check_job() except Exception as e: logger.exception(e)
def stop_workflow(job_id, role, party_id): _job_dir = get_job_directory(job_id) task_pid_path = os.path.join(_job_dir, 'pids') if os.path.isdir(task_pid_path): for pid_file in os.listdir(task_pid_path): try: if not pid_file.endswith('.pid'): continue with open(os.path.join(task_pid_path, pid_file), 'r') as f: pids = f.read().split('\n') for pid in pids: try: if len(pid) == 0: continue logger.debug( "terminating process pid:{} {}".format( pid, pid_file)) p = psutil.Process(int(pid)) for child in p.children(recursive=True): child.kill() p.kill() except NoSuchProcess: continue except Exception as e: logger.exception("error") continue set_job_failed(job_id=job_id, role=role, party_id=party_id) pop_from_job_queue(job_id=job_id) clean_job(job_id=job_id) return get_json_result(job_id=job_id)
def import_offline_feature(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json try: if not request_data.get("jobId"): return get_json_result(status=2, msg="no job id") job_id = request_data.get("jobId") job_data = query_job_by_id(job_id=job_id) if not job_data: return get_json_result(status=3, msg="can not found this job id: %s" % request_data.get("jobId", "")) response = GetFeature.import_data(request_data, json.loads(job_data[0]["config"])) if response.get("status", 1) == 0: update_job_by_id(job_id=job_id, update_data={ "status": "success", "end_date": datetime.datetime.now() }) return get_json_result() else: return get_json_result(status=1, msg="request offline feature error: %s" % response.get("msg", "")) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="request offline feature error: %s" % e)
def do_load_model(): request_data = request.json try: request_data["servings"] = server_conf.get("servers", {}).get("servings", []) publish_model.load_model(config_data=request_data) return get_json_result() except Exception as e: logger.exception(e) return get_json_result(status=1, msg="load model error: %s" % e)
def import_id(): eggroll.init(job_id=generate_job_id(), mode=WORK_MODE) request_data = request.json table_name_space = "id_library" try: id_library_info = eggroll.table("info", table_name_space, partition=10, create_if_missing=True, error_if_exist=False) if request_data.request("rangeStart") == 0: data_id = generate_job_id() id_library_info.put("tmp_data_id", data_id) else: data_id = id_library_info.request("tmp_data_id") data_table = eggroll.table(data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) for i in request_data.request("ids", []): data_table.put(i, "") if request_data.request("rangeEnd") and request_data.request( "total") and (request_data.request("total") - request_data.request("rangeEnd") == 1): # end new_id_count = data_table.count() if new_id_count == request_data["total"]: id_library_info.put( data_id, json.dumps({ "salt": request_data.request("salt"), "saltMethod": request_data.request("saltMethod") })) old_data_id = id_library_info.request("use_data_id") id_library_info.put("use_data_id", data_id) logger.info( "import id success, dtable name is {}, namespace is {}", data_id, table_name_space) # TODO: destroy DTable, should be use a lock old_data_table = eggroll.table(old_data_id, table_name_space, partition=50, create_if_missing=True, error_if_exist=False) old_data_table.destroy() id_library_info.delete(old_data_id) else: data_table.destroy() return get_json_result( 2, "the actual amount of data is not equal to total.") return get_json_result() except Exception as e: logger.exception(e) return get_json_result(1, "import error.")
def query_model_version_history(): request_data = request.json try: config = file_utils.load_json_conf(request_data.get("config_path")) eggroll.init(mode=WORK_MODE) history = version_history(data_table_namespace=config.get("namespace")) return get_json_result(msg=json.dumps(history)) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="load model error: %s" % e)
def publish_model_online(): request_data = request.json try: config = file_utils.load_json_conf(request_data.get("config_path")) if not config.get('servings'): # get my party all servings config['servings'] = SERVINGS publish_model.publish_online(config_data=config) return get_json_result() except Exception as e: logger.exception(e) return get_json_result(status=1, msg="publish model error: %s" % e)
def federated_api(job_id, method, url, party_id, json_body={}, overall_timeout=DEFAULT_GRPC_OVERALL_TIMEOUT): _packet = wrap_grpc_packet(json_body, method, url, party_id, job_id, overall_timeout=overall_timeout) try: channel, stub = get_proxy_data_channel() _return = stub.unaryCall(_packet) logger.info("grpc unary response: {}".format(_return)) channel.close() return 0, _return.body.value except grpc.RpcError as e: logger.exception(e) return 101, 'rpc error' except Exception as e: logger.exception(e) return 102, str(e)
def request_offline_feature(): request_data = request.json try: job_id = uuid.uuid1().hex response = GetFeature.request(job_id, request_data) if response.get("status", 1) == 0: job_data = dict() job_data.update(request_data) job_data["begin_date"] = datetime.datetime.now() job_data["status"] = "running" job_data["config"] = json.dumps(request_data) save_job_info(job_id=job_id, **job_data) return get_json_result() else: return get_json_result(status=1, msg="request offline feature error: %s" % response.get("msg", "")) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="request offline feature error: %s" % e)
def stop_job(job_id): _job_dir = get_job_directory(job_id) for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _url = '/workflow/{}'.format(job_id) _method = 'DELETE' _packet = wrap_grpc_packet({}, _method, _url, _party_id, job_id) channel, stub = get_proxy_data_channel() try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall stop to remote manager failed') return get_json_result()
def load_model(): config = file_utils.load_json_conf(request.json.get("config_path")) _job_id = generate_job_id() channel, stub = get_proxy_data_channel() for _party_id in config.get("party_ids"): config['my_party_id'] = _party_id _method = 'POST' _url = '/model/load/do' _packet = wrap_grpc_packet(config, _method, _url, _party_id, _job_id) logger.info( 'Starting load model job_id:{} party_id:{} method:{} url:{}'.format(_job_id, _party_id,_method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} method:{} url:{} Failed to start load model'.format(_job_id, _party_id, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed') return get_json_result()
def stop_workflow(job_id, role, party_id): _job_dir = get_job_directory(job_id) task_pid_path = os.path.join(_job_dir, 'pids') if os.path.isdir(task_pid_path): for pid_file in os.listdir(task_pid_path): try: if not pid_file.endswith('.pid'): continue with open(os.path.join(task_pid_path, pid_file), 'r') as f: pids = f.read().split('\n') for pid in pids: try: if len(pid) == 0: continue logger.debug( "terminating process pid:{} {}".format( pid, pid_file)) p = psutil.Process(int(pid)) for child in p.children(recursive=True): child.kill() p.kill() except NoSuchProcess: continue except Exception as e: logger.exception("error") continue federated_api(job_id=job_id, method='POST', url='/job/jobStatus/{}/{}/{}'.format( job_id, role, party_id), party_id=party_id, json_body={ 'status': 'failed', 'stopJob': True }) clean_job(job_id=job_id) return get_json_result(job_id=job_id)
def download_upload(data_func): request_config = request.json _job_id = generate_job_id() logger.info('generated job_id {}, body {}'.format(_job_id, request_config)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) module = data_func if module == "upload": if not os.path.isabs(request_config.get("file", "")): request_config["file"] = os.path.join(file_utils.get_project_base_directory(), request_config["file"]) try: request_config["work_mode"] = request_config.get('work_mode', WORK_MODE) table_name, namespace = dtable_utils.get_table_info(config=request_config, create=(True if module == 'upload' else False)) if not table_name or not namespace: return get_json_result(status=102, msg='no table name and namespace') request_config['table_name'] = table_name request_config['namespace'] = namespace conf_file_path = new_runtime_conf(job_dir=_job_dir, method=data_func, module=module, role=request_config.get('local', {}).get("role"), party_id=request_config.get('local', {}).get("party_id", PARTY_ID)) file_utils.dump_json_conf(request_config, conf_file_path) if module == "download": progs = ["python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-j", _job_id, "-c", conf_file_path ] else: progs = ["python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-c", conf_file_path ] p = run_subprocess(job_dir=_job_dir, job_role=data_func, progs=progs) return get_json_result(job_id=_job_id, data={'pid': p.pid, 'table_name': request_config['table_name'], 'namespace': request_config['namespace']}) except Exception as e: logger.exception(e) return get_json_result(status=-104, msg="failed", job_id=_job_id)
def internal_server_error(e): logger.exception(e) return get_json_result(status=100, msg=str(e))