def rerun_job(job_id, role, party_id): DAGScheduler.set_job_rerun( job_id=job_id, initiator_role=role, initiator_party_id=party_id, component_name=request.json.get("component_name"), force=request.json.get("force", False), auto=False) #todo: 判断状态 return get_json_result(retcode=0, retmsg='success')
def submit_job(): submit_result = DAGScheduler.submit(JobConfigurationBase(**request.json)) return get_json_result(retcode=submit_result["code"], retmsg=submit_result["message"], job_id=submit_result["job_id"], data=submit_result if submit_result["code"] == RetCode.SUCCESS else None)
def download_upload(access_module): job_id = job_utils.generate_job_id() if access_module == "upload" and UPLOAD_DATA_FROM_CLIENT and not (request.json and request.json.get("use_local_data") == 0): file = request.files['file'] filename = os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp', file.filename) os.makedirs(os.path.dirname(filename), exist_ok=True) try: file.save(filename) except Exception as e: shutil.rmtree(os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp')) raise e job_config = request.args.to_dict() if "namespace" in job_config and "table_name" in job_config: pass else: # higher than version 1.5.1, support eggroll run parameters job_config = json_loads(list(job_config.keys())[0]) job_config['file'] = filename else: job_config = request.json required_arguments = ['work_mode', 'namespace', 'table_name'] if access_module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif access_module == 'download': required_arguments.extend(['output_path']) else: raise Exception('can not support this operating: {}'.format(access_module)) detect_utils.check_config(job_config, required_arguments=required_arguments) data = {} # compatibility if "table_name" in job_config: job_config["name"] = job_config["table_name"] if "backend" not in job_config: job_config["backend"] = 0 for _ in ["work_mode", "backend", "head", "partition", "drop"]: if _ in job_config: job_config[_] = int(job_config[_]) if access_module == "upload": if job_config.get('drop', 0) == 1: job_config["destroy"] = True else: job_config["destroy"] = False data['table_name'] = job_config["table_name"] data['namespace'] = job_config["namespace"] data_table_meta = storage.StorageTableMeta(name=job_config["table_name"], namespace=job_config["namespace"]) if data_table_meta and not job_config["destroy"]: return get_json_result(retcode=100, retmsg='The data table already exists.' 'If you still want to continue uploading, please add the parameter -drop.' ' 0 means not to delete and continue uploading, ' '1 means to upload again after deleting the table') job_dsl, job_runtime_conf = gen_data_access_job_config(job_config, access_module) submit_result = DAGScheduler.submit({'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf}, job_id=job_id) data.update(submit_result) return get_json_result(job_id=job_id, data=data)
def submit_job(): work_mode = JobRuntimeConfigAdapter( request.json.get('job_runtime_conf', {})).get_job_work_mode() detect_utils.check_config({'work_mode': work_mode}, required_arguments=[('work_mode', (WorkMode.CLUSTER, WorkMode.STANDALONE))]) submit_result = DAGScheduler.submit(request.json) return get_json_result(retcode=0, retmsg='success', job_id=submit_result.get("job_id"), data=submit_result)
def update_parameters(): job_info = request.json component_parameters = job_info.pop("component_parameters", None) job_parameters = job_info.pop("job_parameters", None) job_info["is_initiator"] = True jobs = JobSaver.query_job(**job_info) if not jobs: return get_json_result( retcode=RetCode.DATA_ERROR, retmsg=log_utils.failed_log(f"query job by {job_info}")) else: retcode, retdata = DAGScheduler.update_parameters( jobs[0], job_parameters, component_parameters) return get_json_result(retcode=retcode, data=retdata)
def operate_model(model_operation): request_config = request.json or request.form.to_dict() job_id = job_utils.generate_job_id() if model_operation not in [ ModelOperation.STORE, ModelOperation.RESTORE, ModelOperation.EXPORT, ModelOperation.IMPORT ]: raise Exception( 'Can not support this operating now: {}'.format(model_operation)) required_arguments = ["model_id", "model_version", "role", "party_id"] check_config(request_config, required_arguments=required_arguments) request_config["model_id"] = gen_party_model_id( model_id=request_config["model_id"], role=request_config["role"], party_id=request_config["party_id"]) if model_operation in [ModelOperation.EXPORT, ModelOperation.IMPORT]: if model_operation == ModelOperation.IMPORT: try: file = request.files.get('file') file_path = os.path.join(TEMP_DIRECTORY, file.filename) # if not os.path.exists(file_path): # raise Exception('The file is obtained from the fate flow client machine, but it does not exist, ' # 'please check the path: {}'.format(file_path)) try: os.makedirs(os.path.dirname(file_path), exist_ok=True) file.save(file_path) except Exception as e: shutil.rmtree(file_path) raise e request_config['file'] = file_path model = pipelined_model.PipelinedModel( model_id=request_config["model_id"], model_version=request_config["model_version"]) model.unpack_model(file_path) pipeline = model.read_component_model('pipeline', 'pipeline')['Pipeline'] train_runtime_conf = json_loads(pipeline.train_runtime_conf) permitted_party_id = [] for key, value in train_runtime_conf.get('role', {}).items(): for v in value: permitted_party_id.extend([v, str(v)]) if request_config["party_id"] not in permitted_party_id: shutil.rmtree(model.model_path) raise Exception( "party id {} is not in model roles, please check if the party id is valid." ) try: adapter = JobRuntimeConfigAdapter(train_runtime_conf) job_parameters = adapter.get_common_parameters().to_dict() with DB.connection_context(): db_model = MLModel.get_or_none( MLModel.f_job_id == job_parameters.get( "model_version"), MLModel.f_role == request_config["role"]) if not db_model: model_info = model_utils.gather_model_info_data(model) model_info['imported'] = 1 model_info['job_id'] = model_info['f_model_version'] model_info['size'] = model.calculate_model_file_size() model_info['role'] = request_config["model_id"].split( '#')[0] model_info['party_id'] = request_config[ "model_id"].split('#')[1] if model_utils.compare_version( model_info['f_fate_version'], '1.5.1') == 'lt': model_info['roles'] = model_info.get( 'f_train_runtime_conf', {}).get('role', {}) model_info['initiator_role'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('role') model_info['initiator_party_id'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('party_id') model_info[ 'work_mode'] = adapter.get_job_work_mode() model_info['parent'] = False if model_info.get( 'f_inference_dsl') else True model_utils.save_model_info(model_info) else: stat_logger.info( f'job id: {job_parameters.get("model_version")}, ' f'role: {request_config["role"]} model info already existed in database.' ) except peewee.IntegrityError as e: stat_logger.exception(e) operation_record(request_config, "import", "success") return get_json_result() except Exception: operation_record(request_config, "import", "failed") raise else: try: model = pipelined_model.PipelinedModel( model_id=request_config["model_id"], model_version=request_config["model_version"]) if model.exists(): archive_file_path = model.packaging_model() operation_record(request_config, "export", "success") return send_file(archive_file_path, attachment_filename=os.path.basename( archive_file_path), as_attachment=True) else: operation_record(request_config, "export", "failed") res = error_response( response_code=210, retmsg="Model {} {} is not exist.".format( request_config.get("model_id"), request_config.get("model_version"))) return res except Exception as e: operation_record(request_config, "export", "failed") stat_logger.exception(e) return error_response(response_code=210, retmsg=str(e)) else: data = {} job_dsl, job_runtime_conf = gen_model_operation_job_config( request_config, model_operation) submit_result = DAGScheduler.submit( { 'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf }, job_id=job_id) data.update(submit_result) operation_record(data=job_runtime_conf, oper_type=model_operation, oper_status='') return get_json_result(job_id=job_id, data=data)
def stop_job(job_id, role, party_id, stop_status): retcode, retmsg = DAGScheduler.stop_job(job_id=job_id, role=role, party_id=party_id, stop_status=stop_status) return get_json_result(retcode=retcode, retmsg=retmsg)
def rerun_job(job_id, role, party_id): DAGScheduler.rerun_job(job_id=job_id, initiator_role=role, initiator_party_id=party_id, component_name=request.json.get("component_name")) return get_json_result(retcode=0, retmsg='success')
init_arch_db() # init runtime config import argparse parser = argparse.ArgumentParser() parser.add_argument('--standalone_node', default=False, help="if standalone node mode or not ", action='store_true') args = parser.parse_args() RuntimeConfig.init_env() RuntimeConfig.set_process_role(ProcessRole.DRIVER) PrivilegeAuth.init() ServiceUtils.register() ResourceManager.initialize() Detector(interval=5 * 1000).start() DAGScheduler(interval=2 * 1000).start() thread_pool_executor = ThreadPoolExecutor( max_workers=GRPC_SERVER_MAX_WORKERS) stat_logger.info( f"start grpc server thread pool by {thread_pool_executor._max_workers} max workers" ) server = grpc.server( thread_pool=thread_pool_executor, options=[(cygrpc.ChannelArgKey.max_send_message_length, -1), (cygrpc.ChannelArgKey.max_receive_message_length, -1)]) proxy_pb2_grpc.add_DataTransferServiceServicer_to_server( UnaryService(), server) server.add_insecure_port("{}:{}".format(IP, GRPC_PORT)) server.start() stat_logger.info("FATE Flow grpc server start successfully")