def bind_model_service(): request_config = request.json if request_config.get('job_id', None): retcode, retmsg, res_data = model_utils.query_model_info( request_config['job_id'], 'guest') if res_data: model_info = res_data[0] request_config['initiator'] = {} request_config['initiator']['party_id'] = str( model_info.get('f_initiator_party_id')) request_config['initiator']['role'] = model_info.get( 'f_initiator_role') runtime_conf = model_info.get( 'f_runtime_conf', {}) if model_info.get( 'f_runtime_conf', {}) else model_info.get( 'f_train_runtime_conf', {}) adapter = JobRuntimeConfigAdapter(runtime_conf) job_parameters = adapter.get_common_parameters().to_dict() request_config[ 'job_parameters'] = job_parameters if job_parameters else model_info.get( 'f_train_runtime_conf', {}).get('job_parameters') roles = runtime_conf.get('role') request_config['role'] = roles if roles else model_info.get( 'f_train_runtime_conf', {}).get('role') for key, value in request_config['role'].items(): for i, v in enumerate(value): value[i] = str(v) request_config.pop('job_id') else: return get_json_result( retcode=101, retmsg="model {} can not be found in database. " "Please check if the model version is valid.".format( request_config.get('job_id'))) if not request_config.get('servings'): # get my party all servings request_config['servings'] = RuntimeConfig.SERVICE_DB.get_urls( 'servings') service_id = request_config.get('service_id') if not service_id: return get_json_result(retcode=101, retmsg='no service id') detect_utils.check_config(request_config, ['initiator', 'role', 'job_parameters']) bind_status, retmsg = publish_model.bind_model_service(request_config) operation_record(request_config, "bind", "success" if not bind_status else "failed") return get_json_result( retcode=bind_status, retmsg='service id is {}'.format(service_id) if not retmsg else retmsg)
def component_output_model(): request_data = request.json check_request_parameters(request_data) job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) try: model_id = runtime_conf_on_party['job_parameters']['model_id'] model_version = runtime_conf_on_party['job_parameters']['model_version'] except Exception as e: job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_model_configuration(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) if any([job_dsl, job_runtime_conf, train_runtime_conf]): adapter = JobRuntimeConfigAdapter(job_runtime_conf) model_id = adapter.get_common_parameters().to_dict().get('model_id') model_version = adapter.get_common_parameters().to_dict.get('model_version') else: stat_logger.exception(e) stat_logger.error(f"Can not find model info by filters: job id: {request_data.get('job_id')}, " f"role: {request_data.get('role')}, party id: {request_data.get('party_id')}") raise Exception(f"Can not find model info by filters: job id: {request_data.get('job_id')}, " f"role: {request_data.get('role')}, party id: {request_data.get('party_id')}") tracker = Tracker(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id'], model_id=model_id, model_version=model_version) dag = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) component = dag.get_component_info(request_data['component_name']) output_model_json = {} # There is only one model output at the current dsl version. output_model = tracker.get_output_model(component.get_output()['model'][0] if component.get_output().get('model') else 'default') for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Param'): output_model_json = json_format.MessageToDict(buffer_object, including_default_value_fields=True) if output_model_json: component_define = tracker.get_component_define() this_component_model_meta = {} for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Meta'): this_component_model_meta['meta_data'] = json_format.MessageToDict(buffer_object, including_default_value_fields=True) this_component_model_meta.update(component_define) return get_json_result(retcode=0, retmsg='success', data=output_model_json, meta=this_component_model_meta) else: return get_json_result(retcode=0, retmsg='no data', data={})
def job_config(): jobs = JobSaver.query_job(**request.json) if not jobs: return get_json_result(retcode=101, retmsg='find job failed') else: job = jobs[0] response_data = dict() response_data['job_id'] = job.f_job_id response_data['dsl'] = job.f_dsl response_data['runtime_conf'] = job.f_runtime_conf response_data['train_runtime_conf'] = job.f_train_runtime_conf adapter = JobRuntimeConfigAdapter(job.f_runtime_conf) job_parameters = adapter.get_common_parameters().to_dict() response_data['model_info'] = { 'model_id': job_parameters.get('model_id'), 'model_version': job_parameters.get('model_version') } return get_json_result(retcode=0, retmsg='success', data=response_data)
def submit_job(): work_mode = JobRuntimeConfigAdapter( request.json.get('job_runtime_conf', {})).get_job_work_mode() detect_utils.check_config({'work_mode': work_mode}, required_arguments=[('work_mode', (WorkMode.CLUSTER, WorkMode.STANDALONE))]) submit_result = DAGScheduler.submit(request.json) return get_json_result(retcode=0, retmsg='success', job_id=submit_result.get("job_id"), data=submit_result)
def get_job_dsl_parser(dsl=None, runtime_conf=None, pipeline_dsl=None, train_runtime_conf=None): parser_version = str(runtime_conf.get('dsl_version', '1')) dsl_parser = get_dsl_parser_by_version(parser_version) default_runtime_conf_path = os.path.join( file_utils.get_python_base_directory(), *['federatedml', 'conf', 'default_runtime_conf']) job_type = JobRuntimeConfigAdapter(runtime_conf).get_job_type() dsl_parser.run(dsl=dsl, runtime_conf=runtime_conf, pipeline_dsl=pipeline_dsl, pipeline_runtime_conf=train_runtime_conf, default_runtime_conf_prefix=default_runtime_conf_path, setting_conf_prefix=file_utils. get_federatedml_setting_conf_directory(), mode=job_type) return dsl_parser
def get_job_dsl_parser(dsl=None, runtime_conf=None, pipeline_dsl=None, train_runtime_conf=None): parser_version = get_conf_version(runtime_conf) if parser_version == 1: dsl, runtime_conf = convert_dsl_and_conf_v1_to_v2(dsl, runtime_conf) if pipeline_dsl and train_runtime_conf: pipeline_dsl, train_runtime_conf = convert_dsl_and_conf_v1_to_v2( pipeline_dsl, train_runtime_conf) parser_version = 2 dsl_parser = get_dsl_parser_by_version(parser_version) job_type = JobRuntimeConfigAdapter(runtime_conf).get_job_type() dsl_parser.run(dsl=dsl, runtime_conf=runtime_conf, pipeline_dsl=pipeline_dsl, pipeline_runtime_conf=train_runtime_conf, mode=job_type) return dsl_parser
def deploy(): request_data = request.json require_parameters = ['model_id', 'model_version'] check_config(request_data, require_parameters) model_id = request_data.get("model_id") model_version = request_data.get("model_version") retcode, retmsg, model_info = model_utils.query_model_info_from_file( model_id=model_id, model_version=model_version, to_dict=True) if not model_info: raise Exception( f'Deploy model failed, no model {model_id} {model_version} found.') else: for key, value in model_info.items(): version_check = model_utils.compare_version( value.get('f_fate_version'), '1.5.0') if version_check == 'lt': continue else: init_role = key.split('/')[-2].split('#')[0] init_party_id = key.split('/')[-2].split('#')[1] model_init_role = value.get('f_initiator_role') if value.get( 'f_initiator_role') else value.get( 'f_train_runtime_conf', {}).get('initiator', {}).get( 'role', '') model_init_party_id = value.get( 'f_initiator_role_party_id') if value.get( 'f_initiator_role_party_id') else value.get( 'f_train_runtime_conf', {}).get( 'initiator', {}).get('party_id', '') if (init_role == model_init_role) and (init_party_id == str(model_init_party_id)): break else: raise Exception( "Deploy model failed, can not found model of initiator role or the fate version of model is older than 1.5.0" ) # distribute federated deploy task _job_id = job_utils.generate_job_id() request_data['child_model_version'] = _job_id initiator_party_id = model_init_party_id initiator_role = model_init_role request_data['initiator'] = { 'role': initiator_role, 'party_id': initiator_party_id } deploy_status = True deploy_status_info = {} deploy_status_msg = 'success' deploy_status_info['detail'] = {} for role_name, role_partys in value.get("f_train_runtime_conf", {}).get('role', {}).items(): if role_name not in ['arbiter', 'host', 'guest']: continue deploy_status_info[role_name] = deploy_status_info.get( role_name, {}) deploy_status_info['detail'][role_name] = {} adapter = JobRuntimeConfigAdapter( value.get("f_train_runtime_conf", {})) work_mode = adapter.get_job_work_mode() for _party_id in role_partys: request_data['local'] = { 'role': role_name, 'party_id': _party_id } try: response = federated_api( job_id=_job_id, method='POST', endpoint='/model/deploy/do', src_party_id=initiator_party_id, dest_party_id=_party_id, src_role=initiator_role, json_body=request_data, federated_mode=FederatedMode.MULTIPLE if work_mode else FederatedMode.SINGLE) deploy_status_info[role_name][_party_id] = response[ 'retcode'] detail = {_party_id: {}} detail[_party_id]['retcode'] = response['retcode'] detail[_party_id]['retmsg'] = response['retmsg'] deploy_status_info['detail'][role_name].update(detail) if response['retcode']: deploy_status = False deploy_status_msg = 'failed' except Exception as e: stat_logger.exception(e) deploy_status = False deploy_status_msg = 'failed' deploy_status_info[role_name][_party_id] = 100 deploy_status_info['model_id'] = request_data['model_id'] deploy_status_info['model_version'] = _job_id return get_json_result(retcode=(0 if deploy_status else 101), retmsg=deploy_status_msg, data=deploy_status_info)
def load_model(): request_config = request.json if request_config.get('job_id', None): retcode, retmsg, res_data = model_utils.query_model_info( model_version=request_config['job_id'], role='guest') if res_data: model_info = res_data[0] request_config['initiator'] = {} request_config['initiator']['party_id'] = str( model_info.get('f_initiator_party_id')) request_config['initiator']['role'] = model_info.get( 'f_initiator_role') runtime_conf = model_info.get( 'f_runtime_conf', {}) if model_info.get( 'f_runtime_conf', {}) else model_info.get( 'f_train_runtime_conf', {}) adapter = JobRuntimeConfigAdapter(runtime_conf) job_parameters = adapter.get_common_parameters().to_dict() request_config[ 'job_parameters'] = job_parameters if job_parameters else model_info.get( 'f_train_runtime_conf', {}).get('job_parameters') roles = runtime_conf.get('role') request_config['role'] = roles if roles else model_info.get( 'f_train_runtime_conf', {}).get('role') for key, value in request_config['role'].items(): for i, v in enumerate(value): value[i] = str(v) request_config.pop('job_id') else: return get_json_result( retcode=101, retmsg="model with version {} can not be found in database. " "Please check if the model version is valid.".format( request_config.get('job_id'))) _job_id = job_utils.generate_job_id() initiator_party_id = request_config['initiator']['party_id'] initiator_role = request_config['initiator']['role'] publish_model.generate_publish_model_info(request_config) load_status = True load_status_info = {} load_status_msg = 'success' load_status_info['detail'] = {} if "federated_mode" not in request_config['job_parameters']: if request_config["job_parameters"][ "work_mode"] == WorkMode.STANDALONE: request_config['job_parameters'][ "federated_mode"] = FederatedMode.SINGLE elif request_config["job_parameters"]["work_mode"] == WorkMode.CLUSTER: request_config['job_parameters'][ "federated_mode"] = FederatedMode.MULTIPLE for role_name, role_partys in request_config.get("role").items(): if role_name == 'arbiter': continue load_status_info[role_name] = load_status_info.get(role_name, {}) load_status_info['detail'][role_name] = {} for _party_id in role_partys: request_config['local'] = { 'role': role_name, 'party_id': _party_id } try: response = federated_api( job_id=_job_id, method='POST', endpoint='/model/load/do', src_party_id=initiator_party_id, dest_party_id=_party_id, src_role=initiator_role, json_body=request_config, federated_mode=request_config['job_parameters'] ['federated_mode']) load_status_info[role_name][_party_id] = response['retcode'] detail = {_party_id: {}} detail[_party_id]['retcode'] = response['retcode'] detail[_party_id]['retmsg'] = response['retmsg'] load_status_info['detail'][role_name].update(detail) if response['retcode']: load_status = False load_status_msg = 'failed' except Exception as e: stat_logger.exception(e) load_status = False load_status_msg = 'failed' load_status_info[role_name][_party_id] = 100 return get_json_result(job_id=_job_id, retcode=(0 if load_status else 101), retmsg=load_status_msg, data=load_status_info)
def operate_model(model_operation): request_config = request.json or request.form.to_dict() job_id = job_utils.generate_job_id() if model_operation not in [ ModelOperation.STORE, ModelOperation.RESTORE, ModelOperation.EXPORT, ModelOperation.IMPORT ]: raise Exception( 'Can not support this operating now: {}'.format(model_operation)) required_arguments = ["model_id", "model_version", "role", "party_id"] check_config(request_config, required_arguments=required_arguments) request_config["model_id"] = gen_party_model_id( model_id=request_config["model_id"], role=request_config["role"], party_id=request_config["party_id"]) if model_operation in [ModelOperation.EXPORT, ModelOperation.IMPORT]: if model_operation == ModelOperation.IMPORT: try: file = request.files.get('file') file_path = os.path.join(TEMP_DIRECTORY, file.filename) # if not os.path.exists(file_path): # raise Exception('The file is obtained from the fate flow client machine, but it does not exist, ' # 'please check the path: {}'.format(file_path)) try: os.makedirs(os.path.dirname(file_path), exist_ok=True) file.save(file_path) except Exception as e: shutil.rmtree(file_path) raise e request_config['file'] = file_path model = pipelined_model.PipelinedModel( model_id=request_config["model_id"], model_version=request_config["model_version"]) model.unpack_model(file_path) pipeline = model.read_component_model('pipeline', 'pipeline')['Pipeline'] train_runtime_conf = json_loads(pipeline.train_runtime_conf) permitted_party_id = [] for key, value in train_runtime_conf.get('role', {}).items(): for v in value: permitted_party_id.extend([v, str(v)]) if request_config["party_id"] not in permitted_party_id: shutil.rmtree(model.model_path) raise Exception( "party id {} is not in model roles, please check if the party id is valid." ) try: adapter = JobRuntimeConfigAdapter(train_runtime_conf) job_parameters = adapter.get_common_parameters().to_dict() with DB.connection_context(): db_model = MLModel.get_or_none( MLModel.f_job_id == job_parameters.get( "model_version"), MLModel.f_role == request_config["role"]) if not db_model: model_info = model_utils.gather_model_info_data(model) model_info['imported'] = 1 model_info['job_id'] = model_info['f_model_version'] model_info['size'] = model.calculate_model_file_size() model_info['role'] = request_config["model_id"].split( '#')[0] model_info['party_id'] = request_config[ "model_id"].split('#')[1] if model_utils.compare_version( model_info['f_fate_version'], '1.5.1') == 'lt': model_info['roles'] = model_info.get( 'f_train_runtime_conf', {}).get('role', {}) model_info['initiator_role'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('role') model_info['initiator_party_id'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('party_id') model_info[ 'work_mode'] = adapter.get_job_work_mode() model_info['parent'] = False if model_info.get( 'f_inference_dsl') else True model_utils.save_model_info(model_info) else: stat_logger.info( f'job id: {job_parameters.get("model_version")}, ' f'role: {request_config["role"]} model info already existed in database.' ) except peewee.IntegrityError as e: stat_logger.exception(e) operation_record(request_config, "import", "success") return get_json_result() except Exception: operation_record(request_config, "import", "failed") raise else: try: model = pipelined_model.PipelinedModel( model_id=request_config["model_id"], model_version=request_config["model_version"]) if model.exists(): archive_file_path = model.packaging_model() operation_record(request_config, "export", "success") return send_file(archive_file_path, attachment_filename=os.path.basename( archive_file_path), as_attachment=True) else: operation_record(request_config, "export", "failed") res = error_response( response_code=210, retmsg="Model {} {} is not exist.".format( request_config.get("model_id"), request_config.get("model_version"))) return res except Exception as e: operation_record(request_config, "export", "failed") stat_logger.exception(e) return error_response(response_code=210, retmsg=str(e)) else: data = {} job_dsl, job_runtime_conf = gen_model_operation_job_config( request_config, model_operation) submit_result = DAGScheduler.submit( { 'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf }, job_id=job_id) data.update(submit_result) operation_record(data=job_runtime_conf, oper_type=model_operation, oper_status='') return get_json_result(job_id=job_id, data=data)
def migration(config_data: dict): try: party_model_id = model_utils.gen_party_model_id( model_id=config_data["model_id"], role=config_data["local"]["role"], party_id=config_data["local"]["party_id"]) model = pipelined_model.PipelinedModel( model_id=party_model_id, model_version=config_data["model_version"]) if not model.exists(): raise Exception("Can not found {} {} model local cache".format( config_data["model_id"], config_data["model_version"])) with DB.connection_context(): if MLModel.get_or_none(MLModel.f_model_version == config_data["unify_model_version"]): raise Exception( "Unify model version {} has been occupied in database. " "Please choose another unify model version and try again.". format(config_data["unify_model_version"])) model_data = model.collect_models(in_bytes=True) if "pipeline.pipeline:Pipeline" not in model_data: raise Exception("Can not found pipeline file in model.") migrate_model = pipelined_model.PipelinedModel( model_id=model_utils.gen_party_model_id( model_id=model_utils.gen_model_id(config_data["migrate_role"]), role=config_data["local"]["role"], party_id=config_data["local"]["migrate_party_id"]), model_version=config_data["unify_model_version"]) # migrate_model.create_pipelined_model() shutil.copytree(src=model.model_path, dst=migrate_model.model_path) pipeline = migrate_model.read_component_model('pipeline', 'pipeline')['Pipeline'] # Utilize Pipeline_model collect model data. And modify related inner information of model train_runtime_conf = json_loads(pipeline.train_runtime_conf) train_runtime_conf["role"] = config_data["migrate_role"] train_runtime_conf["initiator"] = config_data["migrate_initiator"] adapter = JobRuntimeConfigAdapter(train_runtime_conf) train_runtime_conf = adapter.update_model_id_version( model_id=model_utils.gen_model_id(train_runtime_conf["role"]), model_version=migrate_model.model_version) # update pipeline.pb file pipeline.train_runtime_conf = json_dumps(train_runtime_conf, byte=True) pipeline.model_id = bytes( adapter.get_common_parameters().to_dict.get("model_id"), "utf-8") pipeline.model_version = bytes( adapter.get_common_parameters().to_dict().get("model_version"), "utf-8") # save updated pipeline.pb file migrate_model.save_pipeline(pipeline) shutil.copyfile( os.path.join(migrate_model.model_path, "pipeline.pb"), os.path.join(migrate_model.model_path, "variables", "data", "pipeline", "pipeline", "Pipeline")) # modify proto with open( os.path.join(migrate_model.model_path, 'define', 'define_meta.yaml'), 'r') as fin: define_yaml = yaml.safe_load(fin) for key, value in define_yaml['model_proto'].items(): if key == 'pipeline': continue for v in value.keys(): buffer_obj = migrate_model.read_component_model(key, v) module_name = define_yaml['component_define'].get( key, {}).get('module_name') modified_buffer = model_migration( model_contents=buffer_obj, module_name=module_name, old_guest_list=config_data['role']['guest'], new_guest_list=config_data['migrate_role']['guest'], old_host_list=config_data['role']['host'], new_host_list=config_data['migrate_role']['host'], old_arbiter_list=config_data.get('role', {}).get('arbiter', None), new_arbiter_list=config_data.get('migrate_role', {}).get('arbiter', None)) migrate_model.save_component_model( component_name=key, component_module_name=module_name, model_alias=v, model_buffers=modified_buffer) archive_path = migrate_model.packaging_model() shutil.rmtree(os.path.abspath(migrate_model.model_path)) return (0, f"Migrating model successfully. " \ "The configuration of model has been modified automatically. " \ "New model id is: {}, model version is: {}. " \ "Model files can be found at '{}'.".format(adapter.get_common_parameters()["model_id"], migrate_model.model_version, os.path.abspath(archive_path)), {"model_id": migrate_model.model_id, "model_version": migrate_model.model_version, "path": os.path.abspath(archive_path)}) except Exception as e: return 100, str(e), {}
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format( job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_job_runtime_conf(job_runtime_conf) authentication_utils.check_constraint(job_runtime_conf, job_dsl) job_initiator = job_runtime_conf['initiator'] conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != 'predict': # generate job model info common_job_parameters.model_id = model_utils.gen_model_id( job_runtime_conf['role']) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') train_runtime_conf = json_loads( pipeline_model['Pipeline'].train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = common_job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] job.f_role = job_initiator['role'] job.f_party_id = job_initiator['party_id'] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, job_runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in job_runtime_conf['role'][ job.f_initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format( job.f_initiator_party_id)) raise Exception("initiator party id error {}".format( job.f_initiator_party_id)) # create common parameters on initiator JobController.backend_compatibility( job_parameters=common_job_parameters) JobController.adapt_job_parameters( role=job.f_initiator_role, job_parameters=common_job_parameters, create_initiator_baseline=True) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() if common_job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue JobController.initialize_tasks(job_id, role, party_id, False, job.f_initiator_role, job.f_initiator_party_id, common_job_parameters, dsl_parser) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format( job.f_job_id, common_job_parameters.model_id)) logs_directory = job_utils.get_job_log_directory(job_id) submit_result = { "job_id": job_id, "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator['role'], job_initiator['party_id']) } submit_result.update(path_dict) return submit_result
def deploy(config_data): model_id = config_data.get('model_id') model_version = config_data.get('model_version') local_role = config_data.get('local').get('role') local_party_id = config_data.get('local').get('party_id') child_model_version = config_data.get('child_model_version') try: party_model_id = model_utils.gen_party_model_id( model_id=model_id, role=local_role, party_id=local_party_id) model = PipelinedModel(model_id=party_model_id, model_version=model_version) model_data = model.collect_models(in_bytes=True) if "pipeline.pipeline:Pipeline" not in model_data: raise Exception("Can not found pipeline file in model.") # check if the model could be executed the deploy process (parent/child) if not check_before_deploy(model): raise Exception('Child model could not be deployed.') # copy proto content from parent model and generate a child model deploy_model = PipelinedModel(model_id=party_model_id, model_version=child_model_version) shutil.copytree(src=model.model_path, dst=deploy_model.model_path) pipeline = deploy_model.read_component_model('pipeline', 'pipeline')['Pipeline'] # modify two pipeline files (model version/ train_runtime_conf) train_runtime_conf = json_loads(pipeline.train_runtime_conf) adapter = JobRuntimeConfigAdapter(train_runtime_conf) train_runtime_conf = adapter.update_model_id_version( model_version=deploy_model.model_version) pipeline.model_version = child_model_version pipeline.train_runtime_conf = json_dumps(train_runtime_conf, byte=True) parser = get_dsl_parser_by_version( train_runtime_conf.get('dsl_version', '1')) train_dsl = json_loads(pipeline.train_dsl) parent_predict_dsl = json_loads(pipeline.inference_dsl) if str(train_runtime_conf.get('dsl_version', '1')) == '1': predict_dsl = json_loads(pipeline.inference_dsl) else: if config_data.get('dsl') or config_data.get('predict_dsl'): predict_dsl = config_data.get('dsl') if config_data.get( 'dsl') else config_data.get('predict_dsl') if not isinstance(predict_dsl, dict): predict_dsl = json_loads(predict_dsl) else: if config_data.get('cpn_list', None): cpn_list = config_data.pop('cpn_list') else: cpn_list = list(train_dsl.get('components', {}).keys()) parser_version = train_runtime_conf.get('dsl_version', '1') if str(parser_version) == '1': predict_dsl = parent_predict_dsl else: parser = schedule_utils.get_dsl_parser_by_version( parser_version) predict_dsl = parser.deploy_component(cpn_list, train_dsl) # save predict dsl into child model file parser.verify_dsl(predict_dsl, "predict") inference_dsl = parser.get_predict_dsl( role=local_role, predict_dsl=predict_dsl, setting_conf_prefix=file_utils. get_federatedml_setting_conf_directory()) pipeline.inference_dsl = json_dumps(inference_dsl, byte=True) if model_utils.compare_version(pipeline.fate_version, '1.5.0') == 'gt': pipeline.parent_info = json_dumps( { 'parent_model_id': model_id, 'parent_model_version': model_version }, byte=True) pipeline.parent = False runtime_conf_on_party = json_loads(pipeline.runtime_conf_on_party) runtime_conf_on_party['job_parameters'][ 'model_version'] = child_model_version pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party, byte=True) # save model file deploy_model.save_pipeline(pipeline) shutil.copyfile( os.path.join(deploy_model.model_path, "pipeline.pb"), os.path.join(deploy_model.model_path, "variables", "data", "pipeline", "pipeline", "Pipeline")) model_info = model_utils.gather_model_info_data(deploy_model) model_info['job_id'] = model_info['f_model_version'] model_info['size'] = deploy_model.calculate_model_file_size() model_info['role'] = local_role model_info['party_id'] = local_party_id model_info['work_mode'] = adapter.get_job_work_mode() model_info['parent'] = False if model_info.get( 'f_inference_dsl') else True if model_utils.compare_version(model_info['f_fate_version'], '1.5.0') == 'eq': model_info['roles'] = model_info.get('f_train_runtime_conf', {}).get('role', {}) model_info['initiator_role'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('role') model_info['initiator_party_id'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('party_id') model_utils.save_model_info(model_info) except Exception as e: stat_logger.exception(e) return 100, f"deploy model of role {local_role} {local_party_id} failed, details: {str(e)}" else: return 0, f"deploy model of role {local_role} {local_party_id} success"
def deploy(config_data): model_id = config_data.get('model_id') model_version = config_data.get('model_version') local_role = config_data.get('local').get('role') local_party_id = config_data.get('local').get('party_id') child_model_version = config_data.get('child_model_version') components_checkpoint = config_data.get('components_checkpoint', {}) warning_msg = "" try: party_model_id = gen_party_model_id(model_id=model_id, role=local_role, party_id=local_party_id) model = PipelinedModel(model_id=party_model_id, model_version=model_version) model_data = model.collect_models(in_bytes=True) if "pipeline.pipeline:Pipeline" not in model_data: raise Exception("Can not found pipeline file in model.") # check if the model could be executed the deploy process (parent/child) if not check_before_deploy(model): raise Exception('Child model could not be deployed.') # copy proto content from parent model and generate a child model deploy_model = PipelinedModel(model_id=party_model_id, model_version=child_model_version) shutil.copytree(src=model.model_path, dst=deploy_model.model_path, ignore=lambda src, names: {'checkpoint'} if src == model.model_path else {}) pipeline_model = deploy_model.read_pipeline_model() train_runtime_conf = json_loads(pipeline_model.train_runtime_conf) runtime_conf_on_party = json_loads( pipeline_model.runtime_conf_on_party) dsl_version = train_runtime_conf.get("dsl_version", "1") parser = get_dsl_parser_by_version(dsl_version) train_dsl = json_loads(pipeline_model.train_dsl) parent_predict_dsl = json_loads(pipeline_model.inference_dsl) if config_data.get('dsl') or config_data.get('predict_dsl'): inference_dsl = config_data.get('dsl') if config_data.get( 'dsl') else config_data.get('predict_dsl') if not isinstance(inference_dsl, dict): inference_dsl = json_loads(inference_dsl) else: if config_data.get('cpn_list', None): cpn_list = config_data.pop('cpn_list') else: cpn_list = list(train_dsl.get('components', {}).keys()) if int(dsl_version) == 1: # convert v1 dsl to v2 dsl inference_dsl, warning_msg = parser.convert_dsl_v1_to_v2( parent_predict_dsl) else: parser = get_dsl_parser_by_version(dsl_version) inference_dsl = parser.deploy_component(cpn_list, train_dsl) # convert v1 conf to v2 conf if int(dsl_version) == 1: components = parser.get_components_light_weight(inference_dsl) from fate_flow.db.component_registry import ComponentRegistry job_providers = parser.get_job_providers( dsl=inference_dsl, provider_detail=ComponentRegistry.REGISTRY) cpn_role_parameters = dict() for cpn in components: cpn_name = cpn.get_name() role_params = parser.parse_component_role_parameters( component=cpn_name, dsl=inference_dsl, runtime_conf=train_runtime_conf, provider_detail=ComponentRegistry.REGISTRY, provider_name=job_providers[cpn_name]["provider"]["name"], provider_version=job_providers[cpn_name]["provider"] ["version"]) cpn_role_parameters[cpn_name] = role_params train_runtime_conf = parser.convert_conf_v1_to_v2( train_runtime_conf, cpn_role_parameters) adapter = JobRuntimeConfigAdapter(train_runtime_conf) train_runtime_conf = adapter.update_model_id_version( model_version=deploy_model.model_version) pipeline_model.model_version = child_model_version pipeline_model.train_runtime_conf = json_dumps(train_runtime_conf, byte=True) # save inference dsl into child model file parser = get_dsl_parser_by_version(2) parser.verify_dsl(inference_dsl, "predict") inference_dsl = JobSaver.fill_job_inference_dsl( job_id=model_version, role=local_role, party_id=local_party_id, dsl_parser=parser, origin_inference_dsl=inference_dsl) pipeline_model.inference_dsl = json_dumps(inference_dsl, byte=True) if compare_version(pipeline_model.fate_version, '1.5.0') == 'gt': pipeline_model.parent_info = json_dumps( { 'parent_model_id': model_id, 'parent_model_version': model_version }, byte=True) pipeline_model.parent = False runtime_conf_on_party['job_parameters'][ 'model_version'] = child_model_version pipeline_model.runtime_conf_on_party = json_dumps( runtime_conf_on_party, byte=True) # save model file deploy_model.save_pipeline(pipeline_model) shutil.copyfile( os.path.join(deploy_model.model_path, "pipeline.pb"), os.path.join(deploy_model.model_path, "variables", "data", "pipeline", "pipeline", "Pipeline")) model_info = gather_model_info_data(deploy_model) model_info['job_id'] = model_info['f_model_version'] model_info['size'] = deploy_model.calculate_model_file_size() model_info['role'] = local_role model_info['party_id'] = local_party_id model_info['parent'] = False if model_info.get( 'f_inference_dsl') else True if compare_version(model_info['f_fate_version'], '1.5.0') == 'eq': model_info['roles'] = model_info.get('f_train_runtime_conf', {}).get('role', {}) model_info['initiator_role'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('role') model_info['initiator_party_id'] = model_info.get( 'f_train_runtime_conf', {}).get('initiator', {}).get('party_id') save_model_info(model_info) for component_name, component in train_dsl.get('components', {}).items(): step_index = components_checkpoint.get(component_name, {}).get('step_index') step_name = components_checkpoint.get(component_name, {}).get('step_name') if step_index is not None: step_index = int(step_index) step_name = None elif step_name is None: continue checkpoint_manager = CheckpointManager( role=local_role, party_id=local_party_id, model_id=model_id, model_version=model_version, component_name=component_name, mkdir=False, ) checkpoint_manager.load_checkpoints_from_disk() if checkpoint_manager.latest_checkpoint is not None: checkpoint_manager.deploy( child_model_version, component['output']['model'][0] if component.get( 'output', {}).get('model') else 'default', step_index, step_name, ) except Exception as e: stat_logger.exception(e) return 100, f"deploy model of role {local_role} {local_party_id} failed, details: {str(e)}" else: msg = f"deploy model of role {local_role} {local_party_id} success" if warning_msg: msg = msg + f", warning: {warning_msg}" return 0, msg
def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None): if not job_id: job_id = job_utils.generate_job_id() submit_result = {"job_id": job_id} schedule_logger(job_id).info( f"submit job, body {submit_job_conf.to_dict()}") try: dsl = submit_job_conf.dsl runtime_conf = deepcopy(submit_job_conf.runtime_conf) job_utils.check_job_runtime_conf(runtime_conf) authentication_utils.check_constraint(runtime_conf, dsl) job_initiator = runtime_conf["initiator"] conf_adapter = JobRuntimeConfigAdapter(runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != "predict": # generate job model info conf_version = schedule_utils.get_conf_version(runtime_conf) if conf_version != 2: raise Exception( "only the v2 version runtime conf is supported") common_job_parameters.model_id = model_utils.gen_model_id( runtime_conf["role"]) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ["model_id", "model_version"]) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_pipeline_model() train_runtime_conf = json_loads( pipeline_model.train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) dsl = json_loads(pipeline_model.inference_dsl) # dsl = ProviderManager.fill_fate_flow_provider(dsl) job = Job() job.f_job_id = job_id job.f_dsl = dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = runtime_conf["role"] job.f_initiator_role = job_initiator["role"] job.f_initiator_party_id = job_initiator["party_id"] job.f_role = job_initiator["role"] job.f_party_id = job_initiator["party_id"] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in runtime_conf["role"][ job.f_initiator_role]: msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}" schedule_logger(job_id).info(msg) raise Exception(msg) # create common parameters on initiator JobController.create_common_job_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, common_job_parameters=common_job_parameters) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() # inherit job job.f_inheritance_info = common_job_parameters.inheritance_info job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS if job.f_inheritance_info: inheritance_jobs = JobSaver.query_job( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"]) inheritance_tasks = JobSaver.query_task( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"], only_latest=True) job_utils.check_job_inheritance_parameters( job, inheritance_jobs, inheritance_tasks) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) else: need_run_components = {} for role in response: need_run_components[role] = {} for party, res in response[role].items(): need_run_components[role][party] = [ name for name, value in response[role][party] ["data"]["components"].items() if value["need_run"] is True ] if common_job_parameters.federated_mode == FederatedMode.MULTIPLE: # create the task holder in db to record information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue if not need_run_components[role][party_id]: continue JobController.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=common_job_parameters, dsl_parser=dsl_parser, components=need_run_components[role][party_id]) job.f_status = JobStatus.WAITING status_code, response = FederatedScheduler.sync_job_status( job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("set job to waiting status failed") schedule_logger(job_id).info( f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}" ) logs_directory = job_utils.get_job_log_directory(job_id) result = { "code": RetCode.SUCCESS, "message": "success", "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator["role"], job_initiator["party_id"]) } warn_parameter = JobRuntimeConfigAdapter( submit_job_conf.runtime_conf).check_removed_parameter() if warn_parameter: result[ "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!" submit_result.update(result) submit_result.update(path_dict) except Exception as e: submit_result["code"] = RetCode.OPERATING_ERROR submit_result["message"] = exception_to_trace_string(e) schedule_logger(job_id).exception(e) return submit_result