def finish_job(job_id, job_runtime_conf): job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] model_id_base64 = base64_encode(job_parameters['model_id']) model_version_base64 = base64_encode(job_parameters['model_version']) for role, partys in job_runtime_conf['role'].items(): for party_id in partys: # save pipeline federated_api( job_id=job_id, method='POST', endpoint='/{}/job/{}/{}/{}/{}/{}/save/pipeline'.format( API_VERSION, job_id, role, party_id, model_id_base64, model_version_base64), src_party_id=job_initiator['party_id'], dest_party_id=party_id, json_body={}, work_mode=job_parameters['work_mode']) # clean federated_api(job_id=job_id, method='POST', endpoint='/{}/job/{}/{}/{}/clean'.format( API_VERSION, job_id, role, party_id), src_party_id=job_initiator['party_id'], dest_party_id=party_id, json_body={}, work_mode=job_parameters['work_mode'])
def sync_job_status(job_id, roles, work_mode, initiator_party_id, initiator_role, job_info, sync_failed=False): for role, partys in roles.items(): job_info['f_role'] = role for party_id in partys: job_info['f_party_id'] = party_id try: federated_api( job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/status'.format( API_VERSION, job_id, role, party_id), src_party_id=initiator_party_id, dest_party_id=party_id, src_role=initiator_role, json_body=job_info, work_mode=work_mode) except Exception as e: if sync_failed: pass else: raise Exception(e)
def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() # detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop'.format(API_VERSION), src_party_id=my_party_id, dest_party_id=initiator_party_id, src_role=None, json_body={ 'job_id': job_id, 'operate': 'kill' }, work_mode=job_work_mode) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=json_loads( jobs[0].f_runtime_conf), stop=True) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')
def sync_job_status(job_id, roles, work_mode, initiator_party_id, job_info): for role, partys in roles.items(): job_info['f_role'] = role for party_id in partys: job_info['f_party_id'] = party_id federated_api(job_id=job_id, method='POST', endpoint='/{}/job/{}/{}/{}/status'.format( API_VERSION, job_id, role, party_id), src_party_id=initiator_party_id, dest_party_id=party_id, json_body=job_info, work_mode=work_mode)
def sync_task_status(job_id, component_name, task_id, role, party_id, initiator_party_id, task_info): for dest_party_id in {party_id, initiator_party_id}: if party_id != initiator_party_id and dest_party_id == initiator_party_id: # do not pass the process id to the initiator task_info['f_run_ip'] = '' federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/status'.format( API_VERSION, job_id, component_name, task_id, role, party_id), src_party_id=party_id, dest_party_id=dest_party_id, json_body=task_info, work_mode=RuntimeConfig.WORK_MODE)
def start_stop(job_id, operate=None): schedule_logger(job_id).info('get {} job {} command'.format( 'stop', job_id)) jobs = job_utils.query_job(job_id=job_id, is_initiator=1) if not jobs: jobs = job_utils.query_job(job_id=job_id) if jobs: job_info = {'job_id': job_id} if operate: job_info['operate'] = operate job_work_mode = jobs[0].f_work_mode initiator_party_id = jobs[0].f_initiator_party_id response = federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop/do'.format(API_VERSION), src_party_id=initiator_party_id, dest_party_id=initiator_party_id, src_role=None, json_body=job_info, work_mode=job_work_mode) return response else: schedule_logger(job_id).info( 'send {} job stop command failed, no find this job'.format( job_id)) raise Exception('can not found job: {}'.format(job_id))
def check_job(job_id, roles, work_mode, initiator_party_id, initiator_role, job_info, way='check'): for role, partys in roles.items(): job_info['f_role'] = role for party_id in partys: job_info['f_party_id'] = party_id response = federated_api( job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}'.format( API_VERSION, job_id, role, party_id, way), src_party_id=initiator_party_id, dest_party_id=party_id, src_role=initiator_role, json_body=job_info, work_mode=work_mode) try: if response['retcode'] == 101: return False except: return False return True
def start_proxy(role): request_config = request.json or request.form.to_dict() _job_id = f"{role}_forward" if role in ['marketplace']: response = proxy_api(role, _job_id, request_config) else: headers = request.headers json_body = {} if request_config.get('header') and request_config.get("body"): src_party_id = request_config.get('header').get('src_party_id') dest_party_id = request_config.get('header').get('dest_party_id') json_body = request_config if headers: json_body['header'].update(headers) else: src_party_id = headers.get('src_party_id') dest_party_id = headers.get('dest_party_id') json_body["header"] = request.headers json_body["body"] = request_config response = federated_api(job_id=_job_id, method='POST', endpoint='/forward/{}/do'.format(role), src_party_id=src_party_id, dest_party_id=dest_party_id, src_role=None, json_body=json_body, federated_mode=FederatedMode.MULTIPLE) return jsonify(response)
def report_task_to_initiator(cls, task: Task): """ :param task: :return: """ if task.f_role != task.f_initiator_role and task.f_party_id != task.f_initiator_party_id: exception = None for t in range(DEFAULT_FEDERATED_COMMAND_TRYS): try: response = federated_api( job_id=task.f_job_id, method='POST', endpoint='/initiator/{}/{}/{}/{}/{}/{}/report'.format( task.f_job_id, task.f_component_name, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id), src_party_id=task.f_party_id, dest_party_id=task.f_initiator_party_id, src_role=task.f_role, json_body=task.to_human_model_dict( only_primary_with=cls.REPORT_TO_INITIATOR_FIELDS), federated_mode=task.f_federated_mode) except Exception as e: exception = e continue if response["retcode"] != RetCode.SUCCESS: exception = Exception(response["retmsg"]) else: return True else: schedule_logger(job_id=task.f_job_id).error( f"report task to initiator error: {exception}") return False else: return False
def load_model(): request_config = request.json _job_id = generate_job_id() initiator_party_id = request_config['initiator']['party_id'] initiator_role = request_config['initiator']['role'] publish_model.generate_publish_model_info(request_config) load_status = True load_status_info = {} load_status_msg = 'success' for role_name, role_partys in request_config.get("role").items(): if role_name == 'arbiter': continue load_status_info[role_name] = load_status_info.get(role_name, {}) for _party_id in role_partys: request_config['local'] = {'role': role_name, 'party_id': _party_id} try: response = federated_api(job_id=_job_id, method='POST', endpoint='/{}/model/load/do'.format(API_VERSION), src_party_id=initiator_party_id, dest_party_id=_party_id, src_role = initiator_role, json_body=request_config, work_mode=request_config['job_parameters']['work_mode']) load_status_info[role_name][_party_id] = response['retcode'] except Exception as e: stat_logger.exception(e) load_status = False load_status_msg = 'failed' load_status_info[role_name][_party_id] = 100 return get_json_result(job_id=_job_id, retcode=(0 if load_status else 101), retmsg=load_status_msg, data=load_status_info)
def sync_task_status(job_id, component_name, task_id, role, party_id, initiator_party_id, initiator_role, task_info, update=False): sync_success = True for dest_party_id in {party_id, initiator_party_id}: if party_id != initiator_party_id and dest_party_id == initiator_party_id: # do not pass the process id to the initiator task_info['f_run_ip'] = '' response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/status'.format( API_VERSION, job_id, component_name, task_id, role, party_id), src_party_id=party_id, dest_party_id=dest_party_id, src_role=role, json_body=task_info, work_mode=RuntimeConfig.WORK_MODE) if response['retcode']: sync_success = False schedule_logger().exception('job {} role {} party {} synchronize task status failed'.format(job_id, role, party_id)) break if not sync_success and not update: task_info['f_status'] = TaskStatus.FAILED TaskExecutor.sync_task_status(job_id, component_name, task_id, role, party_id, initiator_party_id, initiator_role, task_info, update=True) if update: raise Exception('job {} role {} party {} synchronize task status failed'.format(job_id, role, party_id))
def distribute_job(job, roles, job_initiator): for role, partys in roles.items(): job.f_role = role for party_id in partys: job.f_party_id = party_id if role == job_initiator['role'] and party_id == job_initiator['party_id']: job.f_is_initiator = 1 else: job.f_is_initiator = 0 response_json = federated_api(job_id=job.f_job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/create'.format( API_VERSION, job.f_job_id, role, party_id), src_party_id=job_initiator['party_id'], dest_party_id=party_id, src_role=job_initiator['role'], json_body=job.to_json(), work_mode=job.f_work_mode) if response_json["retcode"]: job.f_status = JobStatus.FAILED TaskScheduler.sync_job_status(job_id=job.f_job_id, roles=roles, work_mode=job.f_work_mode, initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job.to_json()) raise Exception( "an error occurred while creating the job: role {} party_id {}".format(role, party_id) + "\n" + str(response_json["retmsg"]))
def distribute_job(job, roles, job_initiator): for role, partys in roles.items(): job.f_role = role for party_id in partys: job.f_party_id = party_id if role == job_initiator['role'] and party_id == job_initiator[ 'party_id']: job.f_is_initiator = 1 else: job.f_is_initiator = 0 federated_api(job_id=job.f_job_id, method='POST', endpoint='/{}/job/{}/{}/{}/create'.format( API_VERSION, job.f_job_id, role, party_id), src_party_id=job_initiator['party_id'], dest_party_id=party_id, json_body=job.to_json(), work_mode=job.f_work_mode)
def stop(job_id, end_status=JobStatus.FAILED, component_name=''): schedule_logger(job_id).info('get {} job {} {} command'.format("cancel" if end_status == JobStatus.CANCELED else "stop", job_id, component_name)) jobs = job_utils.query_job(job_id=job_id, is_initiator=1) cancel_success = False is_cancel = (end_status == JobStatus.CANCELED) if jobs: initiator_job = jobs[0] job_info = {'f_job_id': job_id, 'f_status': end_status} roles = json_loads(initiator_job.f_roles) job_work_mode = initiator_job.f_work_mode initiator_party_id = initiator_job.f_party_id # set status first if not component_name: TaskScheduler.sync_job_status(job_id=job_id, roles=roles, initiator_party_id=initiator_party_id, initiator_role=initiator_job.f_role, work_mode=job_work_mode, job_info=job_info) for role, partys in roles.items(): for party_id in partys: response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}'.format( API_VERSION, job_id, role, party_id, "cancel" if is_cancel else "kill" ), src_party_id=initiator_party_id, dest_party_id=party_id, src_role=initiator_job.f_role, json_body={'job_initiator': {'party_id': initiator_job.f_party_id, 'role': initiator_job.f_role}, 'timeout': end_status == JobStatus.TIMEOUT, 'component_name': component_name }, work_mode=job_work_mode) if response['retcode'] == 0: cancel_success = True schedule_logger(job_id).info( 'send {} {} {} job {} {} command successfully'.format(role, party_id, "cancel" if is_cancel else "kill", job_id, component_name)) if is_cancel: break else: schedule_logger(job_id).info( 'send {} {} {} job {} {} command failed: {}'.format(role, party_id, "cancel" if is_cancel else "kill", job_id, component_name, response['retmsg'])) if is_cancel: return cancel_success else: jobs = job_utils.query_job(job_id=job_id) if jobs: raise Exception('Current role is not this job initiator') schedule_logger(job_id).info('send {} job {} {} command failed'.format("cancel" if is_cancel else "kill", job_id, component_name)) raise Exception('can not found job: {}'.format(job_id))
def task_command(cls, job, task, command, command_body=None, need_user=False): federated_response = {} job_parameters = job.f_runtime_conf_on_party["job_parameters"] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) component = dsl_parser.get_component_info( component_name=task.f_component_name) component_parameters = component.get_role_parameters() for dest_role, parameters_on_partys in component_parameters.items(): federated_response[dest_role] = {} for parameters_on_party in parameters_on_partys: dest_party_id = parameters_on_party.get('local', {}).get('party_id') try: if need_user: command_body["user_id"] = job.f_user.get( dest_role, {}).get(str(dest_party_id), "") schedule_logger(job_id=job.f_job_id).info( f'user:{job.f_user}, dest_role:{dest_role}, dest_party_id:{dest_party_id}' ) schedule_logger(job_id=job.f_job_id).info( f'command_body: {command_body}') response = federated_api( job_id=task.f_job_id, method='POST', endpoint='/party/{}/{}/{}/{}/{}/{}/{}'.format( task.f_job_id, task.f_component_name, task.f_task_id, task.f_task_version, dest_role, dest_party_id, command), src_party_id=job.f_initiator_party_id, dest_party_id=dest_party_id, src_role=job.f_initiator_role, json_body=command_body if command_body else {}, federated_mode=job_parameters["federated_mode"]) federated_response[dest_role][dest_party_id] = response except Exception as e: federated_response[dest_role][dest_party_id] = { "retcode": RetCode.FEDERATED_ERROR, "retmsg": "Federated schedule error, {}".format(str(e)) } if federated_response[dest_role][dest_party_id]["retcode"]: schedule_logger(job_id=job.f_job_id).warning( "an error occurred while {} the task to role {} party {}: \n{}" .format( command, dest_role, dest_party_id, federated_response[dest_role][dest_party_id] ["retmsg"])) return cls.return_federated_response( federated_response=federated_response)
def finish_job(job_id, job_runtime_conf, stop=False): job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] model_id_base64 = base64_encode(job_parameters['model_id']) model_version_base64 = base64_encode(job_parameters['model_version']) roles = ','.join(job_runtime_conf['role'].keys()) party_ids = ','.join([','.join([str(j) for j in i]) for i in job_runtime_conf['role'].values()]) for role, partys in job_runtime_conf['role'].items(): for party_id in partys: # save pipeline if not stop: federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/save/pipeline'.format( API_VERSION, job_id, role, party_id, model_id_base64, model_version_base64 ), src_party_id=job_initiator['party_id'], dest_party_id=party_id, src_role=job_initiator['role'], json_body={}, work_mode=job_parameters['work_mode']) # clean federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/clean'.format( API_VERSION, job_id, role, party_id, roles, party_ids ), src_party_id=job_initiator['party_id'], dest_party_id=party_id, src_role=job_initiator['role'], json_body={}, work_mode=job_parameters['work_mode']) schedule_logger(job_id, delete=True)
def job_command(cls, job, command, command_body=None, dest_only_initiator=False, specific_dest=None, order_federated=False): federated_response = {} job_parameters = job.f_runtime_conf_on_party["job_parameters"] if dest_only_initiator: dest_partys = [(job.f_initiator_role, [job.f_initiator_party_id])] api_type = "initiator" elif specific_dest: dest_partys = specific_dest.items() api_type = "party" else: dest_partys = job.f_roles.items() api_type = "party" if order_federated: dest_partys = schedule_utils.federated_order_reset( dest_partys, scheduler_partys_info=[(job.f_initiator_role, job.f_initiator_party_id)]) for dest_role, dest_party_ids in dest_partys: federated_response[dest_role] = {} for dest_party_id in dest_party_ids: try: response = federated_api( job_id=job.f_job_id, method='POST', endpoint='/{}/{}/{}/{}/{}'.format( api_type, job.f_job_id, dest_role, dest_party_id, command), src_party_id=job.f_initiator_party_id, dest_party_id=dest_party_id, src_role=job.f_initiator_role, json_body=command_body if command_body else {}, federated_mode=job_parameters["federated_mode"]) federated_response[dest_role][dest_party_id] = response except Exception as e: schedule_logger(job_id=job.f_job_id).exception(e) federated_response[dest_role][dest_party_id] = { "retcode": RetCode.FEDERATED_ERROR, "retmsg": "Federated schedule error, {}".format(e) } if federated_response[dest_role][dest_party_id]["retcode"]: schedule_logger(job_id=job.f_job_id).warning( "an error occurred while {} the job to role {} party {}: \n{}" .format( command, dest_role, dest_party_id, federated_response[dest_role][dest_party_id] ["retmsg"])) return cls.return_federated_response( federated_response=federated_response)
def align_task_parameters(job_id, job_parameters, job_initiator, job_args, component, task_id): parameters = component.get_role_parameters() component_name = component.get_name() extra_task_parameters = { 'input_data_partition': 0 } # Large integers are not used for role, partys_parameters in parameters.items(): for party_index in range(len(partys_parameters)): party_parameters = partys_parameters[party_index] if role in job_args: party_job_args = job_args[role][party_index]['args'] else: party_job_args = {} dest_party_id = party_parameters.get('local', {}).get('party_id') if job_parameters.get('align_task_input_data_partition', ALIGN_TASK_INPUT_DATA_PARTITION_SWITCH): response = federated_api( job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/input/args'. format(API_VERSION, job_id, component_name, task_id, role, dest_party_id), src_party_id=job_initiator['party_id'], dest_party_id=dest_party_id, src_role=job_initiator['role'], json_body={ 'job_parameters': job_parameters, 'job_args': party_job_args, 'input': component.get_input() }, work_mode=job_parameters['work_mode']) if response['retcode'] == 0: for input_data in response.get('data', {}).get('data', {}).values(): for data_table_info in input_data.values(): if data_table_info: partitions = data_table_info['partitions'] if extra_task_parameters[ 'input_data_partition'] == 0 or partitions < extra_task_parameters[ 'input_data_partition']: extra_task_parameters[ 'input_data_partition'] = partitions else: raise Exception( 'job {} component {} align task parameters failed on {} {}' .format(job_id, component_name, role, dest_party_id)) return extra_task_parameters
def tracker_command(cls, job, request_data, command, json_body=None): job_parameters = job.f_runtime_conf_on_party["job_parameters"] response = federated_api( job_id=str(request_data['job_id']), method='POST', endpoint='/tracker/{}/{}/{}/{}/{}'.format( request_data['job_id'], request_data['component_name'], request_data['role'], request_data['party_id'], command), src_party_id=job.f_party_id, dest_party_id=request_data['party_id'], src_role=job.f_role, json_body=json_body if json_body else {}, federated_mode=job_parameters["federated_mode"]) return response
def start_proxy(role): request_config = request.json or request.form.to_dict() _job_id = job_utils.generate_job_id() if role in ['marketplace']: response = proxy_api(role, _job_id, request_config) else: response = federated_api(job_id=_job_id, method='POST', endpoint='/forward/{}/do'.format(role), src_party_id=request_config.get('header').get('src_party_id'), dest_party_id=request_config.get('header').get('dest_party_id'), src_role=None, json_body=request_config, federated_mode=FederatedMode.MULTIPLE) return jsonify(response)
def stop_job(job_id): schedule_logger.info('get stop job {} command'.format(job_id)) jobs = job_utils.query_job(job_id=job_id, is_initiator=1) if jobs: initiator_job = jobs[0] job_info = {'f_job_id': job_id, 'f_status': JobStatus.FAILED} roles = json_loads(initiator_job.f_roles) job_work_mode = initiator_job.f_work_mode initiator_party_id = initiator_job.f_party_id # set status first TaskScheduler.sync_job_status( job_id=job_id, roles=roles, initiator_party_id=initiator_party_id, work_mode=job_work_mode, job_info=job_info) for role, partys in roles.items(): for party_id in partys: response = federated_api( job_id=job_id, method='POST', endpoint='/{}/job/{}/{}/{}/kill'.format( API_VERSION, job_id, role, party_id), src_party_id=initiator_party_id, dest_party_id=party_id, json_body={ 'job_initiator': { 'party_id': initiator_job.f_party_id, 'role': initiator_job.f_role } }, work_mode=job_work_mode) if response['retcode'] == 0: schedule_logger.info( 'send {} {} kill job {} command successfully'. format(role, party_id, job_id)) else: schedule_logger.info( 'send {} {} kill job {} command failed: {}'.format( role, party_id, job_id, response['retmsg'])) else: schedule_logger.info( 'send stop job {} command failed'.format(job_id)) raise Exception('can not found job: {}'.format(job_id))
def federated_command(cls, job_id, src_role, src_party_id, dest_role, dest_party_id, endpoint, body, federated_mode, federated_response): st = base_utils.current_timestamp() log_msg = f"sending {endpoint} federated command" schedule_logger(job_id).info(start_log(msg=log_msg)) try: response = federated_api(job_id=job_id, method='POST', endpoint=endpoint, src_role=src_role, src_party_id=src_party_id, dest_party_id=dest_party_id, json_body=body if body else {}, federated_mode=federated_mode) except Exception as e: schedule_logger(job_id=job_id).exception(e) response = { "retcode": RetCode.FEDERATED_ERROR, "retmsg": "Federated schedule error, {}".format(e) } if response["retcode"] != RetCode.SUCCESS: if response["retcode"] in [RetCode.NOT_EFFECTIVE, RetCode.RUNNING]: schedule_logger(job_id).warning( warning_log(msg=log_msg, role=dest_role, party_id=dest_party_id)) else: schedule_logger(job_id).error( failed_log(msg=log_msg, role=dest_role, party_id=dest_party_id, detail=response["retmsg"])) federated_response[dest_role][dest_party_id] = response et = base_utils.current_timestamp() schedule_logger(job_id).info(f"{log_msg} use {et - st} ms")
def run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component): parameters = component.get_role_parameters() component_name = component.get_name() module_name = component.get_module() task_id = job_utils.generate_task_id(job_id=job_id, component_name=component_name) schedule_logger(job_id).info('job {} run component {}'.format(job_id, component_name)) for role, partys_parameters in parameters.items(): for party_index in range(len(partys_parameters)): party_parameters = partys_parameters[party_index] if role in job_args: party_job_args = job_args[role][party_index]['args'] else: party_job_args = {} dest_party_id = party_parameters.get('local', {}).get('party_id') response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/run'.format( API_VERSION, job_id, component_name, task_id, role, dest_party_id), src_party_id=job_initiator['party_id'], dest_party_id=dest_party_id, src_role=job_initiator['role'], json_body={'job_parameters': job_parameters, 'job_initiator': job_initiator, 'job_args': party_job_args, 'parameters': party_parameters, 'module_name': module_name, 'input': component.get_input(), 'output': component.get_output(), 'job_server': {'ip': get_lan_ip(), 'http_port': RuntimeConfig.HTTP_PORT}}, work_mode=job_parameters['work_mode']) if response['retcode']: if 'not authorized' in response['retmsg']: raise Exception('run component {} not authorized'.format(component_name)) component_task_status = TaskScheduler.check_task_status(job_id=job_id, component=component) job_status = TaskScheduler.check_job_status(job_id) if component_task_status and job_status: task_success = True else: task_success = False schedule_logger(job_id).info( 'job {} component {} run {}'.format(job_id, component_name, 'success' if task_success else 'failed')) # update progress TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job_utils.update_job_progress(job_id=job_id, dag=dag, current_task_id=task_id).to_json()) TaskScheduler.stop(job_id=job_id, component_name=component_name) if task_success: next_components = dag.get_next_components(component_name) schedule_logger(job_id).info('job {} component {} next components is {}'.format(job_id, component_name, [next_component.get_name() for next_component in next_components])) for next_component in next_components: try: schedule_logger(job_id).info( 'job {} check component {} dependencies status'.format(job_id, next_component.get_name())) dependencies_status = TaskScheduler.check_dependencies(job_id=job_id, dag=dag, component=next_component) job_status = TaskScheduler.check_job_status(job_id) schedule_logger(job_id).info( 'job {} component {} dependencies status is {}, job status is {}'.format(job_id, next_component.get_name(), dependencies_status, job_status)) if dependencies_status and job_status: run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, next_component) else: run_status = False except Exception as e: schedule_logger(job_id).exception(e) run_status = False if not run_status: return False return True else: if component_task_status == None: end_status = JobStatus.TIMEOUT else: end_status = JobStatus.FAILED TaskScheduler.stop(job_id=job_id, end_status=end_status) return False
def deploy(): request_data = request.json require_parameters = ['model_id', 'model_version'] check_config(request_data, require_parameters) model_id = request_data.get("model_id") model_version = request_data.get("model_version") retcode, retmsg, model_info = model_utils.query_model_info_from_file( model_id=model_id, model_version=model_version, to_dict=True) if not model_info: raise Exception( f'Deploy model failed, no model {model_id} {model_version} found.') else: for key, value in model_info.items(): version_check = model_utils.compare_version( value.get('f_fate_version'), '1.5.0') if version_check == 'lt': continue else: init_role = key.split('/')[-2].split('#')[0] init_party_id = key.split('/')[-2].split('#')[1] model_init_role = value.get('f_initiator_role') if value.get( 'f_initiator_role') else value.get( 'f_train_runtime_conf', {}).get('initiator', {}).get( 'role', '') model_init_party_id = value.get( 'f_initiator_role_party_id') if value.get( 'f_initiator_role_party_id') else value.get( 'f_train_runtime_conf', {}).get( 'initiator', {}).get('party_id', '') if (init_role == model_init_role) and (init_party_id == str(model_init_party_id)): break else: raise Exception( "Deploy model failed, can not found model of initiator role or the fate version of model is older than 1.5.0" ) # distribute federated deploy task _job_id = job_utils.generate_job_id() request_data['child_model_version'] = _job_id initiator_party_id = model_init_party_id initiator_role = model_init_role request_data['initiator'] = { 'role': initiator_role, 'party_id': initiator_party_id } deploy_status = True deploy_status_info = {} deploy_status_msg = 'success' deploy_status_info['detail'] = {} for role_name, role_partys in value.get("f_train_runtime_conf", {}).get('role', {}).items(): if role_name not in ['arbiter', 'host', 'guest']: continue deploy_status_info[role_name] = deploy_status_info.get( role_name, {}) deploy_status_info['detail'][role_name] = {} adapter = JobRuntimeConfigAdapter( value.get("f_train_runtime_conf", {})) work_mode = adapter.get_job_work_mode() for _party_id in role_partys: request_data['local'] = { 'role': role_name, 'party_id': _party_id } try: response = federated_api( job_id=_job_id, method='POST', endpoint='/model/deploy/do', src_party_id=initiator_party_id, dest_party_id=_party_id, src_role=initiator_role, json_body=request_data, federated_mode=FederatedMode.MULTIPLE if work_mode else FederatedMode.SINGLE) deploy_status_info[role_name][_party_id] = response[ 'retcode'] detail = {_party_id: {}} detail[_party_id]['retcode'] = response['retcode'] detail[_party_id]['retmsg'] = response['retmsg'] deploy_status_info['detail'][role_name].update(detail) if response['retcode']: deploy_status = False deploy_status_msg = 'failed' except Exception as e: stat_logger.exception(e) deploy_status = False deploy_status_msg = 'failed' deploy_status_info[role_name][_party_id] = 100 deploy_status_info['model_id'] = request_data['model_id'] deploy_status_info['model_version'] = _job_id return get_json_result(retcode=(0 if deploy_status else 101), retmsg=deploy_status_msg, data=deploy_status_info)
def load_model(): request_config = request.json if request_config.get('job_id', None): retcode, retmsg, res_data = model_utils.query_model_info( model_version=request_config['job_id'], role='guest') if res_data: model_info = res_data[0] request_config['initiator'] = {} request_config['initiator']['party_id'] = str( model_info.get('f_initiator_party_id')) request_config['initiator']['role'] = model_info.get( 'f_initiator_role') runtime_conf = model_info.get( 'f_runtime_conf', {}) if model_info.get( 'f_runtime_conf', {}) else model_info.get( 'f_train_runtime_conf', {}) adapter = JobRuntimeConfigAdapter(runtime_conf) job_parameters = adapter.get_common_parameters().to_dict() request_config[ 'job_parameters'] = job_parameters if job_parameters else model_info.get( 'f_train_runtime_conf', {}).get('job_parameters') roles = runtime_conf.get('role') request_config['role'] = roles if roles else model_info.get( 'f_train_runtime_conf', {}).get('role') for key, value in request_config['role'].items(): for i, v in enumerate(value): value[i] = str(v) request_config.pop('job_id') else: return get_json_result( retcode=101, retmsg="model with version {} can not be found in database. " "Please check if the model version is valid.".format( request_config.get('job_id'))) _job_id = job_utils.generate_job_id() initiator_party_id = request_config['initiator']['party_id'] initiator_role = request_config['initiator']['role'] publish_model.generate_publish_model_info(request_config) load_status = True load_status_info = {} load_status_msg = 'success' load_status_info['detail'] = {} if "federated_mode" not in request_config['job_parameters']: if request_config["job_parameters"][ "work_mode"] == WorkMode.STANDALONE: request_config['job_parameters'][ "federated_mode"] = FederatedMode.SINGLE elif request_config["job_parameters"]["work_mode"] == WorkMode.CLUSTER: request_config['job_parameters'][ "federated_mode"] = FederatedMode.MULTIPLE for role_name, role_partys in request_config.get("role").items(): if role_name == 'arbiter': continue load_status_info[role_name] = load_status_info.get(role_name, {}) load_status_info['detail'][role_name] = {} for _party_id in role_partys: request_config['local'] = { 'role': role_name, 'party_id': _party_id } try: response = federated_api( job_id=_job_id, method='POST', endpoint='/model/load/do', src_party_id=initiator_party_id, dest_party_id=_party_id, src_role=initiator_role, json_body=request_config, federated_mode=request_config['job_parameters'] ['federated_mode']) load_status_info[role_name][_party_id] = response['retcode'] detail = {_party_id: {}} detail[_party_id]['retcode'] = response['retcode'] detail[_party_id]['retmsg'] = response['retmsg'] load_status_info['detail'][role_name].update(detail) if response['retcode']: load_status = False load_status_msg = 'failed' except Exception as e: stat_logger.exception(e) load_status = False load_status_msg = 'failed' load_status_info[role_name][_party_id] = 100 return get_json_result(job_id=_job_id, retcode=(0 if load_status else 101), retmsg=load_status_msg, data=load_status_info)
def migrate_model_process(): request_config = request.json _job_id = job_utils.generate_job_id() initiator_party_id = request_config['migrate_initiator']['party_id'] initiator_role = request_config['migrate_initiator']['role'] if not request_config.get("unify_model_version"): request_config["unify_model_version"] = _job_id migrate_status = True migrate_status_info = {} migrate_status_msg = 'success' migrate_status_info['detail'] = {} require_arguments = [ "migrate_initiator", "role", "migrate_role", "model_id", "model_version", "execute_party", "job_parameters" ] check_config(request_config, require_arguments) try: if compare_roles(request_config.get("migrate_role"), request_config.get("role")): return get_json_result( retcode=100, retmsg= "The config of previous roles is the same with that of migrate roles. " "There is no need to migrate model. Migration process aborting." ) except Exception as e: return get_json_result(retcode=100, retmsg=str(e)) local_template = {"role": "", "party_id": "", "migrate_party_id": ""} res_dict = {} for role_name, role_partys in request_config.get("migrate_role").items(): for offset, party_id in enumerate(role_partys): local_res = deepcopy(local_template) local_res["role"] = role_name local_res["party_id"] = request_config.get("role").get( role_name)[offset] local_res["migrate_party_id"] = party_id if not res_dict.get(role_name): res_dict[role_name] = {} res_dict[role_name][local_res["party_id"]] = local_res for role_name, role_partys in request_config.get("execute_party").items(): migrate_status_info[role_name] = migrate_status_info.get(role_name, {}) migrate_status_info['detail'][role_name] = {} for party_id in role_partys: request_config["local"] = res_dict.get(role_name).get(party_id) try: response = federated_api( job_id=_job_id, method='POST', endpoint='/model/migrate/do', src_party_id=initiator_party_id, dest_party_id=party_id, src_role=initiator_role, json_body=request_config, federated_mode=request_config['job_parameters'] ['federated_mode']) migrate_status_info[role_name][party_id] = response['retcode'] detail = {party_id: {}} detail[party_id]['retcode'] = response['retcode'] detail[party_id]['retmsg'] = response['retmsg'] migrate_status_info['detail'][role_name].update(detail) except Exception as e: stat_logger.exception(e) migrate_status = False migrate_status_msg = 'failed' migrate_status_info[role_name][party_id] = 100 return get_json_result(job_id=_job_id, retcode=(0 if migrate_status else 101), retmsg=migrate_status_msg, data=migrate_status_info)
def run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component): parameters = component.get_role_parameters() component_name = component.get_name() module_name = component.get_module() task_id = job_utils.generate_task_id(job_id=job_id, component_name=component_name) schedule_logger.info('job {} run component {}'.format( job_id, component_name)) for role, partys_parameters in parameters.items(): for party_index in range(len(partys_parameters)): party_parameters = partys_parameters[party_index] if role in job_args: party_job_args = job_args[role][party_index]['args'] else: party_job_args = {} dest_party_id = party_parameters.get('local', {}).get('party_id') federated_api(job_id=job_id, method='POST', endpoint='/{}/job/{}/{}/{}/{}/{}/run'.format( API_VERSION, job_id, component_name, task_id, role, dest_party_id), src_party_id=job_initiator['party_id'], dest_party_id=dest_party_id, json_body={ 'job_parameters': job_parameters, 'job_initiator': job_initiator, 'job_args': party_job_args, 'parameters': party_parameters, 'module_name': module_name, 'input': component.get_input(), 'output': component.get_output() }, work_mode=job_parameters['work_mode']) component_task_status = TaskScheduler.check_task_status( job_id=job_id, component=component) if component_task_status: task_success = True else: task_success = False schedule_logger.info('job {} component {} run {}'.format( job_id, component_name, 'success' if task_success else 'failed')) # update progress TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job_utils.update_job_progress( job_id=job_id, dag=dag, current_task_id=task_id).to_json()) if task_success: next_components = dag.get_next_components(component_name) schedule_logger.info( 'job {} component {} next components is {}'.format( job_id, component_name, [ next_component.get_name() for next_component in next_components ])) for next_component in next_components: try: schedule_logger.info( 'job {} check component {} dependencies status'.format( job_id, next_component.get_name())) dependencies_status = TaskScheduler.check_dependencies( job_id=job_id, dag=dag, component=next_component) schedule_logger.info( 'job {} component {} dependencies status is {}'.format( job_id, next_component.get_name(), dependencies_status)) if dependencies_status: run_status = TaskScheduler.run_component( job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, next_component) else: run_status = False except Exception as e: schedule_logger.info(e) run_status = False if not run_status: return False return True else: return False