def update_job_progress(job_id, dag, current_task_id): component_count = len(dag.get_dependency()['component_list']) success_count = success_task_count(job_id=job_id) job = Job() job.f_progress = float(success_count) / component_count * 100 job.f_update_time = current_timestamp() job.f_current_tasks = json_dumps([current_task_id]) return job
def save_job_info(self, role, party_id, job_info, create=False): with DB.connection_context(): schedule_logger(self.job_id).info('save {} {} job: {}'.format( role, party_id, job_info)) jobs = Job.select().where(Job.f_job_id == self.job_id, Job.f_role == role, Job.f_party_id == party_id) is_insert = True if jobs: job = jobs[0] is_insert = False if job.f_status == JobStatus.TIMEOUT: return None elif create: job = Job() job.f_create_time = current_timestamp() else: return None job.f_job_id = self.job_id job.f_role = role job.f_party_id = party_id if 'f_status' in job_info: if job.f_status in [JobStatus.COMPLETE, JobStatus.FAILED]: # Termination status cannot be updated # TODO: return if (job_info['f_status'] in [ JobStatus.FAILED, JobStatus.TIMEOUT ]) and (not job.f_end_time): if not job.f_start_time: return job_info['f_end_time'] = current_timestamp() job_info['f_elapsed'] = job_info[ 'f_end_time'] - job.f_start_time job_info['f_update_time'] = current_timestamp() if (job_info['f_status'] in [ JobStatus.FAILED, JobStatus.TIMEOUT, JobStatus.CANCELED, JobStatus.COMPLETE ]): job_info['f_tag'] = 'job_end' update_fields = [] for k, v in job_info.items(): try: if k in ['f_job_id', 'f_role', 'f_party_id' ] or v == getattr(Job, k).default: continue setattr(job, k, v) update_fields.append(getattr(Job, k)) except: pass if is_insert: job.save(force_insert=True) else: job.save(only=update_fields)
def submit_job(job_data): job_id = generate_job_id() schedule_logger.info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) job_dsl_path, job_runtime_conf_path = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() # save job info TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue RuntimeConfig.JOB_QUEUE.put_event({ 'job_id': job_id, "initiator_role": job_initiator['role'], "initiator_party_id": job_initiator['party_id'] } ) schedule_logger.info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) return job_id, job_dsl_path, job_runtime_conf_path, {'model_id': job_parameters['model_id'], 'model_version': job_parameters[ 'model_version']}, board_url
def save_job_info(self, role, party_id, job_info, create=False): with DB.connection_context(): stat_logger.info('save {} {} job: {}'.format( role, party_id, job_info)) jobs = Job.select().where(Job.f_job_id == self.job_id, Job.f_role == role, Job.f_party_id == party_id) is_insert = True if jobs: job = jobs[0] is_insert = False if job.f_status == JobStatus.TIMEOUT: return None elif create: job = Job() job.f_create_time = current_timestamp() else: return None job.f_job_id = self.job_id job.f_role = role job.f_party_id = party_id if 'f_status' in job_info: if job.f_status in [JobStatus.COMPLETE, JobStatus.FAILED]: # Termination status cannot be updated # TODO: pass if job_info[ 'f_status'] == JobStatus.FAILED and not job.f_end_time: job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time job.f_update_time = current_timestamp() for k, v in job_info.items(): try: if k in ['f_job_id', 'f_role', 'f_party_id' ] or v == getattr(Job, k).default: continue setattr(job, k, v) except: pass if is_insert: job.save(force_insert=True) else: job.save()
def save_job_info(self, role, party_id, job_info, create=False): with DB.connection_context(): stat_logger.info('save {} {} job: {}'.format( role, party_id, job_info)) jobs = Job.select().where(Job.f_job_id == self.job_id, Job.f_role == role, Job.f_party_id == party_id) is_insert = True if jobs: job = jobs[0] is_insert = False elif create: job = Job() job.f_create_time = current_timestamp() else: return None job.f_job_id = self.job_id job.f_role = role job.f_party_id = party_id if 'f_status' in job_info: if job.f_status in [ JobStatus.SUCCESS, JobStatus.FAILED, JobStatus.PARTIAL, JobStatus.DELETED ]: # Termination status cannot be updated # TODO: pass for k, v in job_info.items(): if k in ['f_job_id', 'f_role', 'f_party_id'] or v == getattr( Job, k).default: continue setattr(job, k, v) if is_insert: job.save(force_insert=True) else: job.save()
def run_job(job_id, initiator_role, initiator_party_id): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_initiator = job_runtime_conf.get('initiator', {}) dag = get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() if not job_initiator: return False storage.init_storage(job_id=job_id, work_mode=RuntimeConfig.WORK_MODE) job = Job() job.f_job_id = job_id job.f_start_time = current_timestamp() job.f_status = JobStatus.RUNNING job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) top_level_task_status = set() components = dag.get_next_components(None) schedule_logger.info('job {} root components is {}'.format( job.f_job_id, [component.get_name() for component in components], None)) for component in components: try: # run a component as task run_status = TaskScheduler.run_component( job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component) except Exception as e: schedule_logger.info(e) run_status = False top_level_task_status.add(run_status) if not run_status: break if len(top_level_task_status) == 2: job.f_status = JobStatus.PARTIAL elif True in top_level_task_status: job.f_status = JobStatus.SUCCESS else: job.f_status = JobStatus.FAILED job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time if job.f_status == JobStatus.SUCCESS: job.f_progress = 100 job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf) schedule_logger.info('job {} finished, status is {}'.format( job.f_job_id, job.f_status))
def run_job(job_id, initiator_role, initiator_party_id): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_initiator = job_runtime_conf.get('initiator', {}) dag = get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() if not job_initiator: return False timeout = job_utils.get_timeout(job_id, job_parameters.get("timeout", None), job_runtime_conf, job_dsl) t = Timer(timeout, TaskScheduler.job_handler, [job_id]) t.start() job = Job() job.f_job_id = job_id job.f_start_time = current_timestamp() job.f_status = JobStatus.RUNNING job.f_update_time = current_timestamp() TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job.to_json()) top_level_task_status = set() components = dag.get_next_components(None) schedule_logger(job_id).info( 'job {} root components is {}'.format(job.f_job_id, [component.get_name() for component in components], None)) for component in components: try: # run a component as task run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component) except Exception as e: schedule_logger(job_id).exception(e) run_status = False top_level_task_status.add(run_status) if not run_status: break if len(top_level_task_status) == 2: job.f_status = JobStatus.FAILED elif True in top_level_task_status: job.f_status = JobStatus.COMPLETE else: job.f_status = JobStatus.FAILED job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time if job.f_status == JobStatus.COMPLETE: job.f_progress = 100 job.f_update_time = current_timestamp() try: TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf) except Exception as e: schedule_logger(job_id).exception(e) job.f_status = JobStatus.FAILED if job.f_status == JobStatus.FAILED: TaskScheduler.stop(job_id=job_id, end_status=JobStatus.FAILED) try: TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job.to_json()) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).warning('job {} sync status failed'.format(job.f_job_id)) schedule_logger(job_id).info('job {} finished, status is {}'.format(job.f_job_id, job.f_status)) t.cancel()
def submit_job(job_data): job_id = generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue job_event = job_utils.job_event(job_id, initiator_role, initiator_party_id) try: RuntimeConfig.JOB_QUEUE.put_event(job_event) except Exception as e: raise Exception('push job into queue failed') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters['model_id'],'model_version': job_parameters['model_version']}, board_url
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format( job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_job_runtime_conf(job_runtime_conf) authentication_utils.check_constraint(job_runtime_conf, job_dsl) job_initiator = job_runtime_conf['initiator'] conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != 'predict': # generate job model info common_job_parameters.model_id = model_utils.gen_model_id( job_runtime_conf['role']) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') train_runtime_conf = json_loads( pipeline_model['Pipeline'].train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = common_job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] job.f_role = job_initiator['role'] job.f_party_id = job_initiator['party_id'] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, job_runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in job_runtime_conf['role'][ job.f_initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format( job.f_initiator_party_id)) raise Exception("initiator party id error {}".format( job.f_initiator_party_id)) # create common parameters on initiator JobController.backend_compatibility( job_parameters=common_job_parameters) JobController.adapt_job_parameters( role=job.f_initiator_role, job_parameters=common_job_parameters, create_initiator_baseline=True) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() if common_job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue JobController.initialize_tasks(job_id, role, party_id, False, job.f_initiator_role, job.f_initiator_party_id, common_job_parameters, dsl_parser) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format( job.f_job_id, common_job_parameters.model_id)) logs_directory = job_utils.get_job_log_directory(job_id) submit_result = { "job_id": job_id, "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator['role'], job_initiator['party_id']) } submit_result.update(path_dict) return submit_result
def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None): if not job_id: job_id = job_utils.generate_job_id() submit_result = {"job_id": job_id} schedule_logger(job_id).info( f"submit job, body {submit_job_conf.to_dict()}") try: dsl = submit_job_conf.dsl runtime_conf = deepcopy(submit_job_conf.runtime_conf) job_utils.check_job_runtime_conf(runtime_conf) authentication_utils.check_constraint(runtime_conf, dsl) job_initiator = runtime_conf["initiator"] conf_adapter = JobRuntimeConfigAdapter(runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != "predict": # generate job model info conf_version = schedule_utils.get_conf_version(runtime_conf) if conf_version != 2: raise Exception( "only the v2 version runtime conf is supported") common_job_parameters.model_id = model_utils.gen_model_id( runtime_conf["role"]) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ["model_id", "model_version"]) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_pipeline_model() train_runtime_conf = json_loads( pipeline_model.train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) dsl = json_loads(pipeline_model.inference_dsl) # dsl = ProviderManager.fill_fate_flow_provider(dsl) job = Job() job.f_job_id = job_id job.f_dsl = dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = runtime_conf["role"] job.f_initiator_role = job_initiator["role"] job.f_initiator_party_id = job_initiator["party_id"] job.f_role = job_initiator["role"] job.f_party_id = job_initiator["party_id"] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in runtime_conf["role"][ job.f_initiator_role]: msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}" schedule_logger(job_id).info(msg) raise Exception(msg) # create common parameters on initiator JobController.create_common_job_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, common_job_parameters=common_job_parameters) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() # inherit job job.f_inheritance_info = common_job_parameters.inheritance_info job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS if job.f_inheritance_info: inheritance_jobs = JobSaver.query_job( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"]) inheritance_tasks = JobSaver.query_task( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"], only_latest=True) job_utils.check_job_inheritance_parameters( job, inheritance_jobs, inheritance_tasks) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) else: need_run_components = {} for role in response: need_run_components[role] = {} for party, res in response[role].items(): need_run_components[role][party] = [ name for name, value in response[role][party] ["data"]["components"].items() if value["need_run"] is True ] if common_job_parameters.federated_mode == FederatedMode.MULTIPLE: # create the task holder in db to record information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue if not need_run_components[role][party_id]: continue JobController.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=common_job_parameters, dsl_parser=dsl_parser, components=need_run_components[role][party_id]) job.f_status = JobStatus.WAITING status_code, response = FederatedScheduler.sync_job_status( job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("set job to waiting status failed") schedule_logger(job_id).info( f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}" ) logs_directory = job_utils.get_job_log_directory(job_id) result = { "code": RetCode.SUCCESS, "message": "success", "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator["role"], job_initiator["party_id"]) } warn_parameter = JobRuntimeConfigAdapter( submit_job_conf.runtime_conf).check_removed_parameter() if warn_parameter: result[ "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!" submit_result.update(result) submit_result.update(path_dict) except Exception as e: submit_result["code"] = RetCode.OPERATING_ERROR submit_result["message"] = exception_to_trace_string(e) schedule_logger(job_id).exception(e) return submit_result
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_initiator = job_runtime_conf['initiator'] job_parameters = RunParameters(**job_runtime_conf['job_parameters']) cls.backend_compatibility(job_parameters=job_parameters) job_utils.check_job_runtime_conf(job_runtime_conf) if job_parameters.job_type != 'predict': # generate job model info job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role']) job_parameters.model_version = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters.model_id, model_version=job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') if not job_dsl: job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = job_utils.save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) cls.adapt_job_parameters(job_parameters=job_parameters) # update runtime conf job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("create job failed: {}".format(response)) if job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job_runtime_conf["role"].items(): for party_id in party_ids: if role == job_initiator['role'] and party_id == job_initiator['party_id']: continue JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser) # push into queue try: JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) except Exception as e: raise Exception(f'push job into queue failed:\n{e}') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id)) board_url = "http://{}:{}{}".format( ServiceUtils.get_item("fateboard", "host"), ServiceUtils.get_item("fateboard", "port"), FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = job_utils.get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url