def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, only_latest=False) stop_job_ids = set() for task in running_tasks: if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party: continue count += 1 try: process_exist = build_engine( task.f_engine_conf.get("computing_engine")).is_alive( task) if not process_exist: msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}" detect_logger(job_id=task.f_job_id).info( f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist" ) time.sleep(3) _tasks = JobSaver.query_task( task_id=task.f_task_id, task_version=task.f_task_version, role=task.f_role, party_id=task.f_party_id) if _tasks: if _tasks[0].f_party_status == TaskStatus.RUNNING: stop_job_ids.add(task.f_job_id) detect_logger(task.f_job_id).info( f"{msg} party status has been checked twice, try to stop job" ) else: detect_logger(task.f_job_id).info( f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again" ) else: detect_logger(task.f_job_id).warning( f"{msg} can not found on db") except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.FAILED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")
def upload_history(): request_data = request.json if request_data.get('job_id'): tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True) else: tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True) limit = request_data.get('limit') if not limit: tasks = tasks[-1::-1] else: tasks = tasks[-1:-limit - 1:-1] jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks) data = get_upload_info(jobs_run_conf=jobs_run_conf) return get_json_result(retcode=0, retmsg='success', data=data)
def collect_task_of_all_party(cls, job, initiator_task, set_status=None): tasks_on_all_party = JobSaver.query_task( task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set( [task.f_status for task in tasks_on_all_party]) if not len(tasks_status_on_all ) > 1 and not TaskStatus.RUNNING in tasks_status_on_all: return status, federated_response = FederatedScheduler.collect_task( job=job, task=initiator_task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning( f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed" ) for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): if party_response["retcode"] == RetCode.SUCCESS: JobSaver.update_task_status( task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"]) elif party_response[ "retcode"] == RetCode.FEDERATED_ERROR and set_status: tmp_task_info = { "job_id": initiator_task.f_job_id, "task_id": initiator_task.f_task_id, "task_version": initiator_task.f_task_version, "role": _role, "party_id": _party_id, "party_status": TaskStatus.RUNNING } JobSaver.update_task_status(task_info=tmp_task_info) tmp_task_info["party_status"] = set_status JobSaver.update_task_status(task_info=tmp_task_info)
def query_task(): tasks = JobSaver.query_task(**request.json) if not tasks: return get_json_result(retcode=101, retmsg='find task failed') return get_json_result(retcode=0, retmsg='success', data=[task.to_json() for task in tasks])
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) component_need_run = {} if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format(job_info.get('job_id', ''))) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True) for task in tasks: need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True) component_need_run[task.f_component_name] = need_run else: dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) dependency = dsl_parser.get_dependency() dependency["component_need_run"] = component_need_run return dependency except Exception as e: stat_logger.exception(e) raise e
def component_output_data_download(): request_data = request.json tasks = JobSaver.query_task(only_latest=True, job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) if not tasks: raise ValueError( f'no found task, please check if the parameters are correct:{request_data}' ) import_component_output_depend(tasks[0].f_provider_info) try: output_tables_meta = get_component_output_tables_meta( task_data=request_data) except Exception as e: stat_logger.exception(e) return error_response(210, str(e)) limit = request_data.get('limit', -1) if not output_tables_meta: return error_response(response_code=210, retmsg='no data') if limit == 0: return error_response(response_code=210, retmsg='limit is 0') tar_file_name = 'job_{}_{}_{}_{}_output_data.tar.gz'.format( request_data['job_id'], request_data['component_name'], request_data['role'], request_data['party_id']) return TableStorage.send_table(output_tables_meta, tar_file_name, limit=limit, need_head=request_data.get("head", True))
def report_task_to_initiator(cls, task_info): tasks = JobSaver.query_task(task_id=task_info["task_id"], task_version=task_info["task_version"], role=task_info["role"], party_id=task_info["party_id"]) if tasks[ 0].f_federated_status_collect_type == FederatedCommunicationType.PUSH: FederatedScheduler.report_task_to_initiator(task=tasks[0])
def load_tasks(cls, component_list, job_id, role, party_id): tasks = JobSaver.query_task(job_id=job_id, role=role, party_id=party_id, only_latest=True) task_dict = {} for cpn in component_list: for task in tasks: if cpn == task.f_component_name: task_dict[cpn] = task return task_dict
def component_output_data(): request_data = request.json tasks = JobSaver.query_task(only_latest=True, job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) if not tasks: raise ValueError( f'no found task, please check if the parameters are correct:{request_data}' ) import_component_output_depend(tasks[0].f_provider_info) output_tables_meta = get_component_output_tables_meta( task_data=request_data) if not output_tables_meta: return get_json_result(retcode=0, retmsg='no data', data=[]) output_data_list = [] headers = [] totals = [] data_names = [] for output_name, output_table_meta in output_tables_meta.items(): output_data = [] is_str = False if output_table_meta: for k, v in output_table_meta.get_part_of_data(): data_line, is_str, extend_header = feature_utils.get_component_output_data_line( src_key=k, src_value=v, schema=output_table_meta.get_schema()) output_data.append(data_line) total = output_table_meta.get_count() output_data_list.append(output_data) data_names.append(output_name) totals.append(total) if output_data: header = get_component_output_data_schema( output_table_meta=output_table_meta, is_str=is_str, extend_header=extend_header) headers.append(header) else: headers.append(None) if len(output_data_list) == 1 and not output_data_list[0]: return get_json_result(retcode=0, retmsg='no data', data=[]) return get_json_result(retcode=0, retmsg='success', data=output_data_list, meta={ 'header': headers, 'total': totals, 'names': data_names })
def component_parameters(): request_data = request.json check_request_parameters(request_data) tasks = JobSaver.query_task(only_latest=True, **request_data) if not tasks: return get_json_result(retcode=101, retmsg='can not found this task') parameters = tasks[0].f_component_parameters output_parameters = {} output_parameters['module'] = parameters.get('module', '') for p_k, p_v in parameters.items(): if p_k.endswith('Param'): output_parameters[p_k] = p_v return get_json_result(retcode=0, retmsg='success', data=output_parameters)
def collect_task(cls, job_id, component_name, task_id, task_version, role, party_id): tasks = JobSaver.query_task(job_id=job_id, component_name=component_name, task_id=task_id, task_version=task_version, role=role, party_id=party_id) if tasks: return tasks[0].to_human_model_dict( only_primary_with=cls.INITIATOR_COLLECT_FIELDS) else: return None
def start_clean_job(cls, **kwargs): tasks = JobSaver.query_task(**kwargs) if tasks: for task in tasks: try: # clean session stat_logger.info('start {} {} {} {} session stop'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) start_session_stop(task) stat_logger.info('stop {} {} {} {} session success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: pass try: # clean data table JobClean.clean_table(job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id, component_name=task.f_component_name) except Exception as e: stat_logger.info( 'delete {} {} {} {} data table failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) try: # clean metric data stat_logger.info( 'start delete {} {} {} {} metric data'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) delete_metric_data({ 'job_id': task.f_job_id, 'role': task.f_role, 'party_id': task.f_party_id, 'component_name': task.f_component_name }) stat_logger.info( 'delete {} {} {} {} metric data success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: stat_logger.info( 'delete {} {} {} {} metric data failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) else: raise Exception('no found task')
def stop_task(job_id, component_name, task_id, task_version, role, party_id, stop_status): tasks = JobSaver.query_task(job_id=job_id, task_id=task_id, task_version=task_version, role=role, party_id=int(party_id)) kill_status = True for task in tasks: kill_status = kill_status & TaskController.stop_task( task=task, stop_status=stop_status) return get_json_result( retcode=RetCode.SUCCESS if kill_status else RetCode.EXCEPTION_ERROR, retmsg='success' if kill_status else 'failed')
def task_command(cls, job: Job, task: Task, command, command_body=None, parallel=False, need_user=False): msg = f"execute federated task {task.f_component_name} command({command})" federated_response = {} job_parameters = job.f_runtime_conf_on_party["job_parameters"] tasks = JobSaver.query_task(task_id=task.f_task_id, only_latest=True) threads = [] for task in tasks: dest_role, dest_party_id = task.f_role, task.f_party_id federated_response[dest_role] = federated_response.get( dest_role, {}) endpoint = f"/party/{task.f_job_id}/{task.f_component_name}/{task.f_task_id}/{task.f_task_version}/{dest_role}/{dest_party_id}/{command}" if need_user: command_body["user_id"] = job.f_user.get(dest_role, {}).get( str(dest_party_id), "") schedule_logger(job.f_job_id).info( f'user:{job.f_user}, dest_role:{dest_role}, dest_party_id:{dest_party_id}' ) schedule_logger( job.f_job_id).info(f'command_body: {command_body}') args = (job.f_job_id, job.f_role, job.f_party_id, dest_role, dest_party_id, endpoint, command_body, job_parameters["federated_mode"], federated_response) if parallel: t = threading.Thread(target=cls.federated_command, args=args) threads.append(t) t.start() else: cls.federated_command(*args) for thread in threads: thread.join() status_code, response = cls.return_federated_response( federated_response=federated_response) if status_code == FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job.f_job_id).info(successful_log(msg)) elif status_code == FederatedSchedulingStatusCode.NOT_EFFECTIVE: schedule_logger(job.f_job_id).warning(warning_log(msg)) elif status_code == FederatedSchedulingStatusCode.ERROR: schedule_logger(job.f_job_id).critical( failed_log(msg, detail=response)) else: schedule_logger(job.f_job_id).error( failed_log(msg, detail=response)) return status_code, response
def stop_job(cls, job, stop_status): tasks = JobSaver.query_task(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, reverse=True) kill_status = True kill_details = {} for task in tasks: kill_task_status = TaskController.stop_task( task=task, stop_status=stop_status) kill_status = kill_status & kill_task_status kill_details[ task.f_task_id] = 'success' if kill_task_status else 'failed' if kill_status: job_info = job.to_human_model_dict(only_primary_with=["status"]) job_info["status"] = stop_status JobController.update_job_status(job_info) return kill_status, kill_details
def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, run_on_this_party=True, run_ip=RuntimeConfig.JOB_SERVER_HOST, only_latest=False) stop_job_ids = set() for task in running_tasks: count += 1 try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger(job_id=task.f_job_id).info( 'job {} task {} {} on {} {} process {} does not exist' .format(task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.CANCELED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")
def federated_task_status(cls, job_id, task_id, task_version): tasks_on_all_party = JobSaver.query_task(task_id=task_id, task_version=task_version) status_flag = 0 # idmapping role status can only be ignored if all non-idmapping roles success for task in tasks_on_all_party: if 'idmapping' not in task.f_role and task.f_party_status != TaskStatus.SUCCESS: status_flag = 1 break if status_flag: tasks_party_status = [ task.f_party_status for task in tasks_on_all_party ] else: tasks_party_status = [ task.f_party_status for task in tasks_on_all_party if 'idmapping' not in task.f_role ] status = cls.calculate_multi_party_task_status(tasks_party_status) schedule_logger(job_id=job_id).info( "job {} task {} {} status is {}, calculate by task party status list: {}" .format(job_id, task_id, task_version, status, tasks_party_status)) return status
def get_rerun_component(cls, component_name, job, dsl_parser, force): if not component_name or component_name == job_utils.job_pipeline_component_name( ): pass else: dependence_status_code, response = FederatedScheduler.check_component( job=job, check_type="rerun") success_task_list = [ task.f_component_name for task in JobSaver.query_task(job_id=job.f_job_id, party_id=job.f_party_id, role=job.f_role, status=TaskStatus.SUCCESS, only_latest=True) ] component_set = set() for dest_role in response.keys(): for party_id in response[dest_role].keys(): component_set = component_set.union( set(response[dest_role][party_id].get("data"))) schedule_logger(job.f_job_id).info( f"success task list: {success_task_list}, check failed component list: {list(component_set)}" ) need_rerun = [ cpn.name for cpn in dsl_parser.get_need_revisit_nodes( success_task_list, list(component_set)) ] schedule_logger(job.f_job_id).info( f"need rerun success component: {need_rerun}") if component_set: force = True if isinstance(component_name, str): component_name = set(need_rerun).union({component_name}) else: component_name = set(need_rerun).union(set(component_name)) return component_name, force
def component_check(cls, job, check_type="inheritance"): if check_type == "rerun": task_list = JobSaver.query_task(job_id=job.f_job_id, party_id=job.f_party_id, role=job.f_role, status=TaskStatus.SUCCESS, only_latest=True) tasks = {} for task in task_list: tasks[task.f_component_name] = task else: tasks = JobController.load_tasks( component_list=job.f_inheritance_info.get( "component_list", []), job_id=job.f_inheritance_info.get("job_id"), role=job.f_role, party_id=job.f_party_id) tracker_dict = JobController.load_task_tracker(tasks) missing_dependence_component_list = [] # data dependence for tracker in tracker_dict.values(): table_infos = tracker.get_output_data_info() for table in table_infos: table_meta = storage.StorageTableMeta( name=table.f_table_name, namespace=table.f_table_namespace) if not table_meta: missing_dependence_component_list.append( tracker.component_name) continue if check_type == "rerun": return missing_dependence_component_list elif check_type == "inheritance": # reload component list return list( set(job.f_inheritance_info.get("component_list", [])) - set(missing_dependence_component_list))
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info( f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError( f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}" ) if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info( f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun" ) else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version}" ) for _role, _party_ids in job.f_runtime_conf_on_party[ "role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks( job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters( ** job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" ) job_can_rerun = True if job_can_rerun: schedule_logger( job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger( job_id=job_id).info(f"job {job_id} no task to rerun")
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( f"start to save pipeline model on {role} {party_id}") job_configuration = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) runtime_conf_on_party = job_configuration.runtime_conf_on_party job_parameters = runtime_conf_on_party.get('job_parameters', {}) if role in job_parameters.get("assistant_role", []): return model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') roles = runtime_conf_on_party['role'] initiator_role = runtime_conf_on_party['initiator']['role'] initiator_party_id = runtime_conf_on_party['initiator']['party_id'] if job_type == 'predict': return dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_configuration.dsl, runtime_conf=job_configuration.runtime_conf, train_runtime_conf=job_configuration.train_runtime_conf) components_parameters = {} tasks = JobSaver.query_task(job_id=job_id, role=role, party_id=party_id, only_latest=True) for task in tasks: components_parameters[ task.f_component_name] = task.f_component_parameters predict_dsl = schedule_utils.fill_inference_dsl( dsl_parser, origin_inference_dsl=job_configuration.dsl, components_parameters=components_parameters) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_configuration.dsl, byte=True) pipeline.train_runtime_conf = json_dumps( job_configuration.runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version pipeline.parent = True pipeline.loaded_times = 0 pipeline.roles = json_dumps(roles, byte=True) pipeline.initiator_role = initiator_role pipeline.initiator_party_id = initiator_party_id pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party, byte=True) pipeline.parent_info = json_dumps({}, byte=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version, job_parameters=RunParameters(**job_parameters)) tracker.save_pipeline_model(pipeline_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( f"save pipeline on {role} {party_id} successfully")
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id, **kwargs): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ job_dsl = job_utils.get_job_dsl(job_id, role, party_id) PrivilegeAuth.authentication_component( job_dsl, src_party_id=kwargs.get('src_party_id'), src_role=kwargs.get('src_role'), party_id=party_id, component_name=component_name) schedule_logger(job_id).info( f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess" ) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } is_failed = False try: task = JobSaver.query_task(task_id=task_id, task_version=task_version, role=role, party_id=party_id)[0] run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) run_parameters_dict["src_user"] = kwargs.get("src_user") run_parameters = RunParameters(**run_parameters_dict) config_dir = job_utils.get_task_directory(job_id, role, party_id, component_name, task_id, task_version) os.makedirs(config_dir, exist_ok=True) run_parameters_path = os.path.join(config_dir, 'task_parameters.json') with open(run_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) schedule_logger(job_id).info( f"use computing engine {run_parameters.computing_engine}") task_info["engine_conf"] = { "computing_engine": run_parameters.computing_engine } backend_engine = build_engine(run_parameters.computing_engine) run_info = backend_engine.run( task=task, run_parameters=run_parameters, run_parameters_path=run_parameters_path, config_dir=config_dir, log_dir=job_utils.get_job_log_directory( job_id, role, party_id, component_name), cwd_dir=job_utils.get_job_directory(job_id, role, party_id, component_name), user_name=kwargs.get("user_id")) task_info.update(run_info) task_info["start_time"] = current_timestamp() task_executor_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) is_failed = True finally: try: cls.update_task(task_info=task_info) task_info["party_status"] = TaskStatus.RUNNING cls.update_task_status(task_info=task_info) if is_failed: task_info["party_status"] = TaskStatus.FAILED cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( "task {} {} on {} {} executor subprocess start {}".format( task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def submit(cls, submit_job_conf: JobConfigurationBase, job_id: str = None): if not job_id: job_id = job_utils.generate_job_id() submit_result = {"job_id": job_id} schedule_logger(job_id).info( f"submit job, body {submit_job_conf.to_dict()}") try: dsl = submit_job_conf.dsl runtime_conf = deepcopy(submit_job_conf.runtime_conf) job_utils.check_job_runtime_conf(runtime_conf) authentication_utils.check_constraint(runtime_conf, dsl) job_initiator = runtime_conf["initiator"] conf_adapter = JobRuntimeConfigAdapter(runtime_conf) common_job_parameters = conf_adapter.get_common_parameters() if common_job_parameters.job_type != "predict": # generate job model info conf_version = schedule_utils.get_conf_version(runtime_conf) if conf_version != 2: raise Exception( "only the v2 version runtime conf is supported") common_job_parameters.model_id = model_utils.gen_model_id( runtime_conf["role"]) common_job_parameters.model_version = job_id train_runtime_conf = {} else: # check predict job parameters detect_utils.check_config(common_job_parameters.to_dict(), ["model_id", "model_version"]) # get inference dsl from pipeline model as job dsl tracker = Tracker( job_id=job_id, role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version) pipeline_model = tracker.get_pipeline_model() train_runtime_conf = json_loads( pipeline_model.train_runtime_conf) if not model_utils.check_if_deployed( role=job_initiator["role"], party_id=job_initiator["party_id"], model_id=common_job_parameters.model_id, model_version=common_job_parameters.model_version): raise Exception( f"Model {common_job_parameters.model_id} {common_job_parameters.model_version} has not been deployed yet." ) dsl = json_loads(pipeline_model.inference_dsl) # dsl = ProviderManager.fill_fate_flow_provider(dsl) job = Job() job.f_job_id = job_id job.f_dsl = dsl job.f_train_runtime_conf = train_runtime_conf job.f_roles = runtime_conf["role"] job.f_initiator_role = job_initiator["role"] job.f_initiator_party_id = job_initiator["party_id"] job.f_role = job_initiator["role"] job.f_party_id = job_initiator["party_id"] path_dict = job_utils.save_job_conf( job_id=job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party={}, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) if job.f_initiator_party_id not in runtime_conf["role"][ job.f_initiator_role]: msg = f"initiator party id {job.f_initiator_party_id} not in roles {runtime_conf['role']}" schedule_logger(job_id).info(msg) raise Exception(msg) # create common parameters on initiator JobController.create_common_job_parameters( job_id=job.f_job_id, initiator_role=job.f_initiator_role, common_job_parameters=common_job_parameters) job.f_runtime_conf = conf_adapter.update_common_parameters( common_parameters=common_job_parameters) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) # initiator runtime conf as template job.f_runtime_conf_on_party = job.f_runtime_conf.copy() job.f_runtime_conf_on_party[ "job_parameters"] = common_job_parameters.to_dict() # inherit job job.f_inheritance_info = common_job_parameters.inheritance_info job.f_inheritance_status = JobInheritanceStatus.WAITING if common_job_parameters.inheritance_info else JobInheritanceStatus.PASS if job.f_inheritance_info: inheritance_jobs = JobSaver.query_job( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"]) inheritance_tasks = JobSaver.query_task( job_id=job.f_inheritance_info.get("job_id"), role=job_initiator["role"], party_id=job_initiator["party_id"], only_latest=True) job_utils.check_job_inheritance_parameters( job, inheritance_jobs, inheritance_tasks) status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: job.f_status = JobStatus.FAILED job.f_tag = "submit_failed" FederatedScheduler.sync_job_status(job=job) raise Exception("create job failed", response) else: need_run_components = {} for role in response: need_run_components[role] = {} for party, res in response[role].items(): need_run_components[role][party] = [ name for name, value in response[role][party] ["data"]["components"].items() if value["need_run"] is True ] if common_job_parameters.federated_mode == FederatedMode.MULTIPLE: # create the task holder in db to record information of all participants in the initiator for scheduling for role, party_ids in job.f_roles.items(): for party_id in party_ids: if role == job.f_initiator_role and party_id == job.f_initiator_party_id: continue if not need_run_components[role][party_id]: continue JobController.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=common_job_parameters, dsl_parser=dsl_parser, components=need_run_components[role][party_id]) job.f_status = JobStatus.WAITING status_code, response = FederatedScheduler.sync_job_status( job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("set job to waiting status failed") schedule_logger(job_id).info( f"submit job successfully, job id is {job.f_job_id}, model id is {common_job_parameters.model_id}" ) logs_directory = job_utils.get_job_log_directory(job_id) result = { "code": RetCode.SUCCESS, "message": "success", "model_info": { "model_id": common_job_parameters.model_id, "model_version": common_job_parameters.model_version }, "logs_directory": logs_directory, "board_url": job_utils.get_board_url(job_id, job_initiator["role"], job_initiator["party_id"]) } warn_parameter = JobRuntimeConfigAdapter( submit_job_conf.runtime_conf).check_removed_parameter() if warn_parameter: result[ "message"] = f"[WARN]{warn_parameter} is removed,it does not take effect!" submit_result.update(result) submit_result.update(path_dict) except Exception as e: submit_result["code"] = RetCode.OPERATING_ERROR submit_result["message"] = exception_to_trace_string(e) schedule_logger(job_id).exception(e) return submit_result
def set_job_rerun(cls, job_id, initiator_role, initiator_party_id, auto, force=False, tasks: typing.List[Task] = None, component_name: typing.Union[str, list] = None): schedule_logger(job_id).info( f"try to rerun job on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if not jobs: raise RuntimeError( f"can not found job on initiator {initiator_role} {initiator_party_id}" ) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) component_name, force = cls.get_rerun_component( component_name, job, dsl_parser, force) schedule_logger(job_id).info(f"rerun component: {component_name}") if tasks: schedule_logger(job_id).info( f"require {[task.f_component_name for task in tasks]} to rerun" ) else: task_query = { 'job_id': job_id, 'role': initiator_role, 'party_id': initiator_party_id, } if not component_name or component_name == job_utils.job_pipeline_component_name( ): # rerun all tasks schedule_logger(job_id).info( "require all component of pipeline to rerun") else: _require_reruns = {component_name} if isinstance( component_name, str) else set(component_name) _should_reruns = _require_reruns.copy() for _cpn in _require_reruns: _components = dsl_parser.get_downstream_dependent_components( _cpn) for _c in _components: _should_reruns.add(_c.get_name()) schedule_logger(job_id).info( f"require {_require_reruns} to rerun, " f"and then found {_should_reruns} need be to rerun") task_query['component_name'] = _should_reruns tasks = JobSaver.query_task(**task_query) job_can_rerun = any([ TaskScheduler.prepare_rerun_task( job=job, task=task, dsl_parser=dsl_parser, auto=auto, force=force, ) for task in tasks ]) if not job_can_rerun: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id).info("job no task to rerun") return False schedule_logger(job_id).info("job set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id).info( f"job set rerun signal {'successfully' if status else 'failed'}") return True