def collect_task_of_all_party(cls, job, task): status, federated_response = FederatedScheduler.collect_task(job=job, task=task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning(f"collect task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} failed") return for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): JobSaver.update_task_status(task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"])
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: if EndStatus.contains(job.f_status): job.f_status = JobStatus.WAITING job.f_end_time = None job.f_elapsed = None job.f_progress = 0 schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun") status, response = FederatedScheduler.sync_job_status(job=job) if status == FederatedSchedulingStatusCode.SUCCESS: FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"]) JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed") else: # status updates may be delayed, and in a very small probability they will be executed after the rerun command schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task") else: schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get( 'src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) job_parameters = RunParameters(**runtime_conf['job_parameters']) job_initiator = runtime_conf['initiator'] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) # save new job into db if role == job_initiator['role'] and party_id == job_initiator[ 'party_id']: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING roles = job_info['roles'] # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 engines_info = cls.get_job_engines_address( job_parameters=job_parameters) cls.special_role_parameters(role=role, job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, engines_info=engines_info) runtime_conf["job_parameters"] = job_parameters.to_dict() JobSaver.create_job(job_info=job_info) job_utils.save_job_conf(job_id=job_id, job_dsl=dsl, job_runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id, role, party_id, True, job_initiator, job_parameters, dsl_parser) cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_info=job_info, is_initiator=is_initiator, dsl_parser=dsl_parser)
def collect_task_of_all_party(cls, job, initiator_task, set_status=None): tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set([task.f_status for task in tasks_on_all_party]) if not len(tasks_status_on_all) > 1 and not TaskStatus.RUNNING in tasks_status_on_all: return status, federated_response = FederatedScheduler.collect_task(job=job, task=initiator_task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning(f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed") for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): if party_response["retcode"] == RetCode.SUCCESS: JobSaver.update_task_status(task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"]) elif party_response["retcode"] == RetCode.FEDERATED_ERROR and set_status: tmp_task_info = { "job_id": initiator_task.f_job_id, "task_id": initiator_task.f_task_id, "task_version": initiator_task.f_task_version, "role": _role, "party_id": _party_id, "party_status": TaskStatus.RUNNING } JobSaver.update_task_status(task_info=tmp_task_info) tmp_task_info["party_status"] = set_status JobSaver.update_task_status(task_info=tmp_task_info)
def update_job_on_initiator(cls, initiator_job: Job, update_fields: list): jobs = JobSaver.query_job(job_id=initiator_job.f_job_id) if not jobs: raise Exception("Failed to update job status on initiator") job_info = initiator_job.to_human_model_dict(only_primary_with=update_fields) for field in update_fields: job_info[field] = getattr(initiator_job, "f_%s" % field) for job in jobs: job_info["role"] = job.f_role job_info["party_id"] = job.f_party_id JobSaver.update_job_status(job_info=job_info) JobSaver.update_job(job_info=job_info)
def create_task(cls, role, party_id, run_on_this_party, task_info): task_info["role"] = role task_info["party_id"] = party_id task_info["status"] = TaskStatus.WAITING task_info["party_status"] = TaskStatus.WAITING task_info["create_time"] = base_utils.current_timestamp() task_info["run_on_this_party"] = run_on_this_party if "task_id" not in task_info: task_info["task_id"] = job_utils.generate_task_id(job_id=task_info["job_id"], component_name=task_info["component_name"]) if "task_version" not in task_info: task_info["task_version"] = 0 JobSaver.create_task(task_info=task_info)
def stop_jobs(cls, job_id, stop_status, role=None, party_id=None): if role and party_id: jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) else: jobs = JobSaver.query_job(job_id=job_id) kill_status = True kill_details = {} for job in jobs: kill_job_status, kill_job_details = cls.stop_job(job=job, stop_status=stop_status) kill_status = kill_status & kill_job_status kill_details[job_id] = kill_job_details return kill_status, kill_details
def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id)) initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] for initiator_task in initiator_tasks_group.values(): # collect all party task party status if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL: tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set([task.f_status for task in tasks_on_all_party]) if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all: cls.collect_task_of_all_party(job=job, task=initiator_task) new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains(initiator_task.f_status): FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[ JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id ) ] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling") break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED break else: schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id)) schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id)) return scheduling_status_code, initiator_tasks_group.values()
def report_task(job_id, component_name, task_id, task_version, role, party_id): task_info = {} task_info.update(request.json) task_info.update({ "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, }) JobSaver.update_task(task_info=task_info) if task_info.get("party_status"): JobSaver.update_status(Task, task_info) return get_json_result(retcode=0, retmsg='success')
def upload_history(): request_data = request.json if request_data.get('job_id'): tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True) else: tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True) limit = request_data.get('limit') if not limit: tasks = tasks[-1::-1] else: tasks = tasks[-1:-limit - 1:-1] jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks) data = get_upload_info(jobs_run_conf=jobs_run_conf) return get_json_result(retcode=0, retmsg='success', data=data)
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf_on_party["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def clean_task(cls, job_id, task_id, task_version, role, party_id, content_type): status = set() if content_type == "metrics": tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version) status.add(tracker.clean_metrics()) elif content_type == "table": jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) if jobs: job = jobs[0] job_parameters = RunParameters( **job.f_runtime_conf_on_party["job_parameters"]) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_parameters=job_parameters) status.add(tracker.clean_task(job.f_runtime_conf_on_party)) if len(status) == 1 and True in status: return True else: return False
def update_job_status(cls, job_info): update_status = JobSaver.update_job_status(job_info=job_info) if update_status and EndStatus.contains(job_info.get("status")): ResourceManager.return_job_resource(job_id=job_info["job_id"], role=job_info["role"], party_id=job_info["party_id"]) return update_status
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) if job_info.get('job_id'): jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format( job_info.get('job_id', ''))) job = jobs[0] job_dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) else: job_dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) return job_dsl_parser.get_dependency(role=job_info["role"], party_id=int( job_info["party_id"])) except Exception as e: stat_logger.exception(e) raise e
def query_task(): tasks = JobSaver.query_task(**request.json) if not tasks: return get_json_result(retcode=101, retmsg='find task failed') return get_json_result(retcode=0, retmsg='success', data=[task.to_json() for task in tasks])
def cancel_job(cls, job_id, role, party_id): schedule_logger(job_id).info( '{} {} get cancel waiting job {} command'.format( role, party_id, job_id)) jobs = JobSaver.query_job(job_id=job_id) if jobs: job = jobs[0] try: # You cannot delete an event directly, otherwise the status might not be updated status = JobQueue.update_event( job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_status=JobStatus.CANCELED) if not status: return False except: return False schedule_logger(job_id).info( 'cancel {} job successfully, job id is {}'.format( job.f_status, job.f_job_id)) return True else: schedule_logger(job_id).warning( 'role {} party id {} cancel job failed, no find jod {}'.format( role, party_id, job_id)) raise Exception( 'role {} party id {} cancel job failed, no find jod {}'.format( role, party_id, job_id))
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info( f"request stop job {jobs[0]} to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_json()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg=f"stop job on this party {kill_status};\n" f"stop job on all party success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="stop job on this party {};\n" "stop job failed:\n{}".format( kill_status, json_dumps(response, indent=4))) else: schedule_logger(job_id).info(f"can not found job {job_id} to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def update_job(cls, job_info): """ Save to local database :param job_info: :return: """ return JobSaver.update_job(job_info=job_info)
def stop_job(cls, job_id, role, party_id, stop_status): schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status}") jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id, is_initiator=True) if len(jobs) > 0: if stop_status == JobStatus.CANCELED: schedule_logger(job_id=job_id).info(f"cancel job {job_id}") set_cancel_status = cls.cancel_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id=job_id).info( f"set job {job_id} cancel signal {set_cancel_status}") job = jobs[0] job.f_status = stop_status schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status} to all party") status_code, response = FederatedScheduler.stop_job( job=jobs[0], stop_status=stop_status) if status_code == FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} successfully") return RetCode.SUCCESS, "success" else: schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} failed, {response}") return RetCode.FEDERATED_ERROR, json_dumps(response) else: return RetCode.SUCCESS, "can not found job"
def start_job(cls, job_id, initiator_role, initiator_party_id): schedule_logger(job_id=job_id).info( "Try to start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) job_info = {} job_info["job_id"] = job_id job_info["role"] = initiator_role job_info["party_id"] = initiator_party_id job_info["status"] = JobStatus.RUNNING job_info["party_status"] = JobStatus.RUNNING job_info["start_time"] = current_timestamp() job_info["tag"] = 'end_waiting' jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] FederatedScheduler.start_job(job=job) schedule_logger(job_id=job_id).info( "start job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id)) else: schedule_logger(job_id=job_id).error( "can not found job {} on initiator {} {}".format( job_id, initiator_role, initiator_party_id))
def update_job(): job_info = request.json jobs = JobSaver.query_job(job_id=job_info['job_id'], party_id=job_info['party_id'], role=job_info['role']) if not jobs: return get_json_result(retcode=101, retmsg='find job failed') else: JobSaver.update_job( job_info={ 'description': job_info.get('notes', ''), 'job_id': job_info['job_id'], 'role': job_info['role'], 'party_id': job_info['party_id'] }) return get_json_result(retcode=0, retmsg='success')
def report_task_to_initiator(cls, task_info): tasks = JobSaver.query_task(task_id=task_info["task_id"], task_version=task_info["task_version"], role=task_info["role"], party_id=task_info["party_id"]) if tasks[0].f_federated_status_collect_type == FederatedCommunicationType.PUSH: FederatedScheduler.report_task_to_initiator(task=tasks[0])
def clean_queue(): jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING) clean_status = {} for job in jobs: status_code, response = FederatedScheduler.request_stop_job( job=job, stop_status=JobStatus.CANCELED) clean_status[job.f_job_id] = status_code return get_json_result(retcode=0, retmsg='success', data=clean_status)
def query_job(): jobs = JobSaver.query_job(**request.json) if not jobs: return get_json_result(retcode=0, retmsg='no job could be found', data=[]) return get_json_result(retcode=0, retmsg='success', data=[job.to_json() for job in jobs])
def component_output_data_table(): request_data = request.json detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name']) jobs = JobSaver.query_job(job_id=request_data.get('job_id')) if jobs: job = jobs[0] return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table')) else: return get_json_result(retcode=100, retmsg='No found job')
def stop_job(cls, job, stop_status): tasks = JobSaver.query_task(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, reverse=True) kill_status = True kill_details = {} for task in tasks: kill_task_status = TaskController.stop_task(task=task, stop_status=stop_status) kill_status = kill_status & kill_task_status kill_details[task.f_task_id] = 'success' if kill_task_status else 'failed' return kill_status, kill_details
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_parameters = dsl_parser.get_job_parameters().get(role, {}).get(party_id, {}) schedule_logger(job_id).info('job parameters:{}'.format(job_parameters)) job_parameters = RunParameters(**job_parameters) # save new job into db if role == job_info["initiator_role"] and party_id == job_info["initiator_party_id"]: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 cls.adapt_job_parameters(role=role, job_parameters=job_parameters) engines_info = cls.get_job_engines_address(job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info) job_info["runtime_conf_on_party"]["job_parameters"] = job_parameters.to_dict() job_utils.save_job_conf(job_id=job_id, role=role, job_dsl=dsl, job_runtime_conf=runtime_conf, job_runtime_conf_on_party=job_info["runtime_conf_on_party"], train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser) job_parameters = job_info['runtime_conf_on_party']['job_parameters'] roles = job_info['roles'] cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser) JobSaver.create_job(job_info=job_info)
def get_job_table_list(): detect_utils.check_config( config=request.json, required_arguments=['job_id', 'role', 'party_id']) jobs = JobSaver.query_job(**request.json) if jobs: job = jobs[0] tables = get_job_all_table(job) return get_json_result(data=tables) else: return get_json_result(retcode=101, retmsg='no find job')
def schedule_waiting_jobs(cls, event): job_id, initiator_role, initiator_party_id, = event.f_job_id, event.f_initiator_role, event.f_initiator_party_id, update_status = JobQueue.update_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id, job_status=JobStatus.READY) if not update_status: schedule_logger(job_id).info(f"job {job_id} may be handled by another scheduler") return # apply resource on all party jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if not jobs: JobQueue.delete_event(job_id=job_id) return job = jobs[0] apply_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.APPLY) if apply_status_code == FederatedSchedulingStatusCode.SUCCESS: cls.start_job(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) JobQueue.update_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id, job_status=JobStatus.RUNNING) else: # rollback resource rollback_party = {} failed_party = {} for dest_role in federated_response.keys(): for dest_party_id in federated_response[dest_role].keys(): retcode = federated_response[dest_role][dest_party_id]["retcode"] if retcode == 0: rollback_party[dest_role] = rollback_party.get(dest_role, []) rollback_party[dest_role].append(dest_party_id) else: failed_party[dest_role] = failed_party.get(dest_role, []) failed_party[dest_role].append(dest_party_id) schedule_logger(job_id).info("job {} apply resource failed on {}, rollback {}".format( job_id, ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in failed_party.items()]), ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in rollback_party.items()]), )) if rollback_party: return_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.RETURN, specific_dest=rollback_party) if return_status_code != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id).info(f"job {job_id} return resource failed:\n{federated_response}") else: schedule_logger(job_id).info(f"job {job_id} no party should be rollback resource") if apply_status_code == FederatedSchedulingStatusCode.ERROR: cls.stop_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id, stop_status=JobStatus.FAILED) schedule_logger(job_id).info(f"apply resource error, stop job {job_id}") else: update_status = JobQueue.update_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id, job_status=JobStatus.WAITING) schedule_logger(job_id).info(f"update job {job_id} status to waiting {update_status}")
def federated_task_status(cls, job_id, task_id, task_version): tasks_on_all_party = JobSaver.query_task(task_id=task_id, task_version=task_version) tasks_party_status = [ task.f_party_status for task in tasks_on_all_party ] status = cls.calculate_multi_party_task_status(tasks_party_status) schedule_logger(job_id=job_id).info( "job {} task {} {} status is {}, calculate by task party status list: {}" .format(job_id, task_id, task_version, status, tasks_party_status)) return status