def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: if EndStatus.contains(job.f_status): job.f_status = JobStatus.WAITING job.f_end_time = None job.f_elapsed = None job.f_progress = 0 schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun") status, response = FederatedScheduler.sync_job_status(job=job) if status == FederatedSchedulingStatusCode.SUCCESS: FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"]) JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed") else: # status updates may be delayed, and in a very small probability they will be executed after the rerun command schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task") else: schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def upload_history(): request_data = request.json if request_data.get('job_id'): tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True) else: tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True) limit = request_data.get('limit') if not limit: tasks = tasks[-1::-1] else: tasks = tasks[-1:-limit - 1:-1] jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks) data = get_upload_info(jobs_run_conf=jobs_run_conf) return get_json_result(retcode=0, retmsg='success', data=data)
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf_on_party["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def query_task(): tasks = JobSaver.query_task(**request.json) if not tasks: return get_json_result(retcode=101, retmsg='find task failed') return get_json_result(retcode=0, retmsg='success', data=[task.to_json() for task in tasks])
def report_task_to_initiator(cls, task_info): tasks = JobSaver.query_task(task_id=task_info["task_id"], task_version=task_info["task_version"], role=task_info["role"], party_id=task_info["party_id"]) if tasks[0].f_federated_status_collect_type == FederatedCommunicationType.PUSH: FederatedScheduler.report_task_to_initiator(task=tasks[0])
def collect_task_of_all_party(cls, job, initiator_task, set_status=None): tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set([task.f_status for task in tasks_on_all_party]) if not len(tasks_status_on_all) > 1 and not TaskStatus.RUNNING in tasks_status_on_all: return status, federated_response = FederatedScheduler.collect_task(job=job, task=initiator_task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning(f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed") for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): if party_response["retcode"] == RetCode.SUCCESS: JobSaver.update_task_status(task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"]) elif party_response["retcode"] == RetCode.FEDERATED_ERROR and set_status: tmp_task_info = { "job_id": initiator_task.f_job_id, "task_id": initiator_task.f_task_id, "task_version": initiator_task.f_task_version, "role": _role, "party_id": _party_id, "party_status": TaskStatus.RUNNING } JobSaver.update_task_status(task_info=tmp_task_info) tmp_task_info["party_status"] = set_status JobSaver.update_task_status(task_info=tmp_task_info)
def stop_job(cls, job, stop_status): tasks = JobSaver.query_task(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, reverse=True) kill_status = True kill_details = {} for task in tasks: kill_task_status = TaskController.stop_task(task=task, stop_status=stop_status) kill_status = kill_status & kill_task_status kill_details[task.f_task_id] = 'success' if kill_task_status else 'failed' return kill_status, kill_details
def federated_task_status(cls, job_id, task_id, task_version): tasks_on_all_party = JobSaver.query_task(task_id=task_id, task_version=task_version) tasks_party_status = [ task.f_party_status for task in tasks_on_all_party ] status = cls.calculate_multi_party_task_status(tasks_party_status) schedule_logger(job_id=job_id).info( "job {} task {} {} status is {}, calculate by task party status list: {}" .format(job_id, task_id, task_version, status, tasks_party_status)) return status
def collect_task(cls, job_id, component_name, task_id, task_version, role, party_id): tasks = JobSaver.query_task(job_id=job_id, component_name=component_name, task_id=task_id, task_version=task_version, role=role, party_id=party_id) if tasks: return tasks[0].to_human_model_dict( only_primary_with=cls.INITIATOR_COLLECT_FIELDS) else: return None
def stop_job(cls, job, stop_status): tasks = JobSaver.query_task(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, reverse=True) kill_status = True kill_details = {} for task in tasks: kill_task_status = TaskController.stop_task(task=task, stop_status=stop_status) kill_status = kill_status & kill_task_status kill_details[task.f_task_id] = 'success' if kill_task_status else 'failed' if kill_status: job_info = job.to_human_model_dict(only_primary_with=["status"]) job_info["status"] = stop_status JobController.update_job_status(job_info) return kill_status, kill_details
def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id)) initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] for initiator_task in initiator_tasks_group.values(): # collect all party task party status if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL: tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set([task.f_status for task in tasks_on_all_party]) if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all: cls.collect_task_of_all_party(job=job, task=initiator_task) new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains(initiator_task.f_status): FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[ JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id ) ] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling") break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED break else: schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id)) schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id)) return scheduling_status_code, initiator_tasks_group.values()
def start_clean_job(cls, **kwargs): tasks = JobSaver.query_task(**kwargs) if tasks: for task in tasks: try: # clean session stat_logger.info('start {} {} {} {} session stop'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) start_session_stop(task) stat_logger.info('stop {} {} {} {} session success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: pass try: # clean data table JobClean.clean_table(job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id, component_name=task.f_component_name) except Exception as e: stat_logger.info( 'delete {} {} {} {} data table failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) try: # clean metric data stat_logger.info( 'start delete {} {} {} {} metric data'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) delete_metric_data({ 'job_id': task.f_job_id, 'role': task.f_role, 'party_id': task.f_party_id, 'component_name': task.f_component_name }) stat_logger.info( 'delete {} {} {} {} metric data success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: stat_logger.info( 'delete {} {} {} {} metric data failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) else: raise Exception('no found task')
def stop_task(job_id, component_name, task_id, task_version, role, party_id, stop_status): tasks = JobSaver.query_task(job_id=job_id, task_id=task_id, task_version=task_version, role=role, party_id=int(party_id)) kill_status = True for task in tasks: kill_status = kill_status & TaskController.stop_task( task=task, stop_status=stop_status) return get_json_result( retcode=RetCode.SUCCESS if kill_status else RetCode.EXCEPTION_ERROR, retmsg='success' if kill_status else 'failed')
def federated_task_status(cls, job_id, task_id, task_version): tasks_on_all_party = JobSaver.query_task(task_id=task_id, task_version=task_version) status_flag = 0 # idmapping role status can only be ignored if all non-idmapping roles success for task in tasks_on_all_party: if 'idmapping' not in task.f_role and task.f_party_status != TaskStatus.SUCCESS: status_flag = 1 break if status_flag: tasks_party_status = [task.f_party_status for task in tasks_on_all_party] else: tasks_party_status = [task.f_party_status for task in tasks_on_all_party if 'idmapping' not in task.f_role] status = cls.calculate_multi_party_task_status(tasks_party_status) schedule_logger(job_id=job_id).info("job {} task {} {} status is {}, calculate by task party status list: {}".format(job_id, task_id, task_version, status, tasks_party_status)) return status
def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, run_on_this_party=True, run_ip=RuntimeConfig.JOB_SERVER_HOST, only_latest=False) stop_job_ids = set() for task in running_tasks: count += 1 try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger(job_id=task.f_job_id).info( 'job {} task {} {} on {} {} process {} does not exist' .format(task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.CANCELED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")