Example #1
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         if EndStatus.contains(job.f_status):
             job.f_status = JobStatus.WAITING
             job.f_end_time = None
             job.f_elapsed = None
             job.f_progress = 0
             schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun")
             status, response = FederatedScheduler.sync_job_status(job=job)
             if status == FederatedSchedulingStatusCode.SUCCESS:
                 FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"])
                 JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully")
             else:
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed")
         else:
             # status updates may be delayed, and in a very small probability they will be executed after the rerun command
             schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task")
     else:
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Example #2
0
def upload_history():
    request_data = request.json
    if request_data.get('job_id'):
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True)
    else:
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True)
    limit = request_data.get('limit')
    if not limit:
        tasks = tasks[-1::-1]
    else:
        tasks = tasks[-1:-limit - 1:-1]
    jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks)
    data = get_upload_info(jobs_run_conf=jobs_run_conf)
    return get_json_result(retcode=0, retmsg='success', data=data)
Example #3
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf_on_party,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf_on_party["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Example #4
0
def query_task():
    tasks = JobSaver.query_task(**request.json)
    if not tasks:
        return get_json_result(retcode=101, retmsg='find task failed')
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=[task.to_json() for task in tasks])
Example #5
0
 def report_task_to_initiator(cls, task_info):
     tasks = JobSaver.query_task(task_id=task_info["task_id"],
                                 task_version=task_info["task_version"],
                                 role=task_info["role"],
                                 party_id=task_info["party_id"])
     if tasks[0].f_federated_status_collect_type == FederatedCommunicationType.PUSH:
         FederatedScheduler.report_task_to_initiator(task=tasks[0])
Example #6
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
     tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all) > 1 and not TaskStatus.RUNNING in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed")
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response["retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
Example #7
0
 def stop_job(cls, job, stop_status):
     tasks = JobSaver.query_task(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, reverse=True)
     kill_status = True
     kill_details = {}
     for task in tasks:
         kill_task_status = TaskController.stop_task(task=task, stop_status=stop_status)
         kill_status = kill_status & kill_task_status
         kill_details[task.f_task_id] = 'success' if kill_task_status else 'failed'
     return kill_status, kill_details
Example #8
0
 def federated_task_status(cls, job_id, task_id, task_version):
     tasks_on_all_party = JobSaver.query_task(task_id=task_id,
                                              task_version=task_version)
     tasks_party_status = [
         task.f_party_status for task in tasks_on_all_party
     ]
     status = cls.calculate_multi_party_task_status(tasks_party_status)
     schedule_logger(job_id=job_id).info(
         "job {} task {} {} status is {}, calculate by task party status list: {}"
         .format(job_id, task_id, task_version, status, tasks_party_status))
     return status
Example #9
0
 def collect_task(cls, job_id, component_name, task_id, task_version, role,
                  party_id):
     tasks = JobSaver.query_task(job_id=job_id,
                                 component_name=component_name,
                                 task_id=task_id,
                                 task_version=task_version,
                                 role=role,
                                 party_id=party_id)
     if tasks:
         return tasks[0].to_human_model_dict(
             only_primary_with=cls.INITIATOR_COLLECT_FIELDS)
     else:
         return None
Example #10
0
 def stop_job(cls, job, stop_status):
     tasks = JobSaver.query_task(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, reverse=True)
     kill_status = True
     kill_details = {}
     for task in tasks:
         kill_task_status = TaskController.stop_task(task=task, stop_status=stop_status)
         kill_status = kill_status & kill_task_status
         kill_details[task.f_task_id] = 'success' if kill_task_status else 'failed'
     if kill_status:
         job_info = job.to_human_model_dict(only_primary_with=["status"])
         job_info["status"] = stop_status
         JobController.update_job_status(job_info)
     return kill_status, kill_details
Example #11
0
    def schedule(cls, job, dsl_parser, canceled=False):
        schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id))
        initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id)
        waiting_tasks = []
        for initiator_task in initiator_tasks_group.values():
            # collect all party task party status
            if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL:
                tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
                tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
                if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all:
                    cls.collect_task_of_all_party(job=job, task=initiator_task)
            new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
            task_status_have_update = False
            if new_task_status != initiator_task.f_status:
                task_status_have_update = True
                initiator_task.f_status = new_task_status
                FederatedScheduler.sync_task_status(job=job, task=initiator_task)

            if initiator_task.f_status == TaskStatus.WAITING:
                waiting_tasks.append(initiator_task)
            elif task_status_have_update and EndStatus.contains(initiator_task.f_status):
                FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status)

        scheduling_status_code = SchedulingStatusCode.NO_NEXT
        if not canceled:
            for waiting_task in waiting_tasks:
                for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name):
                    dependent_task = initiator_tasks_group[
                        JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()),
                                          role=job.f_role,
                                          party_id=job.f_party_id
                                          )
                    ]
                    if dependent_task.f_status != TaskStatus.SUCCESS:
                        # can not start task
                        break
                else:
                    # all upstream dependent tasks have been successful, can start this task
                    scheduling_status_code = SchedulingStatusCode.HAVE_NEXT
                    status_code = cls.start_task(job=job, task=waiting_task)
                    if status_code == SchedulingStatusCode.NO_RESOURCE:
                        # wait for the next round of scheduling
                        schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling")
                        break
                    elif status_code == SchedulingStatusCode.FAILED:
                        scheduling_status_code = SchedulingStatusCode.FAILED
                        break
        else:
            schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id))
        schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id))
        return scheduling_status_code, initiator_tasks_group.values()
Example #12
0
 def start_clean_job(cls, **kwargs):
     tasks = JobSaver.query_task(**kwargs)
     if tasks:
         for task in tasks:
             try:
                 # clean session
                 stat_logger.info('start {} {} {} {} session stop'.format(
                     task.f_job_id, task.f_role, task.f_party_id,
                     task.f_component_name))
                 start_session_stop(task)
                 stat_logger.info('stop {} {} {} {} session success'.format(
                     task.f_job_id, task.f_role, task.f_party_id,
                     task.f_component_name))
             except Exception as e:
                 pass
             try:
                 # clean data table
                 JobClean.clean_table(job_id=task.f_job_id,
                                      role=task.f_role,
                                      party_id=task.f_party_id,
                                      component_name=task.f_component_name)
             except Exception as e:
                 stat_logger.info(
                     'delete {} {} {} {} data table failed'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
                 stat_logger.exception(e)
             try:
                 # clean metric data
                 stat_logger.info(
                     'start delete {} {} {} {} metric data'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
                 delete_metric_data({
                     'job_id': task.f_job_id,
                     'role': task.f_role,
                     'party_id': task.f_party_id,
                     'component_name': task.f_component_name
                 })
                 stat_logger.info(
                     'delete {} {} {} {} metric data success'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
             except Exception as e:
                 stat_logger.info(
                     'delete {} {} {} {} metric data failed'.format(
                         task.f_job_id, task.f_role, task.f_party_id,
                         task.f_component_name))
                 stat_logger.exception(e)
     else:
         raise Exception('no found task')
Example #13
0
def stop_task(job_id, component_name, task_id, task_version, role, party_id,
              stop_status):
    tasks = JobSaver.query_task(job_id=job_id,
                                task_id=task_id,
                                task_version=task_version,
                                role=role,
                                party_id=int(party_id))
    kill_status = True
    for task in tasks:
        kill_status = kill_status & TaskController.stop_task(
            task=task, stop_status=stop_status)
    return get_json_result(
        retcode=RetCode.SUCCESS if kill_status else RetCode.EXCEPTION_ERROR,
        retmsg='success' if kill_status else 'failed')
Example #14
0
 def federated_task_status(cls, job_id, task_id, task_version):
     tasks_on_all_party = JobSaver.query_task(task_id=task_id, task_version=task_version)
     status_flag = 0
     # idmapping role status can only be ignored if all non-idmapping roles success
     for task in tasks_on_all_party:
         if 'idmapping' not in task.f_role and task.f_party_status != TaskStatus.SUCCESS:
             status_flag = 1
             break
     if status_flag:
         tasks_party_status = [task.f_party_status for task in tasks_on_all_party]
     else:
         tasks_party_status = [task.f_party_status for task in tasks_on_all_party if 'idmapping' not in task.f_role]
     status = cls.calculate_multi_party_task_status(tasks_party_status)
     schedule_logger(job_id=job_id).info("job {} task {} {} status is {}, calculate by task party status list: {}".format(job_id, task_id, task_version, status, tasks_party_status))
     return status
Example #15
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING,
             run_on_this_party=True,
             run_ip=RuntimeConfig.JOB_SERVER_HOST,
             only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             count += 1
             try:
                 process_exist = job_utils.check_job_process(
                     int(task.f_run_pid))
                 if not process_exist:
                     detect_logger(job_id=task.f_job_id).info(
                         'job {} task {} {} on {} {} process {} does not exist'
                         .format(task.f_job_id, task.f_task_id,
                                 task.f_task_version, task.f_role,
                                 task.f_party_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.CANCELED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")