Exemple #1
0
 def start_job(cls, job_id, initiator_role, initiator_party_id):
     schedule_logger(job_id=job_id).info(
         "Try to start job {} on initiator {} {}".format(
             job_id, initiator_role, initiator_party_id))
     job_info = {}
     job_info["job_id"] = job_id
     job_info["role"] = initiator_role
     job_info["party_id"] = initiator_party_id
     job_info["status"] = JobStatus.RUNNING
     job_info["party_status"] = JobStatus.RUNNING
     job_info["start_time"] = current_timestamp()
     job_info["tag"] = 'end_waiting'
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
         FederatedScheduler.start_job(job=job)
         schedule_logger(job_id=job_id).info(
             "start job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
     else:
         schedule_logger(job_id=job_id).error(
             "can not found job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
Exemple #2
0
 def report_task_to_initiator(cls, task_info):
     tasks = JobSaver.query_task(task_id=task_info["task_id"],
                                 task_version=task_info["task_version"],
                                 role=task_info["role"],
                                 party_id=task_info["party_id"])
     if tasks[0].f_federated_status_collect_type == FederatedCommunicationType.PUSH:
         FederatedScheduler.report_task_to_initiator(task=tasks[0])
Exemple #3
0
 def schedule_rerun_job(cls, job):
     if EndStatus.contains(job.f_status):
         job.f_status = JobStatus.WAITING
         job.f_ready_signal = False
         job.f_ready_time = None
         job.f_rerun_signal = False
         job.f_progress = 0
         job.f_end_time = None
         job.f_elapsed = None
         schedule_logger(job_id=job.f_job_id).info(
             f"job {job.f_job_id} has been finished, set waiting to rerun")
         status, response = FederatedScheduler.sync_job_status(job=job)
         if status == FederatedSchedulingStatusCode.SUCCESS:
             cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False)
             FederatedScheduler.sync_job(job=job,
                                         update_fields=[
                                             "ready_signal", "ready_time",
                                             "rerun_signal", "progress",
                                             "end_time", "elapsed"
                                         ])
             schedule_logger(job_id=job.f_job_id).info(
                 f"job {job.f_job_id} set waiting to rerun successfully")
         else:
             schedule_logger(job_id=job.f_job_id).info(
                 f"job {job.f_job_id} set waiting to rerun failed")
     else:
         cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False)
         cls.schedule_running_job(job)
Exemple #4
0
 def schedule_waiting_jobs(cls, event):
     job_id, initiator_role, initiator_party_id, = event.f_job_id, event.f_initiator_role, event.f_initiator_party_id,
     update_status = JobQueue.update_event(job_id=job_id,
                                           initiator_role=initiator_role,
                                           initiator_party_id=initiator_party_id,
                                           job_status=JobStatus.READY)
     if not update_status:
         schedule_logger(job_id).info(f"job {job_id} may be handled by another scheduler")
         return
     # apply resource on all party
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if not jobs:
         JobQueue.delete_event(job_id=job_id)
         return
     job = jobs[0]
     apply_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.APPLY)
     if apply_status_code == FederatedSchedulingStatusCode.SUCCESS:
         cls.start_job(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
         JobQueue.update_event(job_id=job_id,
                               initiator_role=initiator_role,
                               initiator_party_id=initiator_party_id,
                               job_status=JobStatus.RUNNING)
     else:
         # rollback resource
         rollback_party = {}
         failed_party = {}
         for dest_role in federated_response.keys():
             for dest_party_id in federated_response[dest_role].keys():
                 retcode = federated_response[dest_role][dest_party_id]["retcode"]
                 if retcode == 0:
                     rollback_party[dest_role] = rollback_party.get(dest_role, [])
                     rollback_party[dest_role].append(dest_party_id)
                 else:
                     failed_party[dest_role] = failed_party.get(dest_role, [])
                     failed_party[dest_role].append(dest_party_id)
         schedule_logger(job_id).info("job {} apply resource failed on {}, rollback {}".format(
             job_id,
             ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in failed_party.items()]),
             ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in rollback_party.items()]),
         ))
         if rollback_party:
             return_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.RETURN, specific_dest=rollback_party)
             if return_status_code != FederatedSchedulingStatusCode.SUCCESS:
                 schedule_logger(job_id).info(f"job {job_id} return resource failed:\n{federated_response}")
         else:
             schedule_logger(job_id).info(f"job {job_id} no party should be rollback resource")
         if apply_status_code == FederatedSchedulingStatusCode.ERROR:
             cls.stop_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id, stop_status=JobStatus.FAILED)
             schedule_logger(job_id).info(f"apply resource error, stop job {job_id}")
         else:
             update_status = JobQueue.update_event(job_id=job_id,
                                                   initiator_role=initiator_role,
                                                   initiator_party_id=initiator_party_id,
                                                   job_status=JobStatus.WAITING)
             schedule_logger(job_id).info(f"update job {job_id} status to waiting {update_status}")
Exemple #5
0
 def finish(cls, job, end_status):
     schedule_logger(job_id=job.f_job_id).info(
         "Job {} finished with {}, do something...".format(
             job.f_job_id, end_status))
     cls.stop_job(job_id=job.f_job_id,
                  role=job.f_initiator_role,
                  party_id=job.f_initiator_party_id,
                  stop_status=end_status)
     FederatedScheduler.clean_job(job=job)
     schedule_logger(job_id=job.f_job_id).info(
         "Job {} finished with {}, done".format(job.f_job_id, end_status))
Exemple #6
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         if EndStatus.contains(job.f_status):
             job.f_status = JobStatus.WAITING
             job.f_end_time = None
             job.f_elapsed = None
             job.f_progress = 0
             schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun")
             status, response = FederatedScheduler.sync_job_status(job=job)
             if status == FederatedSchedulingStatusCode.SUCCESS:
                 FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"])
                 JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully")
             else:
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed")
         else:
             # status updates may be delayed, and in a very small probability they will be executed after the rerun command
             schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task")
     else:
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Exemple #7
0
    def schedule(cls, job, dsl_parser, canceled=False):
        schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id))
        initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id)
        waiting_tasks = []
        for initiator_task in initiator_tasks_group.values():
            # collect all party task party status
            if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL:
                tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
                tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
                if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all:
                    cls.collect_task_of_all_party(job=job, task=initiator_task)
            new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
            task_status_have_update = False
            if new_task_status != initiator_task.f_status:
                task_status_have_update = True
                initiator_task.f_status = new_task_status
                FederatedScheduler.sync_task_status(job=job, task=initiator_task)

            if initiator_task.f_status == TaskStatus.WAITING:
                waiting_tasks.append(initiator_task)
            elif task_status_have_update and EndStatus.contains(initiator_task.f_status):
                FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status)

        scheduling_status_code = SchedulingStatusCode.NO_NEXT
        if not canceled:
            for waiting_task in waiting_tasks:
                for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name):
                    dependent_task = initiator_tasks_group[
                        JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()),
                                          role=job.f_role,
                                          party_id=job.f_party_id
                                          )
                    ]
                    if dependent_task.f_status != TaskStatus.SUCCESS:
                        # can not start task
                        break
                else:
                    # all upstream dependent tasks have been successful, can start this task
                    scheduling_status_code = SchedulingStatusCode.HAVE_NEXT
                    status_code = cls.start_task(job=job, task=waiting_task)
                    if status_code == SchedulingStatusCode.NO_RESOURCE:
                        # wait for the next round of scheduling
                        schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling")
                        break
                    elif status_code == SchedulingStatusCode.FAILED:
                        scheduling_status_code = SchedulingStatusCode.FAILED
                        break
        else:
            schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id))
        schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id))
        return scheduling_status_code, initiator_tasks_group.values()
Exemple #8
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        schedule_logger(job_id).info(f"stop job on this party")
        kill_status, kill_details = JobController.stop_jobs(
            job_id=job_id, stop_status=stop_status)
        schedule_logger(job_id).info(
            f"stop job on this party status {kill_status}")
        schedule_logger(job_id).info(
            f"request stop job {jobs[0]} to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_json())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(
                retcode=RetCode.SUCCESS,
                retmsg=f"stop job on this party {kill_status};\n"
                f"stop job on all party success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="stop job on this party {};\n"
                                   "stop job failed:\n{}".format(
                                       kill_status,
                                       json_dumps(response, indent=4)))
    else:
        schedule_logger(job_id).info(f"can not found job {job_id} to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Exemple #9
0
 def request_stop_jobs(cls, jobs: [Job], stop_msg, stop_status):
     if not len(jobs):
         return
     detect_logger().info(
         f"have {len(jobs)} should be stopped, because of {stop_msg}")
     for job in jobs:
         try:
             detect_logger(job_id=job.f_job_id).info(
                 f"detector request start to stop job {job.f_job_id}, because of {stop_msg}"
             )
             FederatedScheduler.request_stop_job(job=job,
                                                 stop_status=stop_status)
             detect_logger(job_id=job.f_job_id).info(
                 f"detector request stop job {job.f_job_id} successfully")
         except Exception as e:
             detect_logger(job_id=job.f_job_id).exception(e)
Exemple #10
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
     tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all) > 1 and not TaskStatus.RUNNING in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed")
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response["retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
Exemple #11
0
 def stop_job(cls, job_id, role, party_id, stop_status):
     schedule_logger(job_id=job_id).info(
         f"request stop job {job_id} with {stop_status}")
     jobs = JobSaver.query_job(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               is_initiator=True)
     if len(jobs) > 0:
         if stop_status == JobStatus.CANCELED:
             schedule_logger(job_id=job_id).info(f"cancel job {job_id}")
             set_cancel_status = cls.cancel_signal(job_id=job_id,
                                                   set_or_reset=True)
             schedule_logger(job_id=job_id).info(
                 f"set job {job_id} cancel signal {set_cancel_status}")
         job = jobs[0]
         job.f_status = stop_status
         schedule_logger(job_id=job_id).info(
             f"request stop job {job_id} with {stop_status} to all party")
         status_code, response = FederatedScheduler.stop_job(
             job=jobs[0], stop_status=stop_status)
         if status_code == FederatedSchedulingStatusCode.SUCCESS:
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} successfully")
             return RetCode.SUCCESS, "success"
         else:
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} failed, {response}")
             return RetCode.FEDERATED_ERROR, json_dumps(response)
     else:
         return RetCode.SUCCESS, "can not found job"
Exemple #12
0
def clean_queue():
    jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING)
    clean_status = {}
    for job in jobs:
        status_code, response = FederatedScheduler.request_stop_job(
            job=job, stop_status=JobStatus.CANCELED)
        clean_status[job.f_job_id] = status_code
    return get_json_result(retcode=0, retmsg='success', data=clean_status)
Exemple #13
0
 def schedule_waiting_jobs(cls, job):
     job_id, initiator_role, initiator_party_id, = job.f_job_id, job.f_initiator_role, job.f_initiator_party_id,
     if not cls.ready_signal(job_id=job_id, set_or_reset=True):
         schedule_logger(job_id).info(f"job {job_id} may be handled by another scheduler")
         return
     try:
         if job.f_cancel_signal:
             job.f_status = JobStatus.CANCELED
             FederatedScheduler.sync_job_status(job=job)
             schedule_logger(job_id).info(f"job {job_id} have cancel signal")
             return
         apply_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.APPLY)
         if apply_status_code == FederatedSchedulingStatusCode.SUCCESS:
             cls.start_job(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
         else:
             # rollback resource
             rollback_party = {}
             failed_party = {}
             for dest_role in federated_response.keys():
                 for dest_party_id in federated_response[dest_role].keys():
                     retcode = federated_response[dest_role][dest_party_id]["retcode"]
                     if retcode == 0:
                         rollback_party[dest_role] = rollback_party.get(dest_role, [])
                         rollback_party[dest_role].append(dest_party_id)
                     else:
                         failed_party[dest_role] = failed_party.get(dest_role, [])
                         failed_party[dest_role].append(dest_party_id)
             schedule_logger(job_id).info("job {} apply resource failed on {}, rollback {}".format(
                 job_id,
                 ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in failed_party.items()]),
                 ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in rollback_party.items()]),
             ))
             if rollback_party:
                 return_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.RETURN, specific_dest=rollback_party)
                 if return_status_code != FederatedSchedulingStatusCode.SUCCESS:
                     schedule_logger(job_id).info(f"job {job_id} return resource failed:\n{federated_response}")
             else:
                 schedule_logger(job_id).info(f"job {job_id} no party should be rollback resource")
             if apply_status_code == FederatedSchedulingStatusCode.ERROR:
                 cls.stop_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id, stop_status=JobStatus.FAILED)
                 schedule_logger(job_id).info(f"apply resource error, stop job {job_id}")
     except Exception as e:
         raise e
     finally:
         update_status = cls.ready_signal(job_id=job_id, set_or_reset=False)
         schedule_logger(job_id).info(f"reset job {job_id} ready signal {update_status}")
Exemple #14
0
 def collect_task_of_all_party(cls, job, task):
     status, federated_response = FederatedScheduler.collect_task(job=job, task=task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(f"collect task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} failed")
         return
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             JobSaver.update_task_status(task_info=party_response["data"])
             JobSaver.update_task(task_info=party_response["data"])
Exemple #15
0
def component_output_data_table():
    request_data = request.json
    detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name'])
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
Exemple #16
0
    def schedule_running_job(cls, job):
        schedule_logger(job_id=job.f_job_id).info("scheduling job {}".format(job.f_job_id))

        dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                       runtime_conf=job.f_runtime_conf_on_party,
                                                       train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, tasks = TaskScheduler.schedule(job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = [task.f_status for task in tasks]
        new_job_status = cls.calculate_job_status(task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status)
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job_id=job.f_job_id).info("Job {} status is {}, calculate by task status list: {}".format(job.f_job_id, new_job_status, tasks_status))
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job, update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job, update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        schedule_logger(job_id=job.f_job_id).info("finish scheduling job {}".format(job.f_job_id))
Exemple #17
0
 def start_task(cls, job, task):
     schedule_logger(job_id=task.f_job_id).info("try to start job {} task {} {} on {} {}".format(task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id))
     apply_status = ResourceManager.apply_for_task_resource(task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not apply_status:
         return SchedulingStatusCode.NO_RESOURCE
     task.f_status = TaskStatus.RUNNING
     update_status = JobSaver.update_task_status(task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not update_status:
         # Another scheduler scheduling the task
         schedule_logger(job_id=task.f_job_id).info("job {} task {} {} start on another scheduler".format(task.f_job_id, task.f_task_id, task.f_task_version))
         # Rollback
         task.f_status = TaskStatus.WAITING
         ResourceManager.return_task_resource(task_info=task.to_human_model_dict(only_primary_with=["status"]))
         return SchedulingStatusCode.PASS
     schedule_logger(job_id=task.f_job_id).info("start job {} task {} {} on {} {}".format(task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id))
     FederatedScheduler.sync_task_status(job=job, task=task)
     task_parameters = {}
     status_code, response = FederatedScheduler.start_task(job=job, task=task, task_parameters=task_parameters)
     if status_code == FederatedSchedulingStatusCode.SUCCESS:
         return SchedulingStatusCode.SUCCESS
     else:
         return SchedulingStatusCode.FAILED
Exemple #18
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf_on_party,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf_on_party["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Exemple #19
0
def rerun_job():
    job_id = request.json.get("job_id")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        status_code, response = FederatedScheduler.request_rerun_job(
            job=jobs[0], command_body=request.json)
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(retcode=RetCode.SUCCESS,
                                   retmsg="rerun job success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="rerun job failed:\n{}".format(
                                       json_dumps(response)))
    else:
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Exemple #20
0
 def align_input_data_partitions(cls, job: Job, dsl_parser):
     job_args = dsl_parser.get_args_input()
     dataset = JobController.get_dataset(is_initiator=True,
                                         role=job.f_initiator_role,
                                         party_id=job.f_initiator_party_id,
                                         roles=job.f_roles, job_args=job_args)
     input_data_aligned_partitions = 0  # Large integers are not used
     status, partys_response = FederatedScheduler.align_args(job=job, command_body=dataset)
     schedule_logger(job_id=job.f_job_id).info("align partition partys response: {}".format(partys_response))
     for role, party_args in partys_response.items():
         for party_id, response in party_args.items():
             if not response["retcode"]:
                 partitions = response.get('data').get('min_input_data_partition')
                 if partitions:
                     if input_data_aligned_partitions == 0 or partitions < input_data_aligned_partitions:
                         input_data_aligned_partitions = partitions
     return input_data_aligned_partitions
Exemple #21
0
 def stop_job(cls, job_id, role, party_id, stop_status):
     schedule_logger(job_id=job_id).info(f"request stop job {job_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id, is_initiator=True)
     if len(jobs) > 0:
         schedule_logger(job_id=job_id).info(f"initiator cancel job {job_id}")
         JobController.cancel_job(job_id=job_id, role=role, party_id=party_id)
         job = jobs[0]
         job.f_status = stop_status
         schedule_logger(job_id=job_id).info(f"request cancel job {job_id} to all party")
         status_code, response = FederatedScheduler.stop_job(job=jobs[0], stop_status=stop_status)
         if status_code == FederatedSchedulingStatusCode.SUCCESS:
             schedule_logger(job_id=job_id).info(f"cancel job {job_id} successfully")
             return RetCode.SUCCESS, "success"
         else:
             schedule_logger(job_id=job_id).info(f"cancel job {job_id} failed, {response}")
             return RetCode.FEDERATED_ERROR, json_dumps(response)
     else:
         schedule_logger(job_id=job_id).info(f"can not found job {job_id} to stop, delete event on {role} {party_id}")
         JobQueue.delete_event(job_id=job_id)
         return RetCode.SUCCESS, "can not found job, delete job waiting event"
Exemple #22
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        stat_logger.info(f"request stop job {jobs[0]} to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_json())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(retcode=RetCode.SUCCESS,
                                   retmsg="stop job success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="stop job failed:\n{}".format(
                                       json_dumps(response, indent=4)))
    else:
        stat_logger.info(f"can not found job {jobs[0]} to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Exemple #23
0
    def submit(cls, job_data, job_id=None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(
            job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_job_runtime_conf(job_runtime_conf)

        job_initiator = job_runtime_conf['initiator']
        conf_adapter = JobRuntimeConfigAdapter(job_runtime_conf)
        common_job_parameters = conf_adapter.get_common_parameters()

        if common_job_parameters.job_type != 'predict':
            # generate job model info
            common_job_parameters.model_id = model_utils.gen_model_id(
                job_runtime_conf['role'])
            common_job_parameters.model_version = job_id
            train_runtime_conf = {}
        else:
            # check predict job parameters
            detect_utils.check_config(common_job_parameters.to_dict(),
                                      ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            tracker = Tracker(
                job_id=job_id,
                role=job_initiator['role'],
                party_id=job_initiator['party_id'],
                model_id=common_job_parameters.model_id,
                model_version=common_job_parameters.model_version)
            pipeline_model = tracker.get_output_model('pipeline')
            if not job_dsl:
                job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(
                pipeline_model['Pipeline'].train_runtime_conf)

        job = Job()
        job.f_job_id = job_id
        job.f_dsl = job_dsl
        job.f_train_runtime_conf = train_runtime_conf
        job.f_roles = job_runtime_conf['role']
        job.f_work_mode = common_job_parameters.work_mode
        job.f_initiator_role = job_initiator['role']
        job.f_initiator_party_id = job_initiator['party_id']

        path_dict = job_utils.save_job_conf(
            job_id=job_id,
            role=job.f_initiator_role,
            job_dsl=job_dsl,
            job_runtime_conf=job_runtime_conf,
            job_runtime_conf_on_party={},
            train_runtime_conf=train_runtime_conf,
            pipeline_dsl=None)

        if job.f_initiator_party_id not in job_runtime_conf['role'][
                job.f_initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(
                job.f_initiator_party_id))
            raise Exception("initiator party id error {}".format(
                job.f_initiator_party_id))

        # create common parameters on initiator
        JobController.backend_compatibility(
            job_parameters=common_job_parameters)
        JobController.adapt_job_parameters(
            role=job.f_initiator_role,
            job_parameters=common_job_parameters,
            create_initiator_baseline=True)

        job.f_runtime_conf = conf_adapter.update_common_parameters(
            common_parameters=common_job_parameters)
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf,
            train_runtime_conf=job.f_train_runtime_conf)

        # initiator runtime conf as template
        job.f_runtime_conf_on_party = job.f_runtime_conf.copy()
        job.f_runtime_conf_on_party[
            "job_parameters"] = common_job_parameters.to_dict()

        if common_job_parameters.work_mode == WorkMode.CLUSTER:
            # Save the status information of all participants in the initiator for scheduling
            for role, party_ids in job.f_roles.items():
                for party_id in party_ids:
                    if role == job.f_initiator_role and party_id == job.f_initiator_party_id:
                        continue
                    JobController.initialize_tasks(job_id, role, party_id,
                                                   False, job.f_initiator_role,
                                                   job.f_initiator_party_id,
                                                   common_job_parameters,
                                                   dsl_parser)

        status_code, response = FederatedScheduler.create_job(job=job)
        if status_code != FederatedSchedulingStatusCode.SUCCESS:
            job.f_status = JobStatus.FAILED
            job.f_tag = "submit_failed"
            FederatedScheduler.sync_job_status(job=job)
            raise Exception("create job failed", response)

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(
                job.f_job_id, common_job_parameters.model_id))
        board_url = "http://{}:{}{}".format(
            ServiceUtils.get_item("fateboard", "host"),
            ServiceUtils.get_item("fateboard", "port"),
            FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id,
                                                  job_initiator['role'],
                                                  job_initiator['party_id'])
        logs_directory = job_utils.get_job_log_directory(job_id)
        submit_result = {
            "job_id": job_id,
            "model_info": {
                "model_id": common_job_parameters.model_id,
                "model_version": common_job_parameters.model_version
            },
            "logs_directory": logs_directory,
            "board_url": board_url
        }
        submit_result.update(path_dict)
        return submit_result
Exemple #24
0
    def submit(cls, job_data, job_id=None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_initiator = job_runtime_conf['initiator']
        job_parameters = RunParameters(**job_runtime_conf['job_parameters'])
        cls.backend_compatibility(job_parameters=job_parameters)

        job_utils.check_job_runtime_conf(job_runtime_conf)
        if job_parameters.job_type != 'predict':
            # generate job model info
            job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role'])
            job_parameters.model_version = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                              model_id=job_parameters.model_id, model_version=job_parameters.model_version)
            pipeline_model = tracker.get_output_model('pipeline')
            if not job_dsl:
                job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)

        path_dict = job_utils.save_job_conf(job_id=job_id,
                                            job_dsl=job_dsl,
                                            job_runtime_conf=job_runtime_conf,
                                            train_runtime_conf=train_runtime_conf,
                                            pipeline_dsl=None)

        job = Job()
        job.f_job_id = job_id
        job.f_dsl = job_dsl
        job_runtime_conf["job_parameters"] = job_parameters.to_dict()
        job.f_runtime_conf = job_runtime_conf
        job.f_train_runtime_conf = train_runtime_conf
        job.f_roles = job_runtime_conf['role']
        job.f_work_mode = job_parameters.work_mode
        job.f_initiator_role = job_initiator['role']
        job.f_initiator_party_id = job_initiator['party_id']

        initiator_role = job_initiator['role']
        initiator_party_id = job_initiator['party_id']
        if initiator_party_id not in job_runtime_conf['role'][initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id))
            raise Exception("initiator party id error {}".format(initiator_party_id))

        dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl,
                                                       runtime_conf=job_runtime_conf,
                                                       train_runtime_conf=train_runtime_conf)

        cls.adapt_job_parameters(job_parameters=job_parameters)

        # update runtime conf
        job_runtime_conf["job_parameters"] = job_parameters.to_dict()
        job.f_runtime_conf = job_runtime_conf

        status_code, response = FederatedScheduler.create_job(job=job)
        if status_code != FederatedSchedulingStatusCode.SUCCESS:
            raise Exception("create job failed: {}".format(response))

        if job_parameters.work_mode == WorkMode.CLUSTER:
            # Save the status information of all participants in the initiator for scheduling
            for role, party_ids in job_runtime_conf["role"].items():
                for party_id in party_ids:
                    if role == job_initiator['role'] and party_id == job_initiator['party_id']:
                        continue
                    JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser)

        # push into queue
        try:
            JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
        except Exception as e:
            raise Exception(f'push job into queue failed:\n{e}')

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id))
        board_url = "http://{}:{}{}".format(
            ServiceUtils.get_item("fateboard", "host"),
            ServiceUtils.get_item("fateboard", "port"),
            FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id'])
        logs_directory = job_utils.get_job_log_directory(job_id)
        return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \
               {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url
Exemple #25
0
 def finish(cls, job, end_status):
     schedule_logger(job_id=job.f_job_id).info("Job {} finished with {}, do something...".format(job.f_job_id, end_status))
     FederatedScheduler.stop_job(job=job, stop_status=end_status)
     FederatedScheduler.clean_job(job=job)
     JobQueue.delete_event(job_id=job.f_job_id)
     schedule_logger(job_id=job.f_job_id).info("Job {} finished with {}, done".format(job.f_job_id, end_status))