Beispiel #1
0
 def collect_task_of_all_party(cls, job, task):
     status, federated_response = FederatedScheduler.collect_task(job=job, task=task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(f"collect task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} failed")
         return
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             JobSaver.update_task_status(task_info=party_response["data"])
             JobSaver.update_task(task_info=party_response["data"])
Beispiel #2
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         if EndStatus.contains(job.f_status):
             job.f_status = JobStatus.WAITING
             job.f_end_time = None
             job.f_elapsed = None
             job.f_progress = 0
             schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun")
             status, response = FederatedScheduler.sync_job_status(job=job)
             if status == FederatedSchedulingStatusCode.SUCCESS:
                 FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"])
                 JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully")
             else:
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed")
         else:
             # status updates may be delayed, and in a very small probability they will be executed after the rerun command
             schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task")
     else:
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Beispiel #3
0
    def create_job(cls, job_id, role, party_id, job_info):
        # parse job configuration
        dsl = job_info['dsl']
        runtime_conf = job_info['runtime_conf']
        train_runtime_conf = job_info['train_runtime_conf']
        if USE_AUTHENTICATION:
            authentication_check(src_role=job_info.get('src_role', None),
                                 src_party_id=job_info.get(
                                     'src_party_id', None),
                                 dsl=dsl,
                                 runtime_conf=runtime_conf,
                                 role=role,
                                 party_id=party_id)
        job_parameters = RunParameters(**runtime_conf['job_parameters'])
        job_initiator = runtime_conf['initiator']

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=dsl,
            runtime_conf=runtime_conf,
            train_runtime_conf=train_runtime_conf)

        # save new job into db
        if role == job_initiator['role'] and party_id == job_initiator[
                'party_id']:
            is_initiator = True
        else:
            is_initiator = False
        job_info["status"] = JobStatus.WAITING
        roles = job_info['roles']
        # this party configuration
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["is_initiator"] = is_initiator
        job_info["progress"] = 0
        engines_info = cls.get_job_engines_address(
            job_parameters=job_parameters)
        cls.special_role_parameters(role=role, job_parameters=job_parameters)
        cls.check_parameters(job_parameters=job_parameters,
                             engines_info=engines_info)
        runtime_conf["job_parameters"] = job_parameters.to_dict()

        JobSaver.create_job(job_info=job_info)
        job_utils.save_job_conf(job_id=job_id,
                                job_dsl=dsl,
                                job_runtime_conf=runtime_conf,
                                train_runtime_conf=train_runtime_conf,
                                pipeline_dsl=None)

        cls.initialize_tasks(job_id, role, party_id, True, job_initiator,
                             job_parameters, dsl_parser)
        cls.initialize_job_tracker(job_id=job_id,
                                   role=role,
                                   party_id=party_id,
                                   job_info=job_info,
                                   is_initiator=is_initiator,
                                   dsl_parser=dsl_parser)
Beispiel #4
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
     tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all) > 1 and not TaskStatus.RUNNING in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed")
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response["retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
Beispiel #5
0
 def update_job_on_initiator(cls, initiator_job: Job, update_fields: list):
     jobs = JobSaver.query_job(job_id=initiator_job.f_job_id)
     if not jobs:
         raise Exception("Failed to update job status on initiator")
     job_info = initiator_job.to_human_model_dict(only_primary_with=update_fields)
     for field in update_fields:
         job_info[field] = getattr(initiator_job, "f_%s" % field)
     for job in jobs:
         job_info["role"] = job.f_role
         job_info["party_id"] = job.f_party_id
         JobSaver.update_job_status(job_info=job_info)
         JobSaver.update_job(job_info=job_info)
Beispiel #6
0
 def create_task(cls, role, party_id, run_on_this_party, task_info):
     task_info["role"] = role
     task_info["party_id"] = party_id
     task_info["status"] = TaskStatus.WAITING
     task_info["party_status"] = TaskStatus.WAITING
     task_info["create_time"] = base_utils.current_timestamp()
     task_info["run_on_this_party"] = run_on_this_party
     if "task_id" not in task_info:
         task_info["task_id"] = job_utils.generate_task_id(job_id=task_info["job_id"], component_name=task_info["component_name"])
     if "task_version" not in task_info:
         task_info["task_version"] = 0
     JobSaver.create_task(task_info=task_info)
Beispiel #7
0
 def stop_jobs(cls, job_id, stop_status, role=None, party_id=None):
     if role and party_id:
         jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id)
     else:
         jobs = JobSaver.query_job(job_id=job_id)
     kill_status = True
     kill_details = {}
     for job in jobs:
         kill_job_status, kill_job_details = cls.stop_job(job=job, stop_status=stop_status)
         kill_status = kill_status & kill_job_status
         kill_details[job_id] = kill_job_details
     return kill_status, kill_details
Beispiel #8
0
    def schedule(cls, job, dsl_parser, canceled=False):
        schedule_logger(job_id=job.f_job_id).info("scheduling job {} tasks".format(job.f_job_id))
        initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id)
        waiting_tasks = []
        for initiator_task in initiator_tasks_group.values():
            # collect all party task party status
            if job.f_runtime_conf_on_party["job_parameters"]["federated_status_collect_type"] == FederatedCommunicationType.PULL:
                tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
                tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
                if len(tasks_status_on_all) > 1 or TaskStatus.RUNNING in tasks_status_on_all:
                    cls.collect_task_of_all_party(job=job, task=initiator_task)
            new_task_status = cls.federated_task_status(job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
            task_status_have_update = False
            if new_task_status != initiator_task.f_status:
                task_status_have_update = True
                initiator_task.f_status = new_task_status
                FederatedScheduler.sync_task_status(job=job, task=initiator_task)

            if initiator_task.f_status == TaskStatus.WAITING:
                waiting_tasks.append(initiator_task)
            elif task_status_have_update and EndStatus.contains(initiator_task.f_status):
                FederatedScheduler.stop_task(job=job, task=initiator_task, stop_status=initiator_task.f_status)

        scheduling_status_code = SchedulingStatusCode.NO_NEXT
        if not canceled:
            for waiting_task in waiting_tasks:
                for component in dsl_parser.get_upstream_dependent_components(component_name=waiting_task.f_component_name):
                    dependent_task = initiator_tasks_group[
                        JobSaver.task_key(task_id=job_utils.generate_task_id(job_id=job.f_job_id, component_name=component.get_name()),
                                          role=job.f_role,
                                          party_id=job.f_party_id
                                          )
                    ]
                    if dependent_task.f_status != TaskStatus.SUCCESS:
                        # can not start task
                        break
                else:
                    # all upstream dependent tasks have been successful, can start this task
                    scheduling_status_code = SchedulingStatusCode.HAVE_NEXT
                    status_code = cls.start_task(job=job, task=waiting_task)
                    if status_code == SchedulingStatusCode.NO_RESOURCE:
                        # wait for the next round of scheduling
                        schedule_logger(job_id=job.f_job_id).info(f"job {waiting_task.f_job_id} task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling")
                        break
                    elif status_code == SchedulingStatusCode.FAILED:
                        scheduling_status_code = SchedulingStatusCode.FAILED
                        break
        else:
            schedule_logger(job_id=job.f_job_id).info("have cancel signal, pass start job {} tasks".format(job.f_job_id))
        schedule_logger(job_id=job.f_job_id).info("finish scheduling job {} tasks".format(job.f_job_id))
        return scheduling_status_code, initiator_tasks_group.values()
Beispiel #9
0
def report_task(job_id, component_name, task_id, task_version, role, party_id):
    task_info = {}
    task_info.update(request.json)
    task_info.update({
        "job_id": job_id,
        "task_id": task_id,
        "task_version": task_version,
        "role": role,
        "party_id": party_id,
    })
    JobSaver.update_task(task_info=task_info)
    if task_info.get("party_status"):
        JobSaver.update_status(Task, task_info)
    return get_json_result(retcode=0, retmsg='success')
Beispiel #10
0
def upload_history():
    request_data = request.json
    if request_data.get('job_id'):
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True)
    else:
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True)
    limit = request_data.get('limit')
    if not limit:
        tasks = tasks[-1::-1]
    else:
        tasks = tasks[-1:-limit - 1:-1]
    jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks)
    data = get_upload_info(jobs_run_conf=jobs_run_conf)
    return get_json_result(retcode=0, retmsg='success', data=data)
Beispiel #11
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf_on_party,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf_on_party["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Beispiel #12
0
 def clean_task(cls, job_id, task_id, task_version, role, party_id,
                content_type):
     status = set()
     if content_type == "metrics":
         tracker = Tracker(job_id=job_id,
                           role=role,
                           party_id=party_id,
                           task_id=task_id,
                           task_version=task_version)
         status.add(tracker.clean_metrics())
     elif content_type == "table":
         jobs = JobSaver.query_job(job_id=job_id,
                                   role=role,
                                   party_id=party_id)
         if jobs:
             job = jobs[0]
             job_parameters = RunParameters(
                 **job.f_runtime_conf_on_party["job_parameters"])
             tracker = Tracker(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               task_id=task_id,
                               task_version=task_version,
                               job_parameters=job_parameters)
             status.add(tracker.clean_task(job.f_runtime_conf_on_party))
     if len(status) == 1 and True in status:
         return True
     else:
         return False
Beispiel #13
0
 def update_job_status(cls, job_info):
     update_status = JobSaver.update_job_status(job_info=job_info)
     if update_status and EndStatus.contains(job_info.get("status")):
         ResourceManager.return_job_resource(job_id=job_info["job_id"],
                                             role=job_info["role"],
                                             party_id=job_info["party_id"])
     return update_status
Beispiel #14
0
def pipeline_dag_dependency(job_info):
    try:
        detect_utils.check_config(job_info,
                                  required_arguments=["party_id", "role"])
        if job_info.get('job_id'):
            jobs = JobSaver.query_job(job_id=job_info["job_id"],
                                      party_id=job_info["party_id"],
                                      role=job_info["role"])
            if not jobs:
                raise Exception('query job {} failed'.format(
                    job_info.get('job_id', '')))
            job = jobs[0]
            job_dsl_parser = schedule_utils.get_job_dsl_parser(
                dsl=job.f_dsl,
                runtime_conf=job.f_runtime_conf,
                train_runtime_conf=job.f_train_runtime_conf)
        else:
            job_dsl_parser = schedule_utils.get_job_dsl_parser(
                dsl=job_info.get('job_dsl', {}),
                runtime_conf=job_info.get('job_runtime_conf', {}),
                train_runtime_conf=job_info.get('job_train_runtime_conf', {}))
        return job_dsl_parser.get_dependency(role=job_info["role"],
                                             party_id=int(
                                                 job_info["party_id"]))
    except Exception as e:
        stat_logger.exception(e)
        raise e
Beispiel #15
0
def query_task():
    tasks = JobSaver.query_task(**request.json)
    if not tasks:
        return get_json_result(retcode=101, retmsg='find task failed')
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=[task.to_json() for task in tasks])
Beispiel #16
0
 def cancel_job(cls, job_id, role, party_id):
     schedule_logger(job_id).info(
         '{} {} get cancel waiting job {} command'.format(
             role, party_id, job_id))
     jobs = JobSaver.query_job(job_id=job_id)
     if jobs:
         job = jobs[0]
         try:
             # You cannot delete an event directly, otherwise the status might not be updated
             status = JobQueue.update_event(
                 job_id=job.f_job_id,
                 initiator_role=job.f_initiator_role,
                 initiator_party_id=job.f_initiator_party_id,
                 job_status=JobStatus.CANCELED)
             if not status:
                 return False
         except:
             return False
         schedule_logger(job_id).info(
             'cancel {} job successfully, job id is {}'.format(
                 job.f_status, job.f_job_id))
         return True
     else:
         schedule_logger(job_id).warning(
             'role {} party id {} cancel job failed, no find jod {}'.format(
                 role, party_id, job_id))
         raise Exception(
             'role {} party id {} cancel job failed, no find jod {}'.format(
                 role, party_id, job_id))
Beispiel #17
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        schedule_logger(job_id).info(f"stop job on this party")
        kill_status, kill_details = JobController.stop_jobs(
            job_id=job_id, stop_status=stop_status)
        schedule_logger(job_id).info(
            f"stop job on this party status {kill_status}")
        schedule_logger(job_id).info(
            f"request stop job {jobs[0]} to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_json())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(
                retcode=RetCode.SUCCESS,
                retmsg=f"stop job on this party {kill_status};\n"
                f"stop job on all party success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="stop job on this party {};\n"
                                   "stop job failed:\n{}".format(
                                       kill_status,
                                       json_dumps(response, indent=4)))
    else:
        schedule_logger(job_id).info(f"can not found job {job_id} to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Beispiel #18
0
 def update_job(cls, job_info):
     """
     Save to local database
     :param job_info:
     :return:
     """
     return JobSaver.update_job(job_info=job_info)
Beispiel #19
0
 def stop_job(cls, job_id, role, party_id, stop_status):
     schedule_logger(job_id=job_id).info(
         f"request stop job {job_id} with {stop_status}")
     jobs = JobSaver.query_job(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               is_initiator=True)
     if len(jobs) > 0:
         if stop_status == JobStatus.CANCELED:
             schedule_logger(job_id=job_id).info(f"cancel job {job_id}")
             set_cancel_status = cls.cancel_signal(job_id=job_id,
                                                   set_or_reset=True)
             schedule_logger(job_id=job_id).info(
                 f"set job {job_id} cancel signal {set_cancel_status}")
         job = jobs[0]
         job.f_status = stop_status
         schedule_logger(job_id=job_id).info(
             f"request stop job {job_id} with {stop_status} to all party")
         status_code, response = FederatedScheduler.stop_job(
             job=jobs[0], stop_status=stop_status)
         if status_code == FederatedSchedulingStatusCode.SUCCESS:
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} successfully")
             return RetCode.SUCCESS, "success"
         else:
             schedule_logger(job_id=job_id).info(
                 f"stop job {job_id} with {stop_status} failed, {response}")
             return RetCode.FEDERATED_ERROR, json_dumps(response)
     else:
         return RetCode.SUCCESS, "can not found job"
Beispiel #20
0
 def start_job(cls, job_id, initiator_role, initiator_party_id):
     schedule_logger(job_id=job_id).info(
         "Try to start job {} on initiator {} {}".format(
             job_id, initiator_role, initiator_party_id))
     job_info = {}
     job_info["job_id"] = job_id
     job_info["role"] = initiator_role
     job_info["party_id"] = initiator_party_id
     job_info["status"] = JobStatus.RUNNING
     job_info["party_status"] = JobStatus.RUNNING
     job_info["start_time"] = current_timestamp()
     job_info["tag"] = 'end_waiting'
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
         FederatedScheduler.start_job(job=job)
         schedule_logger(job_id=job_id).info(
             "start job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
     else:
         schedule_logger(job_id=job_id).error(
             "can not found job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
Beispiel #21
0
def update_job():
    job_info = request.json
    jobs = JobSaver.query_job(job_id=job_info['job_id'],
                              party_id=job_info['party_id'],
                              role=job_info['role'])
    if not jobs:
        return get_json_result(retcode=101, retmsg='find job failed')
    else:
        JobSaver.update_job(
            job_info={
                'description': job_info.get('notes', ''),
                'job_id': job_info['job_id'],
                'role': job_info['role'],
                'party_id': job_info['party_id']
            })
        return get_json_result(retcode=0, retmsg='success')
Beispiel #22
0
 def report_task_to_initiator(cls, task_info):
     tasks = JobSaver.query_task(task_id=task_info["task_id"],
                                 task_version=task_info["task_version"],
                                 role=task_info["role"],
                                 party_id=task_info["party_id"])
     if tasks[0].f_federated_status_collect_type == FederatedCommunicationType.PUSH:
         FederatedScheduler.report_task_to_initiator(task=tasks[0])
Beispiel #23
0
def clean_queue():
    jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING)
    clean_status = {}
    for job in jobs:
        status_code, response = FederatedScheduler.request_stop_job(
            job=job, stop_status=JobStatus.CANCELED)
        clean_status[job.f_job_id] = status_code
    return get_json_result(retcode=0, retmsg='success', data=clean_status)
Beispiel #24
0
def query_job():
    jobs = JobSaver.query_job(**request.json)
    if not jobs:
        return get_json_result(retcode=0,
                               retmsg='no job could be found',
                               data=[])
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=[job.to_json() for job in jobs])
Beispiel #25
0
def component_output_data_table():
    request_data = request.json
    detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name'])
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
Beispiel #26
0
 def stop_job(cls, job, stop_status):
     tasks = JobSaver.query_task(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, reverse=True)
     kill_status = True
     kill_details = {}
     for task in tasks:
         kill_task_status = TaskController.stop_task(task=task, stop_status=stop_status)
         kill_status = kill_status & kill_task_status
         kill_details[task.f_task_id] = 'success' if kill_task_status else 'failed'
     return kill_status, kill_details
Beispiel #27
0
    def create_job(cls, job_id, role, party_id, job_info):
        # parse job configuration
        dsl = job_info['dsl']
        runtime_conf = job_info['runtime_conf']
        train_runtime_conf = job_info['train_runtime_conf']
        if USE_AUTHENTICATION:
            authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None),
                                 dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id)

        dsl_parser = schedule_utils.get_job_dsl_parser(dsl=dsl,
                                                       runtime_conf=runtime_conf,
                                                       train_runtime_conf=train_runtime_conf)
        job_parameters = dsl_parser.get_job_parameters().get(role, {}).get(party_id, {})
        schedule_logger(job_id).info('job parameters:{}'.format(job_parameters))
        job_parameters = RunParameters(**job_parameters)

        # save new job into db
        if role == job_info["initiator_role"] and party_id == job_info["initiator_party_id"]:
            is_initiator = True
        else:
            is_initiator = False
        job_info["status"] = JobStatus.WAITING
        # this party configuration
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["is_initiator"] = is_initiator
        job_info["progress"] = 0
        cls.adapt_job_parameters(role=role, job_parameters=job_parameters)
        engines_info = cls.get_job_engines_address(job_parameters=job_parameters)
        cls.check_parameters(job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info)
        job_info["runtime_conf_on_party"]["job_parameters"] = job_parameters.to_dict()
        job_utils.save_job_conf(job_id=job_id,
                                role=role,
                                job_dsl=dsl,
                                job_runtime_conf=runtime_conf,
                                job_runtime_conf_on_party=job_info["runtime_conf_on_party"],
                                train_runtime_conf=train_runtime_conf,
                                pipeline_dsl=None)

        cls.initialize_tasks(job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser)
        job_parameters = job_info['runtime_conf_on_party']['job_parameters']
        roles = job_info['roles']
        cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser)
        JobSaver.create_job(job_info=job_info)
Beispiel #28
0
def get_job_table_list():
    detect_utils.check_config(
        config=request.json, required_arguments=['job_id', 'role', 'party_id'])
    jobs = JobSaver.query_job(**request.json)
    if jobs:
        job = jobs[0]
        tables = get_job_all_table(job)
        return get_json_result(data=tables)
    else:
        return get_json_result(retcode=101, retmsg='no find job')
Beispiel #29
0
 def schedule_waiting_jobs(cls, event):
     job_id, initiator_role, initiator_party_id, = event.f_job_id, event.f_initiator_role, event.f_initiator_party_id,
     update_status = JobQueue.update_event(job_id=job_id,
                                           initiator_role=initiator_role,
                                           initiator_party_id=initiator_party_id,
                                           job_status=JobStatus.READY)
     if not update_status:
         schedule_logger(job_id).info(f"job {job_id} may be handled by another scheduler")
         return
     # apply resource on all party
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if not jobs:
         JobQueue.delete_event(job_id=job_id)
         return
     job = jobs[0]
     apply_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.APPLY)
     if apply_status_code == FederatedSchedulingStatusCode.SUCCESS:
         cls.start_job(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
         JobQueue.update_event(job_id=job_id,
                               initiator_role=initiator_role,
                               initiator_party_id=initiator_party_id,
                               job_status=JobStatus.RUNNING)
     else:
         # rollback resource
         rollback_party = {}
         failed_party = {}
         for dest_role in federated_response.keys():
             for dest_party_id in federated_response[dest_role].keys():
                 retcode = federated_response[dest_role][dest_party_id]["retcode"]
                 if retcode == 0:
                     rollback_party[dest_role] = rollback_party.get(dest_role, [])
                     rollback_party[dest_role].append(dest_party_id)
                 else:
                     failed_party[dest_role] = failed_party.get(dest_role, [])
                     failed_party[dest_role].append(dest_party_id)
         schedule_logger(job_id).info("job {} apply resource failed on {}, rollback {}".format(
             job_id,
             ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in failed_party.items()]),
             ",".join([",".join([f"{_r}:{_p}" for _p in _ps]) for _r, _ps in rollback_party.items()]),
         ))
         if rollback_party:
             return_status_code, federated_response = FederatedScheduler.resource_for_job(job=job, operation_type=ResourceOperation.RETURN, specific_dest=rollback_party)
             if return_status_code != FederatedSchedulingStatusCode.SUCCESS:
                 schedule_logger(job_id).info(f"job {job_id} return resource failed:\n{federated_response}")
         else:
             schedule_logger(job_id).info(f"job {job_id} no party should be rollback resource")
         if apply_status_code == FederatedSchedulingStatusCode.ERROR:
             cls.stop_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id, stop_status=JobStatus.FAILED)
             schedule_logger(job_id).info(f"apply resource error, stop job {job_id}")
         else:
             update_status = JobQueue.update_event(job_id=job_id,
                                                   initiator_role=initiator_role,
                                                   initiator_party_id=initiator_party_id,
                                                   job_status=JobStatus.WAITING)
             schedule_logger(job_id).info(f"update job {job_id} status to waiting {update_status}")
Beispiel #30
0
 def federated_task_status(cls, job_id, task_id, task_version):
     tasks_on_all_party = JobSaver.query_task(task_id=task_id,
                                              task_version=task_version)
     tasks_party_status = [
         task.f_party_status for task in tasks_on_all_party
     ]
     status = cls.calculate_multi_party_task_status(tasks_party_status)
     schedule_logger(job_id=job_id).info(
         "job {} task {} {} status is {}, calculate by task party status list: {}"
         .format(job_id, task_id, task_version, status, tasks_party_status))
     return status