Example #1
0
 def create_new_version_task(cls, job, task, dsl_parser, auto):
     # stop old version task
     FederatedScheduler.stop_task(job=job,
                                  task=task,
                                  stop_status=TaskStatus.CANCELED)
     FederatedScheduler.clean_task(
         job=job, task=task, content_type=TaskCleanResourceType.METRICS)
     # create new version task
     task.f_task_version = task.f_task_version + 1
     if auto:
         task.f_auto_retries = task.f_auto_retries - 1
     task.f_run_pid = None
     task.f_run_ip = None
     # todo: FederatedScheduler.create_task and JobController.initialize_tasks will create task twice
     status_code, response = FederatedScheduler.create_task(job=job,
                                                            task=task)
     if status_code != FederatedSchedulingStatusCode.SUCCESS:
         raise Exception(f"create {task.f_task_id} new version failed")
     # create the task holder in db to record information of all participants in the initiator for scheduling
     for _role in response:
         for _party_id in response[_role]:
             if _role == job.f_initiator_role and _party_id == job.f_initiator_party_id:
                 continue
             JobController.initialize_tasks(
                 job_id=job.f_job_id,
                 role=_role,
                 party_id=_party_id,
                 run_on_this_party=False,
                 initiator_role=job.f_initiator_role,
                 initiator_party_id=job.f_initiator_party_id,
                 job_parameters=RunParameters(
                     **job.f_runtime_conf_on_party["job_parameters"]),
                 dsl_parser=dsl_parser,
                 components=[task.f_component_name],
                 task_version=task.f_task_version,
                 auto_retries=task.f_auto_retries)
     schedule_logger(job.f_job_id).info(
         f"create task {task.f_task_id} new version {task.f_task_version} successfully"
     )
Example #2
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id,
               component_name):
     schedule_logger(job_id=job_id).info(
         f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}"
     )
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(
             f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}"
         )
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id,
                                     component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(
         dsl=job.f_dsl,
         runtime_conf=job.f_runtime_conf_on_party,
         train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(
                 f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun"
             )
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job,
                                          task=task,
                                          stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job,
                                           task=task,
                                           content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version}"
             )
             for _role, _party_ids in job.f_runtime_conf_on_party[
                     "role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(
                         job_id,
                         _role,
                         _party_id,
                         False,
                         job.f_initiator_role,
                         job.f_initiator_party_id,
                         RunParameters(
                             **
                             job.f_runtime_conf_on_party["job_parameters"]),
                         dsl_parser,
                         component_name=task.f_component_name,
                         task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version} successfully"
             )
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(
             job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(
             job_id=job_id).info(f"job {job_id} no task to rerun")