def stop_job(cls, job_id, role, party_id, stop_status): schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status}") jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id, is_initiator=True) if len(jobs) > 0: if stop_status == JobStatus.CANCELED: schedule_logger(job_id=job_id).info(f"cancel job {job_id}") set_cancel_status = cls.cancel_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id=job_id).info( f"set job {job_id} cancel signal {set_cancel_status}") job = jobs[0] job.f_status = stop_status schedule_logger(job_id=job_id).info( f"request stop job {job_id} with {stop_status} to all party") status_code, response = FederatedScheduler.stop_job( job=jobs[0], stop_status=stop_status) if status_code == FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} successfully") return RetCode.SUCCESS, "success" else: initiator_tasks_group = JobSaver.get_tasks_asc( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) for initiator_task in initiator_tasks_group.values(): TaskScheduler.collect_task_of_all_party( job, initiator_task=initiator_task, set_status=stop_status) schedule_logger(job_id=job_id).info( f"stop job {job_id} with {stop_status} failed, {response}") return RetCode.FEDERATED_ERROR, json_dumps(response) else: return RetCode.SUCCESS, "can not found job"
def schedule_running_job(cls, job: Job, force_sync_status=False): schedule_logger(job.f_job_id).info(f"scheduling running job") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = dict([(task.f_component_name, task.f_status) for task in tasks]) new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status.values()) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job.f_job_id).info( f"job status is {new_job_status}, calculate by task status list: {tasks_status}" ) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if auto_rerun_tasks: schedule_logger(job.f_job_id).info("job have auto rerun tasks") cls.set_job_rerun(job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, tasks=auto_rerun_tasks, auto=True) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job.f_job_id).info("finish scheduling running job")
def schedule_running_job(cls, job, force_sync_status=False): schedule_logger(job_id=job.f_job_id).info("scheduling job {}".format( job.f_job_id)) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = [task.f_status for task in tasks] new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job_id=job.f_job_id).info( "Job {} status is {}, calculate by task status list: {}".format( job.f_job_id, new_job_status, tasks_status)) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id=job.f_job_id).info( "finish scheduling job {}".format(job.f_job_id))
def set_job_rerun(cls, job_id, initiator_role, initiator_party_id, auto, force=False, tasks: typing.List[Task] = None, component_name: typing.Union[str, list] = None): schedule_logger(job_id).info( f"try to rerun job on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if not jobs: raise RuntimeError( f"can not found job on initiator {initiator_role} {initiator_party_id}" ) job = jobs[0] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) component_name, force = cls.get_rerun_component( component_name, job, dsl_parser, force) schedule_logger(job_id).info(f"rerun component: {component_name}") if tasks: schedule_logger(job_id).info( f"require {[task.f_component_name for task in tasks]} to rerun" ) else: task_query = { 'job_id': job_id, 'role': initiator_role, 'party_id': initiator_party_id, } if not component_name or component_name == job_utils.job_pipeline_component_name( ): # rerun all tasks schedule_logger(job_id).info( "require all component of pipeline to rerun") else: _require_reruns = {component_name} if isinstance( component_name, str) else set(component_name) _should_reruns = _require_reruns.copy() for _cpn in _require_reruns: _components = dsl_parser.get_downstream_dependent_components( _cpn) for _c in _components: _should_reruns.add(_c.get_name()) schedule_logger(job_id).info( f"require {_require_reruns} to rerun, " f"and then found {_should_reruns} need be to rerun") task_query['component_name'] = _should_reruns tasks = JobSaver.query_task(**task_query) job_can_rerun = any([ TaskScheduler.prepare_rerun_task( job=job, task=task, dsl_parser=dsl_parser, auto=auto, force=force, ) for task in tasks ]) if not job_can_rerun: FederatedScheduler.sync_job_status(job=job) schedule_logger(job_id).info("job no task to rerun") return False schedule_logger(job_id).info("job set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) schedule_logger(job_id).info( f"job set rerun signal {'successfully' if status else 'failed'}") return True