def schedule_running_job(cls, job: Job, force_sync_status=False): schedule_logger(job.f_job_id).info(f"scheduling running job") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = dict([(task.f_component_name, task.f_status) for task in tasks]) new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status.values()) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job.f_job_id).info( f"job status is {new_job_status}, calculate by task status list: {tasks_status}" ) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if auto_rerun_tasks: schedule_logger(job.f_job_id).info("job have auto rerun tasks") cls.set_job_rerun(job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, tasks=auto_rerun_tasks, auto=True) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job.f_job_id).info("finish scheduling running job")
def update_job_status(cls, job_info): update_status = JobSaver.update_job_status(job_info=job_info) if update_status and EndStatus.contains(job_info.get("status")): ResourceManager.return_job_resource(job_id=job_info["job_id"], role=job_info["role"], party_id=job_info["party_id"]) return update_status
def schedule_rerun_job(cls, job): if EndStatus.contains(job.f_status): job.f_status = JobStatus.WAITING job.f_ready_signal = False job.f_ready_time = None job.f_rerun_signal = False job.f_progress = 0 job.f_end_time = None job.f_elapsed = None schedule_logger(job.f_job_id).info( f"job has been finished, set waiting to rerun") status, response = FederatedScheduler.sync_job_status(job=job) if status == FederatedSchedulingStatusCode.SUCCESS: cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False) FederatedScheduler.sync_job(job=job, update_fields=[ "ready_signal", "ready_time", "rerun_signal", "progress", "end_time", "elapsed" ]) schedule_logger(job.f_job_id).info( f"job set waiting to rerun successfully") else: schedule_logger( job.f_job_id).info(f"job set waiting to rerun failed") else: cls.rerun_signal(job_id=job.f_job_id, set_or_reset=False) cls.schedule_running_job(job)
def update_status(cls, entity_model: DataBaseModel, entity_info: dict): query_filters = [] primary_keys = entity_model.get_primary_keys_name() for p_k in primary_keys: query_filters.append(operator.attrgetter(p_k)(entity_model) == entity_info[p_k.lstrip("f").lstrip("_")]) objs = entity_model.select().where(*query_filters) if objs: obj = objs[0] else: raise Exception(f"can not found the {entity_model.__name__} record to update") update_filters = query_filters[:] update_info = {"job_id": entity_info["job_id"]} for status_field in cls.STATUS_FIELDS: if entity_info.get(status_field) and hasattr(entity_model, f"f_{status_field}"): if status_field in ["status", "party_status"]: update_info[status_field] = entity_info[status_field] old_status = getattr(obj, f"f_{status_field}") new_status = update_info[status_field] if_pass = False if isinstance(obj, Task): if TaskStatus.StateTransitionRule.if_pass(src_status=old_status, dest_status=new_status): if_pass = True elif isinstance(obj, Job): if JobStatus.StateTransitionRule.if_pass(src_status=old_status, dest_status=new_status): if_pass = True if EndStatus.contains(new_status) and new_status not in {JobStatus.SUCCESS, JobStatus.CANCELED}: update_filters.append(Job.f_rerun_signal == False) if if_pass: update_filters.append(operator.attrgetter(f"f_{status_field}")(type(obj)) == old_status) else: # not allow update status update_info.pop(status_field) return cls.execute_update(old_obj=obj, model=entity_model, update_info=update_info, update_filters=update_filters)
def calculate_job_progress(cls, tasks_status): total = 0 finished_count = 0 for task_status in tasks_status.values(): total += 1 if EndStatus.contains(task_status): finished_count += 1 return total, finished_count
def update_task_status(cls, task_info): update_status = JobSaver.update_task_status(task_info=task_info) if update_status and EndStatus.contains(task_info.get("status")): ResourceManager.return_task_resource(task_info=task_info) cls.clean_task(job_id=task_info["job_id"], task_id=task_info["task_id"], task_version=task_info["task_version"], role=task_info["role"], party_id=task_info["party_id"], content_type=TaskCleanResourceType.TABLE) cls.report_task_to_initiator(task_info=task_info) return update_status
def update_job_status(cls, job_info): schedule_logger(job_info["job_id"]).info("try to update job status to {}".format(job_info.get("status"))) update_status = cls.update_status(Job, job_info) if update_status: schedule_logger(job_info["job_id"]).info("update job status successfully") if EndStatus.contains(job_info.get("status")): new_job_info = {} # only update tag for k in ["job_id", "role", "party_id", "tag"]: if k in job_info: new_job_info[k] = job_info[k] if not new_job_info.get("tag"): new_job_info["tag"] = "job_end" cls.update_entity_table(Job, new_job_info) else: schedule_logger(job_info["job_id"]).warning("update job status does not take effect") return update_status
def detect_resource_record(cls): detect_logger().info('start detect resource recycle') try: filter_status = EndStatus.status_list() filter_status.append(JobStatus.WAITING) jobs = Job.select().where( Job.f_resource_in_use == True, current_timestamp() - Job.f_apply_resource_time > 10 * 60 * 1000, Job.f_status << filter_status) stop_jobs = set() for job in jobs: if job.f_status == JobStatus.WAITING: stop_jobs.add(job) else: try: detect_logger(job_id=job.f_job_id).info( f"start to return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource" ) flag = ResourceManager.return_job_resource( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) if flag: detect_logger(job_id=job.f_job_id).info( f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource successfully" ) else: detect_logger(job_id=job.f_job_id).info( f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource failed" ) except Exception as e: detect_logger(job_id=job.f_job_id).exception(e) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="start timeout", stop_status=JobStatus.TIMEOUT) except Exception as e: detect_logger().exception(e) finally: detect_logger().info('finish detect resource recycle')
def test_tracking(self): with open( os.path.join(get_fate_flow_python_directory(), self.dsl_path), 'r') as f: dsl_data = json.load(f) with open( os.path.join(get_fate_flow_python_directory(), self.config_path), 'r') as f: config_data = json.load(f) config_data["initiator"]["party_id"] = self.guest_party_id config_data["role"] = { "guest": [self.guest_party_id], "host": [self.host_party_id], "arbiter": [self.host_party_id] } response = requests.post("/".join([self.server_url, 'job', 'submit']), json={ 'job_dsl': dsl_data, 'job_runtime_conf': config_data }) self.assertTrue(response.status_code in [200, 201]) self.assertTrue(int(response.json()['retcode']) == 0) job_id = response.json()['jobId'] job_info = {'f_status': 'running'} for i in range(60): response = requests.post("/".join( [self.server_url, 'job', 'query']), json={ 'job_id': job_id, 'role': 'guest' }) self.assertTrue(response.status_code in [200, 201]) job_info = response.json()['data'][0] if EndStatus.contains(job_info['f_status']): break time.sleep(self.sleep_time) print('waiting job run success, the job has been running for {}s'. format((i + 1) * self.sleep_time)) self.assertTrue(job_info['f_status'] == JobStatus.SUCCESS) os.makedirs(self.success_job_dir, exist_ok=True) with open(os.path.join(self.success_job_dir, job_id), 'w') as fw: json.dump(job_info, fw) self.assertTrue( os.path.exists(os.path.join(self.success_job_dir, job_id))) # test_component_parameters test_component(self, 'component/parameters') # test_component_metric_all test_component(self, 'component/metric/all') # test_component_metric test_component(self, 'component/metrics') # test_component_output_model test_component(self, 'component/output/model') # test_component_output_data_download test_component(self, 'component/output/data') # test_component_output_data_download test_component(self, 'component/output/data/download') # test_job_data_view test_component(self, 'job/data_view')
def run_do(self): schedule_logger().info("start schedule waiting jobs") jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} waiting jobs") if len(jobs): # FIFO job = jobs[0] schedule_logger().info(f"schedule waiting job {job.f_job_id}") try: self.schedule_waiting_jobs(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger( job.f_job_id).error(f"schedule waiting job failed") schedule_logger().info("schedule waiting jobs finished") schedule_logger().info("start schedule running jobs") jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.RUNNING, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} running jobs") for job in jobs: schedule_logger().info(f"schedule running job {job.f_job_id}") try: self.schedule_running_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule job failed") schedule_logger().info("schedule running jobs finished") # some ready job exit before start schedule_logger().info("start schedule ready jobs") jobs = JobSaver.query_job(is_initiator=True, ready_signal=True, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} ready jobs") for job in jobs: schedule_logger().info(f"schedule ready job {job.f_job_id}") try: self.schedule_ready_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger( job.f_job_id).error(f"schedule ready job failed:\n{e}") schedule_logger().info("schedule ready jobs finished") schedule_logger().info("start schedule rerun jobs") jobs = JobSaver.query_job(is_initiator=True, rerun_signal=True, order_by="create_time", reverse=False) schedule_logger().info(f"have {len(jobs)} rerun jobs") for job in jobs: schedule_logger().info(f"schedule rerun job {job.f_job_id}") try: self.schedule_rerun_job(job=job) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule job failed") schedule_logger().info("schedule rerun jobs finished") schedule_logger().info( "start schedule end status jobs to update status") jobs = JobSaver.query_job( is_initiator=True, status=set(EndStatus.status_list()), end_time=[ current_timestamp() - JobDefaultConfig.end_status_job_scheduling_time_limit, current_timestamp() ]) schedule_logger().info(f"have {len(jobs)} end status jobs") for job in jobs: schedule_logger().info(f"schedule end status job {job.f_job_id}") try: update_status = self.end_scheduling_updates( job_id=job.f_job_id) if update_status: schedule_logger(job.f_job_id).info( f"try update status by scheduling like running job") else: schedule_logger(job.f_job_id).info( f"the number of updates has been exceeded") continue self.schedule_running_job(job=job, force_sync_status=True) except Exception as e: schedule_logger(job.f_job_id).exception(e) schedule_logger(job.f_job_id).error(f"schedule job failed") schedule_logger().info("schedule end status jobs finished")
def schedule(cls, job, dsl_parser, canceled=False): schedule_logger(job.f_job_id).info("scheduling job tasks") initiator_tasks_group = JobSaver.get_tasks_asc(job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) waiting_tasks = [] auto_rerun_tasks = [] for initiator_task in initiator_tasks_group.values(): if job.f_runtime_conf_on_party["job_parameters"][ "federated_status_collect_type"] == FederatedCommunicationType.PULL: # collect all parties task party status and store it in the database now cls.collect_task_of_all_party(job=job, initiator_task=initiator_task) else: # all parties report task party status and store it in the initiator database when federated_status_collect_type is push pass # get all parties party task status and calculate new_task_status = cls.get_federated_task_status( job_id=initiator_task.f_job_id, task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) task_status_have_update = False if new_task_status != initiator_task.f_status: task_status_have_update = True initiator_task.f_status = new_task_status FederatedScheduler.sync_task_status(job=job, task=initiator_task) if initiator_task.f_status == TaskStatus.WAITING: waiting_tasks.append(initiator_task) elif task_status_have_update and EndStatus.contains( initiator_task.f_status): FederatedScheduler.stop_task( job=job, task=initiator_task, stop_status=initiator_task.f_status) if not canceled and AutoRerunStatus.contains( initiator_task.f_status): if initiator_task.f_auto_retries > 0: auto_rerun_tasks.append(initiator_task) schedule_logger(job.f_job_id).info( f"task {initiator_task.f_task_id} {initiator_task.f_status} will be retried" ) else: schedule_logger(job.f_job_id).info( f"task {initiator_task.f_task_id} {initiator_task.f_status} has no retry count" ) scheduling_status_code = SchedulingStatusCode.NO_NEXT if not canceled: for waiting_task in waiting_tasks: for component in dsl_parser.get_upstream_dependent_components( component_name=waiting_task.f_component_name): dependent_task = initiator_tasks_group[JobSaver.task_key( task_id=job_utils.generate_task_id( job_id=job.f_job_id, component_name=component.get_name()), role=job.f_role, party_id=job.f_party_id)] if dependent_task.f_status != TaskStatus.SUCCESS: # can not start task break else: # all upstream dependent tasks have been successful, can start this task scheduling_status_code = SchedulingStatusCode.HAVE_NEXT status_code = cls.start_task(job=job, task=waiting_task) if status_code == SchedulingStatusCode.NO_RESOURCE: # wait for the next round of scheduling schedule_logger(job.f_job_id).info( f"task {waiting_task.f_task_id} can not apply resource, wait for the next round of scheduling" ) break elif status_code == SchedulingStatusCode.FAILED: scheduling_status_code = SchedulingStatusCode.FAILED waiting_task.f_status = StatusSet.FAILED FederatedScheduler.sync_task_status(job, waiting_task) break else: schedule_logger( job.f_job_id).info("have cancel signal, pass start job tasks") schedule_logger(job.f_job_id).info("finish scheduling job tasks") return scheduling_status_code, auto_rerun_tasks, initiator_tasks_group.values( )