def update_job_progress(job_id, dag, current_task_id): component_count = len(dag.get_dependency()['component_list']) success_count = success_task_count(job_id=job_id) job = Job() job.f_progress = float(success_count) / component_count * 100 job.f_update_time = current_timestamp() job.f_current_tasks = json_dumps([current_task_id]) return job
def submit_job(job_data): job_id = generate_job_id() schedule_logger.info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) job_dsl_path, job_runtime_conf_path = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() # save job info TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue RuntimeConfig.JOB_QUEUE.put_event({ 'job_id': job_id, "initiator_role": job_initiator['role'], "initiator_party_id": job_initiator['party_id'] } ) schedule_logger.info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) return job_id, job_dsl_path, job_runtime_conf_path, {'model_id': job_parameters['model_id'], 'model_version': job_parameters[ 'model_version']}, board_url
def schedule_running_job(cls, job: Job, force_sync_status=False): schedule_logger(job.f_job_id).info(f"scheduling running job") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = dict([(task.f_component_name, task.f_status) for task in tasks]) new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status.values()) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job.f_job_id).info( f"job status is {new_job_status}, calculate by task status list: {tasks_status}" ) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if auto_rerun_tasks: schedule_logger(job.f_job_id).info("job have auto rerun tasks") cls.set_job_rerun(job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, tasks=auto_rerun_tasks, auto=True) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job.f_job_id).info("finish scheduling running job")
def run_job(job_id, initiator_role, initiator_party_id): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_initiator = job_runtime_conf.get('initiator', {}) dag = get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() if not job_initiator: return False storage.init_storage(job_id=job_id, work_mode=RuntimeConfig.WORK_MODE) job = Job() job.f_job_id = job_id job.f_start_time = current_timestamp() job.f_status = JobStatus.RUNNING job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) top_level_task_status = set() components = dag.get_next_components(None) schedule_logger.info('job {} root components is {}'.format( job.f_job_id, [component.get_name() for component in components], None)) for component in components: try: # run a component as task run_status = TaskScheduler.run_component( job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component) except Exception as e: schedule_logger.info(e) run_status = False top_level_task_status.add(run_status) if not run_status: break if len(top_level_task_status) == 2: job.f_status = JobStatus.PARTIAL elif True in top_level_task_status: job.f_status = JobStatus.SUCCESS else: job.f_status = JobStatus.FAILED job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time if job.f_status == JobStatus.SUCCESS: job.f_progress = 100 job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf) schedule_logger.info('job {} finished, status is {}'.format( job.f_job_id, job.f_status))
def run_job(job_id, initiator_role, initiator_party_id): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_initiator = job_runtime_conf.get('initiator', {}) dag = get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() if not job_initiator: return False timeout = job_utils.get_timeout(job_id, job_parameters.get("timeout", None), job_runtime_conf, job_dsl) t = Timer(timeout, TaskScheduler.job_handler, [job_id]) t.start() job = Job() job.f_job_id = job_id job.f_start_time = current_timestamp() job.f_status = JobStatus.RUNNING job.f_update_time = current_timestamp() TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job.to_json()) top_level_task_status = set() components = dag.get_next_components(None) schedule_logger(job_id).info( 'job {} root components is {}'.format(job.f_job_id, [component.get_name() for component in components], None)) for component in components: try: # run a component as task run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component) except Exception as e: schedule_logger(job_id).exception(e) run_status = False top_level_task_status.add(run_status) if not run_status: break if len(top_level_task_status) == 2: job.f_status = JobStatus.FAILED elif True in top_level_task_status: job.f_status = JobStatus.COMPLETE else: job.f_status = JobStatus.FAILED job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time if job.f_status == JobStatus.COMPLETE: job.f_progress = 100 job.f_update_time = current_timestamp() try: TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf) except Exception as e: schedule_logger(job_id).exception(e) job.f_status = JobStatus.FAILED if job.f_status == JobStatus.FAILED: TaskScheduler.stop(job_id=job_id, end_status=JobStatus.FAILED) try: TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job.to_json()) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).warning('job {} sync status failed'.format(job.f_job_id)) schedule_logger(job_id).info('job {} finished, status is {}'.format(job.f_job_id, job.f_status)) t.cancel()
def submit_job(job_data): job_id = generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue job_event = job_utils.job_event(job_id, initiator_role, initiator_party_id) try: RuntimeConfig.JOB_QUEUE.put_event(job_event) except Exception as e: raise Exception('push job into queue failed') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters['model_id'],'model_version': job_parameters['model_version']}, board_url