def update_job_progress(job_id, dag, current_task_id): component_count = len(dag.get_dependency()['component_list']) success_count = success_task_count(job_id=job_id) job = Job() job.f_progress = float(success_count) / component_count * 100 job.f_update_time = current_timestamp() job.f_current_tasks = json_dumps([current_task_id]) return job
def schedule_running_job(cls, job: Job, force_sync_status=False): schedule_logger(job.f_job_id).info(f"scheduling running job") dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule( job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal) tasks_status = dict([(task.f_component_name, task.f_status) for task in tasks]) new_job_status = cls.calculate_job_status( task_scheduling_status_code=task_scheduling_status_code, tasks_status=tasks_status.values()) if new_job_status == JobStatus.WAITING and job.f_cancel_signal: new_job_status = JobStatus.CANCELED total, finished_count = cls.calculate_job_progress( tasks_status=tasks_status) new_progress = float(finished_count) / total * 100 schedule_logger(job.f_job_id).info( f"job status is {new_job_status}, calculate by task status list: {tasks_status}" ) if new_job_status != job.f_status or new_progress != job.f_progress: # Make sure to update separately, because these two fields update with anti-weight logic if int(new_progress) - job.f_progress > 0: job.f_progress = new_progress FederatedScheduler.sync_job(job=job, update_fields=["progress"]) cls.update_job_on_initiator(initiator_job=job, update_fields=["progress"]) if new_job_status != job.f_status: job.f_status = new_job_status if EndStatus.contains(job.f_status): FederatedScheduler.save_pipelined_model(job=job) FederatedScheduler.sync_job_status(job=job) cls.update_job_on_initiator(initiator_job=job, update_fields=["status"]) if EndStatus.contains(job.f_status): cls.finish(job=job, end_status=job.f_status) if auto_rerun_tasks: schedule_logger(job.f_job_id).info("job have auto rerun tasks") cls.set_job_rerun(job_id=job.f_job_id, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, tasks=auto_rerun_tasks, auto=True) if force_sync_status: FederatedScheduler.sync_job_status(job=job) schedule_logger(job.f_job_id).info("finish scheduling running job")
def resource_for_task(cls, task_info, operation_type): cores_per_task, memory_per_task = cls.calculate_task_resource( task_info=task_info) if cores_per_task or memory_per_task: filters, updates = cls.update_resource_sql( resource_model=Job, cores=cores_per_task, memory=memory_per_task, operation_type=operation_type, ) filters.append(Job.f_job_id == task_info["job_id"]) filters.append(Job.f_role == task_info["role"]) filters.append(Job.f_party_id == task_info["party_id"]) filters.append(Job.f_resource_in_use == True) operate = Job.update(updates).where(*filters) operate_status = operate.execute() > 0 else: operate_status = True if operate_status: schedule_logger(job_id=task_info["job_id"]).info( "task {} {} {} resource successfully".format( task_info["task_id"], task_info["task_version"], operation_type)) else: schedule_logger(job_id=task_info["job_id"]).warning( "task {} {} {} resource failed".format( task_info["task_id"], task_info["task_version"], operation_type)) return operate_status
def get_job_parameters(job_id, role, party_id): jobs = Job.select(Job.f_runtime_conf_on_party).where( Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return job.f_runtime_conf_on_party.get("job_parameters") else: return {}
def get_job_configuration(job_id, role, party_id) -> JobConfiguration: jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf, Job.f_runtime_conf_on_party).where( Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return JobConfiguration(**job.to_human_model_dict())
def resource_for_job(cls, job_id, role, party_id, operation_type): operate_status = False engine_name, cores, memory = cls.calculate_job_resource(job_id=job_id, role=role, party_id=party_id) try: with DB.atomic(): updates = { Job.f_engine_type: EngineType.COMPUTING, Job.f_engine_name: engine_name, Job.f_cores: cores, Job.f_memory: memory, } filters = [ Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id, ] if operation_type == ResourceOperation.APPLY: updates[Job.f_remaining_cores] = cores updates[Job.f_remaining_memory] = memory updates[Job.f_resource_in_use] = True updates[Job.f_apply_resource_time] = base_utils.current_timestamp() filters.append(Job.f_resource_in_use == False) elif operation_type == ResourceOperation.RETURN: updates[Job.f_resource_in_use] = False updates[Job.f_return_resource_time] = base_utils.current_timestamp() filters.append(Job.f_resource_in_use == True) operate = Job.update(updates).where(*filters) record_status = operate.execute() > 0 if not record_status: raise RuntimeError(f"record job {job_id} resource {operation_type} failed on {role} {party_id}") filters, updates = cls.update_resource_sql(resource_model=EngineRegistry, cores=cores, memory=memory, operation_type=operation_type, ) filters.append(EngineRegistry.f_engine_type == EngineType.COMPUTING) filters.append(EngineRegistry.f_engine_name == engine_name) operate = EngineRegistry.update(updates).where(*filters) apply_status = operate.execute() > 0 if not apply_status: raise RuntimeError( f"{operation_type} resource from engine {engine_name} for job {job_id} resource {operation_type} failed on {role} {party_id}") operate_status = True except Exception as e: schedule_logger(job_id=job_id).warning(e) schedule_logger(job_id=job_id).warning( f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} failed") operate_status = False finally: remaining_cores, remaining_memory = cls.get_remaining_resource(EngineRegistry, [ EngineRegistry.f_engine_type == EngineType.COMPUTING, EngineRegistry.f_engine_name == engine_name]) operate_msg = "successfully" if operate_status else "failed" schedule_logger(job_id=job_id).info( f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} {operate_msg}, remaining cores: {remaining_cores} remaining memory: {remaining_memory}") return operate_status
def end_scheduling_updates(cls, job_id): operate = Job.update({ Job.f_end_scheduling_updates: Job.f_end_scheduling_updates + 1 }).where( Job.f_job_id == job_id, Job.f_end_scheduling_updates < JobDefaultConfig.end_status_job_scheduling_updates) update_status = operate.execute() > 0 return update_status
def rerun_signal(cls, job_id, set_or_reset: bool): if set_or_reset is True: update_fields = {Job.f_rerun_signal: True, Job.f_cancel_signal: False, Job.f_end_scheduling_updates: 0} elif set_or_reset is False: update_fields = {Job.f_rerun_signal: False} else: raise RuntimeError(f"can not support rereun signal {set_or_reset}") update_status = Job.update(update_fields).where(Job.f_job_id == job_id).execute() > 0 return update_status
def get_job_dsl(job_id, role, party_id): jobs = Job.select(Job.f_dsl).where(Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return job.f_dsl else: return {}
def end_scheduling_updates(cls, job_id): operate = Job.update({ Job.f_end_scheduling_updates: Job.f_end_scheduling_updates + 1 }).where( Job.f_job_id == job_id, Job.f_end_scheduling_updates < END_STATUS_JOB_SCHEDULING_UPDATES) update_status = operate.execute() > 0 return update_status
def get_job_configuration(job_id, role, party_id, tasks=None): if tasks: jobs_run_conf = {} for task in tasks: jobs = Job.select(Job.f_job_id, Job.f_runtime_conf_on_party, Job.f_description).where(Job.f_job_id == task.f_job_id) job = jobs[0] jobs_run_conf[job.f_job_id] = job.f_runtime_conf_on_party["component_parameters"]["role"]["local"]["0"]["upload_0"] jobs_run_conf[job.f_job_id]["notes"] = job.f_description return jobs_run_conf else: jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf, Job.f_runtime_conf_on_party).where(Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return job.f_dsl, job.f_runtime_conf, job.f_runtime_conf_on_party, job.f_train_runtime_conf else: return {}, {}, {}, {}
def get_upload_job_configuration_summary(upload_tasks: typing.List[Task]): jobs_run_conf = {} for task in upload_tasks: jobs = Job.select( Job.f_job_id, Job.f_runtime_conf_on_party, Job.f_description).where(Job.f_job_id == task.f_job_id) job = jobs[0] jobs_run_conf[job.f_job_id] = job.f_runtime_conf_on_party[ "component_parameters"]["role"]["local"]["0"]["upload_0"] jobs_run_conf[job.f_job_id]["notes"] = job.f_description return jobs_run_conf
def get_job_configuration(job_id, role, party_id): with DB.connection_context(): jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where( Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return json_loads(job.f_dsl), json_loads( job.f_runtime_conf), json_loads(job.f_train_runtime_conf) else: return {}, {}, {}
def get_job_dsl_parser_by_job_id(job_id): jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where(Job.f_job_id == job_id) if jobs: job = jobs[0] job_dsl_parser = get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) return job_dsl_parser else: return None
def update_job_on_initiator(cls, initiator_job: Job, update_fields: list): jobs = JobSaver.query_job(job_id=initiator_job.f_job_id) if not jobs: raise Exception("Failed to update job status on initiator") job_info = initiator_job.to_human_model_dict(only_primary_with=update_fields) for field in update_fields: job_info[field] = getattr(initiator_job, "f_%s" % field) for job in jobs: job_info["role"] = job.f_role job_info["party_id"] = job.f_party_id JobSaver.update_job_status(job_info=job_info) JobSaver.update_job(job_info=job_info)
def ready_signal(cls, job_id, set_or_reset: bool, ready_timeout_ttl=None): filters = [Job.f_job_id == job_id] if set_or_reset: update_fields = {Job.f_ready_signal: True, Job.f_ready_time: current_timestamp()} filters.append(Job.f_ready_signal == False) else: update_fields = {Job.f_ready_signal: False, Job.f_ready_time: None} filters.append(Job.f_ready_signal == True) if ready_timeout_ttl: filters.append(current_timestamp() - Job.f_ready_time > ready_timeout_ttl) update_status = Job.update(update_fields).where(*filters).execute() > 0 return update_status
def check_request_parameters(request_data): if 'role' not in request_data and 'party_id' not in request_data: jobs = Job.select(Job.f_runtime_conf_on_party).where(Job.f_job_id == request_data.get('job_id', ''), Job.f_is_initiator == True) if jobs: job = jobs[0] job_runtime_conf = job.f_runtime_conf_on_party job_initiator = job_runtime_conf.get('initiator', {}) role = job_initiator.get('role', '') party_id = job_initiator.get('party_id', 0) request_data['role'] = role request_data['party_id'] = party_id
def query_job(**kwargs): with DB.connection_context(): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(Job, attr_name): filters.append(operator.attrgetter('f_%s' % f_n)(Job) == f_v) if filters: jobs = Job.select().where(*filters) return [job for job in jobs] else: # not allow query all job return []
def check_request_parameters(request_data): with DB.connection_context(): if 'role' not in request_data and 'party_id' not in request_data: jobs = Job.select(Job.f_runtime_conf).where(Job.f_job_id == request_data.get('job_id', ''), Job.f_is_initiator == 1) if jobs: job = jobs[0] job_runtime_conf = json_loads(job.f_runtime_conf) job_initiator = job_runtime_conf.get('initiator', {}) role = job_initiator.get('role', '') party_id = job_initiator.get('party_id', 0) request_data['role'] = role request_data['party_id'] = party_id
def get_job_dsl_parser_by_job_id(job_id): with DB.connection_context(): jobs = Job.select( Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where(Job.f_job_id == job_id) if jobs: job = jobs[0] job_dsl_parser = get_job_dsl_parser( dsl=json_loads(job.f_dsl), runtime_conf=json_loads(job.f_runtime_conf), train_runtime_conf=json_loads(job.f_train_runtime_conf)) return job_dsl_parser else: return None
def get_job_configuration(job_id, role, party_id, tasks=None): with DB.connection_context(): if tasks: jobs_run_conf = {} for task in tasks: jobs = Job.select( Job.f_job_id, Job.f_runtime_conf, Job.f_description).where(Job.f_job_id == task.f_job_id) job = jobs[0] jobs_run_conf[job.f_job_id] = json_loads( job.f_runtime_conf)["role_parameters"]["local"]["upload_0"] jobs_run_conf[job.f_job_id]["notes"] = job.f_description return jobs_run_conf else: jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where( Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return json_loads(job.f_dsl), json_loads( job.f_runtime_conf), json_loads(job.f_train_runtime_conf) else: return {}, {}, {}
def save_machine_learning_model_info(self): try: record = MLModel.get_or_none(MLModel.f_model_version == self.job_id, MLModel.f_role == self.role, MLModel.f_model_id == self.model_id, MLModel.f_party_id == self.party_id) if not record: job = Job.get_or_none(Job.f_job_id == self.job_id) pipeline = self.pipelined_model.read_pipeline_model() if job: job_data = job.to_dict() model_info = { 'job_id': job_data.get("f_job_id"), 'role': self.role, 'party_id': self.party_id, 'roles': job_data.get("f_roles"), 'model_id': self.model_id, 'model_version': self.model_version, 'initiator_role': job_data.get('f_initiator_role'), 'initiator_party_id': job_data.get('f_initiator_party_id'), 'runtime_conf': job_data.get('f_runtime_conf'), 'work_mode': job_data.get('f_work_mode'), 'train_dsl': job_data.get('f_dsl'), 'train_runtime_conf': job_data.get('f_train_runtime_conf'), 'size': self.get_model_size(), 'job_status': job_data.get('f_status'), 'parent': pipeline.parent, 'fate_version': pipeline.fate_version, 'runtime_conf_on_party': json_loads(pipeline.runtime_conf_on_party), 'parent_info': json_loads(pipeline.parent_info), 'inference_dsl': json_loads(pipeline.inference_dsl) } model_utils.save_model_info(model_info) schedule_logger(self.job_id).info( 'save {} model info done. model id: {}, model version: {}.'.format(self.job_id, self.model_id, self.model_version)) else: schedule_logger(self.job_id).info( 'save {} model info failed, no job found in db. ' 'model id: {}, model version: {}.'.format(self.job_id, self.model_id, self.model_version)) else: schedule_logger(self.job_id).info('model {} info has already existed in database.'.format(self.job_id)) except Exception as e: schedule_logger(self.job_id).exception(e)
def query_job(cls, reverse=None, order_by=None, **kwargs): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(Job, attr_name): filters.append(operator.attrgetter('f_%s' % f_n)(Job) == f_v) if filters: jobs = Job.select().where(*filters) if reverse is not None: if not order_by or not hasattr(Job, f"f_{order_by}"): order_by = "create_time" if reverse is True: jobs = jobs.order_by(getattr(Job, f"f_{order_by}").desc()) elif reverse is False: jobs = jobs.order_by(getattr(Job, f"f_{order_by}").asc()) return [job for job in jobs] else: # not allow query all job return []
def save_machine_learning_model_info(self): try: record = MLModel.get_or_none( MLModel.f_model_version == self.job_id) if not record: job = Job.get_or_none(Job.f_job_id == self.job_id) if job: job_data = job.to_json() MLModel.create( f_role=self.role, f_party_id=self.party_id, f_roles=job_data.get("f_roles"), f_model_id=self.model_id, f_model_version=self.model_version, f_job_id=job_data.get("f_job_id"), f_create_time=current_timestamp(), f_initiator_role=job_data.get('f_initiator_role'), f_initiator_party_id=job_data.get( 'f_initiator_party_id'), f_runtime_conf=job_data.get('f_runtime_conf'), f_work_mode=job_data.get('f_work_mode'), f_dsl=job_data.get('f_dsl'), f_train_runtime_conf=job_data.get( 'f_train_runtime_conf'), f_size=self.get_model_size(), f_job_status=job_data.get('f_status')) schedule_logger(self.job_id).info( 'save {} model info done. model id: {}, model version: {}.' .format(self.job_id, self.model_id, self.model_version)) else: schedule_logger(self.job_id).info( 'save {} model info failed, no job found in db. ' 'model id: {}, model version: {}.'.format( self.job_id, self.model_id, self.model_version)) else: schedule_logger(self.job_id).info( 'model {} info has already existed in database.'.format( self.job_id)) except Exception as e: schedule_logger(self.job_id).exception(e)
def detect_resource_record(cls): detect_logger().info('start detect resource recycle') try: filter_status = EndStatus.status_list() filter_status.append(JobStatus.WAITING) jobs = Job.select().where( Job.f_resource_in_use == True, current_timestamp() - Job.f_apply_resource_time > 10 * 60 * 1000, Job.f_status << filter_status) stop_jobs = set() for job in jobs: if job.f_status == JobStatus.WAITING: stop_jobs.add(job) else: try: detect_logger(job_id=job.f_job_id).info( f"start to return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource" ) flag = ResourceManager.return_job_resource( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) if flag: detect_logger(job_id=job.f_job_id).info( f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource successfully" ) else: detect_logger(job_id=job.f_job_id).info( f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource failed" ) except Exception as e: detect_logger(job_id=job.f_job_id).exception(e) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="start timeout", stop_status=JobStatus.TIMEOUT) except Exception as e: detect_logger().exception(e) finally: detect_logger().info('finish detect resource recycle')
def query_job(cls, reverse=None, order_by=None, **kwargs): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if attr_name in ['f_start_time', 'f_end_time', 'f_elapsed' ] and isinstance(f_v, list): if attr_name == 'f_elapsed': b_timestamp = f_v[0] e_timestamp = f_v[1] else: # time type: %Y-%m-%d %H:%M:%S b_timestamp = str_to_time_stamp(f_v[0]) if isinstance( f_v[0], str) else f_v[0] e_timestamp = str_to_time_stamp(f_v[1]) if isinstance( f_v[1], str) else f_v[1] filters.append( getattr(Job, attr_name).between(b_timestamp, e_timestamp)) elif hasattr(Job, attr_name): if isinstance(f_v, set): filters.append( operator.attrgetter('f_%s' % f_n)(Job) << f_v) else: filters.append( operator.attrgetter('f_%s' % f_n)(Job) == f_v) if filters: jobs = Job.select().where(*filters) if reverse is not None: if not order_by or not hasattr(Job, f"f_{order_by}"): order_by = "create_time" if reverse is True: jobs = jobs.order_by(getattr(Job, f"f_{order_by}").desc()) elif reverse is False: jobs = jobs.order_by(getattr(Job, f"f_{order_by}").asc()) return [job for job in jobs] else: return []
def create_job(cls, job: Job): return cls.job_command(job=job, command="create", command_body=job.to_human_model_dict(), order_federated=True)
def run_job(job_id, initiator_role, initiator_party_id): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_initiator = job_runtime_conf.get('initiator', {}) dag = get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() if not job_initiator: return False storage.init_storage(job_id=job_id, work_mode=RuntimeConfig.WORK_MODE) job = Job() job.f_job_id = job_id job.f_start_time = current_timestamp() job.f_status = JobStatus.RUNNING job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) top_level_task_status = set() components = dag.get_next_components(None) schedule_logger.info('job {} root components is {}'.format( job.f_job_id, [component.get_name() for component in components], None)) for component in components: try: # run a component as task run_status = TaskScheduler.run_component( job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component) except Exception as e: schedule_logger.info(e) run_status = False top_level_task_status.add(run_status) if not run_status: break if len(top_level_task_status) == 2: job.f_status = JobStatus.PARTIAL elif True in top_level_task_status: job.f_status = JobStatus.SUCCESS else: job.f_status = JobStatus.FAILED job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time if job.f_status == JobStatus.SUCCESS: job.f_progress = 100 job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf) schedule_logger.info('job {} finished, status is {}'.format( job.f_job_id, job.f_status))
def submit_job(job_data): job_id = generate_job_id() schedule_logger.info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) job_dsl_path, job_runtime_conf_path = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() # save job info TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue RuntimeConfig.JOB_QUEUE.put_event({ 'job_id': job_id, "initiator_role": job_initiator['role'], "initiator_party_id": job_initiator['party_id'] } ) schedule_logger.info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) return job_id, job_dsl_path, job_runtime_conf_path, {'model_id': job_parameters['model_id'], 'model_version': job_parameters[ 'model_version']}, board_url
def run_job(job_id, initiator_role, initiator_party_id): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_initiator = job_runtime_conf.get('initiator', {}) dag = get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() if not job_initiator: return False timeout = job_utils.get_timeout(job_id, job_parameters.get("timeout", None), job_runtime_conf, job_dsl) t = Timer(timeout, TaskScheduler.job_handler, [job_id]) t.start() job = Job() job.f_job_id = job_id job.f_start_time = current_timestamp() job.f_status = JobStatus.RUNNING job.f_update_time = current_timestamp() TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job.to_json()) top_level_task_status = set() components = dag.get_next_components(None) schedule_logger(job_id).info( 'job {} root components is {}'.format(job.f_job_id, [component.get_name() for component in components], None)) for component in components: try: # run a component as task run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component) except Exception as e: schedule_logger(job_id).exception(e) run_status = False top_level_task_status.add(run_status) if not run_status: break if len(top_level_task_status) == 2: job.f_status = JobStatus.FAILED elif True in top_level_task_status: job.f_status = JobStatus.COMPLETE else: job.f_status = JobStatus.FAILED job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time if job.f_status == JobStatus.COMPLETE: job.f_progress = 100 job.f_update_time = current_timestamp() try: TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf) except Exception as e: schedule_logger(job_id).exception(e) job.f_status = JobStatus.FAILED if job.f_status == JobStatus.FAILED: TaskScheduler.stop(job_id=job_id, end_status=JobStatus.FAILED) try: TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job.to_json()) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).warning('job {} sync status failed'.format(job.f_job_id)) schedule_logger(job_id).info('job {} finished, status is {}'.format(job.f_job_id, job.f_status)) t.cancel()