def save_data_view(self, role, party_id, data_info, mark=False): with DB.connection_context(): data_views = DataView.select().where( DataView.f_job_id == self.job_id, DataView.f_component_name == self.component_name, DataView.f_task_id == self.task_id, DataView.f_role == role, DataView.f_party_id == party_id) is_insert = True if mark and self.component_name == "upload_0": return if data_views: data_view = data_views[0] is_insert = False else: data_view = DataView() data_view.f_create_time = current_timestamp() data_view.f_job_id = self.job_id data_view.f_component_name = self.component_name data_view.f_task_id = self.task_id data_view.f_role = role data_view.f_party_id = party_id data_view.f_update_time = current_timestamp() for k, v in data_info.items(): if k in [ 'f_job_id', 'f_component_name', 'f_task_id', 'f_role', 'f_party_id' ] or v == getattr(DataView, k).default: continue setattr(data_view, k, v) if is_insert: data_view.save(force_insert=True) else: data_view.save() return data_view
def insert_data_to_db(self, metric_namespace: str, metric_name: str, data_type: int, kv, job_level=False): with DB.connection_context(): try: tracking_metric = TrackingMetric.model(table_index=self.job_id) tracking_metric.f_job_id = self.job_id tracking_metric.f_component_name = self.component_name if not job_level else 'dag' tracking_metric.f_task_id = self.task_id tracking_metric.f_role = self.role tracking_metric.f_party_id = self.party_id tracking_metric.f_metric_namespace = metric_namespace tracking_metric.f_metric_name = metric_name tracking_metric.f_type = data_type default_db_source = tracking_metric.to_json() tracking_metric_data_source = [] for k, v in kv: db_source = default_db_source.copy() db_source['f_key'] = serialize_b64(k) db_source['f_value'] = serialize_b64(v) db_source['f_create_time'] = current_timestamp() tracking_metric_data_source.append(db_source) self.bulk_insert_model_data( TrackingMetric.model(table_index=self.get_table_index()), tracking_metric_data_source) except Exception as e: stat_logger.exception(e)
def store_rsa(host_party_id, id_type, encrypt_type, tag, namespace, version, rsa): init_database_tables() with DB.connection_context(): LOGGER.info('store rsa and out table info, partyid={}, id_type={}, encrypt_type={}, namespace={}, version={}.'.format(host_party_id, \ id_type, encrypt_type, namespace, version)) infos = IdLibraryCacheInfo.select().where(IdLibraryCacheInfo.f_party_id == host_party_id, \ IdLibraryCacheInfo.f_id_type == id_type, IdLibraryCacheInfo.f_encrypt_type == encrypt_type, \ IdLibraryCacheInfo.f_tag == tag, IdLibraryCacheInfo.f_namespcae == namespace, IdLibraryCacheInfo.f_version == version) is_insert = True if infos: info = infos[0] is_insert = False else: info = IdLibraryCacheInfo() info.f_party_id = host_party_id info.f_id_type = id_type info.f_encrypt_type = encrypt_type info.f_namespcae = namespace info.f_version = version info.f_tag = tag info.f_rsa_key_n = str(rsa.get('rsa_n')) info.f_rsa_key_d = str(rsa.get('rsa_d')) info.f_rsa_key_e = str(rsa.get('rsa_e')) info.f_create_time = current_timestamp() if is_insert: info.save(force_insert=True) else: info.save()
def update_job_progress(job_id, dag, current_task_id): component_count = len(dag.get_dependency()['component_list']) success_count = success_task_count(job_id=job_id) job = Job() job.f_progress = float(success_count) / component_count * 100 job.f_update_time = current_timestamp() job.f_current_tasks = json_dumps([current_task_id]) return job
def save_job_info(self, role, party_id, job_info, create=False): with DB.connection_context(): stat_logger.info('save {} {} job: {}'.format( role, party_id, job_info)) jobs = Job.select().where(Job.f_job_id == self.job_id, Job.f_role == role, Job.f_party_id == party_id) is_insert = True if jobs: job = jobs[0] is_insert = False if job.f_status == JobStatus.TIMEOUT: return None elif create: job = Job() job.f_create_time = current_timestamp() else: return None job.f_job_id = self.job_id job.f_role = role job.f_party_id = party_id if 'f_status' in job_info: if job.f_status in [JobStatus.COMPLETE, JobStatus.FAILED]: # Termination status cannot be updated # TODO: pass if job_info[ 'f_status'] == JobStatus.FAILED and not job.f_end_time: job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time job.f_update_time = current_timestamp() for k, v in job_info.items(): try: if k in ['f_job_id', 'f_role', 'f_party_id' ] or v == getattr(Job, k).default: continue setattr(job, k, v) except: pass if is_insert: job.save(force_insert=True) else: job.save()
def submit_job(job_data): job_id = generate_job_id() schedule_logger.info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) job_dsl_path, job_runtime_conf_path = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() # save job info TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue RuntimeConfig.JOB_QUEUE.put_event({ 'job_id': job_id, "initiator_role": job_initiator['role'], "initiator_party_id": job_initiator['party_id'] } ) schedule_logger.info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) return job_id, job_dsl_path, job_runtime_conf_path, {'model_id': job_parameters['model_id'], 'model_version': job_parameters[ 'model_version']}, board_url
def save_task(self, role, party_id, task_info): with DB.connection_context(): tasks = Task.select().where( Task.f_job_id == self.job_id, Task.f_component_name == self.component_name, Task.f_task_id == self.task_id, Task.f_role == role, Task.f_party_id == party_id) is_insert = True if tasks: task = tasks[0] is_insert = False else: task = Task() task.f_create_time = current_timestamp() task.f_job_id = self.job_id task.f_component_name = self.component_name task.f_task_id = self.task_id task.f_role = role task.f_party_id = party_id if 'f_status' in task_info: if task.f_status in [TaskStatus.COMPLETE, TaskStatus.FAILED]: # Termination status cannot be updated # TODO: pass for k, v in task_info.items(): try: if k in [ 'f_job_id', 'f_component_name', 'f_task_id', 'f_role', 'f_party_id' ] or v == getattr(Task, k).default: continue except: pass setattr(task, k, v) if is_insert: task.save(force_insert=True) else: task.save() return task
def save_job_info(self, role, party_id, job_info, create=False): with DB.connection_context(): stat_logger.info('save {} {} job: {}'.format( role, party_id, job_info)) jobs = Job.select().where(Job.f_job_id == self.job_id, Job.f_role == role, Job.f_party_id == party_id) is_insert = True if jobs: job = jobs[0] is_insert = False elif create: job = Job() job.f_create_time = current_timestamp() else: return None job.f_job_id = self.job_id job.f_role = role job.f_party_id = party_id if 'f_status' in job_info: if job.f_status in [ JobStatus.SUCCESS, JobStatus.FAILED, JobStatus.PARTIAL, JobStatus.DELETED ]: # Termination status cannot be updated # TODO: pass for k, v in job_info.items(): if k in ['f_job_id', 'f_role', 'f_party_id'] or v == getattr( Job, k).default: continue setattr(job, k, v) if is_insert: job.save(force_insert=True) else: job.save()
def save(self, *args, **kwargs): if hasattr(self, "update_date"): self.update_date = datetime.datetime.now() if hasattr(self, "update_time"): self.update_time = current_timestamp() super(DataBaseModel, self).save(*args, **kwargs)
def run_job(job_id, initiator_role, initiator_party_id): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_initiator = job_runtime_conf.get('initiator', {}) dag = get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() if not job_initiator: return False storage.init_storage(job_id=job_id, work_mode=RuntimeConfig.WORK_MODE) job = Job() job.f_job_id = job_id job.f_start_time = current_timestamp() job.f_status = JobStatus.RUNNING job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) top_level_task_status = set() components = dag.get_next_components(None) schedule_logger.info('job {} root components is {}'.format( job.f_job_id, [component.get_name() for component in components], None)) for component in components: try: # run a component as task run_status = TaskScheduler.run_component( job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component) except Exception as e: schedule_logger.info(e) run_status = False top_level_task_status.add(run_status) if not run_status: break if len(top_level_task_status) == 2: job.f_status = JobStatus.PARTIAL elif True in top_level_task_status: job.f_status = JobStatus.SUCCESS else: job.f_status = JobStatus.FAILED job.f_end_time = current_timestamp() job.f_elapsed = job.f_end_time - job.f_start_time if job.f_status == JobStatus.SUCCESS: job.f_progress = 100 job.f_update_time = current_timestamp() TaskScheduler.sync_job_status( job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], job_info=job.to_json()) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf) schedule_logger.info('job {} finished, status is {}'.format( job.f_job_id, job.f_status))
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config(HTTP_PORT=args.job_server.split(':')[1]) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id) # parameters = task_config['parameters'] module_name = task_config['module_name'] except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED return try: # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'], BACKEND=job_parameters.get('backend', 0)) session.init(job_id='{}_{}_{}'.format(task_id, role, party_id), mode=RuntimeConfig.WORK_MODE, backend=RuntimeConfig.BACKEND) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join(run_class_paths[:-2]) + '.' + run_class_paths[-2].replace('.py','') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) schedule_logger().info('run {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id)) schedule_logger().info(parameters) schedule_logger().info(task_input_dsl) run_object.run(parameters, task_run_args) output_data = run_object.save_data() tracker.save_output_data_table(output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component') output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task.f_status = TaskStatus.SUCCESS except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED finally: sync_success = False try: session.stop() task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) sync_success = True except Exception as e: traceback.print_exc() schedule_logger().exception(e) schedule_logger().info( 'finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))
def submit_job(job_data): job_id = generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue job_event = job_utils.job_event(job_id, initiator_role, initiator_party_id) try: RuntimeConfig.JOB_QUEUE.put_event(job_event) except Exception as e: raise Exception('push job into queue failed') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters['model_id'],'model_version': job_parameters['model_version']}, board_url
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="Specify a config json file path") parser.add_argument('-n', '--component_name', required=True, type=str, help="Specify a config json file path") parser.add_argument('-t', '--task_id', required=True, type=str, help="Specify a config json file path") parser.add_argument('-r', '--role', required=True, type=str, help="Specify a config json file path") parser.add_argument('-p', '--party_id', required=True, type=str, help="Specify a config json file path") parser.add_argument('-c', '--config', required=True, type=str, help="Specify a config json file path") args = parser.parse_args() schedule_logger.info('enter task process') schedule_logger.info(args) # init function args job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config.get('job_parameters', None) job_initiator = task_config.get('job_initiator', None) job_args = task_config.get('job_args', {}) task_input_dsl = task_config.get('input', {}) task_output_dsl = task_config.get('output', {}) parameters = task_config.get('parameters', {}) module_name = task_config.get('module_name', '') except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED return try: # init environment RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode']) storage.init_storage(job_id=task_id, work_mode=RuntimeConfig.WORK_MODE) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].rstrip('.py') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args( job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get( 'party_id', None), task_info=task.to_json()) schedule_logger.info('run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger.info(parameters) schedule_logger.info(task_input_dsl) run_object.run(parameters, task_run_args) if task_output_dsl: if task_output_dsl.get('data', []): output_data = run_object.save_data() tracker.save_output_data_table( output_data, task_output_dsl.get('data')[0]) if task_output_dsl.get('model', []): output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0]) task.f_status = TaskStatus.SUCCESS except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED finally: try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json()) except Exception as e: schedule_logger.exception(e) schedule_logger.info('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status))
def save(self, *args, **kwargs): if hasattr(self, "f_update_time"): self.f_update_time = current_timestamp() super(DataBaseModel, self).save(*args, **kwargs)