def save_metric_data(job_id, component_name, task_id, role, party_id): request_data = request.json tracker = Tracking(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id) metrics = [deserialize_b64(metric) for metric in request_data['metrics']] tracker.save_metric_data(metric_namespace=request_data['metric_namespace'], metric_name=request_data['metric_name'], metrics=metrics, job_level=request_data['job_level']) return get_json_result()
def get_component_output_data_table(task_data): check_request_parameters(task_data) tracker = Tracking(job_id=task_data['job_id'], component_name=task_data['component_name'], role=task_data['role'], party_id=task_data['party_id']) job_dsl_parser = job_utils.get_job_dsl_parser_by_job_id(job_id=task_data['job_id']) if not job_dsl_parser: raise Exception('can get dag parser') component = job_dsl_parser.get_component_info(task_data['component_name']) if not component: raise Exception('can found component') output_dsl = component.get_output() output_data_dsl = output_dsl.get('data', []) # The current version will only have one data output. output_data_table = tracker.get_output_data_table(output_data_dsl[0] if output_data_dsl else 'component') return output_data_table
def test_downsample(self): sampler = RandomSampler(fraction=0.3, method="downsample") tracker = Tracking("jobid", "guest", 9999, "abc", "123") sampler.set_tracker(tracker) sample_data, sample_ids = sampler.sample(self.table) self.assertTrue(sample_data.count() > 25 and sample_data.count() < 35) self.assertTrue(len(set(sample_ids)) == len(sample_ids)) new_data = list(sample_data.collect()) data_dict = dict(self.data) for id, value in new_data: self.assertTrue(id in data_dict) self.assertTrue(np.abs(value - data_dict.get(id)) < consts.FLOAT_ZERO) trans_sampler = RandomSampler(method="downsample") trans_sampler.set_tracker(tracker) trans_sample_data = trans_sampler.sample(self.table_trans, sample_ids) trans_data = list(trans_sample_data.collect()) trans_sample_ids = [id for (id, value) in trans_data] data_to_trans_dict = dict(self.data_to_trans) sample_id_mapping = dict(zip(sample_ids, range(len(sample_ids)))) self.assertTrue(len(trans_data) == len(sample_ids)) self.assertTrue(set(trans_sample_ids) == set(sample_ids)) for id, value in trans_data: self.assertTrue(id in sample_id_mapping) self.assertTrue(np.abs(value - data_to_trans_dict.get(id)) < consts.FLOAT_ZERO)
def setUp(self): self.data = [] self.max_feature = -1 for i in range(100): row = [] label = i % 2 row.append(str(label)) dict = {} for j in range(20): x = random.randint(0, 1000) val = random.random() if x in dict: continue self.max_feature = max(self.max_feature, x) dict[x] = True row.append(":".join(map(str, [x, val]))) self.data.append((i, " ".join(row))) self.table = session.parallelize(self.data, include_key=True) self.args = {"data": {"data_io_0": { "data": self.table } } } self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
def setUp(self): self.data = [] self.data_with_value = [] for i in range(100): row = [] row_with_value = [] for j in range(100): if random.randint(1, 100) > 30: continue str_r = ''.join(random.sample(string.ascii_letters + string.digits, 10)) row.append(str_r) row_with_value.append(str_r + ':' + str(random.random())) self.data.append((i, ' '.join(row))) self.data_with_value.append((i, ' '.join(row_with_value))) self.table1 = session.parallelize(self.data, include_key=True) self.table2 = session.parallelize(self.data_with_value, include_key=True) self.args1 = {"data": {"data_io_0": { "data": self.table1 } } } self.args2 = {"data": {"data_io_1": { "data": self.table2 } } } self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
def test_upsample(self): fractions = [(0, 1.3), (1, 0.5), (2, 0.8), (3, 9)] sampler = StratifiedSampler(fractions=fractions, method="upsample") tracker = Tracking("jobid", "guest", 9999, "abc", "123") sampler.set_tracker(tracker) sample_data, sample_ids = sampler.sample(self.table) new_data = list(sample_data.collect()) count_label = [0 for i in range(4)] data_dict = dict(self.data) for id, inst in new_data: count_label[inst.label] += 1 self.assertTrue(type(id).__name__ == 'int' and id >= 0 and id < len(sample_ids)) real_id = sample_ids[id] self.assertTrue(inst.label == self.data[real_id][1].label and inst.features == self.data[real_id][1].features) for i in range(4): self.assertTrue(np.abs(count_label[i] - 250 * fractions[i][1]) < 10) trans_sampler = StratifiedSampler(method="upsample") trans_sampler.set_tracker(tracker) trans_sample_data = trans_sampler.sample(self.table_trans, sample_ids) trans_data = (trans_sample_data.collect()) trans_sample_ids = [id for (id, value) in trans_data] data_to_trans_dict = dict(self.data_to_trans) self.assertTrue(sorted(trans_sample_ids) == list(range(len(sample_ids)))) for id, inst in trans_data: real_id = sample_ids[id] self.assertTrue(inst.features == self.data_to_trans_dict[real_id][1].features)
def test_upsample(self): sampler = RandomSampler(fraction=3, method="upsample") tracker = Tracking("jobid", "guest", 9999, "abc", "123") sampler.set_tracker(tracker) sample_data, sample_ids = sampler.sample(self.table) self.assertTrue(sample_data.count() > 250 and sample_data.count() < 350) data_dict = dict(self.data) new_data = list(sample_data.collect()) for id, value in new_data: self.assertTrue( np.abs(value - data_dict[sample_ids[id]]) < consts.FLOAT_ZERO) trans_sampler = RandomSampler(method="upsample") trans_sampler.set_tracker(tracker) trans_sample_data = trans_sampler.sample(self.table_trans, sample_ids) trans_data = list(trans_sample_data.collect()) data_to_trans_dict = dict(self.data_to_trans) self.assertTrue(len(trans_data) == len(sample_ids)) for id, value in trans_data: self.assertTrue( np.abs(value - data_to_trans_dict[sample_ids[id]]) < consts.FLOAT_ZERO)
def test_downsample(self): fractions = [(0, 0.3), (1, 0.4), (2, 0.5), (3, 0.8)] sampler = StratifiedSampler(fractions=fractions, method="downsample") tracker = Tracking("jobid", "guest", 9999, "abc", "123") sampler.set_tracker(tracker) sample_data, sample_ids = sampler.sample(self.table) count_label = [0 for i in range(4)] new_data = list(sample_data.collect()) data_dict = dict(self.data) self.assertTrue(set(sample_ids) & set(data_dict.keys()) == set(sample_ids)) for id, inst in new_data: count_label[inst.label] += 1 self.assertTrue(type(id).__name__ == 'int' and id >= 0 and id < 1000) self.assertTrue(inst.label == self.data[id][1].label and inst.features == self.data[id][1].features) for i in range(4): self.assertTrue(np.abs(count_label[i] - 250 * fractions[i][1]) < 10) trans_sampler = StratifiedSampler(method="downsample") trans_sampler.set_tracker(tracker) trans_sample_data = trans_sampler.sample(self.table_trans, sample_ids) trans_data = list(trans_sample_data.collect()) trans_sample_ids = [id for (id, value) in trans_data] data_to_trans_dict = dict(self.data_to_trans) self.assertTrue(set(trans_sample_ids) == set(sample_ids)) for id, inst in trans_data: self.assertTrue(inst.features == data_to_trans_dict.get(id).features)
def component_output_model(): request_data = request.json check_request_parameters(request_data) job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) model_id = job_runtime_conf['job_parameters']['model_id'] model_version = job_runtime_conf['job_parameters']['model_version'] tracker = Tracking(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id'], model_id=model_id, model_version=model_version) dag = job_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) component = dag.get_component_info(request_data['component_name']) output_model_json = {} if component.get_output().get('model', []): # There is only one model output at the current dsl version. output_model = tracker.get_output_model( component.get_output()['model'][0]) for buffer_name, buffer_object in output_model.items(): if buffer_name.endswith('Param'): output_model_json = json_format.MessageToDict( buffer_object, including_default_value_fields=True) if output_model_json: pipeline_output_model = tracker.get_output_model_meta() this_component_model_meta = {} for k, v in pipeline_output_model.items(): if k.endswith('_module_name'): if k == '{}_module_name'.format( request_data['component_name']): this_component_model_meta['module_name'] = v else: k_i = k.split('.') if '.'.join(k_i[:-1]) == request_data['component_name']: this_component_model_meta[k] = v return get_json_result(retcode=0, retmsg='success', data=output_model_json, meta=this_component_model_meta) else: return get_json_result(retcode=0, retmsg='no data', data={})
def setUp(self): data1 = [("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")] self.table1 = eggroll.parallelize(data1, include_key=True) data2 = [("a", '-1,,na,null,null,2')] self.table2 = eggroll.parallelize(data2, include_key=True) self.args1 = {"data": {"data_io_0": {"data": self.table1}}} self.args2 = {"data": {"data_io_1": {"data": self.table2}}} self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
def save_pipeline(job_id, role, party_id, model_id, model_version): job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_type = job_parameters.get('job_type', '') if job_type == 'predict': return dag = job_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) job_tracker = Tracking(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) job_tracker.save_output_model({'Pipeline': pipeline}, 'pipeline')
def component_metric_all(): request_data = request.json check_request_parameters(request_data) tracker = Tracking(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) metrics = tracker.get_metric_list() all_metric_data = {} if metrics: for metric_namespace, metric_names in metrics.items(): all_metric_data[metric_namespace] = all_metric_data.get(metric_namespace, {}) for metric_name in metric_names: all_metric_data[metric_namespace][metric_name] = all_metric_data[metric_namespace].get(metric_name, {}) metric_data, metric_meta = get_metric_all_data(tracker=tracker, metric_namespace=metric_namespace, metric_name=metric_name) all_metric_data[metric_namespace][metric_name]['data'] = metric_data all_metric_data[metric_namespace][metric_name]['meta'] = metric_meta return get_json_result(retcode=0, retmsg='success', data=all_metric_data) else: return get_json_result(retcode=0, retmsg='no data', data={})
def job_view(): request_data = request.json check_request_parameters(request_data) job_tracker = Tracking(job_id=request_data['job_id'], role=request_data['role'], party_id=request_data['party_id']) job_view_data = job_tracker.get_job_view() if job_view_data: job_metric_list = job_tracker.get_metric_list(job_level=True) job_view_data['model_summary'] = {} for metric_namespace, namespace_metrics in job_metric_list.items(): job_view_data['model_summary'][metric_namespace] = job_view_data['model_summary'].get(metric_namespace, {}) for metric_name in namespace_metrics: job_view_data['model_summary'][metric_namespace][metric_name] = job_view_data['model_summary'][ metric_namespace].get(metric_name, {}) for metric_data in job_tracker.get_job_metric_data(metric_namespace=metric_namespace, metric_name=metric_name): job_view_data['model_summary'][metric_namespace][metric_name][metric_data.key] = metric_data.value return get_json_result(retcode=0, retmsg='success', data=job_view_data) else: return get_json_result(retcode=101, retmsg='error')
def get_task_run_args(job_id, role, party_id, job_parameters, job_args, input_dsl): task_run_args = {} for input_type, input_detail in input_dsl.items(): if input_type == 'data': this_type_args = task_run_args[input_type] = task_run_args.get(input_type, {}) for data_type, data_list in input_detail.items(): for data_key in data_list: data_key_item = data_key.split('.') search_component_name, search_data_name = data_key_item[0], data_key_item[1] if search_component_name == 'args': if job_args.get('data', {}).get(search_data_name).get('namespace', '') and job_args.get( 'data', {}).get(search_data_name).get('name', ''): data_table = session.table( namespace=job_args['data'][search_data_name]['namespace'], name=job_args['data'][search_data_name]['name']) else: data_table = None else: data_table = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=search_component_name).get_output_data_table( data_name=search_data_name) args_from_component = this_type_args[search_component_name] = this_type_args.get( search_component_name, {}) args_from_component[data_type] = data_table elif input_type in ['model', 'isometric_model']: this_type_args = task_run_args[input_type] = task_run_args.get(input_type, {}) for dsl_model_key in input_detail: dsl_model_key_items = dsl_model_key.split('.') if len(dsl_model_key_items) == 2: search_component_name, search_model_name = dsl_model_key_items[0], dsl_model_key_items[1] elif len(dsl_model_key_items) == 3 and dsl_model_key_items[0] == 'pipeline': search_component_name, search_model_name = dsl_model_key_items[1], dsl_model_key_items[2] else: raise Exception('get input {} failed'.format(input_type)) models = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=search_component_name, model_id=job_parameters['model_id'], model_version=job_parameters['model_version']).get_output_model( model_name=search_model_name) this_type_args[search_component_name] = models return task_run_args
def component_metric_data(): request_data = request.json check_request_parameters(request_data) tracker = Tracking(job_id=request_data['job_id'], component_name=request_data['component_name'], role=request_data['role'], party_id=request_data['party_id']) metric_data, metric_meta = get_metric_all_data(tracker=tracker, metric_namespace=request_data['metric_namespace'], metric_name=request_data['metric_name']) if metric_data or metric_meta: return get_json_result(retcode=0, retmsg='success', data=metric_data, meta=metric_meta) else: return get_json_result(retcode=0, retmsg='no data', data=[], meta={})
def clean_job(job_id, role, party_id): schedule_logger.info('job {} on {} {} start to clean'.format(job_id, role, party_id)) tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id) for task in tasks: try: Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task() schedule_logger.info( 'job {} component {} on {} {} clean done'.format(job_id, task.f_component_name, role, party_id)) except Exception as e: schedule_logger.info( 'job {} component {} on {} {} clean failed'.format(job_id, task.f_component_name, role, party_id)) schedule_logger.exception(e) schedule_logger.info('job {} on {} {} clean done'.format(job_id, role, party_id))
def update_job_status(job_id, role, party_id, job_info, create=False): job_tracker = Tracking(job_id=job_id, role=role, party_id=party_id) job_info['f_run_ip'] = RuntimeConfig.JOB_SERVER_HOST if create: dsl = json_loads(job_info['f_dsl']) runtime_conf = json_loads(job_info['f_runtime_conf']) train_runtime_conf = json_loads(job_info['f_train_runtime_conf']) if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) save_job_conf(job_id=job_id, job_dsl=dsl, job_runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) roles = json_loads(job_info['f_roles']) partner = {} show_role = {} is_initiator = job_info.get('f_is_initiator', 0) for _role, _role_party in roles.items(): if is_initiator or _role == role: show_role[_role] = show_role.get(_role, []) for _party_id in _role_party: if is_initiator or _party_id == party_id: show_role[_role].append(_party_id) if _role != role: partner[_role] = partner.get(_role, []) partner[_role].extend(_role_party) else: for _party_id in _role_party: if _party_id != party_id: partner[_role] = partner.get(_role, []) partner[_role].append(_party_id) dag = get_job_dsl_parser(dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() dataset = {} for _role, _role_party_args in job_args.items(): if is_initiator or _role == role: for _party_index in range(len(_role_party_args)): _party_id = roles[_role][_party_index] if is_initiator or _party_id == party_id: dataset[_role] = dataset.get(_role, {}) dataset[_role][_party_id] = dataset[_role].get(_party_id, {}) for _data_type, _data_location in _role_party_args[_party_index]['args']['data'].items(): dataset[_role][_party_id][_data_type] = '{}.{}'.format(_data_location['namespace'], _data_location['name']) job_tracker.log_job_view({'partner': partner, 'dataset': dataset, 'roles': show_role}) job_tracker.save_job_info(role=role, party_id=party_id, job_info=job_info, create=create)
def setUp(self): name1 = "dense_data_" + str(random.random()) name2 = "dense_data_" + str(random.random()) namespace = "data_io_dense_test" data1 = [("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")] schema = {"header": "x1,x2,x3,x4,x5,x6", "sid": "id"} table1 = session.parallelize(data1, include_key=True) table1.save_as(name1, namespace) session.save_data_table_meta(schema, name1, namespace) self.table1 = session.table(name1, namespace) data2 = [("a", '-1,,na,null,null,2')] table2 = session.parallelize(data2, include_key=True) table2.save_as(name2, namespace) session.save_data_table_meta(schema, name2, namespace) self.table2 = session.table(name2, namespace) self.args1 = {"data": {"data_io_0": {"data": self.table1}}} self.args2 = {"data": {"data_io_1": {"data": self.table2}}} self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="Specify a config json file path") parser.add_argument('-n', '--component_name', required=True, type=str, help="Specify a config json file path") parser.add_argument('-t', '--task_id', required=True, type=str, help="Specify a config json file path") parser.add_argument('-r', '--role', required=True, type=str, help="Specify a config json file path") parser.add_argument('-p', '--party_id', required=True, type=str, help="Specify a config json file path") parser.add_argument('-c', '--config', required=True, type=str, help="Specify a config json file path") args = parser.parse_args() schedule_logger.info('enter task process') schedule_logger.info(args) # init function args job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config.get('job_parameters', None) job_initiator = task_config.get('job_initiator', None) job_args = task_config.get('job_args', {}) task_input_dsl = task_config.get('input', {}) task_output_dsl = task_config.get('output', {}) parameters = task_config.get('parameters', {}) module_name = task_config.get('module_name', '') except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED return try: # init environment RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode']) storage.init_storage(job_id=task_id, work_mode=RuntimeConfig.WORK_MODE) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].rstrip('.py') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args( job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get( 'party_id', None), task_info=task.to_json()) schedule_logger.info('run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger.info(parameters) schedule_logger.info(task_input_dsl) run_object.run(parameters, task_run_args) if task_output_dsl: if task_output_dsl.get('data', []): output_data = run_object.save_data() tracker.save_output_data_table( output_data, task_output_dsl.get('data')[0]) if task_output_dsl.get('model', []): output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0]) task.f_status = TaskStatus.SUCCESS except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED finally: try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json()) except Exception as e: schedule_logger.exception(e) schedule_logger.info('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status))
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config(HTTP_PORT=args.job_server.split(':')[1]) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id) # parameters = task_config['parameters'] module_name = task_config['module_name'] except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED return try: # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'], BACKEND=job_parameters.get('backend', 0)) session.init(job_id='{}_{}_{}'.format(task_id, role, party_id), mode=RuntimeConfig.WORK_MODE, backend=RuntimeConfig.BACKEND) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join(run_class_paths[:-2]) + '.' + run_class_paths[-2].replace('.py','') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) schedule_logger().info('run {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id)) schedule_logger().info(parameters) schedule_logger().info(task_input_dsl) run_object.run(parameters, task_run_args) output_data = run_object.save_data() tracker.save_output_data_table(output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component') output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task.f_status = TaskStatus.SUCCESS except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED finally: sync_success = False try: session.stop() task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) sync_success = True except Exception as e: traceback.print_exc() schedule_logger().exception(e) schedule_logger().info( 'finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))
def submit_job(job_data): job_id = generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue job_event = job_utils.job_event(job_id, initiator_role, initiator_party_id) try: RuntimeConfig.JOB_QUEUE.put_event(job_event) except Exception as e: raise Exception('push job into queue failed') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters['model_id'],'model_version': job_parameters['model_version']}, board_url
def update_task_status(job_id, component_name, task_id, role, party_id, task_info): tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id) tracker.save_task(role=role, party_id=party_id, task_info=task_info) schedule_logger.info( 'job {} component {} {} {} status {}'.format(job_id, component_name, role, party_id, task_info.get('f_status', '')))