def save_output_data_table(self, data_table: Table, data_name: str = 'component'): """ Save component output data, will run in the task executor process :param data_table: :param data_name: :return: """ if data_table: persistent_table_namespace, persistent_table_name = 'output_data_{}'.format( self.task_id), data_table.get_name() schedule_logger(self.job_id).info( 'persisting the component output temporary table: {} {} to {} {}' .format(data_table.get_namespace(), data_table.get_name(), persistent_table_namespace, persistent_table_name)) persistent_table = data_table.save_as( namespace=persistent_table_namespace, name=persistent_table_name) persistent_table_metas = {} persistent_table_metas.update(data_table.get_metas()) persistent_table_metas["schema"] = data_table.schema session.save_data_table_meta( persistent_table_metas, data_table_namespace=persistent_table.get_namespace(), data_table_name=persistent_table.get_name()) data_table_info = { data_name: { 'name': persistent_table.get_name(), 'namespace': persistent_table.get_namespace() } } else: data_table_info = {} session.save_data(data_table_info.items(), name=Tracking.output_table_name('data'), namespace=self.table_namespace, partition=48) self.save_data_view( self.role, self.party_id, data_info={ 'f_table_name': persistent_table._name if data_table else '', 'f_table_namespace': persistent_table._namespace if data_table else '', 'f_partition': persistent_table._partitions if data_table else None, 'f_table_count_actual': data_table.count() if data_table else 0 }, mark=True)
def save_data_table(self, dst_table_name, dst_table_namespace, head=True, job_id=None): input_file = self.parameters["file"] count = self.get_count(input_file) with open(input_file, 'r') as fin: lines_count = 0 if head is True: data_head = fin.readline() count -= 1 self.save_data_header(data_head, dst_table_name, dst_table_namespace) while True: data = list() lines = fin.readlines(self.MAX_BYTES) if lines: for line in lines: values = line.replace("\n", "").replace("\t", ",").split(",") data.append((values[0], self.list_to_str(values[1:]))) lines_count += len(data) f_progress = lines_count/count*100//1 job_info = {'f_progress': f_progress} self.update_job_status(self.parameters["local"]['role'], self.parameters["local"]['party_id'], job_info) data_table = session.save_data(data, name=dst_table_name, namespace=dst_table_namespace, partition=self.parameters["partition"]) else: self.tracker.save_data_view(role=self.parameters["local"]['role'], party_id=self.parameters["local"]['party_id'], data_info={'f_table_name': dst_table_name, 'f_table_namespace': dst_table_namespace, 'f_partition': self.parameters["partition"], 'f_table_create_count': data_table.count() }) self.callback_metric(metric_name='data_access', metric_namespace='upload', metric_data=[Metric("count", data_table.count())]) return data_table.count()
def save_data_table(self, dst_table_name, dst_table_namespace, head=True, in_version=False): input_file = self.parameters["file"] count = self.get_count(input_file) with open(input_file, 'r') as fin: lines_count = 0 if head is True: data_head = fin.readline() count -= 1 self.save_data_header(data_head, dst_table_name, dst_table_namespace) self.table_info["cols"] = data_head while True: data = list() lines = fin.readlines(self.MAX_BYTES) if lines: for line in lines: values = line.replace("\n", "").replace("\t", ",").split(",") data.append((values[0], self.list_to_str(values[1:]))) lines_count += len(data) f_progress = lines_count / count * 100 // 1 job_info = {'f_progress': f_progress} self.update_job_status( self.parameters["local"]['role'], self.parameters["local"]['party_id'], job_info) data_table = session.save_data( data, name=dst_table_name, namespace=dst_table_namespace, partition=self.parameters["partition"]) self.table_info["v_len"] = data_table_count else: self.tracker.save_data_view( role=self.parameters["local"]['role'], party_id=self.parameters["local"]['party_id'], data_info={ 'f_table_name': dst_table_name, 'f_table_namespace': dst_table_namespace, 'f_partition': self.parameters["partition"], 'f_table_count_actual': data_table.count(), 'f_table_count_upload': count }) self.callback_metric( metric_name='data_access', metric_namespace='upload', metric_data=[Metric("count", data_table.count())]) if in_version: version_log = "[AUTO] save data at %s." % datetime.datetime.now( ) version_control.save_version( name=dst_table_name, namespace=dst_table_namespace, version_log=version_log) return data_table.count()
def save_output_data_table(self, data_table, data_name: str = 'component'): if data_table: persistent_table = data_table.save_as( namespace=data_table._namespace, name='{}_persistent'.format(data_table._name)) session.save_data_table_meta( { 'schema': data_table.schema, 'header': data_table.schema.get('header', []) }, data_table_namespace=persistent_table._namespace, data_table_name=persistent_table._name) data_table_info = { data_name: { 'name': persistent_table._name, 'namespace': persistent_table._namespace } } else: data_table_info = {} session.save_data(data_table_info.items(), name=Tracking.output_table_name('data'), namespace=self.table_namespace, partition=48) self.save_data_view( self.role, self.party_id, data_info={ 'f_table_name': persistent_table._name if data_table else '', 'f_table_namespace': persistent_table._namespace if data_table else '', 'f_partition': persistent_table._partitions if data_table else None, 'f_table_create_count': data_table.count() if data_table else 0 }, mark=True)
def save_output_data_table(self, data_table, data_name: str = 'component'): if data_table: persistent_table = data_table.save_as( namespace=data_table._namespace, name='{}_persistent'.format(data_table._name)) session.save_data_table_meta( { 'schema': data_table.schema, 'header': data_table.schema.get('header', []) }, data_table_namespace=persistent_table._namespace, data_table_name=persistent_table._name) data_table_info = { data_name: { 'name': persistent_table._name, 'namespace': persistent_table._namespace } } else: data_table_info = {} session.save_data(data_table_info.items(), name=Tracking.output_table_name('data'), namespace=self.table_namespace, partition=48)
def save_data_table(self, dst_table_name, dst_table_namespace, head=True): input_file = self.parameters["file"] with open(input_file, 'r') as fin: if head is True: data_head = fin.readline() self.save_data_header(data_head, dst_table_name, dst_table_namespace) while True: data = list() lines = fin.readlines(self.MAX_BYTES) if lines: for line in lines: values = line.replace("\n", "").replace("\t", ",").split(",") data.append((values[0], self.list_to_str(values[1:]))) data_table = session.save_data(data, name=dst_table_name, namespace=dst_table_namespace, partition=self.parameters["partition"]) else: return data_table.count()
if work_mode is None: work_mode = 0 if not os.path.exists(input_file_path): print("%s is not exist, please check the configure" % (input_file_path)) sys.exit() _namespace, _table_name = generate_table_name(input_file_path) if namespace is None: namespace = _namespace if table_name is None: table_name = _table_name # todo: use eggroll as default storage backed session.init(job_id=args.job_id, mode=work_mode, backend=Backend.EGGROLL) input_data = read_data(input_file_path, table_name, namespace, head) in_version = job_config.get('in_version', False) data_table = session.save_data(input_data, name=table_name, namespace=namespace, partition=partition, in_version=in_version) print("------------load data finish!-----------------") print("file: {}".format(input_file_path)) print("total data_count: {}".format(data_table.count())) print("table name: {}, table namespace: {}".format(table_name, namespace)) except ValueError: print('json parse error') exit(-102) except IOError: print('read file error') exit(-103) except: traceback.print_exc()