Esempio n. 1
0
 def save_output_data_table(self,
                            data_table: Table,
                            data_name: str = 'component'):
     """
     Save component output data, will run in the task executor process
     :param data_table:
     :param data_name:
     :return:
     """
     if data_table:
         persistent_table_namespace, persistent_table_name = 'output_data_{}'.format(
             self.task_id), data_table.get_name()
         schedule_logger(self.job_id).info(
             'persisting the component output temporary table: {} {} to {} {}'
             .format(data_table.get_namespace(), data_table.get_name(),
                     persistent_table_namespace, persistent_table_name))
         persistent_table = data_table.save_as(
             namespace=persistent_table_namespace,
             name=persistent_table_name)
         persistent_table_metas = {}
         persistent_table_metas.update(data_table.get_metas())
         persistent_table_metas["schema"] = data_table.schema
         session.save_data_table_meta(
             persistent_table_metas,
             data_table_namespace=persistent_table.get_namespace(),
             data_table_name=persistent_table.get_name())
         data_table_info = {
             data_name: {
                 'name': persistent_table.get_name(),
                 'namespace': persistent_table.get_namespace()
             }
         }
     else:
         data_table_info = {}
     session.save_data(data_table_info.items(),
                       name=Tracking.output_table_name('data'),
                       namespace=self.table_namespace,
                       partition=48)
     self.save_data_view(
         self.role,
         self.party_id,
         data_info={
             'f_table_name':
             persistent_table._name if data_table else '',
             'f_table_namespace':
             persistent_table._namespace if data_table else '',
             'f_partition':
             persistent_table._partitions if data_table else None,
             'f_table_count_actual':
             data_table.count() if data_table else 0
         },
         mark=True)
Esempio n. 2
0
 def save_data_table(self, dst_table_name, dst_table_namespace, head=True, job_id=None):
     input_file = self.parameters["file"]
     count = self.get_count(input_file)
     with open(input_file, 'r') as fin:
         lines_count = 0
         if head is True:
             data_head = fin.readline()
             count -= 1
             self.save_data_header(data_head, dst_table_name, dst_table_namespace)
         while True:
             data = list()
             lines = fin.readlines(self.MAX_BYTES)
             if lines:
                 for line in lines:
                     values = line.replace("\n", "").replace("\t", ",").split(",")
                     data.append((values[0], self.list_to_str(values[1:])))
                 lines_count += len(data)
                 f_progress = lines_count/count*100//1
                 job_info = {'f_progress': f_progress}
                 self.update_job_status(self.parameters["local"]['role'], self.parameters["local"]['party_id'],
                                        job_info)
                 data_table = session.save_data(data, name=dst_table_name, namespace=dst_table_namespace,
                                                partition=self.parameters["partition"])
             else:
                 self.tracker.save_data_view(role=self.parameters["local"]['role'],
                                             party_id=self.parameters["local"]['party_id'],
                                             data_info={'f_table_name': dst_table_name,
                                                        'f_table_namespace': dst_table_namespace,
                                                        'f_partition': self.parameters["partition"],
                                                        'f_table_create_count': data_table.count()
                                                        })
                 self.callback_metric(metric_name='data_access',
                                      metric_namespace='upload',
                                      metric_data=[Metric("count", data_table.count())])
                 return data_table.count()
Esempio n. 3
0
    def save_data_table(self,
                        dst_table_name,
                        dst_table_namespace,
                        head=True,
                        in_version=False):
        input_file = self.parameters["file"]
        count = self.get_count(input_file)
        with open(input_file, 'r') as fin:
            lines_count = 0
            if head is True:
                data_head = fin.readline()
                count -= 1
                self.save_data_header(data_head, dst_table_name,
                                      dst_table_namespace)
                self.table_info["cols"] = data_head
            while True:
                data = list()
                lines = fin.readlines(self.MAX_BYTES)
                if lines:
                    for line in lines:
                        values = line.replace("\n", "").replace("\t",
                                                                ",").split(",")
                        data.append((values[0], self.list_to_str(values[1:])))
                    lines_count += len(data)
                    f_progress = lines_count / count * 100 // 1
                    job_info = {'f_progress': f_progress}
                    self.update_job_status(
                        self.parameters["local"]['role'],
                        self.parameters["local"]['party_id'], job_info)
                    data_table = session.save_data(
                        data,
                        name=dst_table_name,
                        namespace=dst_table_namespace,
                        partition=self.parameters["partition"])

                    self.table_info["v_len"] = data_table_count
                else:
                    self.tracker.save_data_view(
                        role=self.parameters["local"]['role'],
                        party_id=self.parameters["local"]['party_id'],
                        data_info={
                            'f_table_name': dst_table_name,
                            'f_table_namespace': dst_table_namespace,
                            'f_partition': self.parameters["partition"],
                            'f_table_count_actual': data_table.count(),
                            'f_table_count_upload': count
                        })
                    self.callback_metric(
                        metric_name='data_access',
                        metric_namespace='upload',
                        metric_data=[Metric("count", data_table.count())])
                    if in_version:
                        version_log = "[AUTO] save data at %s." % datetime.datetime.now(
                        )
                        version_control.save_version(
                            name=dst_table_name,
                            namespace=dst_table_namespace,
                            version_log=version_log)
                    return data_table.count()
Esempio n. 4
0
 def save_output_data_table(self, data_table, data_name: str = 'component'):
     if data_table:
         persistent_table = data_table.save_as(
             namespace=data_table._namespace,
             name='{}_persistent'.format(data_table._name))
         session.save_data_table_meta(
             {
                 'schema': data_table.schema,
                 'header': data_table.schema.get('header', [])
             },
             data_table_namespace=persistent_table._namespace,
             data_table_name=persistent_table._name)
         data_table_info = {
             data_name: {
                 'name': persistent_table._name,
                 'namespace': persistent_table._namespace
             }
         }
     else:
         data_table_info = {}
     session.save_data(data_table_info.items(),
                       name=Tracking.output_table_name('data'),
                       namespace=self.table_namespace,
                       partition=48)
     self.save_data_view(
         self.role,
         self.party_id,
         data_info={
             'f_table_name':
             persistent_table._name if data_table else '',
             'f_table_namespace':
             persistent_table._namespace if data_table else '',
             'f_partition':
             persistent_table._partitions if data_table else None,
             'f_table_create_count':
             data_table.count() if data_table else 0
         },
         mark=True)
Esempio n. 5
0
 def save_output_data_table(self, data_table, data_name: str = 'component'):
     if data_table:
         persistent_table = data_table.save_as(
             namespace=data_table._namespace,
             name='{}_persistent'.format(data_table._name))
         session.save_data_table_meta(
             {
                 'schema': data_table.schema,
                 'header': data_table.schema.get('header', [])
             },
             data_table_namespace=persistent_table._namespace,
             data_table_name=persistent_table._name)
         data_table_info = {
             data_name: {
                 'name': persistent_table._name,
                 'namespace': persistent_table._namespace
             }
         }
     else:
         data_table_info = {}
     session.save_data(data_table_info.items(),
                       name=Tracking.output_table_name('data'),
                       namespace=self.table_namespace,
                       partition=48)
Esempio n. 6
0
 def save_data_table(self, dst_table_name, dst_table_namespace, head=True):
     input_file = self.parameters["file"]
     with open(input_file, 'r') as fin:
         if head is True:
             data_head = fin.readline()
             self.save_data_header(data_head, dst_table_name, dst_table_namespace)
         while True:
             data = list()
             lines = fin.readlines(self.MAX_BYTES)
             if lines:
                 for line in lines:
                     values = line.replace("\n", "").replace("\t", ",").split(",")
                     data.append((values[0], self.list_to_str(values[1:])))
                 data_table = session.save_data(data, name=dst_table_name, namespace=dst_table_namespace,
                                                partition=self.parameters["partition"])
             else:
                 return data_table.count()
Esempio n. 7
0
                if work_mode is None:
                    work_mode = 0

            if not os.path.exists(input_file_path):
                print("%s is not exist, please check the configure" % (input_file_path))
                sys.exit()

            _namespace, _table_name = generate_table_name(input_file_path)
            if namespace is None:
                namespace = _namespace
            if table_name is None:
                table_name = _table_name
            # todo: use eggroll as default storage backed
            session.init(job_id=args.job_id, mode=work_mode, backend=Backend.EGGROLL)
            input_data = read_data(input_file_path, table_name, namespace, head)
            in_version = job_config.get('in_version', False)
            data_table = session.save_data(input_data, name=table_name, namespace=namespace, partition=partition, in_version=in_version)
            print("------------load data finish!-----------------")
            print("file: {}".format(input_file_path))
            print("total data_count: {}".format(data_table.count()))
            print("table name: {}, table namespace: {}".format(table_name, namespace))

        except ValueError:
            print('json parse error')
            exit(-102)
        except IOError:
            print('read file error')
            exit(-103)
    except:
        traceback.print_exc()