Ejemplo n.º 1
0
    def run(self, component_parameters=None, args=None):
        self.parameters = component_parameters["ReaderParam"]
        output_storage_address = args["job_parameters"].engines_address[
            EngineType.STORAGE]
        table_key = [key for key in self.parameters.keys()][0]
        computing_engine = args["job_parameters"].computing_engine
        output_table_namespace, output_table_name = data_utils.default_output_table_info(
            task_id=self.tracker.task_id,
            task_version=self.tracker.task_version)
        input_table_meta, output_table_address, output_table_engine = self.convert_check(
            input_name=self.parameters[table_key]['name'],
            input_namespace=self.parameters[table_key]['namespace'],
            output_name=output_table_name,
            output_namespace=output_table_namespace,
            computing_engine=computing_engine,
            output_storage_address=output_storage_address)
        with storage.Session.build(session_id=job_utils.generate_session_id(
                self.tracker.task_id,
                self.tracker.task_version,
                self.tracker.role,
                self.tracker.party_id,
                suffix="storage",
                random_end=True),
                                   storage_engine=input_table_meta.get_engine(
                                   )) as input_table_session:
            input_table = input_table_session.get_table(
                name=input_table_meta.get_name(),
                namespace=input_table_meta.get_namespace())
            # update real count to meta info
            input_table.count()
            # Table replication is required
            if input_table_meta.get_engine() != output_table_engine:
                LOGGER.info(
                    f"the {input_table_meta.get_engine()} engine input table needs to be converted to {output_table_engine} engine to support computing engine {computing_engine}"
                )
            else:
                LOGGER.info(
                    f"the {input_table_meta.get_engine()} input table needs to be transform format"
                )
            with storage.Session.build(
                    session_id=job_utils.generate_session_id(
                        self.tracker.task_id,
                        self.tracker.task_version,
                        self.tracker.role,
                        self.tracker.party_id,
                        suffix="storage",
                        random_end=True),
                    storage_engine=output_table_engine
            ) as output_table_session:
                output_table = output_table_session.create_table(
                    address=output_table_address,
                    name=output_table_name,
                    namespace=output_table_namespace,
                    partitions=input_table_meta.partitions)
                self.copy_table(src_table=input_table, dest_table=output_table)
                # update real count to meta info
                output_table.count()
                output_table_meta = StorageTableMeta(
                    name=output_table.get_name(),
                    namespace=output_table.get_namespace())
        self.tracker.log_output_data_info(
            data_name=component_parameters.get('output_data_name')[0]
            if component_parameters.get('output_data_name') else table_key,
            table_namespace=output_table_meta.get_namespace(),
            table_name=output_table_meta.get_name())
        headers_str = output_table_meta.get_schema().get('header')
        table_info = {}
        if output_table_meta.get_schema() and headers_str:
            if isinstance(headers_str, str):
                data_list = [headers_str.split(',')]
                is_display = True
            else:
                data_list = [headers_str]
                is_display = False
            if is_display:
                for data in output_table_meta.get_part_of_data():
                    data_list.append(data[1].split(','))
                data = np.array(data_list)
                Tdata = data.transpose()
                for data in Tdata:
                    table_info[data[0]] = ','.join(list(set(data[1:]))[:5])
        data_info = {
            "table_name": self.parameters[table_key]['name'],
            "namespace": self.parameters[table_key]['namespace'],
            "table_info": table_info,
            "partitions": output_table_meta.get_partitions(),
            "storage_engine": output_table_meta.get_engine()
        }
        if input_table_meta.get_engine() in [StorageEngine.PATH]:
            data_info["file_count"] = output_table_meta.get_count()
            data_info["file_path"] = input_table_meta.get_address().path
        else:
            data_info["count"] = output_table_meta.get_count()

        self.tracker.set_metric_meta(metric_namespace="reader_namespace",
                                     metric_name="reader_name",
                                     metric_meta=MetricMeta(
                                         name='reader',
                                         metric_type='data_info',
                                         extra_metas=data_info))
Ejemplo n.º 2
0
 def save_output_data(self,
                      computing_table,
                      output_storage_engine,
                      output_storage_address: dict,
                      output_table_namespace=None,
                      output_table_name=None):
     if computing_table:
         if not output_table_namespace or not output_table_name:
             output_table_namespace, output_table_name = data_utils.default_output_table_info(
                 task_id=self.task_id, task_version=self.task_version)
         schedule_logger(self.job_id).info(
             'persisting the component output temporary table to {} {}'.
             format(output_table_namespace, output_table_name))
         partitions = computing_table.partitions
         schedule_logger(self.job_id).info(
             'output data table partitions is {}'.format(partitions))
         address_dict = output_storage_address.copy()
         if output_storage_engine == StorageEngine.EGGROLL:
             address_dict.update({
                 "name":
                 output_table_name,
                 "namespace":
                 output_table_namespace,
                 "storage_type":
                 storage.EggRollStorageType.ROLLPAIR_LMDB
             })
         elif output_storage_engine == StorageEngine.STANDALONE:
             address_dict.update({
                 "name":
                 output_table_name,
                 "namespace":
                 output_table_namespace,
                 "storage_type":
                 storage.StandaloneStorageType.ROLLPAIR_LMDB
             })
         elif output_storage_engine == StorageEngine.HDFS:
             address_dict.update({
                 "path":
                 data_utils.default_output_fs_path(
                     name=output_table_name,
                     namespace=output_table_namespace,
                     prefix=address_dict.get("path_prefix"))
             })
         else:
             raise RuntimeError(
                 f"{output_storage_engine} storage is not supported")
         address = storage.StorageTableMeta.create_address(
             storage_engine=output_storage_engine,
             address_dict=address_dict)
         schema = {}
         # persistent table
         computing_table.save(address, schema=schema, partitions=partitions)
         part_of_data = []
         part_of_limit = 100
         for k, v in computing_table.collect():
             part_of_data.append((k, v))
             part_of_limit -= 1
             if part_of_limit == 0:
                 break
         table_count = computing_table.count()
         table_meta = storage.StorageTableMeta(
             name=output_table_name,
             namespace=output_table_namespace,
             new=True)
         table_meta.address = address
         table_meta.partitions = computing_table.partitions
         table_meta.engine = output_storage_engine
         table_meta.type = storage.EggRollStorageType.ROLLPAIR_LMDB
         table_meta.schema = schema
         table_meta.part_of_data = part_of_data
         table_meta.count = table_count
         table_meta.create()
         return output_table_namespace, output_table_name
     else:
         schedule_logger(self.job_id).info(
             'task id {} output data table is none'.format(self.task_id))
         return None, None