Exemple #1
0
 def convert_check(
     self,
     input_name,
     input_namespace,
     output_name,
     output_namespace,
     computing_engine: ComputingEngine = ComputingEngine.EGGROLL,
     output_storage_address={}
 ) -> (StorageTableMetaABC, AddressABC, StorageEngine):
     input_table_meta = StorageTableMeta(name=input_name,
                                         namespace=input_namespace)
     if not input_table_meta:
         raise RuntimeError(
             f"can not found table name: {input_name} namespace: {input_namespace}"
         )
     address_dict = output_storage_address.copy()
     if computing_engine == ComputingEngine.STANDALONE:
         from fate_arch.storage import StandaloneStorageType
         address_dict["name"] = output_name
         address_dict["namespace"] = output_namespace
         address_dict["storage_type"] = StandaloneStorageType.ROLLPAIR_LMDB
         output_table_address = StorageTableMeta.create_address(
             storage_engine=StorageEngine.STANDALONE,
             address_dict=address_dict)
         output_table_engine = StorageEngine.STANDALONE
     elif computing_engine == ComputingEngine.EGGROLL:
         from fate_arch.storage import EggRollStorageType
         address_dict["name"] = output_name
         address_dict["namespace"] = output_namespace
         address_dict["storage_type"] = EggRollStorageType.ROLLPAIR_LMDB
         output_table_address = StorageTableMeta.create_address(
             storage_engine=StorageEngine.EGGROLL,
             address_dict=address_dict)
         output_table_engine = StorageEngine.EGGROLL
     elif computing_engine == ComputingEngine.SPARK:
         address_dict["path"] = data_utils.default_output_fs_path(
             name=output_name,
             namespace=output_namespace,
             prefix=address_dict.get("path_prefix"))
         output_table_address = StorageTableMeta.create_address(
             storage_engine=StorageEngine.HDFS, address_dict=address_dict)
         output_table_engine = StorageEngine.HDFS
     else:
         raise RuntimeError(
             f"can not support computing engine {computing_engine}")
     return input_table_meta, output_table_address, output_table_engine
Exemple #2
0
def create_storage_connector():
    request_data = request.json
    address = StorageTableMeta.create_address(
        request_data.get("engine"), request_data.get("connector_info"))
    connector = StorageConnector(
        connector_name=request_data.get("connector_name"),
        engine=request_data.get("engine"),
        connector_info=address.connector)
    connector.create_or_update()
    return get_json_result(retcode=0, retmsg='success')
Exemple #3
0
    def run(self, component_parameters=None, args=None):
        self.parameters = component_parameters["ReaderParam"]
        output_storage_address = args["job_parameters"].engines_address[
            EngineType.STORAGE]
        table_key = [key for key in self.parameters.keys()][0]
        computing_engine = args["job_parameters"].computing_engine
        output_table_namespace, output_table_name = data_utils.default_output_table_info(
            task_id=self.tracker.task_id,
            task_version=self.tracker.task_version)
        input_table_meta, output_table_address, output_table_engine = self.convert_check(
            input_name=self.parameters[table_key]['name'],
            input_namespace=self.parameters[table_key]['namespace'],
            output_name=output_table_name,
            output_namespace=output_table_namespace,
            computing_engine=computing_engine,
            output_storage_address=output_storage_address)
        with storage.Session.build(session_id=job_utils.generate_session_id(
                self.tracker.task_id,
                self.tracker.task_version,
                self.tracker.role,
                self.tracker.party_id,
                suffix="storage",
                random_end=True),
                                   storage_engine=input_table_meta.get_engine(
                                   )) as input_table_session:
            input_table = input_table_session.get_table(
                name=input_table_meta.get_name(),
                namespace=input_table_meta.get_namespace())
            # update real count to meta info
            input_table.count()
            # Table replication is required
            if input_table_meta.get_engine() != output_table_engine:
                LOGGER.info(
                    f"the {input_table_meta.get_engine()} engine input table needs to be converted to {output_table_engine} engine to support computing engine {computing_engine}"
                )
            else:
                LOGGER.info(
                    f"the {input_table_meta.get_engine()} input table needs to be transform format"
                )
            with storage.Session.build(
                    session_id=job_utils.generate_session_id(
                        self.tracker.task_id,
                        self.tracker.task_version,
                        self.tracker.role,
                        self.tracker.party_id,
                        suffix="storage",
                        random_end=True),
                    storage_engine=output_table_engine
            ) as output_table_session:
                output_table = output_table_session.create_table(
                    address=output_table_address,
                    name=output_table_name,
                    namespace=output_table_namespace,
                    partitions=input_table_meta.partitions)
                self.copy_table(src_table=input_table, dest_table=output_table)
                # update real count to meta info
                output_table.count()
                output_table_meta = StorageTableMeta(
                    name=output_table.get_name(),
                    namespace=output_table.get_namespace())
        self.tracker.log_output_data_info(
            data_name=component_parameters.get('output_data_name')[0]
            if component_parameters.get('output_data_name') else table_key,
            table_namespace=output_table_meta.get_namespace(),
            table_name=output_table_meta.get_name())
        headers_str = output_table_meta.get_schema().get('header')
        table_info = {}
        if output_table_meta.get_schema() and headers_str:
            if isinstance(headers_str, str):
                data_list = [headers_str.split(',')]
                is_display = True
            else:
                data_list = [headers_str]
                is_display = False
            if is_display:
                for data in output_table_meta.get_part_of_data():
                    data_list.append(data[1].split(','))
                data = np.array(data_list)
                Tdata = data.transpose()
                for data in Tdata:
                    table_info[data[0]] = ','.join(list(set(data[1:]))[:5])
        data_info = {
            "table_name": self.parameters[table_key]['name'],
            "namespace": self.parameters[table_key]['namespace'],
            "table_info": table_info,
            "partitions": output_table_meta.get_partitions(),
            "storage_engine": output_table_meta.get_engine()
        }
        if input_table_meta.get_engine() in [StorageEngine.PATH]:
            data_info["file_count"] = output_table_meta.get_count()
            data_info["file_path"] = input_table_meta.get_address().path
        else:
            data_info["count"] = output_table_meta.get_count()

        self.tracker.set_metric_meta(metric_namespace="reader_namespace",
                                     metric_name="reader_name",
                                     metric_meta=MetricMeta(
                                         name='reader',
                                         metric_type='data_info',
                                         extra_metas=data_info))
Exemple #4
0
    def _run(self, cpn_input: ComponentInputProtocol):
        self.parameters = cpn_input.parameters
        self.job_parameters = cpn_input.job_parameters
        output_storage_address = self.job_parameters.engines_address[
            EngineType.STORAGE]
        # only support one input table
        table_key = [key for key in self.parameters.keys()][0]

        input_table_namespace, input_table_name = self.get_input_table_info(
            parameters=self.parameters[table_key],
            role=self.tracker.role,
            party_id=self.tracker.party_id,
        )
        (
            output_table_namespace,
            output_table_name,
        ) = default_output_info(
            task_id=self.tracker.task_id,
            task_version=self.tracker.task_version,
            output_type="data",
        )
        (
            input_table_meta,
            output_table_address,
            output_table_engine,
        ) = self.convert_check(
            input_name=input_table_name,
            input_namespace=input_table_namespace,
            output_name=output_table_name,
            output_namespace=output_table_namespace,
            computing_engine=self.job_parameters.computing_engine,
            output_storage_address=output_storage_address,
        )
        sess = Session.get_global()

        input_table = sess.get_table(
            name=input_table_meta.get_name(),
            namespace=input_table_meta.get_namespace())
        # update real count to meta info
        input_table.count()
        # Table replication is required
        if input_table_meta.get_engine() != output_table_engine:
            LOGGER.info(
                f"the {input_table_meta.get_engine()} engine input table needs to be converted to {output_table_engine} engine to support computing engine {self.job_parameters.computing_engine}"
            )
        else:
            LOGGER.info(
                f"the {input_table_meta.get_engine()} input table needs to be transform format"
            )
        LOGGER.info("reader create storage session2")
        output_table_session = sess.storage(storage_engine=output_table_engine)
        output_table = output_table_session.create_table(
            address=output_table_address,
            name=output_table_name,
            namespace=output_table_namespace,
            partitions=input_table_meta.partitions,
            origin=StorageTableOrigin.READER)
        self.save_table(src_table=input_table, dest_table=output_table)
        # update real count to meta info
        output_table_meta = StorageTableMeta(name=output_table.name,
                                             namespace=output_table.namespace)
        # todo: may be set output data, and executor support pass persistent
        self.tracker.log_output_data_info(
            data_name=cpn_input.flow_feeded_parameters.get("output_data_name")
            [0] if cpn_input.flow_feeded_parameters.get("output_data_name")
            else table_key,
            table_namespace=output_table_meta.get_namespace(),
            table_name=output_table_meta.get_name(),
        )
        DataTableTracker.create_table_tracker(
            output_table_meta.get_name(),
            output_table_meta.get_namespace(),
            entity_info={
                "have_parent": True,
                "parent_table_namespace": input_table_namespace,
                "parent_table_name": input_table_name,
                "job_id": self.tracker.job_id,
            },
        )
        headers_str = output_table_meta.get_schema().get("header")
        table_info = {}
        if output_table_meta.get_schema() and headers_str:
            if isinstance(headers_str, str):
                data_list = [headers_str.split(",")]
                is_display = True
            else:
                data_list = [headers_str]
                is_display = False
            if is_display:
                for data in output_table_meta.get_part_of_data():
                    data_list.append(data[1].split(","))
                data = np.array(data_list)
                Tdata = data.transpose()
                for data in Tdata:
                    table_info[data[0]] = ",".join(list(set(data[1:]))[:5])
        data_info = {
            "table_name": input_table_name,
            "namespace": input_table_namespace,
            "table_info": table_info,
            "partitions": output_table_meta.get_partitions(),
            "storage_engine": output_table_meta.get_engine(),
        }
        if input_table_meta.get_engine() in [StorageEngine.PATH]:
            data_info["file_count"] = output_table_meta.get_count()
            data_info["file_path"] = input_table_meta.get_address().path
        else:
            data_info["count"] = output_table_meta.get_count()

        self.tracker.set_metric_meta(
            metric_namespace="reader_namespace",
            metric_name="reader_name",
            metric_meta=MetricMeta(name="reader",
                                   metric_type="data_info",
                                   extra_metas=data_info),
        )
Exemple #5
0
    def convert_check(
        input_name,
        input_namespace,
        output_name,
        output_namespace,
        computing_engine: ComputingEngine = ComputingEngine.EGGROLL,
        output_storage_address={},
    ) -> (StorageTableMetaABC, AddressABC, StorageEngine):
        input_table_meta = StorageTableMeta(name=input_name,
                                            namespace=input_namespace)

        if not input_table_meta:
            raise RuntimeError(
                f"can not found table name: {input_name} namespace: {input_namespace}"
            )
        address_dict = output_storage_address.copy()
        if input_table_meta.get_engine() in [StorageEngine.PATH]:
            from fate_arch.storage import PathStoreType

            address_dict["name"] = output_name
            address_dict["namespace"] = output_namespace
            address_dict["storage_type"] = PathStoreType.PICTURE
            address_dict["path"] = input_table_meta.get_address().path
            output_table_address = StorageTableMeta.create_address(
                storage_engine=StorageEngine.PATH, address_dict=address_dict)
            output_table_engine = StorageEngine.PATH
        elif computing_engine == ComputingEngine.STANDALONE:
            from fate_arch.storage import StandaloneStoreType

            address_dict["name"] = output_name
            address_dict["namespace"] = output_namespace
            address_dict["storage_type"] = StandaloneStoreType.ROLLPAIR_LMDB
            output_table_address = StorageTableMeta.create_address(
                storage_engine=StorageEngine.STANDALONE,
                address_dict=address_dict)
            output_table_engine = StorageEngine.STANDALONE
        elif computing_engine == ComputingEngine.EGGROLL:
            from fate_arch.storage import EggRollStoreType

            address_dict["name"] = output_name
            address_dict["namespace"] = output_namespace
            address_dict["storage_type"] = EggRollStoreType.ROLLPAIR_LMDB
            output_table_address = StorageTableMeta.create_address(
                storage_engine=StorageEngine.EGGROLL,
                address_dict=address_dict)
            output_table_engine = StorageEngine.EGGROLL
        elif computing_engine == ComputingEngine.SPARK:
            if input_table_meta.get_engine() == StorageEngine.HIVE:
                output_table_address = input_table_meta.get_address()
                output_table_address.name = output_name
                output_table_engine = input_table_meta.get_engine()
            elif input_table_meta.get_engine() == StorageEngine.LOCALFS:
                output_table_address = input_table_meta.get_address()
                output_table_address.path = default_output_fs_path(
                    name=output_name,
                    namespace=output_namespace,
                    storage_engine=StorageEngine.LOCALFS)
                output_table_engine = input_table_meta.get_engine()
            else:
                address_dict["path"] = default_output_fs_path(
                    name=output_name,
                    namespace=output_namespace,
                    prefix=address_dict.get("path_prefix"),
                    storage_engine=StorageEngine.HDFS)
                output_table_address = StorageTableMeta.create_address(
                    storage_engine=StorageEngine.HDFS,
                    address_dict=address_dict)
                output_table_engine = StorageEngine.HDFS
        elif computing_engine == ComputingEngine.LINKIS_SPARK:
            output_table_address = input_table_meta.get_address()
            output_table_address.name = output_name
            output_table_engine = input_table_meta.get_engine()
        else:
            raise RuntimeError(
                f"can not support computing engine {computing_engine}")
        return input_table_meta, output_table_address, output_table_engine