def convert_check( self, input_name, input_namespace, output_name, output_namespace, computing_engine: ComputingEngine = ComputingEngine.EGGROLL, output_storage_address={} ) -> (StorageTableMetaABC, AddressABC, StorageEngine): input_table_meta = StorageTableMeta(name=input_name, namespace=input_namespace) if not input_table_meta: raise RuntimeError( f"can not found table name: {input_name} namespace: {input_namespace}" ) address_dict = output_storage_address.copy() if computing_engine == ComputingEngine.STANDALONE: from fate_arch.storage import StandaloneStorageType address_dict["name"] = output_name address_dict["namespace"] = output_namespace address_dict["storage_type"] = StandaloneStorageType.ROLLPAIR_LMDB output_table_address = StorageTableMeta.create_address( storage_engine=StorageEngine.STANDALONE, address_dict=address_dict) output_table_engine = StorageEngine.STANDALONE elif computing_engine == ComputingEngine.EGGROLL: from fate_arch.storage import EggRollStorageType address_dict["name"] = output_name address_dict["namespace"] = output_namespace address_dict["storage_type"] = EggRollStorageType.ROLLPAIR_LMDB output_table_address = StorageTableMeta.create_address( storage_engine=StorageEngine.EGGROLL, address_dict=address_dict) output_table_engine = StorageEngine.EGGROLL elif computing_engine == ComputingEngine.SPARK: address_dict["path"] = data_utils.default_output_fs_path( name=output_name, namespace=output_namespace, prefix=address_dict.get("path_prefix")) output_table_address = StorageTableMeta.create_address( storage_engine=StorageEngine.HDFS, address_dict=address_dict) output_table_engine = StorageEngine.HDFS else: raise RuntimeError( f"can not support computing engine {computing_engine}") return input_table_meta, output_table_address, output_table_engine
def create_storage_connector(): request_data = request.json address = StorageTableMeta.create_address( request_data.get("engine"), request_data.get("connector_info")) connector = StorageConnector( connector_name=request_data.get("connector_name"), engine=request_data.get("engine"), connector_info=address.connector) connector.create_or_update() return get_json_result(retcode=0, retmsg='success')
def run(self, component_parameters=None, args=None): self.parameters = component_parameters["ReaderParam"] output_storage_address = args["job_parameters"].engines_address[ EngineType.STORAGE] table_key = [key for key in self.parameters.keys()][0] computing_engine = args["job_parameters"].computing_engine output_table_namespace, output_table_name = data_utils.default_output_table_info( task_id=self.tracker.task_id, task_version=self.tracker.task_version) input_table_meta, output_table_address, output_table_engine = self.convert_check( input_name=self.parameters[table_key]['name'], input_namespace=self.parameters[table_key]['namespace'], output_name=output_table_name, output_namespace=output_table_namespace, computing_engine=computing_engine, output_storage_address=output_storage_address) with storage.Session.build(session_id=job_utils.generate_session_id( self.tracker.task_id, self.tracker.task_version, self.tracker.role, self.tracker.party_id, suffix="storage", random_end=True), storage_engine=input_table_meta.get_engine( )) as input_table_session: input_table = input_table_session.get_table( name=input_table_meta.get_name(), namespace=input_table_meta.get_namespace()) # update real count to meta info input_table.count() # Table replication is required if input_table_meta.get_engine() != output_table_engine: LOGGER.info( f"the {input_table_meta.get_engine()} engine input table needs to be converted to {output_table_engine} engine to support computing engine {computing_engine}" ) else: LOGGER.info( f"the {input_table_meta.get_engine()} input table needs to be transform format" ) with storage.Session.build( session_id=job_utils.generate_session_id( self.tracker.task_id, self.tracker.task_version, self.tracker.role, self.tracker.party_id, suffix="storage", random_end=True), storage_engine=output_table_engine ) as output_table_session: output_table = output_table_session.create_table( address=output_table_address, name=output_table_name, namespace=output_table_namespace, partitions=input_table_meta.partitions) self.copy_table(src_table=input_table, dest_table=output_table) # update real count to meta info output_table.count() output_table_meta = StorageTableMeta( name=output_table.get_name(), namespace=output_table.get_namespace()) self.tracker.log_output_data_info( data_name=component_parameters.get('output_data_name')[0] if component_parameters.get('output_data_name') else table_key, table_namespace=output_table_meta.get_namespace(), table_name=output_table_meta.get_name()) headers_str = output_table_meta.get_schema().get('header') table_info = {} if output_table_meta.get_schema() and headers_str: if isinstance(headers_str, str): data_list = [headers_str.split(',')] is_display = True else: data_list = [headers_str] is_display = False if is_display: for data in output_table_meta.get_part_of_data(): data_list.append(data[1].split(',')) data = np.array(data_list) Tdata = data.transpose() for data in Tdata: table_info[data[0]] = ','.join(list(set(data[1:]))[:5]) data_info = { "table_name": self.parameters[table_key]['name'], "namespace": self.parameters[table_key]['namespace'], "table_info": table_info, "partitions": output_table_meta.get_partitions(), "storage_engine": output_table_meta.get_engine() } if input_table_meta.get_engine() in [StorageEngine.PATH]: data_info["file_count"] = output_table_meta.get_count() data_info["file_path"] = input_table_meta.get_address().path else: data_info["count"] = output_table_meta.get_count() self.tracker.set_metric_meta(metric_namespace="reader_namespace", metric_name="reader_name", metric_meta=MetricMeta( name='reader', metric_type='data_info', extra_metas=data_info))
def _run(self, cpn_input: ComponentInputProtocol): self.parameters = cpn_input.parameters self.job_parameters = cpn_input.job_parameters output_storage_address = self.job_parameters.engines_address[ EngineType.STORAGE] # only support one input table table_key = [key for key in self.parameters.keys()][0] input_table_namespace, input_table_name = self.get_input_table_info( parameters=self.parameters[table_key], role=self.tracker.role, party_id=self.tracker.party_id, ) ( output_table_namespace, output_table_name, ) = default_output_info( task_id=self.tracker.task_id, task_version=self.tracker.task_version, output_type="data", ) ( input_table_meta, output_table_address, output_table_engine, ) = self.convert_check( input_name=input_table_name, input_namespace=input_table_namespace, output_name=output_table_name, output_namespace=output_table_namespace, computing_engine=self.job_parameters.computing_engine, output_storage_address=output_storage_address, ) sess = Session.get_global() input_table = sess.get_table( name=input_table_meta.get_name(), namespace=input_table_meta.get_namespace()) # update real count to meta info input_table.count() # Table replication is required if input_table_meta.get_engine() != output_table_engine: LOGGER.info( f"the {input_table_meta.get_engine()} engine input table needs to be converted to {output_table_engine} engine to support computing engine {self.job_parameters.computing_engine}" ) else: LOGGER.info( f"the {input_table_meta.get_engine()} input table needs to be transform format" ) LOGGER.info("reader create storage session2") output_table_session = sess.storage(storage_engine=output_table_engine) output_table = output_table_session.create_table( address=output_table_address, name=output_table_name, namespace=output_table_namespace, partitions=input_table_meta.partitions, origin=StorageTableOrigin.READER) self.save_table(src_table=input_table, dest_table=output_table) # update real count to meta info output_table_meta = StorageTableMeta(name=output_table.name, namespace=output_table.namespace) # todo: may be set output data, and executor support pass persistent self.tracker.log_output_data_info( data_name=cpn_input.flow_feeded_parameters.get("output_data_name") [0] if cpn_input.flow_feeded_parameters.get("output_data_name") else table_key, table_namespace=output_table_meta.get_namespace(), table_name=output_table_meta.get_name(), ) DataTableTracker.create_table_tracker( output_table_meta.get_name(), output_table_meta.get_namespace(), entity_info={ "have_parent": True, "parent_table_namespace": input_table_namespace, "parent_table_name": input_table_name, "job_id": self.tracker.job_id, }, ) headers_str = output_table_meta.get_schema().get("header") table_info = {} if output_table_meta.get_schema() and headers_str: if isinstance(headers_str, str): data_list = [headers_str.split(",")] is_display = True else: data_list = [headers_str] is_display = False if is_display: for data in output_table_meta.get_part_of_data(): data_list.append(data[1].split(",")) data = np.array(data_list) Tdata = data.transpose() for data in Tdata: table_info[data[0]] = ",".join(list(set(data[1:]))[:5]) data_info = { "table_name": input_table_name, "namespace": input_table_namespace, "table_info": table_info, "partitions": output_table_meta.get_partitions(), "storage_engine": output_table_meta.get_engine(), } if input_table_meta.get_engine() in [StorageEngine.PATH]: data_info["file_count"] = output_table_meta.get_count() data_info["file_path"] = input_table_meta.get_address().path else: data_info["count"] = output_table_meta.get_count() self.tracker.set_metric_meta( metric_namespace="reader_namespace", metric_name="reader_name", metric_meta=MetricMeta(name="reader", metric_type="data_info", extra_metas=data_info), )
def convert_check( input_name, input_namespace, output_name, output_namespace, computing_engine: ComputingEngine = ComputingEngine.EGGROLL, output_storage_address={}, ) -> (StorageTableMetaABC, AddressABC, StorageEngine): input_table_meta = StorageTableMeta(name=input_name, namespace=input_namespace) if not input_table_meta: raise RuntimeError( f"can not found table name: {input_name} namespace: {input_namespace}" ) address_dict = output_storage_address.copy() if input_table_meta.get_engine() in [StorageEngine.PATH]: from fate_arch.storage import PathStoreType address_dict["name"] = output_name address_dict["namespace"] = output_namespace address_dict["storage_type"] = PathStoreType.PICTURE address_dict["path"] = input_table_meta.get_address().path output_table_address = StorageTableMeta.create_address( storage_engine=StorageEngine.PATH, address_dict=address_dict) output_table_engine = StorageEngine.PATH elif computing_engine == ComputingEngine.STANDALONE: from fate_arch.storage import StandaloneStoreType address_dict["name"] = output_name address_dict["namespace"] = output_namespace address_dict["storage_type"] = StandaloneStoreType.ROLLPAIR_LMDB output_table_address = StorageTableMeta.create_address( storage_engine=StorageEngine.STANDALONE, address_dict=address_dict) output_table_engine = StorageEngine.STANDALONE elif computing_engine == ComputingEngine.EGGROLL: from fate_arch.storage import EggRollStoreType address_dict["name"] = output_name address_dict["namespace"] = output_namespace address_dict["storage_type"] = EggRollStoreType.ROLLPAIR_LMDB output_table_address = StorageTableMeta.create_address( storage_engine=StorageEngine.EGGROLL, address_dict=address_dict) output_table_engine = StorageEngine.EGGROLL elif computing_engine == ComputingEngine.SPARK: if input_table_meta.get_engine() == StorageEngine.HIVE: output_table_address = input_table_meta.get_address() output_table_address.name = output_name output_table_engine = input_table_meta.get_engine() elif input_table_meta.get_engine() == StorageEngine.LOCALFS: output_table_address = input_table_meta.get_address() output_table_address.path = default_output_fs_path( name=output_name, namespace=output_namespace, storage_engine=StorageEngine.LOCALFS) output_table_engine = input_table_meta.get_engine() else: address_dict["path"] = default_output_fs_path( name=output_name, namespace=output_namespace, prefix=address_dict.get("path_prefix"), storage_engine=StorageEngine.HDFS) output_table_address = StorageTableMeta.create_address( storage_engine=StorageEngine.HDFS, address_dict=address_dict) output_table_engine = StorageEngine.HDFS elif computing_engine == ComputingEngine.LINKIS_SPARK: output_table_address = input_table_meta.get_address() output_table_address.name = output_name output_table_engine = input_table_meta.get_engine() else: raise RuntimeError( f"can not support computing engine {computing_engine}") return input_table_meta, output_table_address, output_table_engine