Esempio n. 1
0
 def convert_check(
     self,
     input_name,
     input_namespace,
     output_name,
     output_namespace,
     computing_engine: ComputingEngine = ComputingEngine.EGGROLL,
     output_storage_address={}
 ) -> (StorageTableMetaABC, AddressABC, StorageEngine):
     input_table_meta = StorageTableMeta(name=input_name,
                                         namespace=input_namespace)
     if not input_table_meta:
         raise RuntimeError(
             f"can not found table name: {input_name} namespace: {input_namespace}"
         )
     address_dict = output_storage_address.copy()
     if input_table_meta.get_engine() in [StorageEngine.PATH]:
         from fate_arch.storage import PathStorageType
         address_dict["name"] = output_name
         address_dict["namespace"] = output_namespace
         address_dict["storage_type"] = PathStorageType.PICTURE
         address_dict["path"] = input_table_meta.get_address().path
         output_table_address = StorageTableMeta.create_address(
             storage_engine=StorageEngine.PATH, address_dict=address_dict)
         output_table_engine = StorageEngine.PATH
     elif computing_engine == ComputingEngine.STANDALONE:
         from fate_arch.storage import StandaloneStorageType
         address_dict["name"] = output_name
         address_dict["namespace"] = output_namespace
         address_dict["storage_type"] = StandaloneStorageType.ROLLPAIR_LMDB
         output_table_address = StorageTableMeta.create_address(
             storage_engine=StorageEngine.STANDALONE,
             address_dict=address_dict)
         output_table_engine = StorageEngine.STANDALONE
     elif computing_engine == ComputingEngine.EGGROLL:
         from fate_arch.storage import EggRollStorageType
         address_dict["name"] = output_name
         address_dict["namespace"] = output_namespace
         address_dict["storage_type"] = EggRollStorageType.ROLLPAIR_LMDB
         output_table_address = StorageTableMeta.create_address(
             storage_engine=StorageEngine.EGGROLL,
             address_dict=address_dict)
         output_table_engine = StorageEngine.EGGROLL
     elif computing_engine == ComputingEngine.SPARK:
         address_dict["path"] = data_utils.default_output_fs_path(
             name=output_name,
             namespace=output_namespace,
             prefix=address_dict.get("path_prefix"))
         output_table_address = StorageTableMeta.create_address(
             storage_engine=StorageEngine.HDFS, address_dict=address_dict)
         output_table_engine = StorageEngine.HDFS
     else:
         raise RuntimeError(
             f"can not support computing engine {computing_engine}")
     return input_table_meta, output_table_address, output_table_engine
Esempio n. 2
0
 def save_output_data(self,
                      computing_table,
                      output_storage_engine,
                      output_storage_address: dict,
                      output_table_namespace=None,
                      output_table_name=None):
     if computing_table:
         if not output_table_namespace or not output_table_name:
             output_table_namespace, output_table_name = data_utils.default_output_table_info(
                 task_id=self.task_id, task_version=self.task_version)
         schedule_logger(self.job_id).info(
             'persisting the component output temporary table to {} {}'.
             format(output_table_namespace, output_table_name))
         partitions = computing_table.partitions
         schedule_logger(self.job_id).info(
             'output data table partitions is {}'.format(partitions))
         address_dict = output_storage_address.copy()
         if output_storage_engine == StorageEngine.EGGROLL:
             address_dict.update({
                 "name":
                 output_table_name,
                 "namespace":
                 output_table_namespace,
                 "storage_type":
                 storage.EggRollStorageType.ROLLPAIR_LMDB
             })
         elif output_storage_engine == StorageEngine.STANDALONE:
             address_dict.update({
                 "name":
                 output_table_name,
                 "namespace":
                 output_table_namespace,
                 "storage_type":
                 storage.StandaloneStorageType.ROLLPAIR_LMDB
             })
         elif output_storage_engine == StorageEngine.HDFS:
             address_dict.update({
                 "path":
                 data_utils.default_output_fs_path(
                     name=output_table_name,
                     namespace=output_table_namespace,
                     prefix=address_dict.get("path_prefix"))
             })
         else:
             raise RuntimeError(
                 f"{output_storage_engine} storage is not supported")
         address = storage.StorageTableMeta.create_address(
             storage_engine=output_storage_engine,
             address_dict=address_dict)
         schema = {}
         # persistent table
         computing_table.save(address, schema=schema, partitions=partitions)
         part_of_data = []
         part_of_limit = 100
         for k, v in computing_table.collect():
             part_of_data.append((k, v))
             part_of_limit -= 1
             if part_of_limit == 0:
                 break
         table_count = computing_table.count()
         table_meta = storage.StorageTableMeta(
             name=output_table_name,
             namespace=output_table_namespace,
             new=True)
         table_meta.address = address
         table_meta.partitions = computing_table.partitions
         table_meta.engine = output_storage_engine
         table_meta.type = storage.EggRollStorageType.ROLLPAIR_LMDB
         table_meta.schema = schema
         table_meta.part_of_data = part_of_data
         table_meta.count = table_count
         table_meta.create()
         return output_table_namespace, output_table_name
     else:
         schedule_logger(self.job_id).info(
             'task id {} output data table is none'.format(self.task_id))
         return None, None