def _generate_parallelize_data(start_num, end_num, feature_nums, table_name, namespace, label_flag, data_type,
                                   partition, progress):
        def expand_id_range(k, v):
            if label_flag:
                return [(id_encryption(encryption_type, ids, ids + 1)[0],
                         ",".join([str(round(np.random.random()))] + [str(round(i, 4)) for i in np.random.randn(v)]))
                        for ids in range(int(k), min(step + int(k), end_num))]
            else:
                if data_type == 'tag':
                    valid_set = [x for x in range(2019120799, 2019120799 + round(feature_nums / sparsity))]
                    data = list(map(str, valid_set))
                    return [(id_encryption(encryption_type, ids, ids + 1)[0],
                             ";".join([random.choice(data) for i in range(int(v))]))
                            for ids in range(int(k), min(step + int(k), data_num))]

                elif data_type == 'tag_value':
                    return [(id_encryption(encryption_type, ids, ids + 1)[0],
                             ";".join([f"x{i}" + ':' + str(round(i, 4)) for i in np.random.randn(v)]))
                            for ids in range(int(k), min(step + int(k), data_num))]
                elif data_type == 'dense':
                    return [(id_encryption(encryption_type, ids, ids + 1)[0],
                             ",".join([str(round(i, 4)) for i in np.random.randn(v)]))
                            for ids in range(int(k), min(step + int(k), data_num))]
        data_num = end_num - start_num
        step = 10000 if data_num > 10000 else int(data_num / 10)
        table_list = [(f"{i * step}", f"{feature_nums}") for i in range(int(data_num / step) + start_num)]
        table = sess.computing.parallelize(table_list, partition=partition, include_key=True)
        table = table.flatMap(functools.partial(expand_id_range))
        if label_flag:
            schema = {"sid": "id", "header": ",".join(["y"] + [f"x{i}" for i in range(feature_nums)])}
        else:
            schema = {"sid": "id", "header": ",".join([f"x{i}" for i in range(feature_nums)])}
        if data_type != "dense":
            schema = None

        h_table = sess.get_table(name=table_name, namespace=namespace)
        if h_table:
            h_table.destroy()

        table_meta = sess.persistent(computing_table=table, name=table_name, namespace=namespace, schema=schema)

        storage_session = sess.storage()
        s_table = storage_session.get_table(namespace=table_meta.get_namespace(), name=table_meta.get_name())
        if s_table.count() == data_num:
            progress.set_time_percent(100)
        from fate_flow.manager.data_manager import DataTableTracker
        DataTableTracker.create_table_tracker(
            table_name=table_name,
            table_namespace=namespace,
            entity_info={}
        )
Exemple #2
0
 def log_output_data_table_tracker(cls, job_id, input_table_list,
                                   output_table_list):
     try:
         parent_number = 0
         if len(input_table_list) > 1 and len(output_table_list) > 1:
             # TODO
             return
         for input_table in input_table_list:
             for output_table in output_table_list:
                 DataTableTracker.create_table_tracker(
                     output_table.get("name"),
                     output_table.get("namespace"),
                     entity_info={
                         "have_parent": True,
                         "parent_table_namespace":
                         input_table.get("namespace"),
                         "parent_table_name": input_table.get("name"),
                         "parent_number": parent_number,
                         "job_id": job_id
                     })
             parent_number += 1
     except Exception as e:
         LOGGER.exception(e)
Exemple #3
0
def table_bind():
    request_data = request.json
    address_dict = request_data.get('address')
    engine = request_data.get('engine')
    name = request_data.get('name')
    namespace = request_data.get('namespace')
    address = storage.StorageTableMeta.create_address(
        storage_engine=engine, address_dict=address_dict)
    in_serialized = request_data.get(
        "in_serialized", 1 if engine in {
            storage.StorageEngine.STANDALONE, storage.StorageEngine.EGGROLL,
            storage.StorageEngine.MYSQL, storage.StorageEngine.PATH
        } else 0)
    destroy = (int(request_data.get("drop", 0)) == 1)
    data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace)
    if data_table_meta:
        if destroy:
            data_table_meta.destroy_metas()
        else:
            return get_json_result(
                retcode=100,
                retmsg='The data table already exists.'
                'If you still want to continue uploading, please add the parameter --drop'
            )
    id_column = request_data.get("id_column") or request_data.get("id_name")
    feature_column = request_data.get("feature_column") or request_data.get(
        "feature_name")
    schema = None
    if id_column and feature_column:
        schema = {'header': feature_column, 'sid': id_column}
    elif id_column:
        schema = {'sid': id_column, 'header': ''}
    sess = Session()
    storage_session = sess.storage(storage_engine=engine,
                                   options=request_data.get("options"))
    table = storage_session.create_table(
        address=address,
        name=name,
        namespace=namespace,
        partitions=request_data.get('partitions', None),
        hava_head=request_data.get("head"),
        schema=schema,
        id_delimiter=request_data.get("id_delimiter"),
        in_serialized=in_serialized,
        origin=request_data.get("origin", StorageTableOrigin.TABLE_BIND))
    response = get_json_result(data={
        "table_name": name,
        "namespace": namespace
    })
    if not table.check_address():
        response = get_json_result(
            retcode=100,
            retmsg=f'engine {engine} address {address_dict} check failed')
    else:
        DataTableTracker.create_table_tracker(
            table_name=name,
            table_namespace=namespace,
            entity_info={"have_parent": False},
        )
    sess.destroy_all_sessions()
    return response
Exemple #4
0
def table_tracking_job():
    request_info = request.json
    data = DataTableTracker.track_job(request_info.get("table_name"),
                                      request_info.get("namespace"),
                                      display=True)
    return get_json_result(data=data)
Exemple #5
0
def table_tracking():
    request_info = request.json
    data = DataTableTracker.get_parent_table(request_info.get("table_name"),
                                             request_info.get("namespace"))
    return get_json_result(data=data)
Exemple #6
0
    def _run(self, cpn_input: ComponentInputProtocol):
        self.parameters = cpn_input.parameters
        self.job_parameters = cpn_input.job_parameters
        output_storage_address = self.job_parameters.engines_address[
            EngineType.STORAGE]
        # only support one input table
        table_key = [key for key in self.parameters.keys()][0]

        input_table_namespace, input_table_name = self.get_input_table_info(
            parameters=self.parameters[table_key],
            role=self.tracker.role,
            party_id=self.tracker.party_id,
        )
        (
            output_table_namespace,
            output_table_name,
        ) = default_output_info(
            task_id=self.tracker.task_id,
            task_version=self.tracker.task_version,
            output_type="data",
        )
        (
            input_table_meta,
            output_table_address,
            output_table_engine,
        ) = self.convert_check(
            input_name=input_table_name,
            input_namespace=input_table_namespace,
            output_name=output_table_name,
            output_namespace=output_table_namespace,
            computing_engine=self.job_parameters.computing_engine,
            output_storage_address=output_storage_address,
        )
        sess = Session.get_global()

        input_table = sess.get_table(
            name=input_table_meta.get_name(),
            namespace=input_table_meta.get_namespace())
        # update real count to meta info
        input_table.count()
        # Table replication is required
        if input_table_meta.get_engine() != output_table_engine:
            LOGGER.info(
                f"the {input_table_meta.get_engine()} engine input table needs to be converted to {output_table_engine} engine to support computing engine {self.job_parameters.computing_engine}"
            )
        else:
            LOGGER.info(
                f"the {input_table_meta.get_engine()} input table needs to be transform format"
            )
        LOGGER.info("reader create storage session2")
        output_table_session = sess.storage(storage_engine=output_table_engine)
        output_table = output_table_session.create_table(
            address=output_table_address,
            name=output_table_name,
            namespace=output_table_namespace,
            partitions=input_table_meta.partitions,
            origin=StorageTableOrigin.READER)
        self.save_table(src_table=input_table, dest_table=output_table)
        # update real count to meta info
        output_table_meta = StorageTableMeta(name=output_table.name,
                                             namespace=output_table.namespace)
        # todo: may be set output data, and executor support pass persistent
        self.tracker.log_output_data_info(
            data_name=cpn_input.flow_feeded_parameters.get("output_data_name")
            [0] if cpn_input.flow_feeded_parameters.get("output_data_name")
            else table_key,
            table_namespace=output_table_meta.get_namespace(),
            table_name=output_table_meta.get_name(),
        )
        DataTableTracker.create_table_tracker(
            output_table_meta.get_name(),
            output_table_meta.get_namespace(),
            entity_info={
                "have_parent": True,
                "parent_table_namespace": input_table_namespace,
                "parent_table_name": input_table_name,
                "job_id": self.tracker.job_id,
            },
        )
        headers_str = output_table_meta.get_schema().get("header")
        table_info = {}
        if output_table_meta.get_schema() and headers_str:
            if isinstance(headers_str, str):
                data_list = [headers_str.split(",")]
                is_display = True
            else:
                data_list = [headers_str]
                is_display = False
            if is_display:
                for data in output_table_meta.get_part_of_data():
                    data_list.append(data[1].split(","))
                data = np.array(data_list)
                Tdata = data.transpose()
                for data in Tdata:
                    table_info[data[0]] = ",".join(list(set(data[1:]))[:5])
        data_info = {
            "table_name": input_table_name,
            "namespace": input_table_namespace,
            "table_info": table_info,
            "partitions": output_table_meta.get_partitions(),
            "storage_engine": output_table_meta.get_engine(),
        }
        if input_table_meta.get_engine() in [StorageEngine.PATH]:
            data_info["file_count"] = output_table_meta.get_count()
            data_info["file_path"] = input_table_meta.get_address().path
        else:
            data_info["count"] = output_table_meta.get_count()

        self.tracker.set_metric_meta(
            metric_namespace="reader_namespace",
            metric_name="reader_name",
            metric_meta=MetricMeta(name="reader",
                                   metric_type="data_info",
                                   extra_metas=data_info),
        )
Exemple #7
0
    def _run(self, cpn_input: ComponentInputProtocol):
        self.parameters = cpn_input.parameters
        if self.parameters.get("namespace") and self.parameters.get("name"):
            namespace = self.parameters.get("namespace")
            name = self.parameters.get("name")
        elif cpn_input.flow_feeded_parameters.get("table_info"):
            namespace = cpn_input.flow_feeded_parameters.get(
                "table_info")[0].get("namespace")
            name = cpn_input.flow_feeded_parameters.get("table_info")[0].get(
                "name")
        else:
            raise Exception("no found name or namespace in input parameters")
        LOGGER.info(f"writer parameters:{self.parameters}")
        src_table = Session.get_global().get_table(name=name,
                                                   namespace=namespace)
        output_name = self.parameters.get("output_name")
        output_namespace = self.parameters.get("output_namespace")
        engine = self.parameters.get("storage_engine")
        address_dict = self.parameters.get("address")

        if output_name and output_namespace:
            table_meta = src_table.meta.to_dict()
            address_dict = src_table.meta.get_address().__dict__
            engine = src_table.meta.get_engine()
            table_meta.update({
                "name":
                output_name,
                "namespace":
                output_namespace,
                "address":
                self._create_save_address(engine, address_dict, output_name,
                                          output_namespace),
            })
            src_table.save_as(**table_meta)
            # output table track
            DataTableTracker.create_table_tracker(name,
                                                  namespace,
                                                  entity_info={
                                                      "have_parent":
                                                      True,
                                                      "parent_table_namespace":
                                                      namespace,
                                                      "parent_table_name":
                                                      name,
                                                      "job_id":
                                                      self.tracker.job_id,
                                                  })

        elif engine and address_dict:
            save_data_to_external_storage(engine, address_dict, src_table)

        LOGGER.info("save success")
        self.tracker.log_output_data_info(
            data_name="writer",
            table_namespace=output_namespace,
            table_name=output_name,
        )
        self.tracker.log_metric_data(metric_namespace="writer",
                                     metric_name="writer",
                                     metrics=[
                                         Metric("count",
                                                src_table.meta.get_count()),
                                         Metric("storage_engine", engine)
                                     ])
Exemple #8
0
 def _run(self, cpn_input: ComponentInputProtocol):
     self.parameters = cpn_input.parameters
     LOGGER.info(self.parameters)
     self.parameters["role"] = cpn_input.roles["role"]
     self.parameters["local"] = cpn_input.roles["local"]
     storage_engine = self.parameters["storage_engine"].upper()
     storage_address = self.parameters["storage_address"]
     # if not set storage, use job storage as default
     if not storage_engine:
         storage_engine = cpn_input.job_parameters.storage_engine
     self.storage_engine = storage_engine
     if not storage_address:
         storage_address = cpn_input.job_parameters.engines_address[
             EngineType.STORAGE]
     job_id = self.task_version_id.split("_")[0]
     if not os.path.isabs(self.parameters.get("file", "")):
         self.parameters["file"] = os.path.join(get_fate_flow_directory(),
                                                self.parameters["file"])
     if not os.path.exists(self.parameters["file"]):
         raise Exception("%s is not exist, please check the configure" %
                         (self.parameters["file"]))
     if not os.path.getsize(self.parameters["file"]):
         raise Exception("%s is an empty file" % (self.parameters["file"]))
     name, namespace = self.parameters.get("name"), self.parameters.get(
         "namespace")
     _namespace, _table_name = self.generate_table_name(
         self.parameters["file"])
     if namespace is None:
         namespace = _namespace
     if name is None:
         name = _table_name
     read_head = self.parameters["head"]
     if read_head == 0:
         head = False
     elif read_head == 1:
         head = True
     else:
         raise Exception("'head' in conf.json should be 0 or 1")
     partitions = self.parameters["partition"]
     if partitions <= 0 or partitions >= self.MAX_PARTITIONS:
         raise Exception(
             "Error number of partition, it should between %d and %d" %
             (0, self.MAX_PARTITIONS))
     self.session_id = job_utils.generate_session_id(
         self.tracker.task_id,
         self.tracker.task_version,
         self.tracker.role,
         self.tracker.party_id,
     )
     sess = Session.get_global()
     self.session = sess
     if self.parameters.get("destroy", False):
         table = sess.get_table(namespace=namespace, name=name)
         if table:
             LOGGER.info(
                 f"destroy table name: {name} namespace: {namespace} engine: {table.engine}"
             )
             try:
                 table.destroy()
             except Exception as e:
                 LOGGER.error(e)
         else:
             LOGGER.info(
                 f"can not found table name: {name} namespace: {namespace}, pass destroy"
             )
     address_dict = storage_address.copy()
     storage_session = sess.storage(storage_engine=storage_engine,
                                    options=self.parameters.get("options"))
     upload_address = {}
     if storage_engine in {StorageEngine.EGGROLL, StorageEngine.STANDALONE}:
         upload_address = {
             "name": name,
             "namespace": namespace,
             "storage_type": EggRollStoreType.ROLLPAIR_LMDB,
         }
     elif storage_engine in {StorageEngine.MYSQL, StorageEngine.HIVE}:
         if not address_dict.get("db") or not address_dict.get("name"):
             upload_address = {"db": namespace, "name": name}
     elif storage_engine in {StorageEngine.PATH}:
         upload_address = {"path": self.parameters["file"]}
     elif storage_engine in {StorageEngine.HDFS}:
         upload_address = {
             "path":
             default_input_fs_path(
                 name=name,
                 namespace=namespace,
                 prefix=address_dict.get("path_prefix"),
             )
         }
     elif storage_engine in {StorageEngine.LOCALFS}:
         upload_address = {
             "path":
             default_input_fs_path(name=name,
                                   namespace=namespace,
                                   storage_engine=storage_engine)
         }
     else:
         raise RuntimeError(
             f"can not support this storage engine: {storage_engine}")
     address_dict.update(upload_address)
     LOGGER.info(
         f"upload to {storage_engine} storage, address: {address_dict}")
     address = storage.StorageTableMeta.create_address(
         storage_engine=storage_engine, address_dict=address_dict)
     self.parameters["partitions"] = partitions
     self.parameters["name"] = name
     self.table = storage_session.create_table(
         address=address,
         origin=StorageTableOrigin.UPLOAD,
         **self.parameters)
     if storage_engine not in [StorageEngine.PATH]:
         data_table_count = self.save_data_table(job_id, name, namespace,
                                                 storage_engine, head)
     else:
         data_table_count = self.get_data_table_count(
             self.parameters["file"], name, namespace)
     self.table.meta.update_metas(in_serialized=True)
     DataTableTracker.create_table_tracker(
         table_name=name,
         table_namespace=namespace,
         entity_info={
             "job_id": job_id,
             "have_parent": False
         },
     )
     LOGGER.info("------------load data finish!-----------------")
     # rm tmp file
     try:
         if "{}/fate_upload_tmp".format(job_id) in self.parameters["file"]:
             LOGGER.info("remove tmp upload file")
             LOGGER.info(os.path.dirname(self.parameters["file"]))
             shutil.rmtree(os.path.dirname(self.parameters["file"]))
     except:
         LOGGER.info("remove tmp file failed")
     LOGGER.info("file: {}".format(self.parameters["file"]))
     LOGGER.info("total data_count: {}".format(data_table_count))
     LOGGER.info("table name: {}, table namespace: {}".format(
         name, namespace))