Beispiel #1
0
 def _get_flow_base_params(self, feature_group, num_chunks, size):
     # TODO(fabio): flow identifier is not unique
     return {
         "templateId": -1,
         "flowChunkSize": self.DEFAULT_FLOW_CHUNK_SIZE,
         "flowTotalSize": size,
         "flowIdentifier": util.feature_group_name(feature_group),
         "flowFilename": util.feature_group_name(feature_group),
         "flowRelativePath": util.feature_group_name(feature_group),
         "flowTotalChunks": num_chunks,
     }
    def __init__(
        self,
        feature_store_id,
        feature_store_name,
        feature_group,
        spark_context,
        spark_session,
    ):
        self._feature_group = feature_group
        self._spark_context = spark_context
        self._spark_session = spark_session
        self._feature_store_id = feature_store_id
        self._feature_store_name = feature_store_name
        self._base_path = self._feature_group.location
        self._table_name = util.feature_group_name(feature_group)

        self._primary_key = ",".join(feature_group.primary_key)
        self._partition_key = (",".join(feature_group.partition_key) if
                               len(feature_group.partition_key) >= 1 else "")
        self._partition_path = (":SIMPLE,".join(feature_group.partition_key) +
                                ":SIMPLE" if
                                len(feature_group.partition_key) >= 1 else "")
        self._pre_combine_key = (feature_group.hudi_precombine_key
                                 if feature_group.hudi_precombine_key else
                                 feature_group.primary_key[0])

        self._feature_group_api = feature_group_api.FeatureGroupApi(
            feature_store_id)
        self._storage_connector_api = storage_connector_api.StorageConnectorApi(
            self._feature_store_id)
        self._connstr = self._storage_connector_api.get(
            self._feature_store_name).connection_string
Beispiel #3
0
    def upload(self, feature_group, path, dataframe):
        # Convert the dataframe into PARQUET for upload
        df_parquet = dataframe.to_parquet(index=False)
        parquet_length = len(df_parquet)
        num_chunks = math.ceil(parquet_length / self.DEFAULT_FLOW_CHUNK_SIZE)

        base_params = self._get_flow_base_params(
            feature_group, num_chunks, parquet_length
        )

        chunk_number = 1
        for i in range(0, parquet_length, self.DEFAULT_FLOW_CHUNK_SIZE):
            query_params = base_params
            query_params["flowCurrentChunkSize"] = len(
                df_parquet[i : i + self.DEFAULT_FLOW_CHUNK_SIZE]
            )
            query_params["flowChunkNumber"] = chunk_number

            self._upload_request(
                query_params,
                path,
                util.feature_group_name(feature_group),
                df_parquet[i : i + self.DEFAULT_FLOW_CHUNK_SIZE],
            )

            chunk_number += 1
Beispiel #4
0
    def upload(self, feature_group, path, dataframe):
        # Convert the dataframe into CSV for upload
        df_csv = dataframe.to_csv(index=False)
        num_chunks = math.ceil(len(df_csv) / self.DEFAULT_FLOW_CHUNK_SIZE)

        base_params = self._get_flow_base_params(feature_group, num_chunks,
                                                 len(df_csv))

        chunks = [
            df_csv[i:i + self.DEFAULT_FLOW_CHUNK_SIZE]
            for i in range(0, len(df_csv), self.DEFAULT_FLOW_CHUNK_SIZE)
        ]

        chunk_number = 1
        for chunk in chunks:
            query_params = base_params
            query_params["flowCurrentChunkSize"] = len(chunk)
            query_params["flowChunkNumber"] = chunk_number

            self._upload_request(query_params, path,
                                 util.feature_group_name(feature_group), chunk)

            chunk_number += 1