def write( self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, ): """Loads the data from a feature set into the Historical Feature Store. Args: feature_set: object processed with feature_set informations. dataframe: spark dataframe containing data from a feature set. spark_client: client for spark connections with external services. If the debug_mode is set to True, a temporary table with a name in the format: historical_feature_store__{feature_set.name} will be created instead of writing to the real historical feature store. """ dataframe = self._create_partitions(dataframe) if self.debug_mode: spark_client.create_temporary_view( dataframe=dataframe, name=f"historical_feature_store__{feature_set.name}", ) return s3_key = os.path.join("historical", feature_set.entity, feature_set.name) spark_client.write_table( dataframe=dataframe, database=self.database, table_name=feature_set.name, partition_by=self.PARTITION_BY, **self.db_config.get_options(s3_key), )
def test_create_temporary_view(self, target_df: DataFrame, spark_session: SparkSession) -> None: # arrange spark_client = SparkClient() # act spark_client.create_temporary_view(target_df, "temp_view") result_df = spark_session.table("temp_view") # assert assert_dataframe_equality(target_df, result_df)
def write( self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, ) -> None: """Loads the data from a feature set into the Historical Feature Store. Args: feature_set: object processed with feature_set informations. dataframe: spark dataframe containing data from a feature set. spark_client: client for spark connections with external services. If the debug_mode is set to True, a temporary table with a name in the format: historical_feature_store__{feature_set.name} will be created instead of writing to the real historical feature store. """ dataframe = self._create_partitions(dataframe) dataframe = self._apply_transformations(dataframe) if self.interval_mode: partition_overwrite_mode = spark_client.conn.conf.get( "spark.sql.sources.partitionOverwriteMode").lower() if partition_overwrite_mode != "dynamic": raise RuntimeError( "m=load_incremental_table, " "spark.sql.sources.partitionOverwriteMode={}, " "msg=partitionOverwriteMode have to " "be configured to 'dynamic'".format( partition_overwrite_mode)) if self.debug_mode: spark_client.create_temporary_view( dataframe=dataframe, name=f"historical_feature_store__{feature_set.name}", ) return s3_key = os.path.join("historical", feature_set.entity, feature_set.name) spark_client.write_table( dataframe=dataframe, database=self.database, table_name=feature_set.name, partition_by=self.PARTITION_BY, **self.db_config.get_options(s3_key), )
def _write_in_debug_mode( table_name: str, dataframe: DataFrame, spark_client: SparkClient) -> Union[StreamingQuery, None]: """Creates a temporary table instead of writing to the real feature store.""" return spark_client.create_temporary_view( dataframe=dataframe, name=f"online_feature_store__{table_name}")