def test_sink(input_dataframe, feature_set): # arrange client = SparkClient() feature_set_df = feature_set.construct(input_dataframe, client) target_latest_df = OnlineFeatureStoreWriter.filter_latest( feature_set_df, id_columns=[key.name for key in feature_set.keys]) columns_sort = feature_set_df.schema.fieldNames() # setup historical writer s3config = Mock() s3config.get_options = Mock( return_value={ "mode": "overwrite", "format_": "parquet", "path": "test_folder/historical/entity/feature_set", }) historical_writer = HistoricalFeatureStoreWriter(db_config=s3config) # setup online writer # TODO: Change for CassandraConfig when Cassandra for test is ready online_config = Mock() online_config.mode = "overwrite" online_config.format_ = "parquet" online_config.get_options = Mock( return_value={"path": "test_folder/online/entity/feature_set"}) online_writer = OnlineFeatureStoreWriter(db_config=online_config) writers = [historical_writer, online_writer] sink = Sink(writers) # act client.sql("CREATE DATABASE IF NOT EXISTS {}".format( historical_writer.database)) sink.flush(feature_set, feature_set_df, client) # get historical results historical_result_df = client.read_table(feature_set.name, historical_writer.database) # get online results online_result_df = client.read(online_config.format_, options=online_config.get_options( feature_set.name)) # assert # assert historical results assert sorted(feature_set_df.select(*columns_sort).collect()) == sorted( historical_result_df.select(*columns_sort).collect()) # assert online results assert sorted(target_latest_df.select(*columns_sort).collect()) == sorted( online_result_df.select(*columns_sort).collect()) # tear down shutil.rmtree("test_folder")
def construct(self, client: SparkClient) -> DataFrame: """Construct an entry point dataframe for a feature set. This method will assemble multiple readers, by building each one and querying them using a Spark SQL. After that, there's the caching of the dataframe, however since cache() in Spark is lazy, an action is triggered in order to force persistence. Args: client: client responsible for connecting to Spark session. Returns: DataFrame with the query result against all readers. """ for reader in self.readers: reader.build(client) # create temporary views for each reader dataframe = client.sql(self.query) if not dataframe.isStreaming: dataframe.cache().count() return dataframe
def test_sql(self, target_df): # arrange spark_client = SparkClient() create_temp_view(target_df, "test") # act result_df = spark_client.sql("select * from test") # assert assert result_df.collect() == target_df.collect()