def test__create_partitions(self, spark_session, spark_context): # arrange start = datetime.datetime(year=1970, month=1, day=1) end = datetime.datetime(year=2020, month=12, day=31) random_dates = [ ( lambda: start + datetime. timedelta(seconds=random.randint( # noqa: S311 0, int( (end - start).total_seconds()))))().date().isoformat() for _ in range(10000) ] data = [{"timestamp": date} for date in random_dates] input_df = spark_session.read.json(spark_context.parallelize(data, 1), schema="timestamp timestamp") writer = HistoricalFeatureStoreWriter() # act result_df = writer._create_partitions(input_df) # assert assert result_df.select("year", "month", "day").distinct().count() == len( set(random_dates))
def test_write( self, feature_set_dataframe, historical_feature_set_dataframe, mocker, feature_set, ): # given spark_client = mocker.stub("spark_client") spark_client.write_table = mocker.stub("write_table") writer = HistoricalFeatureStoreWriter() # when writer.write( feature_set=feature_set, dataframe=feature_set_dataframe, spark_client=spark_client, ) result_df = spark_client.write_table.call_args[1]["dataframe"] # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) assert (writer.db_config.format_ == spark_client.write_table.call_args[1]["format_"]) assert writer.db_config.mode == spark_client.write_table.call_args[1][ "mode"] assert (writer.PARTITION_BY == spark_client.write_table.call_args[1] ["partition_by"]) assert feature_set.name == spark_client.write_table.call_args[1][ "table_name"]
def test__repartition_df(self, spark_session, spark_context): # arrange start = datetime.datetime(year=1970, month=1, day=1) end = datetime.datetime(year=2020, month=12, day=31) random_dates = [ ( lambda: start + datetime. timedelta(seconds=random.randint( # noqa: S311 0, int( (end - start).total_seconds()))))().date().isoformat() for _ in range(10000) ] data = [{"timestamp": date} for date in random_dates] input_df = spark_session.read.json(spark_context.parallelize(data, 1), schema="timestamp timestamp") writer = HistoricalFeatureStoreWriter() # act result_df = writer._create_partitions(input_df) # assert # Only one partition id, meaning data is not partitioned assert input_df.select(spark_partition_id()).distinct().count() == 1 # Desired number of partitions assert result_df.select(spark_partition_id()).distinct().count() == 200
def test__assert_validation_count(self, written_count, dataframe_count, threshold): # arrange writer = (HistoricalFeatureStoreWriter(validation_threshold=threshold) if threshold else HistoricalFeatureStoreWriter()) # act and assert writer._assert_validation_count("table", written_count, dataframe_count)
def test_validate(self, feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") spark_client.read_table = mocker.stub("read_table") spark_client.read_table.return_value = feature_set_dataframe writer = HistoricalFeatureStoreWriter() # when writer.validate(feature_set, feature_set_dataframe, spark_client) # then spark_client.read_table.assert_called_once()
def test_validate_false(self, feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") spark_client.read_table = mocker.stub("read_table") # limiting df to 1 row, now the counts should'n be the same spark_client.read_table.return_value = feature_set_dataframe.limit(1) writer = HistoricalFeatureStoreWriter() # when with pytest.raises(AssertionError): _ = writer.validate(feature_set, feature_set_dataframe, spark_client)
def test_flush(self, feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] for w in writer: w.write = mocker.stub("write") feature_set = mocker.stub("feature_set") feature_set.entity = "house" feature_set.name = "test" # when sink = Sink(writers=writer) sink.flush( dataframe=feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, ) # then for w in writer: w.write.assert_called_once()
def test_sink_raise(self): with pytest.raises(ValueError, match="sink must be a Sink instance"): FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[ TableReader(id="source_a", database="db", table="table",), ], query="select * from source_a", ), feature_set=Mock( spec=FeatureSet, name="feature_set", entity="entity", description="description", features=[ Feature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.FLOAT, ), Feature( name="ts", description="The timestamp feature", dtype=DataType.TIMESTAMP, ), ], key_columns=["user_id"], timestamp_column="ts", ), sink=Mock(writers=[HistoricalFeatureStoreWriter(db_config=None)],), )
def test_sink(input_dataframe, feature_set): # arrange client = SparkClient() feature_set_df = feature_set.construct(input_dataframe, client) target_latest_df = OnlineFeatureStoreWriter.filter_latest( feature_set_df, id_columns=[key.name for key in feature_set.keys]) columns_sort = feature_set_df.schema.fieldNames() # setup historical writer s3config = Mock() s3config.get_options = Mock( return_value={ "mode": "overwrite", "format_": "parquet", "path": "test_folder/historical/entity/feature_set", }) historical_writer = HistoricalFeatureStoreWriter(db_config=s3config) # setup online writer # TODO: Change for CassandraConfig when Cassandra for test is ready online_config = Mock() online_config.mode = "overwrite" online_config.format_ = "parquet" online_config.get_options = Mock( return_value={"path": "test_folder/online/entity/feature_set"}) online_writer = OnlineFeatureStoreWriter(db_config=online_config) writers = [historical_writer, online_writer] sink = Sink(writers) # act client.sql("CREATE DATABASE IF NOT EXISTS {}".format( historical_writer.database)) sink.flush(feature_set, feature_set_df, client) # get historical results historical_result_df = client.read_table(feature_set.name, historical_writer.database) # get online results online_result_df = client.read(online_config.format_, options=online_config.get_options( feature_set.name)) # assert # assert historical results assert sorted(feature_set_df.select(*columns_sort).collect()) == sorted( historical_result_df.select(*columns_sort).collect()) # assert online results assert sorted(target_latest_df.select(*columns_sort).collect()) == sorted( online_result_df.select(*columns_sort).collect()) # tear down shutil.rmtree("test_folder")
def test_run_with_repartition(self, spark_session): test_pipeline = FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[TableReader(id="source_a", database="db", table="table",)], query="select * from source_a", ), feature_set=Mock( spec=FeatureSet, name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform( functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Mock( spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], ), ) # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) test_pipeline.feature_set.construct.return_value = sample_df test_pipeline.run(partition_by=["id"]) test_pipeline.source.construct.assert_called_once() test_pipeline.feature_set.construct.assert_called_once() test_pipeline.sink.flush.assert_called_once() test_pipeline.sink.validate.assert_called_once()
def test_write_in_debug_mode( self, feature_set_dataframe, historical_feature_set_dataframe, feature_set, spark_session, ): # given spark_client = SparkClient() writer = HistoricalFeatureStoreWriter(debug_mode=True) # when writer.write( feature_set=feature_set, dataframe=feature_set_dataframe, spark_client=spark_client, ) result_df = spark_session.table( f"historical_feature_store__{feature_set.name}") # then assert_dataframe_equality(historical_feature_set_dataframe, result_df)
def test_feature_set_raise(self): with pytest.raises( ValueError, match="feature_set must be a FeatureSet instance" ): FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[ TableReader(id="source_a", database="db", table="table",), ], query="select * from source_a", ), feature_set=Mock( name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform( functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Mock( spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], ), )
def test_flush_with_invalid_df(self, not_feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] feature_set = mocker.stub("feature_set") feature_set.entity = "house" feature_set.name = "test" # when sink = Sink(writers=writer) # then with pytest.raises(ValueError): sink.flush( dataframe=not_feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, )
def test_validate_false(self, feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] for w in writer: w.validate = mocker.stub("validate") w.validate.side_effect = AssertionError("test") feature_set = mocker.stub("feature_set") # when sink = Sink(writers=writer) # then with pytest.raises(RuntimeError): sink.validate( dataframe=feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, )
def test_validate(self, feature_set_dataframe, mocker): # given spark_client = SparkClient() writer = [ HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter(), ] for w in writer: w.validate = mocker.stub("validate") feature_set = mocker.stub("feature_set") # when sink = Sink(writers=writer) sink.validate( dataframe=feature_set_dataframe, feature_set=feature_set, spark_client=spark_client, ) # then for w in writer: w.validate.assert_called_once()
def test_feature_set_pipeline(self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe): # arrange table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) dbconfig = Mock() dbconfig.get_options = Mock( return_value={ "mode": "overwrite", "format_": "parquet", "path": "test_folder/historical/entity/feature_set", }) # act test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), ], query=f"select * from {table_reader_id} ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink( writers=[HistoricalFeatureStoreWriter(db_config=dbconfig)], ), ) test_pipeline.run() # assert path = dbconfig.get_options("historical/entity/feature_set").get( "path") df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column) # assert assert_dataframe_equality(df, target_df) # tear down shutil.rmtree("test_folder")
def test_feature_set_args(self): # arrange and act out_columns = [ "user_id", "timestamp", "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows", "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__" "stddev_pop_over_2_weeks_fixed_windows", # noqa ] pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader(id="source_a", database="db", table="table",), FileReader(id="source_b", path="path", format="parquet",), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform( functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Sink( writers=[ HistoricalFeatureStoreWriter(db_config=None), OnlineFeatureStoreWriter(db_config=None), ], ), ) assert isinstance(pipeline.spark_client, SparkClient) assert len(pipeline.source.readers) == 2 assert all(isinstance(reader, Reader) for reader in pipeline.source.readers) assert isinstance(pipeline.source.query, str) assert pipeline.feature_set.name == "feature_set" assert pipeline.feature_set.entity == "entity" assert pipeline.feature_set.description == "description" assert isinstance(pipeline.feature_set.timestamp, TimestampFeature) assert len(pipeline.feature_set.keys) == 1 assert all(isinstance(k, KeyFeature) for k in pipeline.feature_set.keys) assert len(pipeline.feature_set.features) == 1 assert all( isinstance(feature, Feature) for feature in pipeline.feature_set.features ) assert pipeline.feature_set.columns == out_columns assert len(pipeline.sink.writers) == 2 assert all(isinstance(writer, Writer) for writer in pipeline.sink.writers)