def test_filtering( self, filtering_dataframe, key_id, timestamp_c, feature1, feature2, feature3, output_filtering_dataframe, ): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature1, feature2, feature3], ) # act result_df = (feature_set.construct( filtering_dataframe, spark_client).orderBy("timestamp").collect()) # assert assert (result_df == output_filtering_dataframe.orderBy( "timestamp").select(feature_set.columns).collect())
def test_construct_transformations( self, dataframe, feature_set_dataframe, key_id, timestamp_c, feature_add, feature_divide, ): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature_add, feature_divide], ) # act result_df = feature_set.construct(dataframe, spark_client) # assert assert_dataframe_equality(result_df, feature_set_dataframe)
def test_construct( self, dataframe, feature_set_dataframe, key_id, timestamp_c, feature_add, feature_divide, ): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature_add, feature_divide], ) # act result_df = feature_set.construct(dataframe, spark_client) result_columns = result_df.columns # assert assert (result_columns == key_id.get_output_columns() + timestamp_c.get_output_columns() + feature_add.get_output_columns() + feature_divide.get_output_columns()) assert_dataframe_equality(result_df, feature_set_dataframe) assert result_df.is_cached
def test_construct( self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe ): # given spark_client = SparkClient() # arrange feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ) output_df = ( feature_set.construct(feature_set_dataframe, client=spark_client) .orderBy(feature_set.timestamp_column) .select(feature_set.columns) ) target_df = fixed_windows_output_feature_set_dataframe.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_getters(self, feature_add, feature_divide, key_id, timestamp_c): # arrange name = "name" entity = "entity" description = "description" # act feature_set = FeatureSet( name, entity, description, [key_id], timestamp_c, [feature_add, feature_divide], ) # assert assert name == feature_set.name assert entity == feature_set.entity assert description == feature_set.description assert [key_id] == feature_set.keys assert timestamp_c == feature_set.timestamp assert [feature_add, feature_divide] == feature_set.features assert "timestamp" == feature_set.timestamp_column assert ["id"] == feature_set.keys_columns
def test_construct_invalid_df(self, key_id, timestamp_c, feature_add, feature_divide): spark_client = Mock() # arrange feature_set = FeatureSet( "name", "entity", "description", [key_id], timestamp_c, [feature_add, feature_divide], ) # act and assert with pytest.raises(ValueError): _ = feature_set.construct("not a dataframe", spark_client)
def get_db_schema(self, feature_set: FeatureSet): """Get desired database schema. Args: feature_set: object processed with feature set metadata. Returns: Desired database schema. """ db_schema = self.db_config.translate(feature_set.get_schema()) return db_schema
def test_multiple_timestamps(self, feature_add, key_id, timestamp_c): # arrange name = "name" entity = "entity" description = "description" timestamp_c.get_output_columns = Mock( return_value=["timestamp1", "timestamp2"]) # act and assert with pytest.raises(ValueError): _ = FeatureSet(name, entity, description, [key_id], timestamp_c, [feature_add])
def test_duplicate_features(self, feature_add, key_id, timestamp_c): # arrange name = "name" entity = "entity" description = "description" # act and assert with pytest.raises(KeyError): _ = FeatureSet( name, entity, description, [key_id], timestamp_c, [feature_add, feature_add], )
def test_feature_without_datatype(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): FeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SQLExpressionTransform( expression="feature1 + a"), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): FeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[Function(F.avg, DataType.FLOAT)]), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def test__get_features_columns(self): # arrange feature_1 = Feature("feature1", "description", DataType.FLOAT) feature_1.get_output_columns = Mock(return_value=["col_a", "col_b"]) feature_2 = Feature("feature2", "description", DataType.FLOAT) feature_2.get_output_columns = Mock(return_value=["col_c"]) feature_3 = Feature("feature3", "description", DataType.FLOAT) feature_3.get_output_columns = Mock(return_value=["col_d"]) target_features_columns = ["col_a", "col_b", "col_c", "col_d"] # act result_features_columns = FeatureSet._get_features_columns( feature_1, feature_2, feature_3) # assert assert target_features_columns == result_features_columns
def feature_set(): key_features = [ KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) ] ts_feature = TimestampFeature(from_column="timestamp") features = [ Feature(name="feature", description="Description", dtype=DataType.FLOAT), ] return FeatureSet( "test_sink_feature_set", "test_sink_entity", "description", keys=key_features, timestamp=ts_feature, features=features, )
def test_columns(self, key_id, timestamp_c, feature_add, feature_divide): # arrange name = "name" entity = "entity" description = "description" # act fs = FeatureSet( name, entity, description, [key_id], timestamp_c, [feature_add, feature_divide], ) out_columns = fs.columns # assert assert (out_columns == key_id.get_output_columns() + timestamp_c.get_output_columns() + feature_add.get_output_columns() + feature_divide.get_output_columns())
def test_get_schema(self): expected_schema = [ { "column_name": "id", "type": LongType(), "primary_key": True }, { "column_name": "timestamp", "type": TimestampType(), "primary_key": False }, { "column_name": "feature1__avg_over_2_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__avg_over_15_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_2_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_15_minutes_fixed_windows", "type": FloatType(), "primary_key": False, }, ] feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ]).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ) schema = feature_set.get_schema() assert schema == expected_schema
def test_feature_set_args(self): # arrange and act out_columns = [ "user_id", "timestamp", "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows", "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__" "stddev_pop_over_2_weeks_fixed_windows", # noqa ] pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader(id="source_a", database="db", table="table",), FileReader(id="source_b", path="path", format="parquet",), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform( functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Sink( writers=[ HistoricalFeatureStoreWriter(db_config=None), OnlineFeatureStoreWriter(db_config=None), ], ), ) assert isinstance(pipeline.spark_client, SparkClient) assert len(pipeline.source.readers) == 2 assert all(isinstance(reader, Reader) for reader in pipeline.source.readers) assert isinstance(pipeline.source.query, str) assert pipeline.feature_set.name == "feature_set" assert pipeline.feature_set.entity == "entity" assert pipeline.feature_set.description == "description" assert isinstance(pipeline.feature_set.timestamp, TimestampFeature) assert len(pipeline.feature_set.keys) == 1 assert all(isinstance(k, KeyFeature) for k in pipeline.feature_set.keys) assert len(pipeline.feature_set.features) == 1 assert all( isinstance(feature, Feature) for feature in pipeline.feature_set.features ) assert pipeline.feature_set.columns == out_columns assert len(pipeline.sink.writers) == 2 assert all(isinstance(writer, Writer) for writer in pipeline.sink.writers)
def test_cannot_instantiate(self, name, entity, description, keys, timestamp, features): # act and assert with pytest.raises(ValueError): FeatureSet(name, entity, description, keys, timestamp, features)
def test_feature_set_pipeline(self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe): # arrange table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE") create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) dbconfig = Mock() dbconfig.get_options = Mock( return_value={ "mode": "overwrite", "format_": "parquet", "path": "test_folder/historical/entity/feature_set", }) # act test_pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), ], query=f"select * from {table_reader_id} ", # noqa ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ), sink=Sink( writers=[HistoricalFeatureStoreWriter(db_config=dbconfig)], ), ) test_pipeline.run() # assert path = dbconfig.get_options("historical/entity/feature_set").get( "path") df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column) # assert assert_dataframe_equality(df, target_df) # tear down shutil.rmtree("test_folder")