def test_construct( self, feature_set_dataframe, fixed_windows_output_feature_set_dataframe ): # given spark_client = SparkClient() # arrange feature_set = FeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform( functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.FLOAT), ] ).with_window( partition_by="id", order_by=TIMESTAMP_COLUMN, mode="fixed_windows", window_definition=["2 minutes", "15 minutes"], ), ), Feature( name="divided_feature", description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ) output_df = ( feature_set.construct(feature_set_dataframe, client=spark_client) .orderBy(feature_set.timestamp_column) .select(feature_set.columns) ) target_df = fixed_windows_output_feature_set_dataframe.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_sink_raise(self): with pytest.raises(ValueError, match="sink must be a Sink instance"): FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[ TableReader(id="source_a", database="db", table="table",), ], query="select * from source_a", ), feature_set=Mock( spec=FeatureSet, name="feature_set", entity="entity", description="description", features=[ Feature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.FLOAT, ), Feature( name="ts", description="The timestamp feature", dtype=DataType.TIMESTAMP, ), ], key_columns=["user_id"], timestamp_column="ts", ), sink=Mock(writers=[HistoricalFeatureStoreWriter(db_config=None)],), )
def test_output_columns(self): # arrange h3_feature = Feature( name="new_feature", description="unit test", dtype=DataType.STRING, transformation=H3HashTransform( h3_resolutions=[6, 7, 8, 9, 10, 11, 12], lat_column="lat", lng_column="lng", ), ) target_columns = [ "lat_lng__h3_hash__6", "lat_lng__h3_hash__7", "lat_lng__h3_hash__8", "lat_lng__h3_hash__9", "lat_lng__h3_hash__10", "lat_lng__h3_hash__11", "lat_lng__h3_hash__12", ] # act output_columns = h3_feature.get_output_columns() # assert assert sorted(output_columns) == sorted(target_columns)
def test_feature_transform(self, feature_set_dataframe): test_feature = Feature( name="feature1_over_feature2", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform(expression="feature1/feature2"), ) df = test_feature.transform(feature_set_dataframe) assert all( [ a == b for a, b in zip( df.columns, [ "feature1", "feature2", "id", "timestamp", "feature1_over_feature2", ], ) ] )
def test_feature_get_output_columns_without_transformations(self): test_feature = Feature( name="feature", from_column="origin", description="unit test", dtype=DataType.BIGINT, ) assert test_feature.get_output_columns() == [test_feature.name]
def test_feature_transform_invalid_output(self, feature_set_dataframe): with pytest.raises(Exception): test_feature = Feature( name="feature1_plus_a", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform(expression="feature2 + a"), ) test_feature.transform(feature_set_dataframe).collect()
def test_feature_transform_with_dtype(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.TIMESTAMP, ) df = test_feature.transform(feature_set_dataframe) assert dict(df.dtypes).get("feature") == "timestamp"
def test_output_columns(self): test_feature = Feature( name="feature1_over_feature2", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform(expression="feature1/feature2"), ) df_columns = test_feature.get_output_columns() assert all([a == b for a, b in zip(df_columns, ["feature1_over_feature2"],)])
def test_construct_rolling_windows_with_end_date( self, feature_set_dataframe, rolling_windows_output_feature_set_dataframe_base_date, ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform(functions=[ Function(F.avg, DataType.DOUBLE), Function(F.stddev_pop, DataType.DOUBLE), ], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform(functions=[ Function(F.avg, DataType.DOUBLE), Function(F.stddev_pop, DataType.DOUBLE), ], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 day", "1 week"]) # act output_df = feature_set.construct( feature_set_dataframe, client=spark_client, end_date="2016-04-18").orderBy("timestamp") target_df = rolling_windows_output_feature_set_dataframe_base_date.orderBy( feature_set.timestamp_column).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_feature_transform(self, feature_set_dataframe, target_df_spark): test_feature = Feature( name="feature", description="unit test", transformation=SparkFunctionTransform( functions=[Function(functions.cos, DataType.DOUBLE)], ), from_column="feature1", ) output_df = test_feature.transform(feature_set_dataframe) assert_dataframe_equality(output_df, target_df_spark)
def test_feature_transform_no_from_column(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test feature without transformation", dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip(df.columns, feature_set_dataframe.columns) ])
def test_feature_transform_output(self, feature_set_dataframe): test_feature = Feature( name="feature1_over_feature2", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform(expression="feature1/feature2"), ) df = test_feature.transform(feature_set_dataframe).collect() assert df[0]["feature1_over_feature2"] == 1 assert df[1]["feature1_over_feature2"] == 1 assert df[2]["feature1_over_feature2"] == 1 assert df[3]["feature1_over_feature2"] == 1
def test_feature_transform(self, feature_set_dataframe, target_df_agg): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) # aggregated feature transform won't run transformations # and depends on the feature set with pytest.raises(NotImplementedError): _ = test_feature.transform(feature_set_dataframe)
def test_output_columns(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df_columns = test_feature.get_output_columns() assert isinstance(df_columns, list) assert df_columns == ["feature"]
def test_aggregations_with_filter_expression(self, spark_context): # arrange test_feature = Feature( name="feature_with_filter", description="unit test", transformation=AggregatedTransform( functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.min, DataType.DOUBLE), Function(functions.max, DataType.DOUBLE), ], filter_expression="type = 'a'", ), from_column="feature", ) target_aggregations = [ agg( functions.when(functions.expr("type = 'a'"), functions.col("feature"))) for agg in [functions.avg, functions.min, functions.max] ] # act output_aggregations = [ agg.function for agg in test_feature.transformation.aggregations ] # assert # cast to string to compare the columns definitions because direct column # comparison was not working assert str(target_aggregations) == str(output_aggregations)
def test_feature_get_output_columns_with_transformations( self, feature_set_dataframe): some_transformation = Mock() some_transformation.output_columns = feature_set_dataframe.columns test_feature = Feature( name="feature", from_column="origin", description="unit test", transformation=some_transformation, dtype=DataType.BIGINT, ) assert test_feature.get_output_columns( ) == feature_set_dataframe.columns
def test_h3_feature_set(self, h3_input_df, h3_target_df): spark_client = SparkClient() feature_set = AggregatedFeatureSet( name="h3_test", entity="h3geolocation", description="Test", keys=[ KeyFeature( name="h3_id", description="The h3 hash ID", dtype=DataType.DOUBLE, transformation=H3HashTransform( h3_resolutions=[6, 7, 8, 9, 10, 11, 12], lat_column="lat", lng_column="lng", ).with_stack(), ) ], timestamp=TimestampFeature(), features=[ Feature( name="house_id", description="Count of house ids over a day.", transformation=AggregatedTransform( functions=[Function(F.count, DataType.BIGINT)]), ), ], ).with_windows(definitions=["1 day"]) output_df = feature_set.construct(h3_input_df, client=spark_client, end_date="2016-04-14") assert_dataframe_equality(output_df, h3_target_df)
def test_feature_transform_with_from_column(self, feature_set_dataframe): test_feature = Feature( name="new_feature", from_column="feature", description="unit test", dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip( sorted(df.columns), sorted(["new_feature", "id", TIMESTAMP_COLUMN, "feature"]), ) ])
def test_columns_not_in_dataframe(self, spark_context, spark_session): # arrange input_df = create_df_from_collection(self.input_data, spark_context, spark_session) feature = Feature( name="id", description="stack transformation", dtype=DataType.STRING, transformation=StackTransform("id_c", "id_d"), ) # act and assert with pytest.raises(ValueError, match="Columns not found, columns in df: "): feature.transform(input_df)
def test_feature_transform_with_distinct_empty_subset( self, timestamp_c, feature_set_with_distinct_dataframe): spark_client = SparkClient() with pytest.raises(ValueError, match="The distinct subset param can't be empty."): AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform(functions=[ Function(functions.sum, DataType.INTEGER) ]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct( subset=[], keep="first").construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10")
def test_feature_transform_with_distinct( self, timestamp_c, feature_set_with_distinct_dataframe, target_with_distinct_dataframe, ): spark_client = SparkClient() fs = (AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct(subset=["id"], keep="last")) # assert output_df = fs.construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10") assert_dataframe_equality(output_df, target_with_distinct_dataframe)
def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT) ], ).with_window( partition_by="id", mode="row_windows", window_definition=["2 events"], ), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def test_custom_transform_output(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df = test_feature.transform(feature_set_dataframe).collect() assert df[0]["feature"] == 1 assert df[1]["feature"] == 1 assert df[2]["feature"] == 1 assert df[3]["feature"] == 1
def test_construct_without_window( self, feature_set_dataframe, target_df_without_window, ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", dtype=DataType.DOUBLE, transformation=AggregatedTransform( functions=[Function(F.avg, DataType.DOUBLE)]), ), Feature( name="feature2", description="test", dtype=DataType.FLOAT, transformation=AggregatedTransform( functions=[Function(F.count, DataType.BIGINT)]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="fixed_ts"), ) # act output_df = feature_set.construct(feature_set_dataframe, client=spark_client) # assert assert_dataframe_equality(output_df, target_df_without_window)
def test_unsupported_aggregation(self, feature_set_dataframe): with pytest.raises(TypeError): Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function("median", DataType.DOUBLE)]), )
def test_feature_transform_with_window(self, feature_set_dataframe, target_df_rows_agg): test_feature = Feature( name="feature1", description="unit test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.DOUBLE) ], ).with_window( partition_by="id", mode="row_windows", window_definition=["2 events", "3 events"], ), ) output_df = test_feature.transform(feature_set_dataframe) assert_dataframe_equality(output_df, target_df_rows_agg)
def test_blank_transformer(self, feature_set_dataframe): with pytest.raises(ValueError): Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform(transformer=None), )
def test_blank_aggregation(self, feature_set_dataframe): with pytest.raises(ValueError): Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function(func="", data_type="")]), )
def test_feature_transform_with_transformation_no_from_column( self, feature_set_dataframe): some_transformation = Mock() some_transformation.transform.return_value = feature_set_dataframe test_feature = Feature( name="feature", description="unit test", transformation=some_transformation, dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip(sorted(df.columns), sorted(["feature", "id", TIMESTAMP_COLUMN])) ])
def test_output_columns(self): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) df_columns = test_feature.get_output_columns() assert all([ a == b for a, b in zip( df_columns, ["feature1__avg", "feature1__stddev_pop"], ) ])