def test_feature_set(): return AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[Function(functions.count, DataType.INTEGER)]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 week", "2 days"])
def agg_feature_set(): return AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.DOUBLE)], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.DOUBLE)]), ), ], keys=[ KeyFeature( name="id", description="description", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), )
def test_construct_rolling_windows_with_end_date( self, feature_set_dataframe, rolling_windows_output_feature_set_dataframe_base_date, ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[ Function(F.avg, DataType.DOUBLE), Function(F.stddev_pop, DataType.DOUBLE), ], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( functions=[ Function(F.avg, DataType.DOUBLE), Function(F.stddev_pop, DataType.DOUBLE), ], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 day", "1 week"]) # act output_df = feature_set.construct( feature_set_dataframe, client=spark_client, end_date="2016-04-18" ).orderBy("timestamp") target_df = rolling_windows_output_feature_set_dataframe_base_date.orderBy( feature_set.timestamp_column ).select(feature_set.columns) # assert assert_dataframe_equality(output_df, target_df)
def test_feature_set_start_date( self, timestamp_c, feature_set_with_distinct_dataframe, ): fs = AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["10 days", "3 weeks", "90 days"]) # assert start_date = fs.define_start_date("2016-04-14") assert start_date == "2016-01-14"
def test_feature_transform_with_distinct( self, timestamp_c, feature_set_with_distinct_dataframe, target_with_distinct_dataframe, ): spark_client = SparkClient() fs = (AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct(subset=["id"], keep="last")) # assert output_df = fs.construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10") assert_dataframe_equality(output_df, target_with_distinct_dataframe)
def test_h3_feature_set(self, h3_input_df, h3_target_df): spark_client = SparkClient() feature_set = AggregatedFeatureSet( name="h3_test", entity="h3geolocation", description="Test", keys=[ KeyFeature( name="h3_id", description="The h3 hash ID", dtype=DataType.DOUBLE, transformation=H3HashTransform( h3_resolutions=[6, 7, 8, 9, 10, 11, 12], lat_column="lat", lng_column="lng", ).with_stack(), ) ], timestamp=TimestampFeature(), features=[ Feature( name="house_id", description="Count of house ids over a day.", transformation=AggregatedTransform( functions=[Function(F.count, DataType.BIGINT)]), ), ], ).with_windows(definitions=["1 day"]) output_df = feature_set.construct(h3_input_df, client=spark_client, end_date="2016-04-14") assert_dataframe_equality(output_df, h3_target_df)
def test_aggregations_with_filter_expression(self, spark_context): # arrange test_feature = Feature( name="feature_with_filter", description="unit test", transformation=AggregatedTransform( functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.min, DataType.DOUBLE), Function(functions.max, DataType.DOUBLE), ], filter_expression="type = 'a'", ), from_column="feature", ) target_aggregations = [ agg( functions.when(functions.expr("type = 'a'"), functions.col("feature"))) for agg in [functions.avg, functions.min, functions.max] ] # act output_aggregations = [ agg.function for agg in test_feature.transformation.aggregations ] # assert # cast to string to compare the columns definitions because direct column # comparison was not working assert str(target_aggregations) == str(output_aggregations)
def test_feature_transform_with_distinct_empty_subset( self, timestamp_c, feature_set_with_distinct_dataframe): spark_client = SparkClient() with pytest.raises(ValueError, match="The distinct subset param can't be empty."): AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature", description="test", transformation=AggregatedTransform(functions=[ Function(functions.sum, DataType.INTEGER) ]), ), ], keys=[ KeyFeature(name="h3", description="test", dtype=DataType.STRING) ], timestamp=timestamp_c, ).with_windows(["3 days"]).with_distinct( subset=[], keep="first").construct(feature_set_with_distinct_dataframe, spark_client, end_date="2020-01-10")
def test_construct_without_window( self, feature_set_dataframe, target_df_without_window, ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", dtype=DataType.DOUBLE, transformation=AggregatedTransform( functions=[Function(F.avg, DataType.DOUBLE)]), ), Feature( name="feature2", description="test", dtype=DataType.FLOAT, transformation=AggregatedTransform( functions=[Function(F.count, DataType.BIGINT)]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="fixed_ts"), ) # act output_df = feature_set.construct(feature_set_dataframe, client=spark_client) # assert assert_dataframe_equality(output_df, target_df_without_window)
def test_unsupported_aggregation(self, feature_set_dataframe): with pytest.raises(TypeError): Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function("median", DataType.DOUBLE)]), )
def test_blank_aggregation(self, feature_set_dataframe): with pytest.raises(ValueError): Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function(func="", data_type="")]), )
def __init__(self): super(UserChargebacksPipeline, self).__init__( source=Source( readers=[ FileReader( id="chargeback_events", path="data/order_events/input.csv", format="csv", format_options={"header": True}, ) ], query=(""" select cpf, timestamp(chargeback_timestamp) as timestamp, order_id from chargeback_events where chargeback_timestamp is not null """), ), feature_set=AggregatedFeatureSet( name="user_chargebacks", entity="user", description="Aggregates the total of chargebacks from users in " "different time windows.", keys=[ KeyFeature( name="cpf", description="User unique identifier, entity key.", dtype=DataType.STRING, ) ], timestamp=TimestampFeature(), features=[ Feature( name="cpf_chargebacks", description= "Total of chargebacks registered on user's CPF", transformation=AggregatedTransform(functions=[ Function(functions.count, DataType.INTEGER) ]), from_column="order_id", ), ], ).with_windows( definitions=["3 days", "7 days", "30 days"]).add_post_hook( ZeroFillHook()), sink=Sink(writers=[ LocalHistoricalFSWriter(), OnlineFeatureStoreWriter( interval_mode=True, check_schema_hook=NotCheckSchemaHook(), debug_mode=True, ), ]), )
def test_run_agg_with_end_date(self, spark_session): test_pipeline = FeatureSetPipeline( spark_client=SparkClient(), source=Mock( spec=Source, readers=[ TableReader( id="source_a", database="db", table="table", ) ], query="select * from source_a", ), feature_set=Mock( spec=AggregatedFeatureSet, name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ), ), ], ), sink=Mock( spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], ), ) # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{ "a": "x", "b": "y", "c": "3" }]) test_pipeline.feature_set.construct.return_value = sample_df test_pipeline.run(end_date="2016-04-18") test_pipeline.source.construct.assert_called_once() test_pipeline.feature_set.construct.assert_called_once() test_pipeline.sink.flush.assert_called_once() test_pipeline.sink.validate.assert_called_once()
def test_agg_feature_set_with_window(self, key_id, timestamp_c, dataframe, rolling_windows_agg_dataframe): spark_client = SparkClient() fs = AggregatedFeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="unit test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.FLOAT)]), ), Feature( name="feature2", description="unit test", transformation=AggregatedTransform( functions=[Function(functions.avg, DataType.FLOAT)]), ), ], keys=[key_id], timestamp=timestamp_c, ).with_windows(definitions=["1 week"]) # raises without end date with pytest.raises(ValueError): _ = fs.construct(dataframe, spark_client) # filters with date smaller then mocked max output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") assert output_df.count() < rolling_windows_agg_dataframe.count() output_df = fs.construct(dataframe, spark_client, end_date="2016-05-01") assert_dataframe_equality(output_df, rolling_windows_agg_dataframe)
def test_feature_transform(self, feature_set_dataframe, target_df_agg): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) # aggregated feature transform won't run transformations # and depends on the feature set with pytest.raises(NotImplementedError): _ = test_feature.transform(feature_set_dataframe)
def test_anonymous_function(self): with pytest.raises( AttributeError, match= "Anonymous functions are not supported on AggregatedTransform.", ): Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(func=partial(functions.count), data_type=DataType.INTEGER) ]), ).get_output_columns()
def test_output_columns(self): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) df_columns = test_feature.get_output_columns() assert all([ a == b for a, b in zip( df_columns, ["feature1__avg", "feature1__stddev_pop"], ) ])
def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): spark_client = SparkClient() with pytest.raises(ValueError): FeatureSet( name="name", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[Function(F.avg, DataType.FLOAT)]), ), ], keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client)
def test_construct_with_pivot( self, feature_set_df_pivot, target_df_pivot_agg, ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature", description="unit test", transformation=AggregatedTransform(functions=[ Function(F.avg, DataType.FLOAT), Function(F.stddev_pop, DataType.DOUBLE), ], ), from_column="feature1", ) ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="fixed_ts"), ).with_pivot("pivot_col", ["S", "N"]) # act output_df = feature_set.construct(feature_set_df_pivot, client=spark_client) # assert assert_dataframe_equality(output_df, target_df_pivot_agg)
def feature_set_incremental(): key_features = [ KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) ] ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN) features = [ Feature( name="feature", description="test", transformation=AggregatedTransform( functions=[Function(functions.sum, DataType.INTEGER)] ), ), ] return AggregatedFeatureSet( "feature_set", "entity", "description", keys=key_features, timestamp=ts_feature, features=features, )
def test_construct_rolling_windows_without_end_date( self, feature_set_dataframe, rolling_windows_output_feature_set_dataframe ): # given spark_client = SparkClient() # arrange feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform( functions=[ Function(F.avg, DataType.DOUBLE), Function(F.stddev_pop, DataType.DOUBLE), ], ), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 day", "1 week"],) # act & assert with pytest.raises(ValueError): _ = feature_set.construct(feature_set_dataframe, client=spark_client)
def test_feature_transform_with_data_type_array(self, spark_context, spark_session): # arrange input_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10 }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 20 }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 30 }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10 }, ] target_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature__collect_set": [30.0, 20.0, 10.0], }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature__collect_set": [10.0], }, ] input_df = create_df_from_collection( input_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) target_df = create_df_from_collection( target_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) fs = AggregatedFeatureSet( name="name", entity="entity", description="description", keys=[ KeyFeature(name="id", description="test", dtype=DataType.INTEGER) ], timestamp=TimestampFeature(), features=[ Feature( name="feature", description="aggregations with ", dtype=DataType.BIGINT, transformation=AggregatedTransform(functions=[ Function(functions.collect_set, DataType.ARRAY_FLOAT), ], ), from_column="feature", ), ], ) # act output_df = fs.construct(input_df, SparkClient()) # assert assert_dataframe_equality(target_df, output_df)
def test_feature_transform_with_filter_expression(self, spark_context, spark_session): # arrange input_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10, "type": "a", }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 20, "type": "a", }, { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 30, "type": "b", }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature": 10, "type": "a", }, ] target_data = [ { "id": 1, "timestamp": "2020-04-22T00:00:00+00:00", "feature_only_type_a__avg": 15.0, "feature_only_type_a__min": 10, "feature_only_type_a__max": 20, }, { "id": 2, "timestamp": "2020-04-22T00:00:00+00:00", "feature_only_type_a__avg": 10.0, "feature_only_type_a__min": 10, "feature_only_type_a__max": 10, }, ] input_df = create_df_from_collection( input_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) target_df = create_df_from_collection( target_data, spark_context, spark_session).withColumn( "timestamp", functions.to_timestamp(functions.col("timestamp"))) fs = AggregatedFeatureSet( name="name", entity="entity", description="description", keys=[ KeyFeature(name="id", description="test", dtype=DataType.INTEGER) ], timestamp=TimestampFeature(), features=[ Feature( name="feature_only_type_a", description="aggregations only when type = a", dtype=DataType.BIGINT, transformation=AggregatedTransform( functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.min, DataType.FLOAT), Function(functions.max, DataType.FLOAT), ], filter_expression="type = 'a'", ), from_column="feature", ), ], ) # act output_df = fs.construct(input_df, SparkClient()) # assert assert_dataframe_equality(target_df, output_df)
def test_get_schema(self): expected_schema = [ { "column_name": "id", "type": LongType(), "primary_key": True }, { "column_name": "timestamp", "type": TimestampType(), "primary_key": False }, { "column_name": "feature1__avg_over_1_week_rolling_windows", "type": DoubleType(), "primary_key": False, }, { "column_name": "feature1__avg_over_2_days_rolling_windows", "type": DoubleType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_1_week_rolling_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature1__stddev_pop_over_2_days_rolling_windows", "type": FloatType(), "primary_key": False, }, { "column_name": "feature2__count_over_1_week_rolling_windows", "type": ArrayType(StringType(), True), "primary_key": False, }, { "column_name": "feature2__count_over_2_days_rolling_windows", "type": ArrayType(StringType(), True), "primary_key": False, }, ] feature_set = AggregatedFeatureSet( name="feature_set", entity="entity", description="description", features=[ Feature( name="feature1", description="test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.FLOAT), ], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform(functions=[ Function(functions.count, DataType.ARRAY_STRING) ]), ), ], keys=[ KeyFeature( name="id", description="The user's Main ID or device ID", dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), ).with_windows(definitions=["1 week", "2 days"]) schema = feature_set.get_schema() assert schema == expected_schema