def test_csv_file_with_schema_and_header(self): # given spark_client = SparkClient() schema_csv = StructType( [ StructField("A", LongType()), StructField("B", DoubleType()), StructField("C", StringType()), ] ) file = "tests/unit/butterfree/extract/readers/file-reader-test.csv" # when file_reader = FileReader( id="id", path=file, format="csv", schema=schema_csv, format_options={"header": True}, ) df = file_reader.consume(spark_client) # assert assert schema_csv == df.schema assert df.columns == ["A", "B", "C"] for value in range(3): assert df.first()[value] != ["A", "B", "C"][value]
def test_consume_with_stream_without_schema(self, spark_client, target_df): # arrange path = "path/to/file.json" format = "json" schema = None format_options = None stream = True options = dict({"path": path}) spark_client.read.return_value = target_df file_reader = FileReader( "test", path, format, schema, format_options, stream=stream ) # act output_df = file_reader.consume(spark_client) # assert # assert call for schema infer spark_client.read.assert_any_call(format=format, options=options) # assert call for stream read # stream spark_client.read.assert_called_with( format=format, options=options, schema=output_df.schema, stream=stream ) assert target_df.collect() == output_df.collect()
def test_filter(self, feature_set_dataframe, spark_context, spark_session): # given file_reader = FileReader("test", "path/to/file", "format") file_reader.with_( transformer=filter, condition="test not in ('fail') and feature in (110, 120)", ) # when result_df = file_reader._apply_transformations(feature_set_dataframe) target_data = [ { "id": 1, TIMESTAMP_COLUMN: 1, "feature": 110, "test": "pass" }, { "id": 1, TIMESTAMP_COLUMN: 2, "feature": 120, "test": "pass" }, ] target_df = spark_session.read.json( spark_context.parallelize(target_data, 1)) # then assert result_df.collect() == target_df.collect()
def test_build_with_incremental_strategy( self, incremental_source_df, spark_client, spark_session ): # arrange readers = [ # directly from column FileReader( id="test_1", path="path/to/file", format="format" ).with_incremental_strategy( incremental_strategy=IncrementalStrategy(column="date") ), # from milliseconds FileReader( id="test_2", path="path/to/file", format="format" ).with_incremental_strategy( incremental_strategy=IncrementalStrategy().from_milliseconds( column_name="milliseconds" ) ), # from str FileReader( id="test_3", path="path/to/file", format="format" ).with_incremental_strategy( incremental_strategy=IncrementalStrategy().from_string( column_name="date_str", mask="dd/MM/yyyy" ) ), # from year, month, day partitions FileReader( id="test_4", path="path/to/file", format="format" ).with_incremental_strategy( incremental_strategy=( IncrementalStrategy().from_year_month_day_partitions() ) ), ] spark_client.read.return_value = incremental_source_df target_df = incremental_source_df.where( "date >= date('2020-07-29') and date <= date('2020-07-31')" ) # act for reader in readers: reader.build( client=spark_client, start_date="2020-07-29", end_date="2020-07-31" ) output_dfs = [ spark_session.table(f"test_{i + 1}") for i, _ in enumerate(readers) ] # assert for output_df in output_dfs: assert_dataframe_equality(output_df=output_df, target_df=target_df)
def test_build(self, target_df, spark_client, spark_session): # arrange file_reader = FileReader("test", "path/to/file", "format") spark_client.read.return_value = target_df # act file_reader.build(spark_client) result_df = spark_session.sql("select * from test") # assert assert target_df.collect() == result_df.collect()
def test_with_(self, transformations, spark_client): # arrange file_reader = FileReader("test", "path/to/file", "format") # act for transformation in transformations: file_reader.with_( transformation["transformer"], *transformation["args"], **transformation["kwargs"], ) # assert assert file_reader.transformations == transformations
def test_build_with_columns(self, target_df, column_target_df, spark_client, spark_session): # arrange file_reader = FileReader("test", "path/to/file", "format") spark_client.read.return_value = target_df # act file_reader.build( client=spark_client, columns=[("col1", "new_col1"), ("col2", "new_col2")], ) result_df = spark_session.sql("select * from test") # assert assert column_target_df.collect() == result_df.collect()
def test_consume( self, path, format, schema, format_options, spark_client, target_df ): # arrange spark_client.read.return_value = target_df file_reader = FileReader("test", path, format, schema, format_options) # act output_df = file_reader.consume(spark_client) options = dict({"path": path}, **format_options if format_options else {}) # assert spark_client.read.assert_called_once_with( format=format, options=options, schema=schema, stream=False ) assert target_df.collect() == output_df.collect()
def __init__(self): super(UserChargebacksPipeline, self).__init__( source=Source( readers=[ FileReader( id="chargeback_events", path="data/order_events/input.csv", format="csv", format_options={"header": True}, ) ], query=(""" select cpf, timestamp(chargeback_timestamp) as timestamp, order_id from chargeback_events where chargeback_timestamp is not null """), ), feature_set=AggregatedFeatureSet( name="user_chargebacks", entity="user", description="Aggregates the total of chargebacks from users in " "different time windows.", keys=[ KeyFeature( name="cpf", description="User unique identifier, entity key.", dtype=DataType.STRING, ) ], timestamp=TimestampFeature(), features=[ Feature( name="cpf_chargebacks", description= "Total of chargebacks registered on user's CPF", transformation=AggregatedTransform(functions=[ Function(functions.count, DataType.INTEGER) ]), from_column="order_id", ), ], ).with_windows( definitions=["3 days", "7 days", "30 days"]).add_post_hook( ZeroFillHook()), sink=Sink(writers=[ LocalHistoricalFSWriter(), OnlineFeatureStoreWriter( interval_mode=True, check_schema_hook=NotCheckSchemaHook(), debug_mode=True, ), ]), )
def test_apply_pivot_transformation(self, input_df, pivot_df): # arrange file_reader = FileReader("test", "path/to/file", "format") file_reader.with_( transformer=pivot, group_by_columns=["id", "ts"], pivot_column="pivot_column", agg_column="has_feature", aggregation=first, ) # act result_df = file_reader._apply_transformations(input_df) # assert assert compare_dataframes( actual_df=result_df, expected_df=pivot_df, )
def test_json_file_with_schema(self): # given spark_client = SparkClient() schema_json = StructType( [ StructField("A", StringType()), StructField("B", DoubleType()), StructField("C", StringType()), ] ) file = "tests/unit/butterfree/extract/readers/file-reader-test.json" # when file_reader = FileReader(id="id", path=file, format="json", schema=schema_json) df = file_reader.consume(spark_client) # assert assert schema_json == df.schema
def test__apply_transformations( self, input_data, transformations, transformed_data, spark_context, spark_session, spark_client, ): # arrange file_reader = FileReader("test", "path/to/file", "format") file_reader.transformations = transformations input_df = spark_session.read.json( spark_context.parallelize(input_data, 1)) target_df = spark_session.read.json( spark_context.parallelize(transformed_data, 1)) # act result_df = file_reader._apply_transformations(input_df) # assert assert target_df.collect() == result_df.collect()
def test_source( self, target_df_source, target_df_table_reader, spark_session, ): # given spark_client = SparkClient() table_reader_id = "a_test_source" table_reader_db = "db" table_reader_table = "table_test_source" create_temp_view(dataframe=target_df_table_reader, name=table_reader_id) create_db_and_table( spark=spark_session, table_reader_id=table_reader_id, table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) file_reader_id = "b_test_source" data_sample_path = INPUT_PATH + "/data.json" # when source = Source( readers=[ TableReader( id=table_reader_id, database=table_reader_db, table=table_reader_table, ), FileReader(id=file_reader_id, path=data_sample_path, format="json"), ], query=f"select a.*, b.feature2 " # noqa f"from {table_reader_id} a " # noqa f"inner join {file_reader_id} b on a.id = b.id ", # noqa ) result_df = source.construct(client=spark_client) target_df = target_df_source # then assert (compare_dataframes( actual_df=result_df, expected_df=target_df, columns_sort=result_df.columns, ) is True)
def test_filter_with_invalidations(self, feature_set_dataframe, condition, spark_context, spark_session): # given file_reader = FileReader("test", "path/to/file", "format") file_reader.with_(transformer=filter, condition=condition) # then with pytest.raises(TypeError): file_reader._apply_transformations(feature_set_dataframe)
def test_feature_set_args(self): # arrange and act out_columns = [ "user_id", "timestamp", "listing_page_viewed__rent_per_month__avg_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__avg_over_2_weeks_fixed_windows", "listing_page_viewed__rent_per_month__stddev_pop_over_7_days_fixed_windows", "listing_page_viewed__rent_per_month__" "stddev_pop_over_2_weeks_fixed_windows", # noqa ] pipeline = FeatureSetPipeline( source=Source( readers=[ TableReader( id="source_a", database="db", table="table", ), FileReader( id="source_b", path="path", format="parquet", ), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", ), feature_set=FeatureSet( name="feature_set", entity="entity", description="description", keys=[ KeyFeature( name="user_id", description="The user's Main ID or device ID", dtype=DataType.INTEGER, ) ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature( name="listing_page_viewed__rent_per_month", description="Average of something.", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.FLOAT), Function(functions.stddev_pop, DataType.FLOAT), ], ).with_window( partition_by="user_id", order_by=TIMESTAMP_COLUMN, window_definition=["7 days", "2 weeks"], mode="fixed_windows", ), ), ], ), sink=Sink(writers=[ HistoricalFeatureStoreWriter(db_config=None), OnlineFeatureStoreWriter(db_config=None), ], ), ) assert isinstance(pipeline.spark_client, SparkClient) assert len(pipeline.source.readers) == 2 assert all( isinstance(reader, Reader) for reader in pipeline.source.readers) assert isinstance(pipeline.source.query, str) assert pipeline.feature_set.name == "feature_set" assert pipeline.feature_set.entity == "entity" assert pipeline.feature_set.description == "description" assert isinstance(pipeline.feature_set.timestamp, TimestampFeature) assert len(pipeline.feature_set.keys) == 1 assert all( isinstance(k, KeyFeature) for k in pipeline.feature_set.keys) assert len(pipeline.feature_set.features) == 1 assert all( isinstance(feature, Feature) for feature in pipeline.feature_set.features) assert pipeline.feature_set.columns == out_columns assert len(pipeline.sink.writers) == 2 assert all( isinstance(writer, Writer) for writer in pipeline.sink.writers)
def test_init_invalid_params(self, path, format): # act and assert with pytest.raises(ValueError): FileReader("id", path, format)
def __init__(self): super(AwesomeDatasetPipeline, self).__init__( source=Source( readers=[ FileReader( id="order_events", path="data/order_events/input.csv", format="csv", format_options={"header": True}, ), FileReader( id="user_chargebacks", path="data/feature_store/historical/user/user_chargebacks", format="parquet", ), FileReader( id="user_orders", path="data/feature_store/historical/user/user_orders", format="parquet", ), ], query=""" with feature_sets_merge as( select user_orders.cpf, user_orders.timestamp, user_chargebacks.timestamp as chargeback_timestamp, cpf_orders__count_over_3_days_rolling_windows, cpf_orders__count_over_7_days_rolling_windows, cpf_orders__count_over_30_days_rolling_windows, cpf_chargebacks__count_over_3_days_rolling_windows, cpf_chargebacks__count_over_7_days_rolling_windows, cpf_chargebacks__count_over_30_days_rolling_windows, row_number() over ( partition by (user_orders.cpf, user_orders.timestamp) order by user_chargebacks.timestamp desc ) as rn from user_orders left join user_chargebacks on user_orders.cpf = user_chargebacks.cpf and user_orders.timestamp >= user_chargebacks.timestamp ), feature_sets_rn_filter as( select * from feature_sets_merge where rn = 1 ), orders_with_feature_sets as( select order_events.order_id, timestamp(order_events.order_timestamp) as timestamp, timestamp(order_events.chargeback_timestamp) as chargeback_timestamp, order_events.cpf, feature_sets_rn_filter.cpf_orders__count_over_3_days_rolling_windows, feature_sets_rn_filter.cpf_orders__count_over_7_days_rolling_windows, feature_sets_rn_filter.cpf_orders__count_over_30_days_rolling_windows, feature_sets_rn_filter.cpf_chargebacks__count_over_3_days_rolling_windows, feature_sets_rn_filter.cpf_chargebacks__count_over_7_days_rolling_windows, feature_sets_rn_filter.cpf_chargebacks__count_over_30_days_rolling_windows, row_number() over ( partition by (order_events.cpf, order_events.order_timestamp) order by feature_sets_rn_filter.timestamp desc ) as rn from order_events join feature_sets_rn_filter on order_events.cpf = feature_sets_rn_filter.cpf and timestamp(order_events.order_timestamp) >= feature_sets_rn_filter.timestamp ) select order_id, timestamp, chargeback_timestamp, cpf, cpf_orders__count_over_3_days_rolling_windows, cpf_orders__count_over_7_days_rolling_windows, cpf_orders__count_over_30_days_rolling_windows, coalesce( cpf_chargebacks__count_over_3_days_rolling_windows, 0) as cpf_chargebacks__count_over_3_days_rolling_windows, coalesce( cpf_chargebacks__count_over_7_days_rolling_windows, 0) as cpf_chargebacks__count_over_7_days_rolling_windows, coalesce( cpf_chargebacks__count_over_30_days_rolling_windows, 0) as cpf_chargebacks__count_over_30_days_rolling_windows from orders_with_feature_sets where rn = 1 """, ), feature_set=FeatureSet( name="awesome_dataset", entity="user", description="Dataset enriching orders events with aggregated features " "on total of orders and chargebacks by user.", keys=[ KeyFeature( name="order_id", description="Orders unique identifier.", dtype=DataType.STRING, ) ], timestamp=TimestampFeature(), features=[ Feature( name="chargeback_timestamp", description="Timestamp for the order creation.", dtype=DataType.TIMESTAMP, ), Feature( name="cpf", description="User unique identifier, user entity key.", dtype=DataType.STRING, ), Feature( name="cpf_orders__count_over_3_days_rolling_windows", description="Count of orders over 3 days rolling windows group " "by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_orders__count_over_7_days_rolling_windows", description="Count of orders over 7 days rolling windows group " "by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_orders__count_over_30_days_rolling_windows", description="Count of orders over 30 days rolling windows group" " by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_chargebacks__count_over_3_days_rolling_windows", description="Count of chargebacks over 3 days rolling windows " "group by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_chargebacks__count_over_7_days_rolling_windows", description="Count of chargebacks over 7 days rolling windows " "group by user (identified by CPF)", dtype=DataType.INTEGER, ), Feature( name="cpf_chargebacks__count_over_30_days_rolling_windows", description="Count of chargebacks over 30 days rolling windows " "group by user (identified by CPF)", dtype=DataType.INTEGER, ), ], ), sink=Sink(writers=[DatasetWriter()]), )