def test_filter_with_invalidations(self, feature_set_dataframe, condition, spark_context, spark_session): # given file_reader = FileReader("test", "path/to/file", "format") file_reader.with_(transformer=filter, condition=condition) # then with pytest.raises(TypeError): file_reader._apply_transformations(feature_set_dataframe)
def test_filter(self, feature_set_dataframe, spark_context, spark_session): # given file_reader = FileReader("test", "path/to/file", "format") file_reader.with_( transformer=filter, condition="test not in ('fail') and feature in (110, 120)", ) # when result_df = file_reader._apply_transformations(feature_set_dataframe) target_data = [ { "id": 1, TIMESTAMP_COLUMN: 1, "feature": 110, "test": "pass" }, { "id": 1, TIMESTAMP_COLUMN: 2, "feature": 120, "test": "pass" }, ] target_df = spark_session.read.json( spark_context.parallelize(target_data, 1)) # then assert result_df.collect() == target_df.collect()
def test_apply_pivot_transformation(self, input_df, pivot_df): # arrange file_reader = FileReader("test", "path/to/file", "format") file_reader.with_( transformer=pivot, group_by_columns=["id", "ts"], pivot_column="pivot_column", agg_column="has_feature", aggregation=first, ) # act result_df = file_reader._apply_transformations(input_df) # assert assert compare_dataframes( actual_df=result_df, expected_df=pivot_df, )
def test__apply_transformations( self, input_data, transformations, transformed_data, spark_context, spark_session, spark_client, ): # arrange file_reader = FileReader("test", "path/to/file", "format") file_reader.transformations = transformations input_df = spark_session.read.json( spark_context.parallelize(input_data, 1)) target_df = spark_session.read.json( spark_context.parallelize(transformed_data, 1)) # act result_df = file_reader._apply_transformations(input_df) # assert assert target_df.collect() == result_df.collect()