def test_feature_transform_invalid_output(self, feature_set_dataframe): with pytest.raises(Exception): test_feature = Feature( name="feature1_plus_a", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform( expression="feature2 + a"), ) test_feature.transform(feature_set_dataframe).collect()
def test_columns_not_in_dataframe(self, spark_context, spark_session): # arrange input_df = create_df_from_collection(self.input_data, spark_context, spark_session) feature = Feature( name="id", description="stack transformation", dtype=DataType.STRING, transformation=StackTransform("id_c", "id_d"), ) # act and assert with pytest.raises(ValueError, match="Columns not found, columns in df: "): feature.transform(input_df)
def test_feature_transform_with_dtype(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.TIMESTAMP, ) df = test_feature.transform(feature_set_dataframe) assert dict(df.dtypes).get("feature") == "timestamp"
def test_feature_transform(self, feature_set_dataframe, target_df_spark): test_feature = Feature( name="feature", description="unit test", transformation=SparkFunctionTransform( functions=[Function(functions.cos, DataType.DOUBLE)], ), from_column="feature1", ) output_df = test_feature.transform(feature_set_dataframe) assert_dataframe_equality(output_df, target_df_spark)
def test_feature_transform_no_from_column(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test feature without transformation", dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip(df.columns, feature_set_dataframe.columns) ])
def test_feature_transform(self, feature_set_dataframe, target_df_agg): test_feature = Feature( name="feature1", description="unit test", transformation=AggregatedTransform(functions=[ Function(functions.avg, DataType.DOUBLE), Function(functions.stddev_pop, DataType.DOUBLE), ]), ) # aggregated feature transform won't run transformations # and depends on the feature set with pytest.raises(NotImplementedError): _ = test_feature.transform(feature_set_dataframe)
def test_feature_transform_output(self, feature_set_dataframe): test_feature = Feature( name="feature1_over_feature2", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform( expression="feature1/feature2"), ) df = test_feature.transform(feature_set_dataframe).collect() assert df[0]["feature1_over_feature2"] == 1 assert df[1]["feature1_over_feature2"] == 1 assert df[2]["feature1_over_feature2"] == 1 assert df[3]["feature1_over_feature2"] == 1
def test_feature_transform_with_from_column(self, feature_set_dataframe): test_feature = Feature( name="new_feature", from_column="feature", description="unit test", dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip( sorted(df.columns), sorted(["new_feature", "id", TIMESTAMP_COLUMN, "feature"]), ) ])
def test_custom_transform_output(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df = test_feature.transform(feature_set_dataframe).collect() assert df[0]["feature"] == 1 assert df[1]["feature"] == 1 assert df[2]["feature"] == 1 assert df[3]["feature"] == 1
def test_overwriting_column(self, spark_session): # arrange input_df = spark_session.sql("select 0 as feature") feature_with_same_name = Feature( name="feature", description="description", dtype=DataType.INTEGER, transformation=SQLExpressionTransform(expression="feature + 1"), ) target_df = spark_session.sql("select 1 as feature") # act output_df = feature_with_same_name.transform(input_df) # assert assert_dataframe_equality(output_df, target_df)
def test_feature_transform_with_window(self, feature_set_dataframe, target_df_rows_agg): test_feature = Feature( name="feature1", description="unit test", transformation=SparkFunctionTransform(functions=[ Function(functions.avg, DataType.DOUBLE) ], ).with_window( partition_by="id", mode="row_windows", window_definition=["2 events", "3 events"], ), ) output_df = test_feature.transform(feature_set_dataframe) assert_dataframe_equality(output_df, target_df_rows_agg)
def test_feature_transform_with_transformation_no_from_column( self, feature_set_dataframe): some_transformation = Mock() some_transformation.transform.return_value = feature_set_dataframe test_feature = Feature( name="feature", description="unit test", transformation=some_transformation, dtype=DataType.BIGINT, ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip(sorted(df.columns), sorted(["feature", "id", TIMESTAMP_COLUMN])) ])
def test_feature_transform(self, h3_input_df, h3_target_df): # arrange test_feature = Feature( name="new_feature", description="unit test", dtype=DataType.STRING, transformation=H3HashTransform( h3_resolutions=[6, 7, 8, 9, 10, 11, 12], lat_column="lat", lng_column="lng", ), ) # act output_df = test_feature.transform(h3_input_df) # assert assert_dataframe_equality(output_df, h3_target_df)
def test_feature_transform(self, feature_set_dataframe): test_feature = Feature( name="feature", description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( transformer=divide, column1="feature1", column2="feature2", ), ) df = test_feature.transform(feature_set_dataframe) assert all( [ a == b for a, b in zip( df.columns, ["feature1", "feature2", "id", TIMESTAMP_COLUMN, "feature"], ) ] )
def test_feature_transform(self, feature_set_dataframe): test_feature = Feature( name="feature1_over_feature2", description="unit test", dtype=DataType.FLOAT, transformation=SQLExpressionTransform( expression="feature1/feature2"), ) df = test_feature.transform(feature_set_dataframe) assert all([ a == b for a, b in zip( df.columns, [ "feature1", "feature2", "id", "timestamp", "feature1_over_feature2", ], ) ])