def test_super_transform_called(self, mocker): """Test that BaseTransformer.transform called.""" df = d.create_df_7() x = SetValueTransformer(columns=["a", "b"], value=1) expected_call_args = {0: {"args": (d.create_df_7(), ), "kwargs": {}}} with h.assert_function_call(mocker, tubular.base.BaseTransformer, "transform", expected_call_args): x.transform(df)
def test_super_transform_called(self, mocker): """Test that BaseTransformer.transform called.""" df = d.create_df_7() x = SeriesStrMethodTransformer(new_column_name="cc", pd_method_name="find", columns=["c"]) expected_call_args = {0: {"args": (d.create_df_7(), ), "kwargs": {}}} with h.assert_function_call(mocker, tubular.base.BaseTransformer, "transform", expected_call_args): x.transform(df)
def test_attributes_unchanged_by_transform(self): """Test that attributes set in init are unchanged by the transform method.""" df = d.create_df_7() x = SeriesStrMethodTransformer( new_column_name="b", pd_method_name="pad", columns=["b"], pd_method_kwargs={"width": 10}, ) x2 = SeriesStrMethodTransformer( new_column_name="b", pd_method_name="pad", columns=["b"], pd_method_kwargs={"width": 10}, ) x.transform(df) assert ( x.new_column_name == x2.new_column_name ), "new_column_name changed by SeriesDtMethodTransformer.transform" assert ( x.pd_method_name == x2.pd_method_name ), "pd_method_name changed by SeriesDtMethodTransformer.transform" assert (x.columns == x2.columns ), "columns changed by SeriesDtMethodTransformer.transform" assert ( x.pd_method_kwargs == x2.pd_method_kwargs ), "pd_method_kwargs changed by SeriesDtMethodTransformer.transform"
def test_categories_not_modified(self): """Test that the categories from fit are not changed in transform.""" df_train = d.create_df_1() df_test = d.create_df_7() x = OneHotEncodingTransformer(columns=["a", "b"], verbose=False) x2 = OneHotEncodingTransformer(columns=["a", "b"], verbose=False) x.fit(df_train) x2.fit(df_train) x.transform(df_test) h.assert_equal_dispatch( expected=list(x2.categories_[0]), actual=list(x.categories_[0]), msg="categories_ (index 0) modified during transform", ) h.assert_equal_dispatch( expected=list(x2.categories_[1]), actual=list(x.categories_[1]), msg="categories_ (index 1) modified during transform", )
def expected_df_2(): """Expected output of test_expected_output_overwrite.""" df = d.create_df_7() df["b"] = df["b"].str.pad(width=10) return df
def expected_df_1(): """Expected output of test_expected_output_no_overwrite.""" df = d.create_df_7() df["b_new"] = df["b"].str.find(sub="a") return df
def test_warning_generated_by_unseen_categories(self): """Test OneHotEncodingTransformer.transform triggers a warning for unseen categories.""" df_train = d.create_df_7() df_test = d.create_df_8() x = OneHotEncodingTransformer(verbose=True) x.fit(df_train) with pytest.warns(Warning): x.transform(df_test)
def test_unseen_categories_encoded_as_all_zeroes(self, df_test, expected): """Test OneHotEncodingTransformer.transform encodes unseen categories correctly (all 0s).""" # transformer is fit on the whole dataset separately from the input df to work with the decorators df_train = d.create_df_7() x = OneHotEncodingTransformer(columns=["a", "b", "c"], verbose=False) x.fit(df_train) df_transformed = x.transform(df_test) h.assert_equal_dispatch( expected=expected, actual=df_transformed, msg="unseen category rows not encoded as 0s", )
def test_original_columns_kept_when_specified(self): """Test OneHotEncodingTransformer.transform keeps original columns when specified.""" df = d.create_df_7() x = OneHotEncodingTransformer(drop_original=False) x.fit(df) df_transformed = x.transform(df) h.assert_equal_dispatch( expected=list(set()), actual=list(set(["a", "b", "c"]) - set(df_transformed.columns)), msg="original columns not kept", )
def test_original_columns_dropped_when_specified(self): """Test OneHotEncodingTransformer.transform drops original columns get when specified.""" df = d.create_df_7() x = OneHotEncodingTransformer(columns=["a", "b", "c"], drop_original=True) x.fit(df) df_transformed = x.transform(df) h.assert_equal_dispatch( expected=["a", "b", "c"], actual=[ x for x in df.columns.values if x not in df_transformed.columns.values ], msg="original columns not dropped", )
def test_expected_output(self, df_test, expected): """Test that OneHotEncodingTransformer.transform encodes the feature correctly. Also tests that OneHotEncodingTransformer.transform does not modify unrelated columns. """ # transformer is fit on the whole dataset separately from the input df to work with the decorators df_train = d.create_df_7() x = OneHotEncodingTransformer(columns="b") x.fit(df_train) df_transformed = x.transform(df_test) h.assert_frame_equal_msg( expected=expected, actual=df_transformed, msg_tag="Unspecified columns changed in transform", )
def test_renaming_feature_works_as_expected(self): """Test OneHotEncodingTransformer.transform() is renaming features correctly.""" df = d.create_df_7() df = df[["b", "c"]] x = OneHotEncodingTransformer( columns=["b", "c"], separator="|", drop_original=True ) x.fit(df) df_transformed = x.transform(df) h.assert_equal_dispatch( expected=["b|x", "b|y", "b|z", "c|a", "c|b", "c|c"], actual=list(df_transformed.columns.values), msg="renaming columns feature in OneHotEncodingTransformer.transform", )
class TestTransform(object): """Tests for the transform method on CrossColumnMappingTransformer.""" def expected_df_1(): """Expected output for test_expected_output.""" df = pd.DataFrame( {"a": [1, 2, 3, 4, 5, 6], "b": ["aa", "bb", "cc", "dd", "ee", "ff"]} ) return df def expected_df_2(): """Expected output for test_non_specified_values_unchanged.""" df = pd.DataFrame( {"a": [1, 2, 3, 4, 5, 6], "b": ["aa", "bb", "cc", "d", "e", "f"]} ) return df def expected_df_3(): """Expected output for test_multiple_mappings_ordered_dict.""" df = pd.DataFrame( { "a": [4, 2, 2, 1, 3], "b": ["x", "z", "y", "x", "x"], "c": ["cc", "dd", "bb", "cc", "cc"], } ) return df def test_arguments(self): """Test that transform has expected arguments.""" h.test_function_arguments( func=CrossColumnMappingTransformer.transform, expected_arguments=["self", "X"], expected_default_values=None, ) def test_check_is_fitted_call(self, mocker): """Test the call to check_is_fitted.""" df = d.create_df_1() mapping = {"a": {1: "a", 2: "b", 3: "c", 4: "d", 5: "e", 6: "f"}} x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="b") expected_call_args = {0: {"args": (["adjust_column"],), "kwargs": {}}} with h.assert_function_call( mocker, tubular.base.BaseTransformer, "check_is_fitted", expected_call_args ): x.transform(df) def test_super_transform_call(self, mocker): """Test the call to BaseMappingTransformer.transform.""" df = d.create_df_1() mapping = {"a": {1: "aa", 2: "bb", 3: "cc", 4: "dd", 5: "ee", 6: "ff"}} x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="b") expected_call_args = {0: {"args": (d.create_df_1(),), "kwargs": {}}} with h.assert_function_call( mocker, tubular.mapping.BaseMappingTransformer, "transform", expected_call_args, return_value=d.create_df_1(), ): x.transform(df) def test_adjust_col_not_in_x_error(self): """Test that an exception is raised if the adjust_column is not present in the dataframe.""" df = d.create_df_1() mapping = {"a": {1: "aa", 2: "bb", 3: "cc", 4: "dd", 5: "ee", 6: "ff"}} x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="c") with pytest.raises(ValueError, match="variable c is not in X"): x.transform(df) @pytest.mark.parametrize( "df, expected", h.row_by_row_params(d.create_df_1(), expected_df_1()) + h.index_preserved_params(d.create_df_1(), expected_df_1()), ) def test_expected_output(self, df, expected): """Test that transform is giving the expected output.""" mapping = {"a": {1: "aa", 2: "bb", 3: "cc", 4: "dd", 5: "ee", 6: "ff"}} x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="b") df_transformed = x.transform(df) h.assert_frame_equal_msg( actual=df_transformed, expected=expected, msg_tag="expected output from cross column mapping transformer", ) @pytest.mark.parametrize( "df, expected", h.row_by_row_params(d.create_df_1(), expected_df_2()) + h.index_preserved_params(d.create_df_1(), expected_df_2()), ) def test_non_specified_values_unchanged(self, df, expected): """Test that values not specified in mappings are left unchanged in transform.""" mapping = {"a": {1: "aa", 2: "bb", 3: "cc"}} x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="b") df_transformed = x.transform(df) h.assert_frame_equal_msg( actual=df_transformed, expected=expected, msg_tag="expected output from cross column mapping transformer", ) @pytest.mark.parametrize( "df, expected", h.row_by_row_params(d.create_df_7(), expected_df_3()) + h.index_preserved_params(d.create_df_7(), expected_df_3()), ) def test_multiple_mappings_ordered_dict(self, df, expected): """Test that mappings by multiple columns using an ordered dict gives the expected output in transform""" mapping = OrderedDict() mapping["a"] = {1: "aa", 2: "bb"} mapping["b"] = {"x": "cc", "z": "dd"} x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="c") df_transformed = x.transform(df) h.assert_frame_equal_msg( actual=df_transformed, expected=expected, msg_tag="expected output from cross column mapping transformer", ) def test_mappings_unchanged(self): """Test that mappings is unchanged in transform.""" df = d.create_df_1() mapping = {"a": {1: "aa", 2: "bb", 3: "cc", 4: "dd", 5: "ee", 6: "ff"}} x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="b") x.transform(df) h.assert_equal_dispatch( expected=mapping, actual=x.mappings, msg="CrossColumnMappingTransformer.transform has changed self.mappings unexpectedly", )
class TestTransform(object): """Tests for SeriesStrMethodTransformer.transform().""" def expected_df_1(): """Expected output of test_expected_output_no_overwrite.""" df = d.create_df_7() df["b_new"] = df["b"].str.find(sub="a") return df def expected_df_2(): """Expected output of test_expected_output_overwrite.""" df = d.create_df_7() df["b"] = df["b"].str.pad(width=10) return df def test_arguments(self): """Test that transform has expected arguments.""" h.test_function_arguments(func=SeriesStrMethodTransformer.transform, expected_arguments=["self", "X"]) def test_super_transform_called(self, mocker): """Test that BaseTransformer.transform called.""" df = d.create_df_7() x = SeriesStrMethodTransformer(new_column_name="cc", pd_method_name="find", columns=["c"]) expected_call_args = {0: {"args": (d.create_df_7(), ), "kwargs": {}}} with h.assert_function_call(mocker, tubular.base.BaseTransformer, "transform", expected_call_args): x.transform(df) @pytest.mark.parametrize( "df, expected", h.row_by_row_params(d.create_df_7(), expected_df_1()) + h.index_preserved_params(d.create_df_7(), expected_df_1()), ) def test_expected_output_no_overwrite(self, df, expected): """Test a single column output from transform gives expected results, when not overwriting the original column.""" x = SeriesStrMethodTransformer( new_column_name="b_new", pd_method_name="find", columns=["b"], pd_method_kwargs={"sub": "a"}, ) df_transformed = x.transform(df) h.assert_frame_equal_msg( actual=df_transformed, expected=expected, msg_tag= "Unexpected values in SeriesStrMethodTransformer.transform with find, not overwriting original column", ) @pytest.mark.parametrize( "df, expected", h.row_by_row_params(d.create_df_7(), expected_df_2()) + h.index_preserved_params(d.create_df_7(), expected_df_2()), ) def test_expected_output_overwrite(self, df, expected): """Test a single column output from transform gives expected results, when overwriting the original column.""" x = SeriesStrMethodTransformer( new_column_name="b", pd_method_name="pad", columns=["b"], pd_method_kwargs={"width": 10}, ) df_transformed = x.transform(df) h.assert_frame_equal_msg( actual=df_transformed, expected=expected, msg_tag= "Unexpected values in SeriesStrMethodTransformer.transform with pad, overwriting original column", ) @pytest.mark.parametrize( "df, new_column_name, pd_method_name, columns, pd_method_kwargs", [ (d.create_df_7(), "b_new", "find", ["b"], { "sub": "a" }), ( d.create_df_7(), "c_slice", "slice", ["c"], { "start": 0, "stop": 1, "step": 1 }, ), (d.create_df_7(), "b_upper", "upper", ["b"], {}), ], ) def test_pandas_method_called(self, mocker, df, new_column_name, pd_method_name, columns, pd_method_kwargs): """Test that the pandas.Series.str method is called as expected (with kwargs passed) during transform.""" spy = mocker.spy(pd.Series.str, pd_method_name) x = SeriesStrMethodTransformer( new_column_name=new_column_name, pd_method_name=pd_method_name, columns=columns, pd_method_kwargs=pd_method_kwargs, ) x.transform(df) # pull out positional and keyword args to target the call call_args = spy.call_args_list[0] call_kwargs = call_args[1] # test keyword are as expected h.assert_dict_equal_msg( actual=call_kwargs, expected=pd_method_kwargs, msg_tag=f"""Keyword arg assert for {pd_method_name}""", ) def test_attributes_unchanged_by_transform(self): """Test that attributes set in init are unchanged by the transform method.""" df = d.create_df_7() x = SeriesStrMethodTransformer( new_column_name="b", pd_method_name="pad", columns=["b"], pd_method_kwargs={"width": 10}, ) x2 = SeriesStrMethodTransformer( new_column_name="b", pd_method_name="pad", columns=["b"], pd_method_kwargs={"width": 10}, ) x.transform(df) assert ( x.new_column_name == x2.new_column_name ), "new_column_name changed by SeriesDtMethodTransformer.transform" assert ( x.pd_method_name == x2.pd_method_name ), "pd_method_name changed by SeriesDtMethodTransformer.transform" assert (x.columns == x2.columns ), "columns changed by SeriesDtMethodTransformer.transform" assert ( x.pd_method_kwargs == x2.pd_method_kwargs ), "pd_method_kwargs changed by SeriesDtMethodTransformer.transform"
class TestTransform(object): """Tests for OneHotEncodingTransformer.transform().""" def expected_df_1(): """Expected output for test_expected_output.""" df = pd.DataFrame( { "a": [4, 2, 2, 1, 3], "b": ["x", "z", "y", "x", "x"], "c": ["c", "a", "a", "c", "b"], } ) df["c"] = df["c"].astype("category") df["b_x"] = [1.0, 0.0, 0.0, 1.0, 1.0] df["b_y"] = [0.0, 0.0, 1.0, 0.0, 0.0] df["b_z"] = [0.0, 1.0, 0.0, 0.0, 0.0] return df def expected_df_2(): """Expected output for test_unseen_categories_encoded_as_all_zeroes.""" df = pd.DataFrame( { "a": [1, 5, 2, 3, 3], "b": ["w", "w", "z", "y", "x"], "c": ["a", "a", "c", "b", "a"], }, index=[10, 15, 200, 251, 59], ) df["c"] = df["c"].astype("category") df["a_1"] = [1.0, 0.0, 0.0, 0.0, 0.0] df["a_2"] = [0.0, 0.0, 1.0, 0.0, 0.0] df["a_3"] = [0.0, 0.0, 0.0, 1.0, 1.0] df["a_4"] = [0.0, 0.0, 0.0, 0.0, 0.0] df["b_x"] = [0.0, 0.0, 0.0, 0.0, 1.0] df["b_y"] = [0.0, 0.0, 0.0, 1.0, 0.0] df["b_z"] = [0.0, 0.0, 1.0, 0.0, 0.0] df["c_a"] = [1.0, 1.0, 0.0, 0.0, 1.0] df["c_b"] = [0.0, 0.0, 0.0, 1.0, 0.0] df["c_c"] = [0.0, 0.0, 1.0, 0.0, 0.0] return df def test_arguments(self): """Test that transform has expected arguments.""" h.test_function_arguments( func=OneHotEncodingTransformer.transform, expected_arguments=["self", "X"] ) def test_columns_check_call(self, mocker): """Test the first call to BaseTransformer columns_check.""" df = d.create_df_1() x = OneHotEncodingTransformer(columns="b") x.fit(df) expected_call_args = {0: {"args": (d.create_df_1(),), "kwargs": {}}} with h.assert_function_call( mocker, tubular.base.BaseTransformer, "columns_check", expected_call_args ): x.transform(df) def test_check_is_fitted_first_call(self, mocker): """Test the calls to BaseTransformer check_is_fitted.""" df = d.create_df_1() x = OneHotEncodingTransformer(columns="b") x.fit(df) expected_call_args = { 0: {"args": (["separator"],), "kwargs": {}}, 1: {"args": (["drop_original"],), "kwargs": {}}, } with h.assert_function_call( mocker, tubular.base.BaseTransformer, "check_is_fitted", expected_call_args ): x.transform(df) def test_non_numeric_column_error_1(self): """Test that transform will raise an error if a column to transform has nulls.""" df_train = d.create_df_1() df_test = d.create_df_2() x = OneHotEncodingTransformer(columns=["b"]) x.fit(df_train) with pytest.raises( ValueError, match="column b has nulls - replace before proceeding" ): x.transform(df_test) def test_base_nominal_transformer_transform_called(self, mocker): """Test that BaseNominalTransformer.transform called.""" df = d.create_df_1() x = OneHotEncodingTransformer(columns="b") x.fit(df) mocker.patch( "tubular.nominal.BaseNominalTransformer.transform", return_value=d.create_df_1(), ) x.transform(df) assert ( tubular.nominal.BaseNominalTransformer.transform.call_count == 1 ), f"Not enough calls to BaseNominalTransformer.transform -\n Expected: 1\n Actual: {tubular.nominal.BaseNominalTransformer.transform.call_count}" call_args = tubular.nominal.BaseNominalTransformer.transform.call_args_list[0] call_pos_args = call_args[0] call_kwargs = call_args[1] h.assert_equal_dispatch( expected={}, actual=call_kwargs, msg="kwargs for BaseNominalTransformer.transform in OneHotEncodingTransformer.init", ) expected_pos_args = (x, d.create_df_1()) assert ( len(call_pos_args) == 2 ), f"Unepxected number of positional args in BaseNominalTransformer.transform call -\n Expected: 2\n Actual: {len(call_pos_args)}" h.assert_frame_equal_msg( expected=expected_pos_args[1], actual=call_pos_args[1], msg_tag="X positional arg in BaseNominalTransformer.transform call", ) assert ( expected_pos_args[0] == call_pos_args[0] ), "self positional arg in BaseNominalTransformer.transform call" def test_one_hot_encoder_transform_called(self, mocker): """Test that OneHotEncoder.transform called.""" df = d.create_df_1() x = OneHotEncodingTransformer(columns="b") x.fit(df) mocker.patch("sklearn.preprocessing.OneHotEncoder.transform") x.transform(df) assert ( sklearn.preprocessing.OneHotEncoder.transform.call_count == 1 ), f"Not enough calls to OneHotEncoder.transform -\n Expected: 1\n Actual: {sklearn.preprocessing.OneHotEncoder.transform.call_count}" call_args = sklearn.preprocessing.OneHotEncoder.transform.call_args_list[0] call_pos_args = call_args[0] call_kwargs = call_args[1] h.assert_equal_dispatch( expected={}, actual=call_kwargs, msg="kwargs for OneHotEncodingTransformer.transform in BaseTransformer.init", ) assert ( len(call_pos_args) == 2 ), f"Unepxected number of positional args in OneHotEncodingTransformer.transform call -\n Expected: 2\n Actual: {len(call_pos_args)}" assert ( call_pos_args[0] is x ), f"Unexpected positional arg (self, index 1) in OneHotEncodingTransformer.transform call -\n Expected: self\n Actual: {call_pos_args[0]}" h.assert_frame_equal_msg( expected=d.create_df_1()[["b"]], actual=call_pos_args[1], msg_tag="X positional arg in OneHotEncodingTransformer.transform call", ) @pytest.mark.parametrize( "df_test, expected", h.row_by_row_params(d.create_df_7(), expected_df_1()) + h.index_preserved_params(d.create_df_7(), expected_df_1()), ) def test_expected_output(self, df_test, expected): """Test that OneHotEncodingTransformer.transform encodes the feature correctly. Also tests that OneHotEncodingTransformer.transform does not modify unrelated columns. """ # transformer is fit on the whole dataset separately from the input df to work with the decorators df_train = d.create_df_7() x = OneHotEncodingTransformer(columns="b") x.fit(df_train) df_transformed = x.transform(df_test) h.assert_frame_equal_msg( expected=expected, actual=df_transformed, msg_tag="Unspecified columns changed in transform", ) def test_categories_not_modified(self): """Test that the categories from fit are not changed in transform.""" df_train = d.create_df_1() df_test = d.create_df_7() x = OneHotEncodingTransformer(columns=["a", "b"], verbose=False) x2 = OneHotEncodingTransformer(columns=["a", "b"], verbose=False) x.fit(df_train) x2.fit(df_train) x.transform(df_test) h.assert_equal_dispatch( expected=list(x2.categories_[0]), actual=list(x.categories_[0]), msg="categories_ (index 0) modified during transform", ) h.assert_equal_dispatch( expected=list(x2.categories_[1]), actual=list(x.categories_[1]), msg="categories_ (index 1) modified during transform", ) def test_renaming_feature_works_as_expected(self): """Test OneHotEncodingTransformer.transform() is renaming features correctly.""" df = d.create_df_7() df = df[["b", "c"]] x = OneHotEncodingTransformer( columns=["b", "c"], separator="|", drop_original=True ) x.fit(df) df_transformed = x.transform(df) h.assert_equal_dispatch( expected=["b|x", "b|y", "b|z", "c|a", "c|b", "c|c"], actual=list(df_transformed.columns.values), msg="renaming columns feature in OneHotEncodingTransformer.transform", ) def test_warning_generated_by_unseen_categories(self): """Test OneHotEncodingTransformer.transform triggers a warning for unseen categories.""" df_train = d.create_df_7() df_test = d.create_df_8() x = OneHotEncodingTransformer(verbose=True) x.fit(df_train) with pytest.warns(Warning): x.transform(df_test) @pytest.mark.parametrize( "df_test, expected", h.row_by_row_params(d.create_df_8(), expected_df_2()) + h.index_preserved_params(d.create_df_8(), expected_df_2()), ) def test_unseen_categories_encoded_as_all_zeroes(self, df_test, expected): """Test OneHotEncodingTransformer.transform encodes unseen categories correctly (all 0s).""" # transformer is fit on the whole dataset separately from the input df to work with the decorators df_train = d.create_df_7() x = OneHotEncodingTransformer(columns=["a", "b", "c"], verbose=False) x.fit(df_train) df_transformed = x.transform(df_test) h.assert_equal_dispatch( expected=expected, actual=df_transformed, msg="unseen category rows not encoded as 0s", ) def test_original_columns_dropped_when_specified(self): """Test OneHotEncodingTransformer.transform drops original columns get when specified.""" df = d.create_df_7() x = OneHotEncodingTransformer(columns=["a", "b", "c"], drop_original=True) x.fit(df) df_transformed = x.transform(df) h.assert_equal_dispatch( expected=["a", "b", "c"], actual=[ x for x in df.columns.values if x not in df_transformed.columns.values ], msg="original columns not dropped", ) def test_original_columns_kept_when_specified(self): """Test OneHotEncodingTransformer.transform keeps original columns when specified.""" df = d.create_df_7() x = OneHotEncodingTransformer(drop_original=False) x.fit(df) df_transformed = x.transform(df) h.assert_equal_dispatch( expected=list(set()), actual=list(set(["a", "b", "c"]) - set(df_transformed.columns)), msg="original columns not kept", )