def test_fillna_adds_is_na_column_when_imputing(self): df = pd.DataFrame({"id": [1, 2, 3, 4], "sales": [2000, 3000, 4000, np.nan]}) fill_na = FillNA(strategy="mean", indicate_nan=True) expected_cols = ["id", "sales", "id_is_nan", "sales_is_nan"] result = fill_na.fit_transform(df) assert result.columns.tolist() == expected_cols assert np, all(result["sales_isna"] == [0, 0, 0, 1])
def test_imputer_returns_dataframe_unchanged_if_no_nans(categorical): imputer = FillNA('Unknown') result = imputer.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert {'category_a', 'category_b'} == set(result.columns) assert ~result.isin(['Unknown']).any().any()
def test_imputer_returns_correct_dataframe_max(numerical_na): imputer = FillNA(strategy='max') result = imputer.fit_transform(numerical_na) assert isinstance(result, pd.DataFrame) assert len(numerical_na) == len(result) assert {'number_a', 'number_b'} == set(result.columns) assert 4 == result.loc[0, "number_a"] assert 7 == result.loc[3, "number_b"]
def test_imputer_returns_correct_dataframe(categorical_na): imputer = FillNA('Unknown') result = imputer.fit_transform(categorical_na) assert isinstance(result, pd.DataFrame) assert len(categorical_na) == len(result) assert {'category_a', 'category_b'} == set(result.columns) assert 'Unknown' == result.iloc[0, 1] assert 'Unknown' == result.iloc[1, 0]
def test_fill_na_imputes_numerical_na_correct( self, numerical_na: pd.DataFrame, value: None, strategy: str, expected: pd.DataFrame, ): imputer = FillNA(value=value, strategy=strategy) result = imputer.fit_transform(numerical_na) pd.testing.assert_frame_equal(result, expected)
def test_imputer_returns_correct_dataframe_most_freq(categorical): categorical.loc[1, "category_a"] = np.nan categorical.loc[0, "category_b"] = np.nan categorical.loc[1, "category_b"] = "b3" imputer = FillNA(strategy='most_freq') result = imputer.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert {'category_a', 'category_b'} == set(result.columns) assert 'a1' == result.loc[1, "category_a"] assert 'b3' == result.loc[0, "category_b"]
def test_fillna_raises_when_imputing_numerically_on_strings(self): df = pd.DataFrame( { "id": [1, 2, 3, 4], "status": ["OK", "Error", "OK", "Error"], "sales": [2000, 3000, 4000, np.nan], } ) fill_na = FillNA(strategy="mean") with pytest.raises( TransformerError, match="column/columns have invalid types for strategy = mean", ): fill_na.fit_transform(df) df["new_col"] = ["One", "Two", "Three", "Four"] with pytest.raises( TransformerError, match="column/columns have invalid types for strategy = mean", ): fill_na.fit_transform(df)
def test_fillna_imputes_pandas_categorical_correct( self, value: Any, strategy: Any, expected: pd.DataFrame, categorical_na: pd.DataFrame, ): categorical_na["category_a"] = categorical_na["category_a"].astype("category") categorical_na["category_b"] = categorical_na["category_b"].astype("category") imputer = FillNA(value=value, strategy=strategy) result = imputer.fit_transform(categorical_na) pd.testing.assert_frame_equal(result, expected, check_categorical=False)
def test_imputer_with_both_raises_error(numerical_na): imputer = FillNA(value=0, strategy='mean') with pytest.raises(TransformerError): imputer.fit_transform(numerical_na)
def test_imputer_with_none_raises_error(numerical_na): imputer = FillNA() with pytest.raises(TransformerError): imputer.fit_transform(numerical_na)
def test_fillna_imputes_categorical_na_correct( self, categorical_na: pd.DataFrame, value: Any, strategy: Any, expected: Any ): imputer = FillNA(value=value, strategy=strategy) result = imputer.fit_transform(categorical_na) pd.testing.assert_frame_equal(result, expected)
def test_fillna_returns_dataframe_unchanged_if_no_nans( self, categorical: pd.DataFrame ): imputer = FillNA("Unknown") result = imputer.fit_transform(categorical) pd.testing.assert_frame_equal(result, categorical)