def test_fillna_adds_is_na_column_when_imputing(self): df = pd.DataFrame({"id": [1, 2, 3, 4], "sales": [2000, 3000, 4000, np.nan]}) fill_na = FillNA(strategy="mean", indicate_nan=True) expected_cols = ["id", "sales", "id_is_nan", "sales_is_nan"] result = fill_na.fit_transform(df) assert result.columns.tolist() == expected_cols assert np, all(result["sales_isna"] == [0, 0, 0, 1])
def test_imputer_returns_dataframe_unchanged_if_no_nans(categorical): imputer = FillNA('Unknown') result = imputer.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert {'category_a', 'category_b'} == set(result.columns) assert ~result.isin(['Unknown']).any().any()
def test_imputer_returns_correct_dataframe(categorical_na): imputer = FillNA('Unknown') result = imputer.fit_transform(categorical_na) assert isinstance(result, pd.DataFrame) assert len(categorical_na) == len(result) assert {'category_a', 'category_b'} == set(result.columns) assert 'Unknown' == result.iloc[0, 1] assert 'Unknown' == result.iloc[1, 0]
def test_imputer_returns_correct_dataframe_max(numerical_na): imputer = FillNA(strategy='max') result = imputer.fit_transform(numerical_na) assert isinstance(result, pd.DataFrame) assert len(numerical_na) == len(result) assert {'number_a', 'number_b'} == set(result.columns) assert 4 == result.loc[0, "number_a"] assert 7 == result.loc[3, "number_b"]
def test_fill_na_imputes_numerical_na_correct( self, numerical_na: pd.DataFrame, value: None, strategy: str, expected: pd.DataFrame, ): imputer = FillNA(value=value, strategy=strategy) result = imputer.fit_transform(numerical_na) pd.testing.assert_frame_equal(result, expected)
def test_imputer_returns_correct_dataframe_most_freq(categorical): categorical.loc[1, "category_a"] = np.nan categorical.loc[0, "category_b"] = np.nan categorical.loc[1, "category_b"] = "b3" imputer = FillNA(strategy='most_freq') result = imputer.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert {'category_a', 'category_b'} == set(result.columns) assert 'a1' == result.loc[1, "category_a"] assert 'b3' == result.loc[0, "category_b"]
def test_fillna_imputes_pandas_categorical_correct( self, value: Any, strategy: Any, expected: pd.DataFrame, categorical_na: pd.DataFrame, ): categorical_na["category_a"] = categorical_na["category_a"].astype("category") categorical_na["category_b"] = categorical_na["category_b"].astype("category") imputer = FillNA(value=value, strategy=strategy) result = imputer.fit_transform(categorical_na) pd.testing.assert_frame_equal(result, expected, check_categorical=False)
def test_fillna_raises_when_imputing_numerically_on_strings(self): df = pd.DataFrame( { "id": [1, 2, 3, 4], "status": ["OK", "Error", "OK", "Error"], "sales": [2000, 3000, 4000, np.nan], } ) fill_na = FillNA(strategy="mean") with pytest.raises( TransformerError, match="column/columns have invalid types for strategy = mean", ): fill_na.fit_transform(df) df["new_col"] = ["One", "Two", "Three", "Four"] with pytest.raises( TransformerError, match="column/columns have invalid types for strategy = mean", ): fill_na.fit_transform(df)
""" host_response_time ================== How long does it take for the host to accept/decline an offer. Hypothesis that this could be an indicator of "seriousness" which could affect the price Categorical with 4 levels dtype: category """ from ml_tooling.transformers import Select, FillNA, ToCategorical from sklearn.pipeline import Pipeline host_response_time = Pipeline([ ("select", Select("host_response_time")), ("fill_na", FillNA("unknown", indicate_nan=True)), ("categorical", ToCategorical()) ])
def pipeline(self): return Pipeline([("fillna", FillNA(0))])
def test_imputer_with_both_raises_error(numerical_na): imputer = FillNA(value=0, strategy='mean') with pytest.raises(TransformerError): imputer.fit_transform(numerical_na)
def test_imputer_with_none_raises_error(numerical_na): imputer = FillNA() with pytest.raises(TransformerError): imputer.fit_transform(numerical_na)
""" security_deposit ================ What's the security deposit. Might be related to the price Is a float, but is prepended with `$`. Read as string and preprocess dtype: string """ from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline security_deposit = Pipeline([("select", Select("security_deposit")), ("fill_na", FillNA(0, indicate_nan=True))])
from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline host_acceptance_rate = Pipeline([ ("select", Select("host_acceptance_rate")), ("fill_na", FillNA(127, indicate_nan=True)), ])
""" house_rules =========== What rules the guest must follow. Would try to extract some simple rules such as smoking allowed or similar House rules will be used to extract features from, such as `is_no_smoking` or something indicating a lot of rules dtype: string """ from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline house_rules_len = Pipeline([("select", Select("house_rules_len")), ("fill_na", FillNA(0))])
def test_fillna_imputes_categorical_na_correct( self, categorical_na: pd.DataFrame, value: Any, strategy: Any, expected: Any ): imputer = FillNA(value=value, strategy=strategy) result = imputer.fit_transform(categorical_na) pd.testing.assert_frame_equal(result, expected)
def test_fillna_returns_dataframe_unchanged_if_no_nans( self, categorical: pd.DataFrame ): imputer = FillNA("Unknown") result = imputer.fit_transform(categorical) pd.testing.assert_frame_equal(result, categorical)
def test_fillna_raises_error(self, numerical_na: pd.DataFrame, value, strategy): with pytest.raises(TransformerError): FillNA(value=value, strategy=strategy).fit(numerical_na)