def test_fillna_adds_is_na_column_when_imputing(self):
     df = pd.DataFrame({"id": [1, 2, 3, 4], "sales": [2000, 3000, 4000, np.nan]})
     fill_na = FillNA(strategy="mean", indicate_nan=True)
     expected_cols = ["id", "sales", "id_is_nan", "sales_is_nan"]
     result = fill_na.fit_transform(df)
     assert result.columns.tolist() == expected_cols
     assert np, all(result["sales_isna"] == [0, 0, 0, 1])
Beispiel #2
0
def test_imputer_returns_dataframe_unchanged_if_no_nans(categorical):
    imputer = FillNA('Unknown')
    result = imputer.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a', 'category_b'} == set(result.columns)
    assert ~result.isin(['Unknown']).any().any()
Beispiel #3
0
def test_imputer_returns_correct_dataframe(categorical_na):
    imputer = FillNA('Unknown')
    result = imputer.fit_transform(categorical_na)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical_na) == len(result)
    assert {'category_a', 'category_b'} == set(result.columns)
    assert 'Unknown' == result.iloc[0, 1]
    assert 'Unknown' == result.iloc[1, 0]
Beispiel #4
0
def test_imputer_returns_correct_dataframe_max(numerical_na):
    imputer = FillNA(strategy='max')
    result = imputer.fit_transform(numerical_na)

    assert isinstance(result, pd.DataFrame)
    assert len(numerical_na) == len(result)
    assert {'number_a', 'number_b'} == set(result.columns)
    assert 4 == result.loc[0, "number_a"]
    assert 7 == result.loc[3, "number_b"]
 def test_fill_na_imputes_numerical_na_correct(
     self,
     numerical_na: pd.DataFrame,
     value: None,
     strategy: str,
     expected: pd.DataFrame,
 ):
     imputer = FillNA(value=value, strategy=strategy)
     result = imputer.fit_transform(numerical_na)
     pd.testing.assert_frame_equal(result, expected)
Beispiel #6
0
def test_imputer_returns_correct_dataframe_most_freq(categorical):
    categorical.loc[1, "category_a"] = np.nan
    categorical.loc[0, "category_b"] = np.nan
    categorical.loc[1, "category_b"] = "b3"

    imputer = FillNA(strategy='most_freq')
    result = imputer.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a', 'category_b'} == set(result.columns)
    assert 'a1' == result.loc[1, "category_a"]
    assert 'b3' == result.loc[0, "category_b"]
    def test_fillna_imputes_pandas_categorical_correct(
        self,
        value: Any,
        strategy: Any,
        expected: pd.DataFrame,
        categorical_na: pd.DataFrame,
    ):
        categorical_na["category_a"] = categorical_na["category_a"].astype("category")
        categorical_na["category_b"] = categorical_na["category_b"].astype("category")

        imputer = FillNA(value=value, strategy=strategy)
        result = imputer.fit_transform(categorical_na)

        pd.testing.assert_frame_equal(result, expected, check_categorical=False)
    def test_fillna_raises_when_imputing_numerically_on_strings(self):
        df = pd.DataFrame(
            {
                "id": [1, 2, 3, 4],
                "status": ["OK", "Error", "OK", "Error"],
                "sales": [2000, 3000, 4000, np.nan],
            }
        )
        fill_na = FillNA(strategy="mean")
        with pytest.raises(
            TransformerError,
            match="column/columns have invalid types for strategy = mean",
        ):
            fill_na.fit_transform(df)

        df["new_col"] = ["One", "Two", "Three", "Four"]

        with pytest.raises(
            TransformerError,
            match="column/columns have invalid types for strategy = mean",
        ):
            fill_na.fit_transform(df)
Beispiel #9
0
"""
host_response_time
==================
How long does it take for the host to accept/decline an offer. Hypothesis that this could be an indicator of "seriousness" which could affect the price

Categorical with 4 levels
dtype: category
"""
from ml_tooling.transformers import Select, FillNA, ToCategorical
from sklearn.pipeline import Pipeline

host_response_time = Pipeline([
    ("select", Select("host_response_time")),
    ("fill_na", FillNA("unknown", indicate_nan=True)),
    ("categorical", ToCategorical())
])
Beispiel #10
0
 def pipeline(self):
     return Pipeline([("fillna", FillNA(0))])
Beispiel #11
0
def test_imputer_with_both_raises_error(numerical_na):
    imputer = FillNA(value=0, strategy='mean')
    with pytest.raises(TransformerError):
        imputer.fit_transform(numerical_na)
Beispiel #12
0
def test_imputer_with_none_raises_error(numerical_na):
    imputer = FillNA()
    with pytest.raises(TransformerError):
        imputer.fit_transform(numerical_na)
"""
security_deposit
================
What's the security deposit. Might be related to the price

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

security_deposit = Pipeline([("select", Select("security_deposit")),
                             ("fill_na", FillNA(0, indicate_nan=True))])
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

host_acceptance_rate = Pipeline([
    ("select", Select("host_acceptance_rate")),
    ("fill_na", FillNA(127, indicate_nan=True)),
])
"""
house_rules
===========
What rules the guest must follow. Would try to extract some simple rules such as smoking allowed
or similar

House rules will be used to extract features from, such as `is_no_smoking` or something
indicating a lot of rules
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

house_rules_len = Pipeline([("select", Select("house_rules_len")),
                            ("fill_na", FillNA(0))])
Beispiel #16
0
 def test_fillna_imputes_categorical_na_correct(
     self, categorical_na: pd.DataFrame, value: Any, strategy: Any, expected: Any
 ):
     imputer = FillNA(value=value, strategy=strategy)
     result = imputer.fit_transform(categorical_na)
     pd.testing.assert_frame_equal(result, expected)
Beispiel #17
0
 def test_fillna_returns_dataframe_unchanged_if_no_nans(
     self, categorical: pd.DataFrame
 ):
     imputer = FillNA("Unknown")
     result = imputer.fit_transform(categorical)
     pd.testing.assert_frame_equal(result, categorical)
Beispiel #18
0
 def test_fillna_raises_error(self, numerical_na: pd.DataFrame, value, strategy):
     with pytest.raises(TransformerError):
         FillNA(value=value, strategy=strategy).fit(numerical_na)