Exemple #1
0
def test_imputer_fill_value(imputer_test_data):
    X = imputer_test_data[[
        "int with nan", "categorical with nan", "float with nan",
        "object with nan", "bool col with nan"
    ]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(categorical_impute_strategy="constant",
                      numeric_impute_strategy="constant",
                      categorical_fill_value="fill",
                      numeric_fill_value=-1)
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int with nan": [-1, 1, 0, 0, 1],
        "categorical with nan":
        pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
        "float with nan": [0.0, 1.0, -1, -1.0, 0.],
        "object with nan":
        pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'),
        "bool col with nan":
        pd.Series([True, "fill", False, "fill", True], dtype='category')
    })
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)

    imputer = Imputer(categorical_impute_strategy="constant",
                      numeric_impute_strategy="constant",
                      categorical_fill_value="fill",
                      numeric_fill_value=-1)
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)
Exemple #2
0
def test_imputer_with_none():
    X = pd.DataFrame({
        "int with None": [1, 0, 5, None],
        "float with None": [0.1, 0.0, 0.5, None],
        "category with None":
        pd.Series(["b", "a", "a", None], dtype='category'),
        "boolean with None": [True, None, False, True],
        "object with None": ["b", "a", "a", None],
        "all None": [None, None, None, None]
    })
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int with None": [1, 0, 5, 2],
        "float with None": [0.1, 0.0, 0.5, 0.2],
        "category with None":
        pd.Series(["b", "a", "a", "a"], dtype='category'),
        "boolean with None": [True, True, False, True],
        "object with None":
        pd.Series(["b", "a", "a", "a"], dtype='category')
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #3
0
def test_categorical_and_numeric_input(imputer_test_data):
    X = imputer_test_data
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col":
        pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "int col": [0, 1, 2, 0, 3],
        "object col":
        pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
        "bool col": [True, False, False, True, True],
        "categorical with nan":
        pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
        "float with nan": [0.0, 1.0, 0, -1.0, 0.],
        "object with nan":
        pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
        "bool col with nan": [True, True, False, True, True]
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #4
0
def test_categorical_only_input(imputer_test_data):
    X = imputer_test_data[[
        "categorical col", "object col", "bool col", "categorical with nan",
        "object with nan", "bool col with nan", "all nan cat"
    ]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col":
        pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "object col":
        pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "bool col": [True, False, False, True, True],
        "categorical with nan":
        pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
        "object with nan":
        pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
        "bool col with nan": [True, True, False, True, True]
    })

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #5
0
def test_imputer_all_bool_return_original(data_type, make_data_type):
    X = make_data_type(data_type, pd.DataFrame([True, True, False, True, True], dtype=bool))
    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='boolean')
    y = make_data_type(data_type, pd.Series([1, 0, 0, 1, 0]))

    imputer = Imputer()
    imputer.fit(X, y)
    X_t = imputer.transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe())
Exemple #6
0
def test_imputer_bool_dtype_object(data_type, make_data_type):
    X = pd.DataFrame([True, np.nan, False, np.nan, True], dtype='boolean')
    y = pd.Series([1, 0, 0, 1, 0])
    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='boolean')
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)
    imputer = Imputer()
    imputer.fit(X, y)
    X_t = imputer.transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe())
Exemple #7
0
def test_imputer_all_bool_return_original(data_type):
    X = pd.DataFrame([True, True, False, True, True], dtype=bool)
    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype=bool)
    y = pd.Series([1, 0, 0, 1, 0])
    if data_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    imputer = Imputer()
    imputer.fit(X, y)
    X_t = imputer.transform(X)
    assert_frame_equal(X_expected_arr, X_t)
Exemple #8
0
def test_drop_all_columns(imputer_test_data):
    X = imputer_test_data[["all nan cat", "all nan"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = X.drop(["all nan cat", "all nan"], axis=1)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #9
0
def test_imputer_empty_data(data_type, make_data_type):
    X = pd.DataFrame()
    y = pd.Series()
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)
    expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #10
0
def test_typed_imputer_numpy_input():
    X = np.array([[1, 2, 2, 0], [np.nan, 0, 0, 0], [1, np.nan, np.nan,
                                                    np.nan]])
    y = pd.Series([0, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame(
        np.array([[1, 2, 2, 0], [1, 0, 0, 0], [1, 1, 1, 0]]))
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #11
0
def test_imputer_datetime_input():
    X = pd.DataFrame({'dates': ['20190902', '20200519', '20190607', np.nan],
                      'more dates': ['20190902', '20201010', '20190921', np.nan]})
    X['dates'] = pd.to_datetime(X['dates'], format='%Y%m%d')
    X['more dates'] = pd.to_datetime(X['more dates'], format='%Y%m%d')
    y = pd.Series()

    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), X, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), X, check_dtype=False)
Exemple #12
0
def test_imputer_does_not_reset_index():
    X = pd.DataFrame({'input_val': np.arange(10), 'target': np.arange(10),
                      'input_cat': ['a'] * 7 + ['b'] * 3})
    X.loc[5, 'input_val'] = np.nan
    X.loc[5, 'input_cat'] = np.nan
    assert X.index.tolist() == list(range(10))

    X.drop(0, inplace=True)
    y = X.pop('target')

    imputer = Imputer()
    imputer.fit(X, y=y)
    transformed = imputer.transform(X)
    pd.testing.assert_frame_equal(transformed.to_dataframe(),
                                  pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9],
                                                'input_cat': pd.Categorical(['a'] * 6 + ['b'] * 3)},
                                               index=list(range(1, 10))))
Exemple #13
0
def test_imputer_no_nans(imputer_test_data):
    X = imputer_test_data[["categorical col", "object col", "bool col"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
                      categorical_fill_value="fill", numeric_fill_value=-1)
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "bool col": [True, False, False, True, True],
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
                      categorical_fill_value="fill", numeric_fill_value=-1)
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #14
0
def test_imputer_multitype_with_one_bool(data_type, make_data_type):
    X_multi = pd.DataFrame({
        "bool with nan": pd.Series([True, np.nan, False, np.nan, False], dtype='boolean'),
        "bool no nan": pd.Series([False, False, False, False, True], dtype=bool),
    })
    y = pd.Series([1, 0, 0, 1, 0])
    X_multi_expected_arr = pd.DataFrame({
        "bool with nan": pd.Series([True, False, False, False, False], dtype='boolean'),
        "bool no nan": pd.Series([False, False, False, False, True], dtype='boolean'),
    })

    X_multi = make_data_type(data_type, X_multi)
    y = make_data_type(data_type, y)

    imputer = Imputer()
    imputer.fit(X_multi, y)
    X_multi_t = imputer.transform(X_multi)
    assert_frame_equal(X_multi_expected_arr, X_multi_t.to_dataframe())
Exemple #15
0
def test_numeric_only_input(imputer_test_data):
    X = imputer_test_data[["int col", "float col",
                           "int with nan", "float with nan", "all nan"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(numeric_impute_strategy="median")
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int col": [0, 1, 2, 0, 3],
        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
        "float with nan": [0.0, 1.0, 0, -1.0, 0.]
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #16
0
def test_imputer_woodwork_custom_overrides_returned_by_components(X_df, has_nan, numeric_impute_strategy):
    y = pd.Series([1, 2, 1])
    if has_nan:
        X_df.iloc[len(X_df) - 1, 0] = np.nan
    override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        imputer = Imputer(numeric_impute_strategy=numeric_impute_strategy)
        imputer.fit(X, y)
        transformed = imputer.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        if numeric_impute_strategy == "most_frequent":
            assert transformed.logical_types == {0: logical_type}
        elif logical_type in [Categorical, NaturalLanguage] or not has_nan:
            assert transformed.logical_types == {0: logical_type}
        else:
            assert transformed.logical_types == {0: Double}
Exemple #17
0
def test_imputer_multitype_with_one_bool(data_type):
    X_multi = pd.DataFrame({
        "bool with nan":
        pd.Series([True, np.nan, False, np.nan, False], dtype=object),
        "bool no nan":
        pd.Series([False, False, False, False, True], dtype=bool),
    })
    y = pd.Series([1, 0, 0, 1, 0])
    X_multi_expected_arr = pd.DataFrame({
        "bool with nan":
        pd.Series([True, False, False, False, False], dtype=object),
        "bool no nan":
        pd.Series([False, False, False, False, True], dtype=bool),
    })
    if data_type == 'ww':
        X_multi = ww.DataTable(X_multi)
        y = ww.DataColumn(y)
    imputer = Imputer()
    imputer.fit(X_multi, y)
    X_multi_t = imputer.transform(X_multi)
    assert_frame_equal(X_multi_expected_arr, X_multi_t)
Exemple #18
0
def test_imputer_empty_data(data_type):
    if data_type == 'pd':
        X = pd.DataFrame()
        y = pd.Series()
        expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    elif data_type == 'ww':
        X = ww.DataTable(pd.DataFrame())
        y = ww.DataColumn(pd.Series())
        expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    else:
        X = np.array([[]])
        y = np.array([])
        expected = pd.DataFrame(index=pd.Index([0]), columns=pd.Int64Index([]))
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed, expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed, expected, check_dtype=False)