Exemple #1
0
def test_imputer_with_none():
    X = pd.DataFrame({
        "int with None": [1, 0, 5, None],
        "float with None": [0.1, 0.0, 0.5, None],
        "category with None":
        pd.Series(["b", "a", "a", None], dtype='category'),
        "boolean with None": [True, None, False, True],
        "object with None": ["b", "a", "a", None],
        "all None": [None, None, None, None]
    })
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int with None": [1, 0, 5, 2],
        "float with None": [0.1, 0.0, 0.5, 0.2],
        "category with None":
        pd.Series(["b", "a", "a", "a"], dtype='category'),
        "boolean with None": [True, True, False, True],
        "object with None":
        pd.Series(["b", "a", "a", "a"], dtype='category')
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #2
0
def test_categorical_only_input(imputer_test_data):
    X = imputer_test_data[[
        "categorical col", "object col", "bool col", "categorical with nan",
        "object with nan", "bool col with nan", "all nan cat"
    ]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col":
        pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "object col":
        pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "bool col": [True, False, False, True, True],
        "categorical with nan":
        pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
        "object with nan":
        pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
        "bool col with nan": [True, True, False, True, True]
    })

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #3
0
def test_categorical_and_numeric_input(imputer_test_data):
    X = imputer_test_data
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col":
        pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "int col": [0, 1, 2, 0, 3],
        "object col":
        pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
        "bool col": [True, False, False, True, True],
        "categorical with nan":
        pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
        "float with nan": [0.0, 1.0, 0, -1.0, 0.],
        "object with nan":
        pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
        "bool col with nan": [True, True, False, True, True]
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #4
0
def test_imputer_fill_value(imputer_test_data):
    X = imputer_test_data[[
        "int with nan", "categorical with nan", "float with nan",
        "object with nan", "bool col with nan"
    ]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(categorical_impute_strategy="constant",
                      numeric_impute_strategy="constant",
                      categorical_fill_value="fill",
                      numeric_fill_value=-1)
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int with nan": [-1, 1, 0, 0, 1],
        "categorical with nan":
        pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
        "float with nan": [0.0, 1.0, -1, -1.0, 0.],
        "object with nan":
        pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'),
        "bool col with nan":
        pd.Series([True, "fill", False, "fill", True], dtype='category')
    })
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)

    imputer = Imputer(categorical_impute_strategy="constant",
                      numeric_impute_strategy="constant",
                      categorical_fill_value="fill",
                      numeric_fill_value=-1)
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)
Exemple #5
0
def test_drop_all_columns(imputer_test_data):
    X = imputer_test_data[["all nan cat", "all nan"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = X.drop(["all nan cat", "all nan"], axis=1)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #6
0
def test_imputer_empty_data(data_type, make_data_type):
    X = pd.DataFrame()
    y = pd.Series()
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)
    expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #7
0
def test_typed_imputer_numpy_input():
    X = np.array([[1, 2, 2, 0], [np.nan, 0, 0, 0], [1, np.nan, np.nan,
                                                    np.nan]])
    y = pd.Series([0, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame(
        np.array([[1, 2, 2, 0], [1, 0, 0, 0], [1, 1, 1, 0]]))
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #8
0
def test_imputer_datetime_input():
    X = pd.DataFrame({'dates': ['20190902', '20200519', '20190607', np.nan],
                      'more dates': ['20190902', '20201010', '20190921', np.nan]})
    X['dates'] = pd.to_datetime(X['dates'], format='%Y%m%d')
    X['more dates'] = pd.to_datetime(X['more dates'], format='%Y%m%d')
    y = pd.Series()

    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), X, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), X, check_dtype=False)
Exemple #9
0
def test_numeric_only_input(imputer_test_data):
    X = imputer_test_data[["int col", "float col",
                           "int with nan", "float with nan", "all nan"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(numeric_impute_strategy="median")
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int col": [0, 1, 2, 0, 3],
        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
        "float with nan": [0.0, 1.0, 0, -1.0, 0.]
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #10
0
def test_imputer_no_nans(imputer_test_data):
    X = imputer_test_data[["categorical col", "object col", "bool col"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
                      categorical_fill_value="fill", numeric_fill_value=-1)
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "bool col": [True, False, False, True, True],
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
                      categorical_fill_value="fill", numeric_fill_value=-1)
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
Exemple #11
0
def test_imputer_empty_data(data_type):
    if data_type == 'pd':
        X = pd.DataFrame()
        y = pd.Series()
        expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    elif data_type == 'ww':
        X = ww.DataTable(pd.DataFrame())
        y = ww.DataColumn(pd.Series())
        expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    else:
        X = np.array([[]])
        y = np.array([])
        expected = pd.DataFrame(index=pd.Index([0]), columns=pd.Int64Index([]))
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed, expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed, expected, check_dtype=False)