Exemple #1
0
def test_simple_imputer_fill_value(data_type):
    if data_type == "numeric":
        X = pd.DataFrame({
            "some numeric": [np.nan, 1, 0],
            "another numeric": [0, np.nan, 2]
        })
        fill_value = -1
        expected = pd.DataFrame({
            "some numeric": [-1, 1, 0],
            "another numeric": [0, -1, 2]
        })
    else:
        X = pd.DataFrame({
            "categorical with nan":
            pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'),
            "object with nan": ["b", "b", np.nan, "c", np.nan]
        })
        fill_value = "fill"
        expected = pd.DataFrame({
            "categorical with nan":
            pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
            "object with nan":
            pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'),
        })
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)

    imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)
Exemple #2
0
def test_simple_imputer_mean():
    X = pd.DataFrame([[np.nan, 0, 1, np.nan], [1, 2, 3, 2], [1, 2, 3, 0]])
    # test impute_strategy
    transformer = SimpleImputer(impute_strategy='mean')
    X_expected_arr = pd.DataFrame([[1, 0, 1, 1], [1, 2, 3, 2], [1, 2, 3, 0]])
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)
def test_simple_imputer_median():
    X = pd.DataFrame([[np.nan, 0, 1, np.nan], [1, 2, 3, 2], [10, 2, np.nan, 2],
                      [10, 2, 5, np.nan], [6, 2, 7, 0]])
    transformer = SimpleImputer(impute_strategy='median')
    X_expected_arr = pd.DataFrame([[8, 0, 1, 2], [1, 2, 3, 2], [10, 2, 4, 2],
                                   [10, 2, 5, 2], [6, 2, 7, 0]])
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
Exemple #4
0
def test_simple_imputer_most_frequent():
    X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3],
                      ["b", 2, 1, 0]])

    transformer = SimpleImputer(impute_strategy='most_frequent')
    X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["a", 2, 1, 3],
                                   ["b", 2, 1, 0]])
    X_expected_arr = X_expected_arr.astype({0: 'category'})
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)
Exemple #5
0
def test_simple_imputer_transform_drop_all_nan_columns_empty():
    X = pd.DataFrame([[np.nan, np.nan, np.nan]])
    transformer = SimpleImputer(impute_strategy='most_frequent')
    assert transformer.fit_transform(X).to_dataframe().empty
    assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))

    transformer = SimpleImputer(impute_strategy='most_frequent')
    transformer.fit(X)
    assert transformer.transform(X).to_dataframe().empty
    assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
def test_simple_imputer_numpy_input():
    X = np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3,
                                                              0]])
    transformer = SimpleImputer(impute_strategy='mean')
    X_expected_arr = np.array([[0, 1, 1], [2, 3, 2], [2, 3, 0]])
    assert np.allclose(X_expected_arr, transformer.fit_transform(X))
    np.testing.assert_almost_equal(
        X,
        np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3,
                                                              0]]))
Exemple #7
0
def test_simple_imputer_constant():
    # test impute strategy is constant and fill value is not specified
    X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3],
                      ["b", 2, 3, 0]])

    transformer = SimpleImputer(impute_strategy='constant', fill_value=3)
    X_expected_arr = pd.DataFrame([[3, 0, 1, 3], ["a", 2, 3, 3],
                                   ["b", 2, 3, 0]])
    X_expected_arr = X_expected_arr.astype({0: 'category'})
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)
Exemple #8
0
def test_simple_imputer_fit_transform_drop_all_nan_columns():
    X = pd.DataFrame({
        "all_nan": [np.nan, np.nan, np.nan],
        "some_nan": [np.nan, 1, 0],
        "another_col": [0, 1, 2]
    })

    transformer = SimpleImputer(impute_strategy='most_frequent')
    X_expected_arr = pd.DataFrame({
        "some_nan": [0, 1, 0],
        "another_col": [0, 1, 2]
    })
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)
    assert_frame_equal(
        X,
        pd.DataFrame({
            "all_nan": [np.nan, np.nan, np.nan],
            "some_nan": [np.nan, 1, 0],
            "another_col": [0, 1, 2]
        }))
Exemple #9
0
def test_simple_imputer_col_with_non_numeric():
    # test col with all strings
    X = pd.DataFrame([["a", 0, 1, np.nan], ["b", 2, 3, 3], ["a", 2, 3, 1],
                      [np.nan, 2, 3, 0]])

    transformer = SimpleImputer(impute_strategy='mean')
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        transformer.fit_transform(X)
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        transformer.fit(X)

    transformer = SimpleImputer(impute_strategy='median')
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        transformer.fit_transform(X)
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        transformer.fit(X)

    transformer = SimpleImputer(impute_strategy='most_frequent')
    X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["b", 2, 3, 3],
                                   ["a", 2, 3, 1], ["a", 2, 3, 0]])
    X_expected_arr = X_expected_arr.astype({0: 'category'})
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)

    transformer = SimpleImputer(impute_strategy='constant', fill_value=2)
    X_expected_arr = pd.DataFrame([["a", 0, 1, 2], ["b", 2, 3, 3],
                                   ["a", 2, 3, 1], [2, 2, 3, 0]])
    X_expected_arr = X_expected_arr.astype({0: 'category'})
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)