Esempio n. 1
0
def _check_statistics(X, X_true, strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "sparse = {0}" % (strategy, missing_values)

    assert_ae = assert_array_equal

    if X.dtype.kind == 'f' or X_true.dtype.kind == 'f':
        assert_ae = assert_array_almost_equal

    # Normal matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(False))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(False))

    # Sparse matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(True))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(True))
Esempio n. 2
0
def test_imputation_error_invalid_strategy(strategy):
    X = np.ones((3, 5))
    X[0, 0] = np.nan

    with pytest.raises(ValueError, match=str(strategy)):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit_transform(X)
Esempio n. 3
0
def test_permutation_importance_mixed_types_pandas():
    pd = pytest.importorskip("pandas")
    rng = np.random.RandomState(42)
    n_repeats = 5

    # Last column is correlated with y
    X = pd.DataFrame({
        'col1': [1.0, 2.0, 3.0, np.nan],
        'col2': ['a', 'b', 'a', 'b']
    })
    y = np.array([0, 1, 0, 1])

    num_preprocess = make_pipeline(SimpleImputer(), StandardScaler())
    preprocess = ColumnTransformer([('num', num_preprocess, ['col1']),
                                    ('cat', OneHotEncoder(), ['col2'])])
    clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs'))
    clf.fit(X, y)

    result = permutation_importance(clf,
                                    X,
                                    y,
                                    n_repeats=n_repeats,
                                    random_state=rng)

    assert result.importances.shape == (X.shape[1], n_repeats)
    # the correlated feature with y is the last column and should
    # have the highest importance
    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
Esempio n. 4
0
def test_imputation_deletion_warning(strategy):
    X = np.ones((3, 5))
    X[:, 0] = np.nan

    with pytest.warns(UserWarning, match="Deleting"):
        imputer = SimpleImputer(strategy=strategy, verbose=True)
        imputer.fit_transform(X)
Esempio n. 5
0
def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    full_scores = cross_val_score(REGRESSOR, X_full, y_full,
                                  scoring='neg_mean_squared_error',
                                  cv=N_SPLITS)

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                          dtype=np.bool),
                                 np.ones(n_missing_samples,
                                         dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()

    # Estimate the score after replacing missing values by 0
    imputer = SimpleImputer(missing_values=0,
                            strategy='constant',
                            fill_value=0)
    zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)

    # Estimate the score after imputation (mean strategy) of the missing values
    imputer = SimpleImputer(missing_values=0, strategy="mean")
    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)

    # Estimate the score after iterative imputation of the missing values
    imputer = IterativeImputer(missing_values=0,
                               random_state=0,
                               n_nearest_features=5,
                               sample_posterior=True)
    iterative_impute_scores = get_scores_for_imputer(imputer,
                                                     X_missing,
                                                     y_missing)

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
Esempio n. 6
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert not np.all(X == Xt)

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0],
                            strategy="mean",
                            copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_array_almost_equal(X, Xt)

    # copy=False, sparse csc => no copy
    X = X_orig.copy().tocsc()
    imputer = SimpleImputer(missing_values=X.data[0],
                            strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_array_almost_equal(X.data, Xt.data)

    # copy=False, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0],
                            strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)
Esempio n. 7
0
def test_imputation_constant_error_invalid_type(X_data, missing_value):
    # Verify that exceptions are raised on invalid fill_value type
    X = np.full((3, 5), X_data, dtype=float)
    X[0, 0] = missing_value

    with pytest.raises(ValueError, match="imputing numerical"):
        imputer = SimpleImputer(missing_values=missing_value,
                                strategy="constant",
                                fill_value="x")
        imputer.fit_transform(X)
Esempio n. 8
0
def test_calibration_nan_imputer():
    """Test that calibration can accept nan"""
    X, y = make_classification(n_samples=10, n_features=2,
                               n_informative=2, n_redundant=0,
                               random_state=42)
    X[0, 0] = np.nan
    clf = Pipeline(
        [('imputer', SimpleImputer()),
         ('rf', RandomForestClassifier(n_estimators=1))])
    clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic')
    clf_c.fit(X, y)
    clf_c.predict(X)
Esempio n. 9
0
def test_imputation_error_sparse_0(strategy):
    # check that error are raised when missing_values = 0 and input is sparse
    X = np.ones((3, 5))
    X[0] = 0
    X = sparse.csc_matrix(X)

    imputer = SimpleImputer(strategy=strategy, missing_values=0)
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.fit(X)

    imputer.fit(X.toarray())
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.transform(X)
Esempio n. 10
0
def test_imputation_constant_integer():
    # Test imputation using the constant strategy on integers
    X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1],
                  [8, 9, 0, -1]])

    X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]])

    imputer = SimpleImputer(missing_values=-1,
                            strategy="constant",
                            fill_value=0)
    X_trans = imputer.fit_transform(X)

    assert_array_equal(X_trans, X_true)
Esempio n. 11
0
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    X = sparse_random_matrix(100, 100, density=0.10)
    missing_values = X.data[0]

    pipeline = Pipeline([('imputer',
                          SimpleImputer(missing_values=missing_values)),
                         ('tree', tree.DecisionTreeRegressor(random_state=0))])

    parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]}

    Y = sparse_random_matrix(100, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
Esempio n. 12
0
def test_imputation_shape(strategy):
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    imputer = SimpleImputer(strategy=strategy)
    X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
    assert X_imputed.shape == (10, 2)
    X_imputed = imputer.fit_transform(X)
    assert X_imputed.shape == (10, 2)

    iterative_imputer = IterativeImputer(initial_strategy=strategy)
    X_imputed = iterative_imputer.fit_transform(X)
    assert X_imputed.shape == (10, 2)
Esempio n. 13
0
def test_imputation_const_mostf_error_invalid_types(strategy, dtype):
    # Test imputation on non-numeric data using "most_frequent" and "constant"
    # strategy
    X = np.array([
        [np.nan, np.nan, "a", "f"],
        [np.nan, "c", np.nan, "d"],
        [np.nan, "b", "d", np.nan],
        [np.nan, "c", "d", "h"],
    ],
                 dtype=dtype)

    err_msg = "SimpleImputer does not support data"
    with pytest.raises(ValueError, match=err_msg):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit(X).transform(X)
Esempio n. 14
0
def test_simple_imputation_add_indicator_sparse_matrix(arr_type):
    X_sparse = arr_type([[np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan],
                         [1, 2, 9]])
    X_true = np.array([
        [3., 1., 5., 1., 0., 0.],
        [2., 2., 1., 0., 1., 0.],
        [6., 3., 5., 0., 0., 1.],
        [1., 2., 9., 0., 0., 0.],
    ])

    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True)
    X_trans = imputer.fit_transform(X_sparse)

    assert sparse.issparse(X_trans)
    assert X_trans.shape == X_true.shape
    assert_allclose(X_trans.toarray(), X_true)
Esempio n. 15
0
def test_imputation_constant_float(array_constructor):
    # Test imputation using the constant strategy on floats
    X = np.array([[np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan],
                  [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan]])

    X_true = np.array([[-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1],
                       [1.4, 1.5, 0, -1]])

    X = array_constructor(X)

    X_true = array_constructor(X_true)

    imputer = SimpleImputer(strategy="constant", fill_value=-1)
    X_trans = imputer.fit_transform(X)

    assert_allclose_dense_sparse(X_trans, X_true)
Esempio n. 16
0
def test_imputation_constant_object(marker):
    # Test imputation using the constant strategy on objects
    X = np.array([[marker, "a", "b", marker], ["c", marker, "d", marker],
                  ["e", "f", marker, marker], ["g", "h", "i", marker]],
                 dtype=object)

    X_true = np.array(
        [["missing", "a", "b", "missing"], ["c", "missing", "d", "missing"],
         ["e", "f", "missing", "missing"], ["g", "h", "i", "missing"]],
        dtype=object)

    imputer = SimpleImputer(missing_values=marker,
                            strategy="constant",
                            fill_value="missing")
    X_trans = imputer.fit_transform(X)

    assert_array_equal(X_trans, X_true)
Esempio n. 17
0
def test_imputation_most_frequent_pandas(dtype):
    # Test imputation using the most frequent strategy on pandas df
    pd = pytest.importorskip("pandas")

    f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n"
                    ",i,x,\n"
                    "a,,y,\n"
                    "a,j,,\n"
                    "b,j,x,")

    df = pd.read_csv(f, dtype=dtype)

    X_true = np.array(
        [["a", "i", "x"], ["a", "j", "y"], ["a", "j", "x"], ["b", "j", "x"]],
        dtype=object)

    imputer = SimpleImputer(strategy="most_frequent")
    X_trans = imputer.fit_transform(df)

    assert_array_equal(X_trans, X_true)
Esempio n. 18
0
def test_imputation_most_frequent_objects(marker):
    # Test imputation using the most-frequent strategy.
    X = np.array([
        [marker, marker, "a", "f"],
        [marker, "c", marker, "d"],
        [marker, "b", "d", marker],
        [marker, "c", "d", "h"],
    ],
                 dtype=object)

    X_true = np.array([
        ["c", "a", "f"],
        ["c", "d", "d"],
        ["b", "d", "d"],
        ["c", "d", "h"],
    ],
                      dtype=object)

    imputer = SimpleImputer(missing_values=marker, strategy="most_frequent")
    X_trans = imputer.fit(X).transform(X)

    assert_array_equal(X_trans, X_true)
Esempio n. 19
0
def test_iterative_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               initial_strategy=strategy,
                               random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert_allclose(
        imputer.transform(X_test)[:, 0],
        initial_imputer.transform(X_test)[:, 0])
Esempio n. 20
0
def test_permutation_importance_mixed_types():
    rng = np.random.RandomState(42)
    n_repeats = 4

    # Last column is correlated with y
    X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T
    y = np.array([0, 1, 0, 1])

    clf = make_pipeline(SimpleImputer(), LogisticRegression(solver='lbfgs'))
    clf.fit(X, y)
    result = permutation_importance(clf,
                                    X,
                                    y,
                                    n_repeats=n_repeats,
                                    random_state=rng)

    assert result.importances.shape == (X.shape[1], n_repeats)

    # the correlated feature with y is the last column and should
    # have the highest importance
    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])

    # use another random state
    rng = np.random.RandomState(0)
    result2 = permutation_importance(clf,
                                     X,
                                     y,
                                     n_repeats=n_repeats,
                                     random_state=rng)
    assert result2.importances.shape == (X.shape[1], n_repeats)

    assert not np.allclose(result.importances, result2.importances)

    # the correlated feature with y is the last column and should
    # have the highest importance
    assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
Esempio n. 21
0
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
    trans = make_union(
        SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
        MissingIndicator(missing_values=missing_values))
    X_trans = trans.fit_transform(X)
    assert_array_equal(X_trans, X_trans_exp)
Esempio n. 22
0
def test_imputation_mean_median_error_invalid_type(strategy, dtype):
    X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype)

    with pytest.raises(ValueError, match="non-numeric data"):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit_transform(X)
Esempio n. 23
0
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X['random_cat'] = np.random.randint(3, size=X.shape[0])
X['random_num'] = np.random.randn(X.shape[0])

categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat']
numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num']

X = X[categorical_columns + numerical_columns]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))])

preprocessing = ColumnTransformer([('cat', categorical_pipe,
                                    categorical_columns),
                                   ('num', numerical_pipe, numerical_columns)])

rf = Pipeline([('preprocess', preprocessing),
               ('classifier', RandomForestClassifier(random_state=42))])
rf.fit(X_train, y_train)

##############################################################################
# Accuracy of the Model
# ---------------------
Esempio n. 24
0
# y = titanic.frame['survived']

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer,
    numeric_features), ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor',
                       preprocessor), ('classifier', LogisticRegression())])
Esempio n. 25
0
    scoring='neg_mean_squared_error',
    cv=N_SPLITS),
                               columns=['Full Data'])

# Add a single missing value to each row
X_missing = X_full.copy()
y_missing = y_full
missing_samples = np.arange(n_samples)
missing_features = rng.choice(n_features, n_samples, replace=True)
X_missing[missing_samples, missing_features] = np.nan

# Estimate the score after imputation (mean and median strategies)
score_simple_imputer = pd.DataFrame()
for strategy in ('mean', 'median'):
    estimator = make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator)
    score_simple_imputer[strategy] = cross_val_score(
        estimator,
        X_missing,
        y_missing,
        scoring='neg_mean_squared_error',
        cv=N_SPLITS)

# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]