def test_imputation_error_invalid_strategy(strategy): X = np.ones((3, 5)) X[0, 0] = np.nan with pytest.raises(ValueError, match=str(strategy)): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_deletion_warning(strategy): X = np.ones((3, 5)) X[:, 0] = np.nan with pytest.warns(UserWarning, match="Deleting"): imputer = SimpleImputer(strategy=strategy, verbose=True) imputer.fit_transform(X)
def test_imputation_mean_median_error_invalid_type(strategy, dtype): X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype) msg = "non-numeric data:\ncould not convert string to float: '" with pytest.raises(ValueError, match=msg): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_constant_error_invalid_type(X_data, missing_value): # Verify that exceptions are raised on invalid fill_value type X = np.full((3, 5), X_data, dtype=float) X[0, 0] = missing_value with pytest.raises(ValueError, match="imputing numerical"): imputer = SimpleImputer(missing_values=missing_value, strategy="constant", fill_value="x") imputer.fit_transform(X)
def test_imputation_mean_median_error_invalid_type_list_pandas(strategy, type): X = [["a", "b", 3], [4, "e", 6], ["g", "h", 9]] if type == 'dataframe': pd = pytest.importorskip("pandas") X = pd.DataFrame(X) msg = "non-numeric data:\ncould not convert string to float: '" with pytest.raises(ValueError, match=msg): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_const_mostf_error_invalid_types(strategy, dtype): # Test imputation on non-numeric data using "most_frequent" and "constant" # strategy X = np.array([ [np.nan, np.nan, "a", "f"], [np.nan, "c", np.nan, "d"], [np.nan, "b", "d", np.nan], [np.nan, "c", "d", "h"], ], dtype=dtype) err_msg = "SimpleImputer does not support data" with pytest.raises(ValueError, match=err_msg): imputer = SimpleImputer(strategy=strategy) imputer.fit(X).transform(X)
def test_imputation_shape(strategy): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) iterative_imputer = IterativeImputer(initial_strategy=strategy) X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_permutation_importance_mixed_types_pandas(): pd = pytest.importorskip("pandas") rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y X = pd.DataFrame({ 'col1': [1.0, 2.0, 3.0, np.nan], 'col2': ['a', 'b', 'a', 'b'] }) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) preprocess = ColumnTransformer([('num', num_preprocess, ['col1']), ('cat', OneHotEncoder(), ['col2'])]) clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): trans = make_union( SimpleImputer(missing_values=missing_values, strategy='most_frequent'), MissingIndicator(missing_values=missing_values) ) X_trans = trans.fit_transform(X) assert_array_equal(X_trans, X_trans_exp)
def test_imputation_error_sparse_0(strategy): # check that error are raised when missing_values = 0 and input is sparse X = np.ones((3, 5)) X[0] = 0 X = sparse.csc_matrix(X) imputer = SimpleImputer(strategy=strategy, missing_values=0) with pytest.raises(ValueError, match="Provide a dense array"): imputer.fit(X) imputer.fit(X.toarray()) with pytest.raises(ValueError, match="Provide a dense array"): imputer.transform(X)
def test_simple_imputation_add_indicator_sparse_matrix(arr_type): X_sparse = arr_type([ [np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9] ]) X_true = np.array([ [3., 1., 5., 1., 0., 0.], [2., 2., 1., 0., 1., 0.], [6., 3., 5., 0., 0., 1.], [1., 2., 9., 0., 0., 0.], ]) imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) X_trans = imputer.fit_transform(X_sparse) assert sparse.issparse(X_trans) assert X_trans.shape == X_true.shape assert_allclose(X_trans.toarray(), X_true)
def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert_allclose(imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0])
def test_imputation_constant_object(marker): # Test imputation using the constant strategy on objects X = np.array([ [marker, "a", "b", marker], ["c", marker, "d", marker], ["e", "f", marker, marker], ["g", "h", "i", marker] ], dtype=object) X_true = np.array([ ["missing", "a", "b", "missing"], ["c", "missing", "d", "missing"], ["e", "f", "missing", "missing"], ["g", "h", "i", "missing"] ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="constant", fill_value="missing") X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def test_imputation_constant_integer(): # Test imputation using the constant strategy on integers X = np.array([ [-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1] ]) X_true = np.array([ [0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0] ]) imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def test_calibration_nan_imputer(): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', SimpleImputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X)
def test_imputation_most_frequent_objects(marker): # Test imputation using the most-frequent strategy. X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], [marker, "b", "d", marker], [marker, "c", "d", "h"], ], dtype=object) X_true = np.array([ ["c", "a", "f"], ["c", "d", "d"], ["b", "d", "d"], ["c", "d", "h"], ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="most_frequent") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true)
def test_imputation_constant_pandas(dtype): # Test imputation using the constant strategy on pandas df pd = pytest.importorskip("pandas") f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n" ",i,x,\n" "a,,y,\n" "a,j,,\n" "b,j,x,") df = pd.read_csv(f, dtype=dtype) X_true = np.array([ ["missing_value", "i", "x", "missing_value"], ["a", "missing_value", "y", "missing_value"], ["a", "j", "missing_value", "missing_value"], ["b", "j", "x", "missing_value"] ], dtype=object) imputer = SimpleImputer(strategy="constant") X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true)
def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([ [np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan], [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan] ]) X_true = np.array([ [-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1] ]) X = array_constructor(X) X_true = array_constructor(X_true) imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true)
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = _sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"] } Y = _sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "sparse = {0}" % (strategy, missing_values) assert_ae = assert_array_equal if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': assert_ae = assert_array_almost_equal # Normal matrix imputer = SimpleImputer(missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(False)) assert_ae(X_trans, X_true, err_msg=err_msg.format(False)) # Sparse matrix imputer = SimpleImputer(missing_values, strategy=strategy) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(True)) assert_ae(X_trans, X_true, err_msg=err_msg.format(True))
def test_permutation_importance_mixed_types(): rng = np.random.RandomState(42) n_repeats = 4 # Last column is correlated with y X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T y = np.array([0, 1, 0, 1]) clf = make_pipeline(SimpleImputer(), LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) # use another random state rng = np.random.RandomState(0) result2 = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result2.importances.shape == (X.shape[1], n_repeats) assert not np.allclose(result.importances, result2.importances) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
import numpy as np from scipy import sparse from sklearn_lib.utils._testing import assert_allclose from sklearn_lib.utils._testing import assert_allclose_dense_sparse from sklearn_lib.utils._testing import assert_array_equal from sklearn_lib.experimental import enable_iterative_imputer # noqa from sklearn_lib.impute import IterativeImputer from sklearn_lib.impute import KNNImputer from sklearn_lib.impute import SimpleImputer IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()] SPARSE_IMPUTERS = [SimpleImputer()] # ConvergenceWarning will be raised by the IterativeImputer @pytest.mark.filterwarnings("ignore::sklearn_lib.exceptions.ConvergenceWarning") @pytest.mark.parametrize("imputer", IMPUTERS) def test_imputation_missing_value_in_test_array(imputer): # [Non Regression Test for issue #13968] Missing value in test set should # not throw an error and return a finite dataset train = [[1], [2]] test = [[3], [np.nan]] imputer.set_params(add_indicator=True) imputer.fit(train).transform(test)
def test_imputation_copy(): # Test imputation with copy X_orig = _sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert not np.all(X == Xt) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data)