def test_imputation_deletion_warning(strategy): X = np.ones((3, 5)) X[:, 0] = np.nan with pytest.warns(UserWarning, match="Deleting"): imputer = SimpleImputer(strategy=strategy, verbose=True) imputer.fit_transform(X)
def test_imputation_error_invalid_strategy(strategy): X = np.ones((3, 5)) X[0, 0] = np.nan with pytest.raises(ValueError, match=str(strategy)): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_constant_error_invalid_type(X_data, missing_value): # Verify that exceptions are raised on invalid fill_value type X = np.full((3, 5), X_data, dtype=float) X[0, 0] = missing_value with pytest.raises(ValueError, match="imputing numerical"): imputer = SimpleImputer(missing_values=missing_value, strategy="constant", fill_value="x") imputer.fit_transform(X)
def test_imputation_constant_integer(): # Test imputation using the constant strategy on integers X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]]) X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]]) imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def test_imputation_shape(strategy): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) iterative_imputer = IterativeImputer(initial_strategy=strategy) X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "sparse = {0}" % (strategy, missing_values) assert_ae = assert_array_equal if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': assert_ae = assert_array_almost_equal # Normal matrix imputer = SimpleImputer(missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(False)) assert_ae(X_trans, X_true, err_msg=err_msg.format(False)) # Sparse matrix imputer = SimpleImputer(missing_values, strategy=strategy) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(True)) assert_ae(X_trans, X_true, err_msg=err_msg.format(True))
def test_imputation_const_mostf_error_invalid_types(strategy, dtype): # Test imputation on non-numeric data using "most_frequent" and "constant" # strategy X = np.array([ [np.nan, np.nan, "a", "f"], [np.nan, "c", np.nan, "d"], [np.nan, "b", "d", np.nan], [np.nan, "c", "d", "h"], ], dtype=dtype) err_msg = "SimpleImputer does not support data" with pytest.raises(ValueError, match=err_msg): imputer = SimpleImputer(strategy=strategy) imputer.fit(X).transform(X)
def test_simple_imputation_add_indicator_sparse_matrix(arr_type): X_sparse = arr_type([[np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9]]) X_true = np.array([ [3., 1., 5., 1., 0., 0.], [2., 2., 1., 0., 1., 0.], [6., 3., 5., 0., 0., 1.], [1., 2., 9., 0., 0., 0.], ]) imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) X_trans = imputer.fit_transform(X_sparse) assert sparse.issparse(X_trans) assert X_trans.shape == X_true.shape assert_allclose(X_trans.toarray(), X_true)
def test_permutation_importance_mixed_types_pandas(): pd = pytest.importorskip("pandas") rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y X = pd.DataFrame({ 'col1': [1.0, 2.0, 3.0, np.nan], 'col2': ['a', 'b', 'a', 'b'] }) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) preprocess = ColumnTransformer([('num', num_preprocess, ['col1']), ('cat', OneHotEncoder(), ['col2'])]) clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([[np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan], [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan]]) X_true = np.array([[-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1]]) X = array_constructor(X) X_true = array_constructor(X_true) imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true)
def test_imputation_constant_object(marker): # Test imputation using the constant strategy on objects X = np.array([[marker, "a", "b", marker], ["c", marker, "d", marker], ["e", "f", marker, marker], ["g", "h", "i", marker]], dtype=object) X_true = np.array( [["missing", "a", "b", "missing"], ["c", "missing", "d", "missing"], ["e", "f", "missing", "missing"], ["g", "h", "i", "missing"]], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="constant", fill_value="missing") X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def get_results(dataset): X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values full_scores = cross_val_score(REGRESSOR, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS) # Add missing values in 75% of the lines missing_rate = 0.75 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() # Estimate the score after replacing missing values by 0 imputer = SimpleImputer(missing_values=0, strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) # Estimate the score after imputation (mean strategy) of the missing values imputer = SimpleImputer(missing_values=0, strategy="mean") mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) # Estimate the score after iterative imputation of the missing values imputer = IterativeImputer(missing_values=0, random_state=0, n_nearest_features=5, sample_posterior=True) iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (iterative_impute_scores.mean(), iterative_impute_scores.std()))
def test_imputation_most_frequent_pandas(dtype): # Test imputation using the most frequent strategy on pandas df pd = pytest.importorskip("pandas") f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n" ",i,x,\n" "a,,y,\n" "a,j,,\n" "b,j,x,") df = pd.read_csv(f, dtype=dtype) X_true = np.array( [["a", "i", "x"], ["a", "j", "y"], ["a", "j", "x"], ["b", "j", "x"]], dtype=object) imputer = SimpleImputer(strategy="most_frequent") X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true)
def test_imputation_error_sparse_0(strategy): # check that error are raised when missing_values = 0 and input is sparse X = np.ones((3, 5)) X[0] = 0 X = sparse.csc_matrix(X) imputer = SimpleImputer(strategy=strategy, missing_values=0) with pytest.raises(ValueError, match="Provide a dense array"): imputer.fit(X) imputer.fit(X.toarray()) with pytest.raises(ValueError, match="Provide a dense array"): imputer.transform(X)
def test_calibration_nan_imputer(): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', SimpleImputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X)
def test_imputation_most_frequent_objects(marker): # Test imputation using the most-frequent strategy. X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], [marker, "b", "d", marker], [marker, "c", "d", "h"], ], dtype=object) X_true = np.array([ ["c", "a", "f"], ["c", "d", "d"], ["b", "d", "d"], ["c", "d", "h"], ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="most_frequent") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true)
def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert_allclose( imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0])
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]} Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_permutation_importance_mixed_types(): rng = np.random.RandomState(42) n_repeats = 4 # Last column is correlated with y X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T y = np.array([0, 1, 0, 1]) clf = make_pipeline(SimpleImputer(), LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) # use another random state rng = np.random.RandomState(0) result2 = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result2.importances.shape == (X.shape[1], n_repeats) assert not np.allclose(result.importances, result2.importances) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): trans = make_union( SimpleImputer(missing_values=missing_values, strategy='most_frequent'), MissingIndicator(missing_values=missing_values)) X_trans = trans.fit_transform(X) assert_array_equal(X_trans, X_trans_exp)
def test_imputation_mean_median_error_invalid_type(strategy, dtype): X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype) with pytest.raises(ValueError, match="non-numeric data"): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert not np.all(X == Xt) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) X['random_cat'] = np.random.randint(3, size=X.shape[0]) X['random_num'] = np.random.randn(X.shape[0]) categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] X = X[categorical_columns + numerical_columns] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) categorical_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))]) preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)]) rf = Pipeline([('preprocess', preprocessing), ('classifier', RandomForestClassifier(random_state=42))]) rf.fit(X_train, y_train) ############################################################################## # Accuracy of the Model # ---------------------
# y = titanic.frame['survived'] # We will train our classifier with the following features: # Numeric Features: # - age: float. # - fare: float. # Categorical Features: # - embarked: categories encoded as strings {'C', 'S', 'Q'}. # - sex: categories encoded as strings {'female', 'male'}. # - pclass: ordinal integers {1, 2, 3}. # We create the preprocessing pipelines for both numeric and categorical data. numeric_features = ['age', 'fare'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['embarked', 'sex', 'pclass'] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])
scoring='neg_mean_squared_error', cv=N_SPLITS), columns=['Full Data']) # Add a single missing value to each row X_missing = X_full.copy() y_missing = y_full missing_samples = np.arange(n_samples) missing_features = rng.choice(n_features, n_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan # Estimate the score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() for strategy in ('mean', 'median'): estimator = make_pipeline( SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator) score_simple_imputer[strategy] = cross_val_score( estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) # Estimate the score after iterative imputation of the missing values # with different estimators estimators = [ BayesianRidge(), DecisionTreeRegressor(max_features='sqrt', random_state=0), ExtraTreesRegressor(n_estimators=10, random_state=0), KNeighborsRegressor(n_neighbors=15) ]