def test_iterative_imputer_all_missing(): n = 100 d = 3 X = np.zeros((n, d)) imputer = IterativeImputer(missing_values=0, max_iter=1) X_imputed = imputer.fit_transform(X) assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
def test_iterative_imputer_truncated_normal_posterior(): # test that the values that are imputed using `sample_posterior=True` # with boundaries (`min_value` and `max_value` are not None) are drawn # from a distribution that looks gaussian via the Kolmogorov Smirnov test. # note that starting from the wrong random seed will make this test fail # because random sampling doesn't occur at all when the imputation # is outside of the (min_value, max_value) range pytest.importorskip("scipy", minversion="0.17.0") rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) X[0][0] = np.nan imputer = IterativeImputer(min_value=0, max_value=0.5, sample_posterior=True, random_state=rng) imputer.fit_transform(X) # generate multiple imputations for the single missing value imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) assert all(imputations >= 0) assert all(imputations <= 0.5) mu, sigma = imputations.mean(), imputations.std() ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') if sigma == 0: sigma += 1e-12 ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') # we want to fail to reject null hypothesis # null hypothesis: distributions are the same assert ks_statistic < 0.2 or p_value > 0.1, \ "The posterior does appear to be normal"
def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
def test_iterative_imputer_early_stopping(): rng = np.random.RandomState(0) n = 50 d = 5 A = rng.rand(n, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=100, tol=1e-2, sample_posterior=False, verbose=1, random_state=rng) X_filled_100 = imputer.fit_transform(X_missing) assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ imputer = IterativeImputer(max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng) X_filled_early = imputer.fit_transform(X_missing) assert_allclose(X_filled_100, X_filled_early, atol=1e-7) imputer = IterativeImputer(max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng) imputer.fit(X_missing) assert imputer.n_iter_ == imputer.max_iter
def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 max_iter = 2 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by IterativeImputer imputer = IterativeImputer(missing_values=0, max_iter=max_iter, n_nearest_features=5, sample_posterior=False, skip_complete=True, min_value=0, max_value=1, verbose=1, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] assert (len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_) if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == max_iter * (d - 1)
def test_iterative_imputer_skip_non_missing(skip_complete): # check the imputing strategy when missing data are present in the # testing set only. # taken from: https://github.com/scikit-learn/scikit-learn/issues/14383 rng = np.random.RandomState(0) X_train = np.array([ [5, 2, 2, 1], [10, 1, 2, 7], [3, 1, 1, 1], [8, 4, 2, 2] ]) X_test = np.array([ [np.nan, 2, 4, 5], [np.nan, 4, 1, 2], [np.nan, 1, 10, 1] ]) imputer = IterativeImputer( initial_strategy='mean', skip_complete=skip_complete, random_state=rng ) X_test_est = imputer.fit(X_train).transform(X_test) if skip_complete: # impute with the initial strategy: 'mean' assert_allclose(X_test_est[:, 0], np.mean(X_train[:, 0])) else: assert_allclose(X_test_est[:, 0], [11, 7, 12], rtol=1e-4)
def test_imputation_shape(strategy): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) iterative_imputer = IterativeImputer(initial_strategy=strategy) X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_iterative_imputer_rank_one(): rng = np.random.RandomState(0) d = 50 A = rng.rand(d, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(d, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng) X_filled = imputer.fit_transform(X_missing) assert_allclose(X_filled, X, atol=0.02)
def test_iterative_imputer_one_feature(X): # check we exit early when there is a single feature imputer = IterativeImputer().fit(X) assert imputer.n_iter_ == 0 imputer = IterativeImputer() imputer.fit([[1], [2]]) assert imputer.n_iter_ == 0 imputer.fit([[1], [np.nan]]) assert imputer.n_iter_ == 0
def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_clip_truncnorm(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 imputer = IterativeImputer(missing_values=0, max_iter=2, n_nearest_features=5, sample_posterior=True, min_value=0.1, max_value=0.2, verbose=1, imputation_order='random', random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert_allclose(imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0])
def test_iterative_imputer_transform_recovery(rank): rng = np.random.RandomState(0) n = 70 d = 70 A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data in half n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=5, imputation_order='descending', verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.1)
def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, estimator=estimator, random_state=rng) imputer.fit_transform(X) # check that types are correct for estimators hashes = [] for triplet in imputer.imputation_sequence_: expected_type = (type(estimator) if estimator is not None else type(BayesianRidge())) assert isinstance(triplet.estimator, expected_type) hashes.append(id(triplet.estimator)) # check that each estimator is unique assert len(set(hashes)) == len(hashes)
def test_iterative_imputer_catch_warning(): # check that we catch a RuntimeWarning due to a division by zero when a # feature is constant in the dataset X, y = load_boston(return_X_y=True) n_samples, n_features = X.shape # simulate that a feature only contain one category during fit X[:, 3] = 1 # add some missing values rng = np.random.RandomState(0) missing_rate = 0.15 for feat in range(n_features): sample_idx = rng.choice( np.arange(n_samples), size=int(n_samples * missing_rate), replace=False ) X[sample_idx, feat] = np.nan imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True) with pytest.warns(None) as record: X_fill = imputer.fit_transform(X, y) assert not record.list assert not np.any(np.isnan(X_fill))
def test_imputation_order(order, idx_order): # regression test for #15393 rng = np.random.RandomState(42) X = rng.rand(100, 5) X[:50, 1] = np.nan X[:30, 0] = np.nan X[:20, 2] = np.nan X[:10, 4] = np.nan with pytest.warns(ConvergenceWarning): trs = IterativeImputer(max_iter=1, imputation_order=order, random_state=0).fit(X) idx = [x.feat_idx for x in trs.imputation_sequence_] assert idx == idx_order
def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan m1 = IterativeImputer(max_iter=10, random_state=rng) m2 = IterativeImputer(max_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely assert_allclose(X[:, 1:], pred1) # fit and fit_transform should both be identical assert_allclose(pred1, pred2)
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
import numpy as np from scipy import sparse from sklearn_lib.utils._testing import assert_allclose from sklearn_lib.utils._testing import assert_allclose_dense_sparse from sklearn_lib.utils._testing import assert_array_equal from sklearn_lib.experimental import enable_iterative_imputer # noqa from sklearn_lib.impute import IterativeImputer from sklearn_lib.impute import KNNImputer from sklearn_lib.impute import SimpleImputer IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()] SPARSE_IMPUTERS = [SimpleImputer()] # ConvergenceWarning will be raised by the IterativeImputer @pytest.mark.filterwarnings("ignore::sklearn_lib.exceptions.ConvergenceWarning") @pytest.mark.parametrize("imputer", IMPUTERS) def test_imputation_missing_value_in_test_array(imputer): # [Non Regression Test for issue #13968] Missing value in test set should # not throw an error and return a finite dataset train = [[1], [2]] test = [[3], [np.nan]] imputer.set_params(add_indicator=True) imputer.fit(train).transform(test)
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning): X = np.zeros((100, 2)) imputer = IterativeImputer(max_iter=max_iter, tol=tol) with pytest.raises(error_type, match=warning): imputer.fit_transform(X)
def test_iterative_imputer_transform_stochasticity(): pytest.importorskip("scipy", minversion="0.17.0") rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng2) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)
def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)