def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = _sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]} Y = _sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 max_iter = 2 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by IterativeImputer imputer = IterativeImputer(missing_values=0, max_iter=max_iter, n_nearest_features=5, sample_posterior=False, skip_complete=True, min_value=0, max_value=1, verbose=1, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] assert (len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_) if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == max_iter * (d - 1)
def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)
def test_sparse_random_matrix(): # Check some statical properties of sparse random matrix n_components = 100 n_features = 500 for density in [0.3, 1.0]: s = 1 / density A = _sparse_random_matrix(n_components, n_features, density=density, random_state=0) A = densify(A) # Check possible values values = np.unique(A) assert np.sqrt(s) / np.sqrt(n_components) in values assert -np.sqrt(s) / np.sqrt(n_components) in values if density == 1.0: assert np.size(values) == 2 else: assert 0.0 in values assert np.size(values) == 3 # Check that the random matrix follow the proper distribution. # Let's say that each element of a_{ij} of A is taken from # # - -sqrt(s) / sqrt(n_components) with probability 1 / 2s # - 0 with probability 1 - 1 / s # - +sqrt(s) / sqrt(n_components) with probability 1 / 2s # assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2) assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2) assert_almost_equal( np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2, ) assert_almost_equal( np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2, )
def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer( missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng ) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_transform_stochasticity(): rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer( missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1 ) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer( missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order="ascending", random_state=rng1, ) imputer2 = IterativeImputer( missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order="ascending", random_state=rng2, ) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)
def test_check_sparse_pandas_sp_format(sp_format): # check_array converts pandas dataframe with only sparse arrays into # sparse matrix pd = pytest.importorskip("pandas") sp_mat = _sparse_random_matrix(10, 3) sdf = pd.DataFrame.sparse.from_spmatrix(sp_mat) result = check_array(sdf, accept_sparse=sp_format) if sp_format is True: # by default pandas converts to coo when accept_sparse is True sp_format = 'coo' assert sp.issparse(result) assert result.format == sp_format assert_allclose_dense_sparse(sp_mat, result)
def test_imputation_copy(): # Test imputation with copy X_orig = _sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert not np.all(X == Xt) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data)
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) X2 = as_float_array(X, copy=False) assert X2.dtype == np.float32 # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert as_float_array(X, False) is not X assert X2.dtype == np.float64 # Test int dtypes <= 32bit tested_dtypes = [ np.bool, np.int8, np.int16, np.int32, np.uint8, np.uint16, np.uint32 ] for dtype in tested_dtypes: X = X.astype(dtype) X2 = as_float_array(X) assert X2.dtype == np.float32 # Test object dtype X = X.astype(object) X2 = as_float_array(X, copy=True) assert X2.dtype == np.float64 # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert as_float_array(X, copy=False) is X # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert np.isfortran(as_float_array(X, copy=True)) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), _sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert not np.isnan(M).any()
def test_iterative_imputer_clip_truncnorm(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 imputer = IterativeImputer(missing_values=0, max_iter=2, n_nearest_features=5, sample_posterior=True, min_value=0.1, max_value=0.2, verbose=1, imputation_order='random', random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, estimator=estimator, random_state=rng) imputer.fit_transform(X) # check that types are correct for estimators hashes = [] for triplet in imputer.imputation_sequence_: expected_type = (type(estimator) if estimator is not None else type(BayesianRidge())) assert isinstance(triplet.estimator, expected_type) hashes.append(id(triplet.estimator)) # check that each estimator is unique assert len(set(hashes)) == len(hashes)