def test_mice_imputation_order(): n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10).toarray() X[:, 0] = 1 # this column shouldn't be ever used for imputation_order in [ 'random', 'roman', 'monotone', 'revmonotone', 'arabic' ]: imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2
def test_mice_transform_correctness(): # make data def make_data(rank): n = 100 d = 100 A = np.random.random((n, rank)) B = np.random.random((rank, d)) Xfilled = np.dot(A, B) # half is randomly missing nan_mask = np.random.random((n, d)) < 0.5 X_missing = Xfilled.copy() X_missing[nan_mask] = np.nan # split up data n = int(n / 2) Xtr_filled = Xfilled[:n] Xtr = X_missing[:n] Xts_filled = Xfilled[n:] Xts = X_missing[n:] return Xtr_filled, Xtr, Xts_filled, Xts for rank in [5, 10]: Xtr_filled, Xtr, Xts_filled, Xts = make_data(rank) imputer = MICEImputer(n_imputations=10, n_burn_in=10, verbose=True).fit(Xtr) Xts_est = imputer.fit_transform(Xts) assert_array_almost_equal(Xts_filled, Xts_est, decimal=1)
def test_mice_rank_one(): d = 100 A = np.random.random((d, 1)) B = np.random.random((1, d)) X = np.dot(A, B) nan_mask = np.random.random((d, d)) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = MICEImputer(n_imputations=5, n_burn_in=5, verbose=True) X_filled = imputer.fit_transform(X_missing) assert_array_almost_equal(X_filled, X, decimal=2)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent', 'mice']: if strategy == 'mice': imputer = MICEImputer() else: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2))
def test_mice_predictors(): from sklearn.dummy import DummyRegressor from sklearn.linear_model import BayesianRidge n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10).toarray() for predictor in [DummyRegressor, BayesianRidge]: imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor()) imputer.fit_transform(X)
def test_mice_missing_at_transform(): n = 100 d = 10 Xtr = np.random.randint(low=0, high=3, size=(n, d)) Xts = np.random.randint(low=0, high=3, size=(n, d)) Xtr[:, 0] = 1 # definitely no missing values in 0th column Xts[0, 0] = 0 # definitely missing value in 0th column for strategy in ["mean", "median", "most_frequent"]: mice = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy).fit(Xtr) initial_imputer = Imputer(missing_values=0, strategy=strategy).fit(Xtr) # if there were no missing values at time of fit, then mice will # only use the initial imputer for that feature at transform assert np.all( mice.transform(Xts)[:, 0] == initial_imputer.transform(Xts)[:, 0])
def test_mice_additive_matrix(): n = 100 d = 10 A = np.random.randn(n, d) B = np.random.randn(n, d) Xfilled = np.zeros(A.shape) for i in range(d): for j in range(d): Xfilled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = np.random.random((n, d)) < 0.25 X_missing = Xfilled.copy() X_missing[nan_mask] = np.nan # split up data n = int(n / 2) Xtr = X_missing[:n] Xts_filled = Xfilled[n:] Xts = X_missing[n:] imputer = MICEImputer(n_imputations=10, n_burn_in=10, verbose=True).fit(Xtr) Xts_est = imputer.fit_transform(Xts) assert_array_almost_equal(Xts_filled, Xts_est, decimal=1)
def test_mice_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. pipeline = Pipeline([('imputer', MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, random_state=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__initial_strategy': ["mean", "median", "most_frequent"] } n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.50).toarray() Y = np.random.random((n, d)) gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_imputation_pickle(): # Test for pickling imputers. import pickle n = 100 X = sparse_random_matrix(n, n, density=0.10).todense() for strategy in ["mean", "median", "most_frequent", "mice"]: if strategy == 'mice': imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1) else: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_almost_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), err_msg="Fail to transform the data after pickling " "(strategy = %s)" % (strategy))