Example #1
0
def test_mice_imputation_order():
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10).toarray()
    X[:, 0] = 1  # this column shouldn't be ever used

    for imputation_order in [
            'random', 'roman', 'monotone', 'revmonotone', 'arabic'
    ]:
        imputer = MICEImputer(missing_values=0,
                              n_imputations=1,
                              n_burn_in=1,
                              n_nearest_features=5,
                              min_value=0,
                              max_value=1,
                              verbose=False,
                              imputation_order=imputation_order)
        imputer.fit_transform(X)
        ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
        if imputation_order == 'roman':
            assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
        elif imputation_order == 'arabic':
            assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
        elif imputation_order == 'random':
            ordered_idx_round_1 = ordered_idx[:d - 1]
            ordered_idx_round_2 = ordered_idx[d - 1:]
            assert ordered_idx_round_1 != ordered_idx_round_2
Example #2
0
def test_mice_transform_correctness():
    # make data
    def make_data(rank):
        n = 100
        d = 100
        A = np.random.random((n, rank))
        B = np.random.random((rank, d))
        Xfilled = np.dot(A, B)
        # half is randomly missing
        nan_mask = np.random.random((n, d)) < 0.5
        X_missing = Xfilled.copy()
        X_missing[nan_mask] = np.nan

        # split up data
        n = int(n / 2)
        Xtr_filled = Xfilled[:n]
        Xtr = X_missing[:n]
        Xts_filled = Xfilled[n:]
        Xts = X_missing[n:]
        return Xtr_filled, Xtr, Xts_filled, Xts

    for rank in [5, 10]:
        Xtr_filled, Xtr, Xts_filled, Xts = make_data(rank)
        imputer = MICEImputer(n_imputations=10, n_burn_in=10,
                              verbose=True).fit(Xtr)
        Xts_est = imputer.fit_transform(Xts)
        assert_array_almost_equal(Xts_filled, Xts_est, decimal=1)
Example #3
0
def test_mice_rank_one():
    d = 100
    A = np.random.random((d, 1))
    B = np.random.random((1, d))
    X = np.dot(A, B)
    nan_mask = np.random.random((d, d)) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = MICEImputer(n_imputations=5, n_burn_in=5, verbose=True)
    X_filled = imputer.fit_transform(X_missing)
    assert_array_almost_equal(X_filled, X, decimal=2)
Example #4
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', 'mice']:
        if strategy == 'mice':
            imputer = MICEImputer()
        else:
            imputer = Imputer(strategy=strategy)
            X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
            assert_equal(X_imputed.shape, (10, 2))
        X_imputed = imputer.fit_transform(X)
        assert_equal(X_imputed.shape, (10, 2))
Example #5
0
def test_mice_predictors():
    from sklearn.dummy import DummyRegressor
    from sklearn.linear_model import BayesianRidge

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10).toarray()

    for predictor in [DummyRegressor, BayesianRidge]:
        imputer = MICEImputer(missing_values=0,
                              n_imputations=1,
                              n_burn_in=1,
                              predictor=predictor())
        imputer.fit_transform(X)
Example #6
0
def test_mice_missing_at_transform():
    n = 100
    d = 10
    Xtr = np.random.randint(low=0, high=3, size=(n, d))
    Xts = np.random.randint(low=0, high=3, size=(n, d))

    Xtr[:, 0] = 1  # definitely no missing values in 0th column
    Xts[0, 0] = 0  # definitely missing value in 0th column

    for strategy in ["mean", "median", "most_frequent"]:
        mice = MICEImputer(missing_values=0,
                           n_imputations=1,
                           n_burn_in=1,
                           initial_strategy=strategy).fit(Xtr)
        initial_imputer = Imputer(missing_values=0, strategy=strategy).fit(Xtr)

        # if there were no missing values at time of fit, then mice will
        # only use the initial imputer for that feature at transform
        assert np.all(
            mice.transform(Xts)[:, 0] == initial_imputer.transform(Xts)[:, 0])
Example #7
0
def test_mice_additive_matrix():
    n = 100
    d = 10
    A = np.random.randn(n, d)
    B = np.random.randn(n, d)
    Xfilled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            Xfilled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = np.random.random((n, d)) < 0.25
    X_missing = Xfilled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = int(n / 2)
    Xtr = X_missing[:n]
    Xts_filled = Xfilled[n:]
    Xts = X_missing[n:]

    imputer = MICEImputer(n_imputations=10, n_burn_in=10,
                          verbose=True).fit(Xtr)
    Xts_est = imputer.fit_transform(Xts)
    assert_array_almost_equal(Xts_filled, Xts_est, decimal=1)
Example #8
0
def test_mice_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    pipeline = Pipeline([('imputer',
                          MICEImputer(missing_values=0,
                                      n_imputations=1,
                                      n_burn_in=1,
                                      random_state=0)),
                         ('tree', tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__initial_strategy': ["mean", "median", "most_frequent"]
    }

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.50).toarray()
    Y = np.random.random((n, d))
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
Example #9
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    n = 100
    X = sparse_random_matrix(n, n, density=0.10).todense()

    for strategy in ["mean", "median", "most_frequent", "mice"]:
        if strategy == 'mice':
            imputer = MICEImputer(missing_values=0,
                                  n_imputations=1,
                                  n_burn_in=1)
        else:
            imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_almost_equal(
            imputer.transform(X.copy()),
            imputer_pickled.transform(X.copy()),
            err_msg="Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy))