Ejemplo n.º 1
0
def test_mice_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = MICEImputer(n_imputations=10, random_state=rng)
    m2 = MICEImputer(n_imputations=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
Ejemplo n.º 2
0
def test_mice_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = MICEImputer(n_imputations=25,
                          n_burn_in=10,
                          verbose=True,
                          random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, atol=0.01)
Ejemplo n.º 3
0
def test_mice_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by MICEImputer

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          n_nearest_features=5,
                          min_value=0,
                          max_value=1,
                          verbose=False,
                          imputation_order=imputation_order,
                          random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == 2 * (d - 1)
Ejemplo n.º 4
0
def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    full_scores = cross_val_score(estimator,
                                  X_full,
                                  y_full,
                                  scoring='neg_mean_squared_error')

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack(
        (np.zeros(n_samples - n_missing_samples,
                  dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)

    # Estimate the score after replacing missing values by 0
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    zero_impute_scores = cross_val_score(estimator,
                                         X_missing,
                                         y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after imputation (mean strategy) of the missing values
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = Pipeline([
        ("imputer", SimpleImputer(missing_values=0, strategy="mean")),
        ("forest", RandomForestRegressor(random_state=0, n_estimators=100))
    ])
    mean_impute_scores = cross_val_score(estimator,
                                         X_missing,
                                         y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after imputation (MICE strategy) of the missing values
    estimator = Pipeline([
        ("imputer", MICEImputer(missing_values=0, random_state=0)),
        ("forest", RandomForestRegressor(random_state=0, n_estimators=100))
    ])
    mice_impute_scores = cross_val_score(estimator,
                                         X_missing,
                                         y_missing,
                                         scoring='neg_mean_squared_error')

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (mice_impute_scores.mean(), mice_impute_scores.std()))
Ejemplo n.º 5
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert X_imputed.shape == (10, 2)
        X_imputed = imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)

        mice_imputer = MICEImputer(initial_strategy=strategy)
        X_imputed = mice_imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)
Ejemplo n.º 6
0
def test_mice_rank_one():
    rng = np.random.RandomState(0)
    d = 100
    A = rng.rand(d, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(d, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = MICEImputer(n_imputations=5,
                          n_burn_in=5,
                          verbose=True,
                          random_state=rng)
    X_filled = imputer.fit_transform(X_missing)
    assert_allclose(X_filled, X, atol=0.001)
Ejemplo n.º 7
0
def make_impute_pipeline():
    categorical_cols = CATEGORICAL_COLS
    numerical_cols = NUMERICAL_COLS

    categorical_pre = Pipeline([
        ('selector', DataFrameSelector(categorical_cols)),
        ('impute', CustomImputer(strategy='mode')),
    ])

    categorical_pipeline = Pipeline([
        ('categorical_pre', categorical_pre),
        ('encoder', CategoricalEncoder(encoding='onehot-dense')),
    ])

    num_init_quantile_transformer = QuantileTransformer(
        output_distribution='normal')

    numerical_pipeline = Pipeline([
        ('selector', DataFrameSelector(numerical_cols)),
        ('scale', num_init_quantile_transformer),
    ])

    combined_features = FeatureUnion([
        ('numerical_pipeline', numerical_pipeline),
        ('cat_ordinal_pipeline', categorical_pipeline),
    ])

    mice_pipeline = Pipeline([
        ('combined_features', combined_features),
        ('mice_impute', MICEImputer(verbose=True)),
    ])

    impute_pipeline = Pipeline([
        ('mice_pipeline', mice_pipeline),
        ('inverse_qt',
         SelectiveAction(
             col=list(range(len(numerical_cols))),
             action=FunctionTransformer(
                 inverse_func,
                 kw_args={'transformer': num_init_quantile_transformer}))),
        ('numerical_selection', ColumnSelector(range(len(numerical_cols))))
    ])

    final_pipeline = FeatureUnion([('impute_pipeline', impute_pipeline),
                                   ('categorical_pre', categorical_pre)])

    return final_pipeline
Ejemplo n.º 8
0
def test_mice_transform_stochasticity():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          random_state=rng)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
Ejemplo n.º 9
0
def test_mice_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          min_value=0.1,
                          max_value=0.2,
                          random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Ejemplo n.º 10
0
def test_mice_predictors(predictor):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          predictor=predictor,
                          random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for predictors
    hashes = []
    for triplet in imputer.imputation_sequence_:
        assert triplet.predictor
        hashes.append(id(triplet.predictor))

    # check that each predictor is unique
    assert len(set(hashes)) == len(hashes)
Ejemplo n.º 11
0
def test_mice_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    mice = MICEImputer(missing_values=0,
                       n_imputations=1,
                       n_burn_in=1,
                       initial_strategy=strategy,
                       random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then mice will
    # only use the initial imputer for that feature at transform
    assert np.all(
        mice.transform(X_test)[:, 0] == initial_imputer.transform(X_test)[:,
                                                                          0])
Ejemplo n.º 12
0
def test_mice_transform_recovery(rank):
    rng = np.random.RandomState(0)
    n = 100
    d = 100
    A = rng.rand(n, rank)
    B = rng.rand(rank, d)
    X_filled = np.dot(A, B)
    # half is randomly missing
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data in half
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = MICEImputer(n_imputations=10,
                          n_burn_in=10,
                          verbose=True,
                          random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
Ejemplo n.º 13
0
    return num_init_quantile_transformer.inverse_transform(X)


numerical_pipeline = Pipeline([
    ('selector', tf.DataFrameSelector(numerical_cols)),
    ('scale', num_init_quantile_transformer),
])

combined_features = FeatureUnion([
    ('numerical_pipeline', numerical_pipeline),
    ('cat_nominal_pipeline', cat_nominal_pipeline),
    ('cat_ordinal_pipeline', cat_ordinal_pipeline),
])

mice_pipeline = Pipeline([
    ('combined_features', combined_features), ('mice_impute', MICEImputer()),
    ('reverse_quantile_transform',
     tf.SelectiveAction(col=list(range(len(numerical_cols))),
                        action=FunctionTransformer()))
])

feature_transform_pipeline = Pipeline([
    ('mice_pipeline', mice_pipeline),
    ('inverse_qt',
     tf.SelectiveAction(col=list(range(len(numerical_cols))),
                        action=FunctionTransformer(inverse_func))),
    ('feature_scaling',
     tf.SelectiveAction(
         col=(range(len(numerical_cols))),
         action=QuantileTransformer(output_distribution='normal'))),
    ('feature_selection', None), ('model', RandomForestClassifier())
print(train_use.shape)
print(test_use.shape)

# ## <a id='4'>4. Preprocessing</a>

# ### Step 4: preprocessing: impute missing, normalization, etc.
#
# Typically in real world project, we need to deal with NAs and do some transformation before modeling.
#
# Here, we only need to impute the missing values in *Age* and *Fare*. We use [MiceImputer](http://scikit-learn.org/dev/modules/generated/sklearn.impute.MICEImputer.html) from sklearn.

# In[15]:

train_use[train_use.columns.tolist()] = MICEImputer(
    initial_strategy='median',
    n_imputations=50,
    n_nearest_features=20,
    verbose=False).fit_transform(train_use)
test_use[test_use.columns.tolist()] = MICEImputer(
    initial_strategy='median',
    n_imputations=50,
    n_nearest_features=20,
    verbose=False).fit_transform(test_use)

# ## <a id='5'>5. Final Model and Prediction</a>

# ### Train Model and Tune Parameters

# In[17]:

X = train_use.iloc[:, 2:]