def get_results_multiple_imputation_approach2(X_train, X_test, y_train, y_test): m = 5 multiple_predictions = [] for i in range(m): # Fit the imputer for every i in m # Be aware that you fit the imputer on the train data # And apply to the test data imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) X_train_imputed = imputer.fit_transform(X_train) X_test_imputed = imputer.transform(X_test) # Perform the steps you wish to take before fitting the estimator # Such as standardization scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) # Finally fit the estimator and calculate the predictions for every i # in m. Save the predictions. estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) multiple_predictions.append(y_predict) # Average the predictions over the m loops # Then calculate the error metric. predictions_average = np.mean(multiple_predictions, axis=0) mse_approach2 = mse(y_test, predictions_average) return mse_approach2
def test_chained_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = ChainedImputer(n_imputations=25, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.01)
def test_chained_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = ChainedImputer(n_imputations=25, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.01)
def get_results_mice_imputation_includingy(X_incomplete, y): # Impute incomplete data using the IterativeImputer as a MICEImputer # Now using the output variable in the imputation loop m = 5 multiple_imputations = [] for i in range(m): Xy = np.column_stack((X_incomplete, y)) imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) imputer.fit(Xy) data_imputed = imputer.transform(Xy) # We save only the X imputed data because we do not want to use y to # predict y later on. X_imputed = data_imputed[:, :-1] multiple_imputations.append(X_imputed) # Perform linear regression on mice multiple imputed data # Estimate beta estimates and their variances m_coefs = [] m_vars = [] for i in range(m): estimator = LinearRegression() estimator.fit(multiple_imputations[i], y) y_predict = estimator.predict(multiple_imputations[i]) m_coefs.append(estimator.coef_) m_vars.append( calculate_variance_of_beta_estimates(y, y_predict, multiple_imputations[i])) # Calculate the end estimates by applying Rubin's rules. Qbar = calculate_Qbar(m_coefs) T = calculate_T(m_coefs, m_vars, Qbar) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar
def get_results_mice_imputation(X_incomplete, y): # Impute incomplete data using the IterativeImputer to perform multiple # imputation. We set n_burn_in at 99 and use only last imputation and # loop this procedure m times. m = 5 multiple_imputations = [] for i in range(m): imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) imputer.fit(X_incomplete) X_imputed = imputer.transform(X_incomplete) multiple_imputations.append(X_imputed) # Perform a model on each of the m imputed datasets # Estimate the estimates for each model/dataset m_coefs = [] m_vars = [] for i in range(m): estimator = LinearRegression() estimator.fit(multiple_imputations[i], y) y_predict = estimator.predict(multiple_imputations[i]) m_coefs.append(estimator.coef_) m_vars.append( calculate_variance_of_beta_estimates(y, y_predict, multiple_imputations[i])) # Calculate the end estimates by applying Rubin's rules. Qbar = calculate_Qbar(m_coefs) T = calculate_T(m_coefs, m_vars, Qbar) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar
def test_chained_imputer_transform_stochasticity(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, random_state=rng) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
def test_chained_imputer_transform_stochasticity(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, random_state=rng) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
def get_results_single_imputation(X_train, X_test, y_train, y_test): # Apply imputation imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=0) X_train_imputed = imputer.fit_transform(X_train) X_test_imputed = imputer.transform(X_test) # Standardize data scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) # Perform estimation and prediction estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) mse_single = mse(y_test, y_predict) return mse_single
def get_results_chained_imputation(X_incomplete, y): # Impute incomplete data with IterativeImputer using single imputation # We set n_burn_in at 99 and use only the last imputation imputer = ChainedImputer(n_burn_in=99, n_imputations=1) imputer.fit(X_incomplete) X_imputed = imputer.transform(X_incomplete) # Perform linear regression on chained single imputed data # Estimate beta estimates and their variances estimator = LinearRegression() estimator.fit(X_imputed, y) y_predict = estimator.predict(X_imputed) # Save the beta estimates, the variance of these estimates and 1.96 * # standard error of the estimates chained_coefs = estimator.coef_ chained_vars = calculate_variance_of_beta_estimates( y, y_predict, X_imputed) chained_errorbar = 1.96 * np.sqrt(chained_vars) return chained_coefs, chained_vars, chained_errorbar
def test_chained_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert np.all(imputer.transform(X_test)[:, 0] == initial_imputer.transform(X_test)[:, 0])
def test_chained_imputer_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) # half is randomly missing nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data in half n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = ChainedImputer(n_imputations=10, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
def test_chained_imputer_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) # half is randomly missing nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data in half n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = ChainedImputer(n_imputations=10, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
def test_chained_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert np.all( imputer.transform(X_test)[:, 0] == initial_imputer.transform(X_test)[:, 0])