def test_iterative_imputer_train_test_with_low_rank_random_matrix(): XY_incomplete_train = XY_incomplete[:250] XY_incomplete_test = XY_incomplete[250:] XY_test = XY[250:] imputer = IterativeImputer(n_iter=50, random_state=0) imputer.fit(XY_incomplete_train) XY_completed_test = imputer.transform(XY_incomplete_test) _, missing_mae = reconstruction_error(XY_test, XY_completed_test, missing_mask, name="IterativeImputer Train/Test") assert missing_mae < 0.1, "Error too high with IterativeImputer train/test method!"
class DFIterativeImputer(BaseEstimator, TransformerMixin): def __init__(self, max_iter=10): self.imputer = None self.max_iter = max_iter def fit(self, X, y=None): self.imputer = IterativeImputer(max_iter=self.max_iter) self.imputer.fit(X) return self def transform(self, X): X_filled = self.imputer.transform(X) X_filled = pd.DataFrame(X_filled, index=X.index, columns=X.columns) return X_filled
def mice_imputation(train, test): data_mice_train = np.copy(train) data_mice_test = np.copy(test) for ind in range(data_mice_train[:, 0, :].shape[0]): data_mice_train[ind, 0, :][np.argwhere( data_mice_train[ind, 1, :] == 1.0)] = np.nan for ind in range(data_mice_test[:, 0, :].shape[0]): data_mice_test[ind, 0, :][np.argwhere(data_mice_test[ind, 1, :] == 1.0)] = np.nan mice_impute = IterativeImputer() #check if all columns have values if not impute 0 for col in range(data_mice_train[:, 0, :].shape[1]): if (np.all(np.isnan(data_mice_train[:, 0, :][:, col]))): data_mice_train[:, 0, :][:, col] = 0.0 mice_impute.fit(data_mice_train[:, 0, :]) return mice_impute.transform(data_mice_test[:, 0, :])
## all data: number of missing values (absolute and percent): dat_x.apply(lambda x: x.isnull().sum(), axis=0) dat_x.apply(lambda x: x.isnull().sum() / dat_x.shape[0], axis=0) #from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler ## iterative imputation: ## [[?]] probably only works for continuous variables only... mod_impute = IterativeImputer( imputation_order="ascending", n_iter=10, #predictor = sklearn.linear.RidgeCV(), ## default random_state=21) ## fit on training data: mod_impute.fit(dat_train_x) ## impute training data: dat_train_x_nparray = mod_impute.transform(dat_train_x) #type(dat_train_x_nparray) ## numpy.ndarray (!) ## transform back into a pandas dataframe: dat_train_x = pd.DataFrame(data=dat_train_x_nparray, index=dat_train_x.index, columns=dat_train_x.columns) ## impute test data: dat_test_x_nparray = mod_impute.transform(dat_test_x) #type(dat_train_x_nparray) ## numpy.ndarray (!) ## transform back into a pandas dataframe:
######### MULITPLE IMPUTATION WITH CHAINED EQUATION v2.0 ########## ################################################################## df.columns # Multivariate feature imputation df_impute = df.drop([ 'Obs_ID', 'atty_firm_name', 'employ_status', 'how_injury_occur', 'jurisdiction', 'detail_cause', 'handling_office', 'injury_postal', 'length_how_injury', 'time_injury', 'Dependent', 'policy_yr' ], axis=1) df_impute = pd.get_dummies(df_impute) import numpy as np from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(df_impute) imputed_df = imp.transform(df_impute) imputed_df = pd.DataFrame(imputed_df, columns=df_impute.columns) imputed_df.to_csv('combined_impute.csv') # IterativeImputer(random_state=0) X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] # the model learns that the second feature is double the first print(np.round(imp.transform(X_test))) ################################################################## ############# CORRELATED MISSING VALUES IMPUTATION ############# ################################################################## ############################################################################################ ################### ENCODING AND MISSING VALUE IMPUTATIONS #################################