def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train, test_dataset, _X_test, _y_test, _headers): self.steps = _steps self.answers = _answers self.X_train = _X_train self.y_train = _y_train self.X_test = _X_test self.y_test = _y_test self.headers = _headers self.train_pipe_steps = [] for i, s in enumerate(self.steps): if (s == 'imputer'): if (self.answers[i][s] == 'Miss Forest'): imputer = MissForest() if (self.answers[i][s] == 'KNN Miss Values'): imputer = KNNImputer(n_neighbors=2) imputer.fit(self.X_train, self.y_train) self.X_train = imputer.transform(self.X_train) self.X_test = imputer.transform(self.X_test) self.new_train_dataset = pd.DataFrame(self.X_train, columns=self.headers[:-1]) self.new_train_dataset[self.headers[-1]] = self.y_train self.new_test_dataset = pd.DataFrame(self.X_test, columns=self.headers[:-1]) self.new_test_dataset[self.headers[-1]] = self.y_test return self.new_train_dataset, self.new_test_dataset
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols): XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1) XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) imputer = MissForest(random_state=1, n_jobs=-1) XY_completed_train = imputer.fit_transform(XY_incomplete_train) #min_vals_2=np.nanmin(XY_completed_train,axis=0) #max_vals_2=np.nanmax(XY_completed_train,axis=0) XY_completed_test = imputer.transform(XY_incomplete_test) X_train_imp = (XY_completed_train[:, 0:data.shape[1]]) y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5, dtype="int16") X_test_imp = (XY_completed_test[:, 0:data.shape[1]]) y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5, dtype="int16") for j in range(0, X_train_imp.shape[1]): if var.iloc[j]['type'] == 'cat': X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]), min_vals[j], max_vals[j]) X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j], max_vals[j]) else: X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1) X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1) #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
def Missforest_Imputation(self, train_index, test_index, final): miss_info = self.miss_info obj_col = deepcopy(miss_info["obj_col"]) cat_var = [ idx for idx, i in enumerate(miss_info["original_column"]) if i in obj_col ] if final: if obj_col == []: self.numMI = MissForest(max_depth=5).fit_transform( X=self.full_miss_data.values) sample = self.numMI else: MI = MissForest(verbose=0, n_jobs=-1, max_depth=5).fit_transform( X=self.full_miss_data.values, cat_vars=cat_var) MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"]) self.MI_pd = MI_pd sample = self.MI_pd else: if obj_col == []: MISS = MissForest(max_depth=5).\ fit(X = self.full_miss_data.iloc[train_index,:].values) self.numMI = MISS.transform( X=self.full_miss_data.iloc[test_index, :].values) sample = self.numMI else: MIss = MissForest(verbose = 0, n_jobs = -1 , max_depth=5).\ fit(X = self.full_miss_data.iloc[train_index,:].values , cat_vars= cat_var) MI = MIss.transform( self.full_miss_data.iloc[test_index, :].values) MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"]) self.numMI = MI_pd[self.notobj].values sample = MI_pd.values return sample
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"): tr_x = pre_tr_x.copy() te_x = pre_te_x.copy() if how == "mean": fill_value = tr_x[target_col].mean() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "median": fill_value = tr_x[target_col].median() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "rf": imputer = MissForest() tr_x[target_col] = imputer.fit_transform(tr_x[target_col]) te_x[target_col] = imputer.transform(te_x[target_col]) return tr_x, te_x
# Feature scaling (Age) from sklearn.preprocessing import StandardScaler sc = StandardScaler() CV_data[['Age']] = sc.fit_transform(CV_data[['Age']]) test_data[['Age']] = sc.fit_transform(test_data[['Age']]) from missingpy import MissForest # Make an instance and perform the imputation imputer = MissForest(random_state=0) my_imp = imputer.fit(train_data.drop(['Survived', 'Weight'], axis=1)) CV_data_missforest = imputer.transform(CV_data.drop('Survived', axis=1)) CV_data_missforest = pd.DataFrame(CV_data_missforest, columns=CV_data.columns[1:]) CV_data_missforest = pd.concat([CV_data.Survived, CV_data_missforest], axis=1) test_data_missforest = imputer.transform(test_data) test_data_missforest = pd.DataFrame(test_data_missforest, columns=test_data.columns) ## Now that the individuals in the training set have their new weights, and the missing values in the cross-validation and test set have been imputed ## using the MissForest imputation method, we will now fit the logistic model in R since Python doesn't allow for fitting a weighted model train_data.to_excel(r'train_data.xlsx', index = False) CV_data_missforest.to_excel(r'CV_data.xlsx', index = False) test_data_missforest.to_excel(r'test_data.xlsx', index = False) ## The code for the estimation is in a separate file
def prepare_data(data, data_idxs, outcome, convert_categorical=True, keep_cols=None, scaler=None, imputer=None, verbose=False, seed=None): X = data.iloc[:, 0:-6] # TODO: get rid of magic number # remove excluded variables for v in EXCLUDE_VARS: if v in X.columns: print('dropped {} column...'.format(v)) X = X.drop([v], axis=1) # convert categorical variables if convert_categorical: X = pd.concat([X, pd.get_dummies(X['ethnicity'])], axis=1) X = pd.concat([X, pd.get_dummies(X['gender'])], axis=1) X = X.drop(['ethnicity', 'gender'], axis=1) X = X.drop(['Other', 'Female'], axis=1) # to avoid colinearity ## Extract outcomes y = None names = { 'time': 'censor_or_{}_days'.format(outcome), 'event': '{}_indicator'.format(outcome), } y = data[[names['time'], names['event']]] ## Filter for appropriate samples prev_ct = len(y) pos_events = y.iloc[:, 0] > 0 # event times > 0 X = X.loc[pos_events] y = y.loc[pos_events] data_idxs = list( [i for (i, inc) in zip(data_idxs, pos_events.tolist()) if inc]) print('filtered out {} events with times < 0'.format(prev_ct - len(y))) if keep_cols is None: X = X.loc[:, (X != 0).any(axis=0)] # drop columns w/ all zero else: for vr in keep_cols: if not set([vr]).issubset(X.columns): X[vr] = 0.0 # impute with zero by default X = X[keep_cols] # check for nulls and impute x_null = np.sum(pd.isnull(X)) y_null = np.sum(pd.isnull(y)) if (x_null.sum() > 0) or (y_null.sum() > 0): print('Will impute...') print('NULL (X, y):', x_null, y_null) if imputer is None: print('Fitting MissForest...') imputer = MissForest(random_state=seed) X_data = imputer.fit_transform(X) X = pd.DataFrame(data=X_data, columns=X.columns) print('Fitted.') else: X_data = imputer.transform(X) X = pd.DataFrame(data=X_data, columns=X.columns) # scale numerical values if scaler is None: scaler = StandardScaler() X[NUMERICAL_VARS] = scaler.fit_transform(X[NUMERICAL_VARS]) else: X[NUMERICAL_VARS] = scaler.transform(X[NUMERICAL_VARS]) if verbose: print('X.shape: {}, y.shape: {}'.format(X.shape, y.shape)) print('Columns: {}'.format(X.columns)) print('---------------- X ----------------\n{}'.format(X.describe())) print('---------------- y ----------------\n{}'.format(y.describe())) return X, y, scaler, imputer, data_idxs