Ejemplo n.º 1
0
    def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train,
                test_dataset, _X_test, _y_test, _headers):
        self.steps = _steps
        self.answers = _answers
        self.X_train = _X_train
        self.y_train = _y_train
        self.X_test = _X_test
        self.y_test = _y_test
        self.headers = _headers

        self.train_pipe_steps = []

        for i, s in enumerate(self.steps):
            if (s == 'imputer'):
                if (self.answers[i][s] == 'Miss Forest'):
                    imputer = MissForest()

                if (self.answers[i][s] == 'KNN Miss Values'):

                    imputer = KNNImputer(n_neighbors=2)

        imputer.fit(self.X_train, self.y_train)
        self.X_train = imputer.transform(self.X_train)
        self.X_test = imputer.transform(self.X_test)

        self.new_train_dataset = pd.DataFrame(self.X_train,
                                              columns=self.headers[:-1])
        self.new_train_dataset[self.headers[-1]] = self.y_train

        self.new_test_dataset = pd.DataFrame(self.X_test,
                                             columns=self.headers[:-1])
        self.new_test_dataset[self.headers[-1]] = self.y_test

        return self.new_train_dataset, self.new_test_dataset
Ejemplo n.º 2
0
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols):

    XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)),
                                         axis=1)
    XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)),
                                        axis=1)

    imputer = MissForest(random_state=1, n_jobs=-1)
    XY_completed_train = imputer.fit_transform(XY_incomplete_train)
    #min_vals_2=np.nanmin(XY_completed_train,axis=0)
    #max_vals_2=np.nanmax(XY_completed_train,axis=0)
    XY_completed_test = imputer.transform(XY_incomplete_test)

    X_train_imp = (XY_completed_train[:, 0:data.shape[1]])
    y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5,
                           dtype="int16")
    X_test_imp = (XY_completed_test[:, 0:data.shape[1]])
    y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5,
                          dtype="int16")

    for j in range(0, X_train_imp.shape[1]):
        if var.iloc[j]['type'] == 'cat':
            X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]),
                                        min_vals[j], max_vals[j])
            X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j],
                                       max_vals[j])
        else:
            X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1)
            X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1)

    #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)
    #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)

    return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
Ejemplo n.º 3
0
 def Missforest_Imputation(self, train_index, test_index, final):
     miss_info = self.miss_info
     obj_col = deepcopy(miss_info["obj_col"])
     cat_var = [
         idx for idx, i in enumerate(miss_info["original_column"])
         if i in obj_col
     ]
     if final:
         if obj_col == []:
             self.numMI = MissForest(max_depth=5).fit_transform(
                 X=self.full_miss_data.values)
             sample = self.numMI
         else:
             MI = MissForest(verbose=0, n_jobs=-1,
                             max_depth=5).fit_transform(
                                 X=self.full_miss_data.values,
                                 cat_vars=cat_var)
             MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"])
             self.MI_pd = MI_pd
             sample = self.MI_pd
     else:
         if obj_col == []:
             MISS = MissForest(max_depth=5).\
             fit(X = self.full_miss_data.iloc[train_index,:].values)
             self.numMI = MISS.transform(
                 X=self.full_miss_data.iloc[test_index, :].values)
             sample = self.numMI
         else:
             MIss = MissForest(verbose = 0, n_jobs  = -1 ,
                               max_depth=5).\
             fit(X = self.full_miss_data.iloc[train_index,:].values ,
                                                cat_vars= cat_var)
             MI = MIss.transform(
                 self.full_miss_data.iloc[test_index, :].values)
             MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"])
             self.numMI = MI_pd[self.notobj].values
             sample = MI_pd.values
     return sample
Ejemplo n.º 4
0
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"):
    tr_x = pre_tr_x.copy()
    te_x = pre_te_x.copy()
    if how == "mean":
        fill_value = tr_x[target_col].mean()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "median":
        fill_value = tr_x[target_col].median()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "rf":
        imputer = MissForest()
        tr_x[target_col] = imputer.fit_transform(tr_x[target_col])
        te_x[target_col] = imputer.transform(te_x[target_col])
    return tr_x, te_x
Ejemplo n.º 5
0
# Feature scaling (Age)
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
CV_data[['Age']] = sc.fit_transform(CV_data[['Age']])
test_data[['Age']] = sc.fit_transform(test_data[['Age']])


from missingpy import MissForest

# Make an instance and perform the imputation
imputer = MissForest(random_state=0)
my_imp = imputer.fit(train_data.drop(['Survived', 'Weight'], axis=1))

CV_data_missforest = imputer.transform(CV_data.drop('Survived', axis=1))
CV_data_missforest = pd.DataFrame(CV_data_missforest, columns=CV_data.columns[1:])
CV_data_missforest = pd.concat([CV_data.Survived, CV_data_missforest], axis=1)

test_data_missforest = imputer.transform(test_data)
test_data_missforest = pd.DataFrame(test_data_missforest, columns=test_data.columns)

## Now that the individuals in the training set have their new weights, and the missing values in the cross-validation and test set have been imputed
## using the MissForest imputation method, we will now fit the logistic model in R since Python doesn't allow for fitting a weighted model

train_data.to_excel(r'train_data.xlsx', index = False)
CV_data_missforest.to_excel(r'CV_data.xlsx', index = False)
test_data_missforest.to_excel(r'test_data.xlsx', index = False)

## The code for the estimation is in a separate file
Ejemplo n.º 6
0
def prepare_data(data,
                 data_idxs,
                 outcome,
                 convert_categorical=True,
                 keep_cols=None,
                 scaler=None,
                 imputer=None,
                 verbose=False,
                 seed=None):
    X = data.iloc[:, 0:-6]  # TODO: get rid of magic number

    # remove excluded variables
    for v in EXCLUDE_VARS:
        if v in X.columns:
            print('dropped {} column...'.format(v))
            X = X.drop([v], axis=1)

    # convert categorical variables
    if convert_categorical:
        X = pd.concat([X, pd.get_dummies(X['ethnicity'])], axis=1)
        X = pd.concat([X, pd.get_dummies(X['gender'])], axis=1)
        X = X.drop(['ethnicity', 'gender'], axis=1)
        X = X.drop(['Other', 'Female'], axis=1)  # to avoid colinearity

    ## Extract outcomes
    y = None
    names = {
        'time': 'censor_or_{}_days'.format(outcome),
        'event': '{}_indicator'.format(outcome),
    }
    y = data[[names['time'], names['event']]]

    ## Filter for appropriate samples
    prev_ct = len(y)
    pos_events = y.iloc[:, 0] > 0  # event times > 0
    X = X.loc[pos_events]
    y = y.loc[pos_events]
    data_idxs = list(
        [i for (i, inc) in zip(data_idxs, pos_events.tolist()) if inc])
    print('filtered out {} events with times < 0'.format(prev_ct - len(y)))

    if keep_cols is None:
        X = X.loc[:, (X != 0).any(axis=0)]  # drop columns w/ all zero
    else:
        for vr in keep_cols:
            if not set([vr]).issubset(X.columns):
                X[vr] = 0.0  # impute with zero by default
        X = X[keep_cols]

    # check for nulls and impute
    x_null = np.sum(pd.isnull(X))
    y_null = np.sum(pd.isnull(y))
    if (x_null.sum() > 0) or (y_null.sum() > 0):
        print('Will impute...')
        print('NULL (X, y):', x_null, y_null)
    if imputer is None:
        print('Fitting MissForest...')
        imputer = MissForest(random_state=seed)
        X_data = imputer.fit_transform(X)
        X = pd.DataFrame(data=X_data, columns=X.columns)
        print('Fitted.')
    else:
        X_data = imputer.transform(X)
        X = pd.DataFrame(data=X_data, columns=X.columns)

    # scale numerical values
    if scaler is None:
        scaler = StandardScaler()
        X[NUMERICAL_VARS] = scaler.fit_transform(X[NUMERICAL_VARS])
    else:
        X[NUMERICAL_VARS] = scaler.transform(X[NUMERICAL_VARS])

    if verbose:
        print('X.shape: {}, y.shape: {}'.format(X.shape, y.shape))
        print('Columns: {}'.format(X.columns))
        print('---------------- X ----------------\n{}'.format(X.describe()))
        print('---------------- y ----------------\n{}'.format(y.describe()))

    return X, y, scaler, imputer, data_idxs