Beispiel #1
0
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")
    # impute data given imp_methods in params.py
    for imp_method in votes_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, votes_params)
        path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(
            dataname, imp_method, monotone, ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # binarize data
        data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'],
                                            votes_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)

        # add labels as last column
        data_scaled_bin = np.hstack((data_scaled_bin, labels_train))

        # save to disk
        filename = "{}_{}_bin_scaled_mono_{}_ratio_{}.np".format(
            dataname, imp_method, monotone, ratio)
        path = os.path.join(feats_train_folder, filename)
        print '\tSaving imputed scaled and binarized data to {}'.format(path)
        data_scaled_bin.dump(path)

# For test data
print 'Preparing test data for {}'.format(dataname)
Beispiel #2
0
# # drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print 'imputing with random replacement'
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print 'imputing with feature summarization (mode)'
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

# replace categorical features with one hot row
print 'imputing with one-hot'
data_onehot = imp.binarize_data(x, cat_cols)

# replace missing data with predictions using random forest
print 'imputing with predicted values from random forest'
clf = RandomForestClassifier(n_estimators=100, criterion='gini')
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with predictions using SVM
print 'imputing with predicted values usng SVM'
clf = SVM(penalty='l2',
          loss='squared_hinge',
          dual=True,
          tol=0.0001,
          C=1.0,
          multi_class='ovr',
          fit_intercept=True,
Beispiel #3
0
# drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print "imputing with random replacement"
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print "imputing with feature summarization (mode)"
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

# replace categorical features with one hot row
print "imputing with one-hot"
data_onehot = imp.binarize_data(x, cat_cols)

# replace missing data with predictions using random forest
print "imputing with predicted values from random forest"
clf = RandomForestClassifier(n_estimators=100, criterion="gini")
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with predictions using SVM
print "imputing with predicted values usng SVM"
clf = clf = SVM(
    penalty="l2",
    loss="squared_hinge",
    dual=True,
    tol=0.0001,
    C=1.0,
    multi_class="ovr",
Beispiel #4
0
        scaler = StandardScaler()
        scaler = scaler.fit(data[:,
                                 adult_params['non_cat_cols']].astype(float))

        data_scaled = np.copy(data)
        data_scaled[:, adult_params['non_cat_cols']] = scaler.transform(
            data[:, adult_params['non_cat_cols']].astype(float))

        # key is imputation method and ratio dependent
        # filename is imputation method dependent
        scaler_dict["{}_ratio_{}".format(imp_method, ratio)] = scaler
        pkl.dump(scaler_dict, open(scaler_path, 'wb'))

        # binarize scaled data
        data_scaled_bin = imp.binarize_data(data_scaled,
                                            adult_params['cat_cols'],
                                            adult_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)

        # add labels as last column
        data_scaled_bin = np.hstack((data_scaled_bin, labels))

        # save to disk
        filename = "{}_bin_scaled_mono_{}_ratio_{}.np".format(
            imp_method, monotone, ratio)
        path = os.path.join(feats_train_folder, filename)
        print '\tSaving imputed scaled and binarized data to {}'.format(path)
        data_scaled_bin.dump(path)
Beispiel #5
0
        scaler = StandardScaler()
        scaler = scaler.fit(data[:, adult_params['non_cat_cols']].astype(float))

        data_scaled = np.copy(data)
        data_scaled[:, adult_params['non_cat_cols']] = scaler.transform(
            data[:, adult_params['non_cat_cols']].astype(float))

        # key is imputation method and ratio dependent
        # filename is imputation method dependent
        scaler_dict["{}_ratio_{}".format(imp_method, ratio)] = scaler
        pkl.dump(scaler_dict, open(scaler_path, 'wb'))

        # binarize scaled data
        data_scaled_bin = imp.binarize_data(data_scaled,
                                            adult_params['cat_cols'],
                                            adult_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)

        # add labels as last column
        data_scaled_bin = np.hstack((data_scaled_bin, labels))

        # save to disk
        filename = "{}_bin_scaled_mono_{}_ratio_{}.np".format(imp_method,
                                                              monotone,
                                                              ratio)
        path = os.path.join(feats_train_folder, filename)
        print '\tSaving imputed scaled and binarized data to {}'.format(path)
        data_scaled_bin.dump(path)
Beispiel #6
0
    # impute data given imp_methods in params.py
    for imp_method in votes_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, votes_params)
        path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(dataname,
                                                                imp_method,
                                                                monotone,
                                                                ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # binarize data
        data_scaled_bin = imp.binarize_data(data,
                                            votes_params['cat_cols'],
                                            votes_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)

        # add labels as last column
        data_scaled_bin = np.hstack((data_scaled_bin, labels_train))

        # save to disk
        filename = "{}_{}_bin_scaled_mono_{}_ratio_{}.np".format(dataname,
                                                                 imp_method,
                                                                 monotone,
                                                                 ratio)
        path = os.path.join(feats_train_folder, filename)
        print '\tSaving imputed scaled and binarized data to {}'.format(path)
        data_scaled_bin.dump(path)