# save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") # impute data given imp_methods in params.py for imp_method in votes_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, votes_params) path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format( dataname, imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # binarize data data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'], votes_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float) # add labels as last column data_scaled_bin = np.hstack((data_scaled_bin, labels_train)) # save to disk filename = "{}_{}_bin_scaled_mono_{}_ratio_{}.np".format( dataname, imp_method, monotone, ratio) path = os.path.join(feats_train_folder, filename) print '\tSaving imputed scaled and binarized data to {}'.format(path) data_scaled_bin.dump(path) # For test data print 'Preparing test data for {}'.format(dataname)
# # drop observations with missing variables # print 'imputing with drop' # data_drop = imp.drop(x, missing_data_cond) # replace missing values with random existing values print 'imputing with random replacement' data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_mode = imp.summarize(x, summ_func, missing_data_cond) # replace categorical features with one hot row print 'imputing with one-hot' data_onehot = imp.binarize_data(x, cat_cols) # replace missing data with predictions using random forest print 'imputing with predicted values from random forest' clf = RandomForestClassifier(n_estimators=100, criterion='gini') data_rf = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with predictions using SVM print 'imputing with predicted values usng SVM' clf = SVM(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True,
# drop observations with missing variables # print 'imputing with drop' # data_drop = imp.drop(x, missing_data_cond) # replace missing values with random existing values print "imputing with random replacement" data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print "imputing with feature summarization (mode)" summ_func = lambda x: mode(x)[0] data_mode = imp.summarize(x, summ_func, missing_data_cond) # replace categorical features with one hot row print "imputing with one-hot" data_onehot = imp.binarize_data(x, cat_cols) # replace missing data with predictions using random forest print "imputing with predicted values from random forest" clf = RandomForestClassifier(n_estimators=100, criterion="gini") data_rf = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with predictions using SVM print "imputing with predicted values usng SVM" clf = clf = SVM( penalty="l2", loss="squared_hinge", dual=True, tol=0.0001, C=1.0, multi_class="ovr",
scaler = StandardScaler() scaler = scaler.fit(data[:, adult_params['non_cat_cols']].astype(float)) data_scaled = np.copy(data) data_scaled[:, adult_params['non_cat_cols']] = scaler.transform( data[:, adult_params['non_cat_cols']].astype(float)) # key is imputation method and ratio dependent # filename is imputation method dependent scaler_dict["{}_ratio_{}".format(imp_method, ratio)] = scaler pkl.dump(scaler_dict, open(scaler_path, 'wb')) # binarize scaled data data_scaled_bin = imp.binarize_data(data_scaled, adult_params['cat_cols'], adult_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float) # add labels as last column data_scaled_bin = np.hstack((data_scaled_bin, labels)) # save to disk filename = "{}_bin_scaled_mono_{}_ratio_{}.np".format( imp_method, monotone, ratio) path = os.path.join(feats_train_folder, filename) print '\tSaving imputed scaled and binarized data to {}'.format(path) data_scaled_bin.dump(path)
scaler = StandardScaler() scaler = scaler.fit(data[:, adult_params['non_cat_cols']].astype(float)) data_scaled = np.copy(data) data_scaled[:, adult_params['non_cat_cols']] = scaler.transform( data[:, adult_params['non_cat_cols']].astype(float)) # key is imputation method and ratio dependent # filename is imputation method dependent scaler_dict["{}_ratio_{}".format(imp_method, ratio)] = scaler pkl.dump(scaler_dict, open(scaler_path, 'wb')) # binarize scaled data data_scaled_bin = imp.binarize_data(data_scaled, adult_params['cat_cols'], adult_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float) # add labels as last column data_scaled_bin = np.hstack((data_scaled_bin, labels)) # save to disk filename = "{}_bin_scaled_mono_{}_ratio_{}.np".format(imp_method, monotone, ratio) path = os.path.join(feats_train_folder, filename) print '\tSaving imputed scaled and binarized data to {}'.format(path) data_scaled_bin.dump(path)
# impute data given imp_methods in params.py for imp_method in votes_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, votes_params) path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(dataname, imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # binarize data data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'], votes_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float) # add labels as last column data_scaled_bin = np.hstack((data_scaled_bin, labels_train)) # save to disk filename = "{}_{}_bin_scaled_mono_{}_ratio_{}.np".format(dataname, imp_method, monotone, ratio) path = os.path.join(feats_train_folder, filename) print '\tSaving imputed scaled and binarized data to {}'.format(path) data_scaled_bin.dump(path)