ratio, monotone, votes_params['miss_data_symbol'], votes_params['mnar_values']) else: pert_data = votes_train path = os.path.join( perturb_folder, '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname, monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") # impute data given imp_methods in params.py for imp_method in votes_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, votes_params) path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format( dataname, imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # binarize data data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'], votes_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float) # add labels as last column data_scaled_bin = np.hstack((data_scaled_bin, labels_train))
pert_data = x print "\tRatio is {} of {}".format( np.sum(pert_data == adult_params['miss_data_symbol']), len(pert_data) * len(adult_params['cat_cols'])) path = os.path.join( perturb_folder, 'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") for imp_method in adult_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, adult_params) path = "data/imputed/{}_mono_{}_ratio_{}.csv".format( imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # scale continuous variables and convert categorial to one-hot # store the scaler objects to be used on the test set scaler_path = os.path.join(scalers_folder, "{}_scaler".format(imp_method)) if os.path.isfile(scaler_path): scaler_dict = pkl.load(open(scaler_path, "rb")) else:
votes_train, votes_params['cat_cols'], ratio, monotone, votes_params['miss_data_symbol'], votes_params['mnar_values']) else: pert_data = votes_train path = os.path.join(perturb_folder, '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname, monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") # impute data given imp_methods in params.py for imp_method in votes_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, votes_params) path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(dataname, imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # binarize data data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'], votes_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float)
pert_data = x print "\tRatio is {} of {}".format( np.sum(pert_data == adult_params['miss_data_symbol']), len(pert_data) * len(adult_params['cat_cols'])) path = os.path.join(perturb_folder, 'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") for imp_method in adult_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, adult_params) path = "data/imputed/{}_mono_{}_ratio_{}.csv".format(imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # scale continuous variables and convert categorial to one-hot # store the scaler objects to be used on the test set scaler_path = os.path.join(scalers_folder, "{}_scaler".format(imp_method)) if os.path.isfile(scaler_path): scaler_dict = pkl.load(open(scaler_path, "rb"))
# binarize labels labels = (np.array(x[:, -1]) == '>50K').astype(int) labels = labels.reshape((-1, 1)) # save labels in binary and one-hot representations labels.dump(os.path.join(labels_test_folder, 'labels_bin_test.np')) # remove redundant education-number and labels features x = delete(x, (4, 14), 1) # instantiate Imputer imp = Imputer() for imp_method in adult_params['imp_methods']: print 'Imputing with {}'.format(imp_method) data = impute(x, imp, imp_method, adult_params) # load respective scaler scaler_path = os.path.join(scalers_folder, "{}_scaler".format(imp_method)) scaler_dict = pickle.load(open(scaler_path, "rb")) for name, scaler in scaler_dict.items(): print 'Scaling with {}'.format(name) # scale and binarize, adding one col for missing value in all categ vars data_scaled = np.copy(data) data_scaled[:, adult_params['non_cat_cols']] = scaler.transform( data[:, adult_params['non_cat_cols']].astype(float)) data_scaled_bin = imp.binarize_data(data_scaled, adult_params['cat_cols'], adult_params['miss_data_symbol'])