pert_data, _ = perturb_data(votes_train, votes_params['cat_cols'], ratio, monotone, votes_params['miss_data_symbol'], votes_params['mnar_values']) else: pert_data = votes_train path = os.path.join( perturb_folder, '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname, monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") # impute data given imp_methods in params.py for imp_method in votes_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, votes_params) path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format( dataname, imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # binarize data data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'], votes_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float) # add labels as last column data_scaled_bin = np.hstack((data_scaled_bin, labels_train))
import numpy as np from scipy.stats import mode, itemfreq from scipy import delete import matplotlib.pylab as plt from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC as SVM from missing_data_imputation import Imputer # declare csv headers x = np.genfromtxt("data/votes_train.csv", delimiter=",", dtype=object) # use training set # enumerate parameters and instantiate Imputer imp = Imputer() missing_data_cond = lambda x: x == "?" cat_cols = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) n_neighbors = 3 # lower for votes # drop observations with missing variables # print 'imputing with drop' # data_drop = imp.drop(x, missing_data_cond) # replace missing values with random existing values print "imputing with random replacement" data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print "imputing with feature summarization (mode)" summ_func = lambda x: mode(x)[0]
from scipy import delete import matplotlib.pylab as plt from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC as SVM from missing_data_imputation import Imputer # declare csv headers x = np.genfromtxt('data/adult-train-raw', delimiter=', ', dtype=object) # remove redundant education-number feature x = delete(x, (4, 14), 1) # enumerate parameters and instantiate Imputer imp = Imputer() missing_data_cond = lambda x: x == '?' cat_cols = (1, 3, 4, 5, 6, 7, 8, 12) n_neighbors = 5 # # drop observations with missing variables # print 'imputing with drop' # data_drop = imp.drop(x, missing_data_cond) # replace missing values with random existing values print 'imputing with random replacement' data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0]
else: pert_data = x print "\tRatio is {} of {}".format( np.sum(pert_data == adult_params['miss_data_symbol']), len(pert_data) * len(adult_params['cat_cols'])) path = os.path.join( perturb_folder, 'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") for imp_method in adult_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, adult_params) path = "data/imputed/{}_mono_{}_ratio_{}.csv".format( imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # scale continuous variables and convert categorial to one-hot # store the scaler objects to be used on the test set scaler_path = os.path.join(scalers_folder, "{}_scaler".format(imp_method)) if os.path.isfile(scaler_path): scaler_dict = pkl.load(open(scaler_path, "rb"))
from sklearn.metrics import confusion_matrix import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from missing_data_imputation import Imputer plt.rcParams.update({'figure.autolayout': True}) # declare csv headers x = np.genfromtxt('adult-train-raw', delimiter=', ', dtype=object) # remove redundant education-number feature x = delete(x, (4, 14), 1) # enumerate parameters and instantiate Imputer imp = Imputer() missing_data_symbol = '?' miss_data_cond = lambda x: x == missing_data_symbol cat_cols = (1, 3, 4, 5, 6, 7, 8, 12) n_neighbors = 5 miss_data_rows, miss_data_cols = np.where(miss_data_cond(x)) # remove missing data, which we is are MNAR in the ADULT dataset x = np.delete(x, miss_data_rows, axis=0) ratios = np.arange(10, 100, 10) # monotone False must be fixed monotone = True def perturbate_data(x, cat_cols, ratio, missing_data_symbol, monotone=False, in_place=False):
else: pert_data = x print "\tRatio is {} of {}".format( np.sum(pert_data == adult_params['miss_data_symbol']), len(pert_data) * len(adult_params['cat_cols'])) path = os.path.join(perturb_folder, 'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") for imp_method in adult_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, adult_params) path = "data/imputed/{}_mono_{}_ratio_{}.csv".format(imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # scale continuous variables and convert categorial to one-hot # store the scaler objects to be used on the test set scaler_path = os.path.join(scalers_folder, "{}_scaler".format(imp_method)) if os.path.isfile(scaler_path):
from sklearn.metrics import confusion_matrix import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from missing_data_imputation import Imputer plt.rcParams.update({'figure.autolayout': True}) # declare csv headers x = np.genfromtxt('adult-train-raw', delimiter=', ', dtype=object) # remove redundant education-number feature x = delete(x, (4, 14), 1) # enumerate parameters and instantiate Imputer imp = Imputer() missing_data_symbol = '?' miss_data_cond = lambda x: x == missing_data_symbol cat_cols = (1, 3, 4, 5, 6, 7, 8, 12) n_neighbors = 5 miss_data_rows, miss_data_cols = np.where(miss_data_cond(x)) # remove missing data, which we is are MNAR in the ADULT dataset x = np.delete(x, miss_data_rows, axis=0) ratios = np.arange(10, 100, 10) # monotone False must be fixed monotone = True def perturbate_data(x,
pert_data, _ = perturbate_data( votes_train, votes_params['cat_cols'], ratio, monotone, votes_params['miss_data_symbol'], votes_params['mnar_values']) else: pert_data = votes_train path = os.path.join(perturb_folder, '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname, monotone, ratio)) # save perturbed data to disk as csv print '\tSaving perturbed data to {}'.format(path) np.savetxt(path, pert_data, delimiter=",", fmt="%s") # impute data given imp_methods in params.py for imp_method in votes_params['imp_methods']: print '\tImputing with {}'.format(imp_method) imp = Imputer() data = impute(pert_data, imp, imp_method, votes_params) path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(dataname, imp_method, monotone, ratio) # save data as csv print '\tSaving imputed data to {}'.format(path) np.savetxt(path, data, delimiter=",", fmt="%s") # binarize data data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'], votes_params['miss_data_symbol']) # convert to float data_scaled_bin = data_scaled_bin.astype(float)