Beispiel #1
0
        pert_data, _ = perturb_data(votes_train, votes_params['cat_cols'],
                                    ratio, monotone,
                                    votes_params['miss_data_symbol'],
                                    votes_params['mnar_values'])
    else:
        pert_data = votes_train
    path = os.path.join(
        perturb_folder,
        '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname, monotone, ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")
    # impute data given imp_methods in params.py
    for imp_method in votes_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, votes_params)
        path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(
            dataname, imp_method, monotone, ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # binarize data
        data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'],
                                            votes_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)

        # add labels as last column
        data_scaled_bin = np.hstack((data_scaled_bin, labels_train))
Beispiel #2
0
import numpy as np
from scipy.stats import mode, itemfreq
from scipy import delete
import matplotlib.pylab as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC as SVM

from missing_data_imputation import Imputer


# declare csv headers
x = np.genfromtxt("data/votes_train.csv", delimiter=",", dtype=object)  # use training set

# enumerate parameters and instantiate Imputer
imp = Imputer()
missing_data_cond = lambda x: x == "?"
cat_cols = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
n_neighbors = 3  # lower for votes

# drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print "imputing with random replacement"
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print "imputing with feature summarization (mode)"
summ_func = lambda x: mode(x)[0]
Beispiel #3
0
from scipy import delete
import matplotlib.pylab as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC as SVM

from missing_data_imputation import Imputer

# declare csv headers
x = np.genfromtxt('data/adult-train-raw', delimiter=', ', dtype=object)

# remove redundant education-number feature
x = delete(x, (4, 14), 1)

# enumerate parameters and instantiate Imputer
imp = Imputer()
missing_data_cond = lambda x: x == '?'
cat_cols = (1, 3, 4, 5, 6, 7, 8, 12)
n_neighbors = 5

# # drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print 'imputing with random replacement'
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print 'imputing with feature summarization (mode)'
summ_func = lambda x: mode(x)[0]
Beispiel #4
0
    else:
        pert_data = x
    print "\tRatio is {} of {}".format(
        np.sum(pert_data == adult_params['miss_data_symbol']),
        len(pert_data) * len(adult_params['cat_cols']))

    path = os.path.join(
        perturb_folder,
        'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone, ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")

    for imp_method in adult_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, adult_params)

        path = "data/imputed/{}_mono_{}_ratio_{}.csv".format(
            imp_method, monotone, ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # scale continuous variables and convert categorial to one-hot
        # store the scaler objects to be used on the test set
        scaler_path = os.path.join(scalers_folder,
                                   "{}_scaler".format(imp_method))

        if os.path.isfile(scaler_path):
            scaler_dict = pkl.load(open(scaler_path, "rb"))
Beispiel #5
0
from sklearn.metrics import confusion_matrix
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from missing_data_imputation import Imputer

plt.rcParams.update({'figure.autolayout': True})

# declare csv headers
x = np.genfromtxt('adult-train-raw', delimiter=', ', dtype=object)

# remove redundant education-number feature
x = delete(x, (4, 14), 1)

# enumerate parameters and instantiate Imputer
imp = Imputer()
missing_data_symbol = '?'
miss_data_cond = lambda x: x == missing_data_symbol
cat_cols = (1, 3, 4, 5, 6, 7, 8, 12)
n_neighbors = 5
miss_data_rows, miss_data_cols = np.where(miss_data_cond(x))

# remove missing data, which we is are MNAR in the ADULT dataset
x = np.delete(x, miss_data_rows, axis=0)
ratios = np.arange(10, 100, 10)

# monotone False must be fixed
monotone = True

def perturbate_data(x, cat_cols, ratio, missing_data_symbol,
                    monotone=False, in_place=False):
Beispiel #6
0
    else:
        pert_data = x
    print "\tRatio is {} of {}".format(
            np.sum(pert_data == adult_params['miss_data_symbol']), 
            len(pert_data) * len(adult_params['cat_cols']))

    path = os.path.join(perturb_folder,
                        'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone,
                                                                       ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")

    for imp_method in adult_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, adult_params)

        path = "data/imputed/{}_mono_{}_ratio_{}.csv".format(imp_method,
                                                             monotone,
                                                             ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # scale continuous variables and convert categorial to one-hot
        # store the scaler objects to be used on the test set
        scaler_path = os.path.join(scalers_folder,
                                   "{}_scaler".format(imp_method))

        if os.path.isfile(scaler_path):
Beispiel #7
0
from sklearn.metrics import confusion_matrix
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from missing_data_imputation import Imputer

plt.rcParams.update({'figure.autolayout': True})

# declare csv headers
x = np.genfromtxt('adult-train-raw', delimiter=', ', dtype=object)

# remove redundant education-number feature
x = delete(x, (4, 14), 1)

# enumerate parameters and instantiate Imputer
imp = Imputer()
missing_data_symbol = '?'
miss_data_cond = lambda x: x == missing_data_symbol
cat_cols = (1, 3, 4, 5, 6, 7, 8, 12)
n_neighbors = 5
miss_data_rows, miss_data_cols = np.where(miss_data_cond(x))

# remove missing data, which we is are MNAR in the ADULT dataset
x = np.delete(x, miss_data_rows, axis=0)
ratios = np.arange(10, 100, 10)

# monotone False must be fixed
monotone = True


def perturbate_data(x,
Beispiel #8
0
        pert_data, _ = perturbate_data(
            votes_train, votes_params['cat_cols'], ratio, monotone,
            votes_params['miss_data_symbol'], votes_params['mnar_values'])
    else:
        pert_data = votes_train
    path = os.path.join(perturb_folder,
                        '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname,
                                                                    monotone,
                                                                    ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")
    # impute data given imp_methods in params.py
    for imp_method in votes_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, votes_params)
        path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(dataname,
                                                                imp_method,
                                                                monotone,
                                                                ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # binarize data
        data_scaled_bin = imp.binarize_data(data,
                                            votes_params['cat_cols'],
                                            votes_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)