コード例 #1
0
ファイル: preprocess_votes.py プロジェクト: weiningZhang/MDI
                                    ratio, monotone,
                                    votes_params['miss_data_symbol'],
                                    votes_params['mnar_values'])
    else:
        pert_data = votes_train
    path = os.path.join(
        perturb_folder,
        '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname, monotone, ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")
    # impute data given imp_methods in params.py
    for imp_method in votes_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, votes_params)
        path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(
            dataname, imp_method, monotone, ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # binarize data
        data_scaled_bin = imp.binarize_data(data, votes_params['cat_cols'],
                                            votes_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)

        # add labels as last column
        data_scaled_bin = np.hstack((data_scaled_bin, labels_train))
コード例 #2
0
ファイル: preprocess_data.py プロジェクト: weiningZhang/MDI
        pert_data = x
    print "\tRatio is {} of {}".format(
        np.sum(pert_data == adult_params['miss_data_symbol']),
        len(pert_data) * len(adult_params['cat_cols']))

    path = os.path.join(
        perturb_folder,
        'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone, ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")

    for imp_method in adult_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, adult_params)

        path = "data/imputed/{}_mono_{}_ratio_{}.csv".format(
            imp_method, monotone, ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # scale continuous variables and convert categorial to one-hot
        # store the scaler objects to be used on the test set
        scaler_path = os.path.join(scalers_folder,
                                   "{}_scaler".format(imp_method))

        if os.path.isfile(scaler_path):
            scaler_dict = pkl.load(open(scaler_path, "rb"))
        else:
コード例 #3
0
ファイル: preprocess_votes.py プロジェクト: rafaelvalle/MDI
            votes_train, votes_params['cat_cols'], ratio, monotone,
            votes_params['miss_data_symbol'], votes_params['mnar_values'])
    else:
        pert_data = votes_train
    path = os.path.join(perturb_folder,
                        '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname,
                                                                    monotone,
                                                                    ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")
    # impute data given imp_methods in params.py
    for imp_method in votes_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, votes_params)
        path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(dataname,
                                                                imp_method,
                                                                monotone,
                                                                ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # binarize data
        data_scaled_bin = imp.binarize_data(data,
                                            votes_params['cat_cols'],
                                            votes_params['miss_data_symbol'])
        # convert to float
        data_scaled_bin = data_scaled_bin.astype(float)
コード例 #4
0
ファイル: preprocess_data.py プロジェクト: rafaelvalle/MDI
        pert_data = x
    print "\tRatio is {} of {}".format(
            np.sum(pert_data == adult_params['miss_data_symbol']), 
            len(pert_data) * len(adult_params['cat_cols']))

    path = os.path.join(perturb_folder,
                        'adult_train_pert_mono_{}_ratio_{}.csv'.format(monotone,
                                                                       ratio))
    # save perturbed data to disk as csv
    print '\tSaving perturbed data to {}'.format(path)
    np.savetxt(path, pert_data, delimiter=",", fmt="%s")

    for imp_method in adult_params['imp_methods']:
        print '\tImputing with {}'.format(imp_method)
        imp = Imputer()
        data = impute(pert_data, imp, imp_method, adult_params)

        path = "data/imputed/{}_mono_{}_ratio_{}.csv".format(imp_method,
                                                             monotone,
                                                             ratio)
        # save data as csv
        print '\tSaving imputed data to {}'.format(path)
        np.savetxt(path, data, delimiter=",", fmt="%s")

        # scale continuous variables and convert categorial to one-hot
        # store the scaler objects to be used on the test set
        scaler_path = os.path.join(scalers_folder,
                                   "{}_scaler".format(imp_method))

        if os.path.isfile(scaler_path):
            scaler_dict = pkl.load(open(scaler_path, "rb"))
コード例 #5
0
# binarize labels
labels = (np.array(x[:, -1]) == '>50K').astype(int)
labels = labels.reshape((-1, 1))

# save labels in binary and one-hot representations
labels.dump(os.path.join(labels_test_folder, 'labels_bin_test.np'))

# remove redundant education-number and labels features
x = delete(x, (4, 14), 1)

# instantiate Imputer
imp = Imputer()
for imp_method in adult_params['imp_methods']:
    print 'Imputing with {}'.format(imp_method)
    data = impute(x, imp, imp_method, adult_params)

    # load respective scaler
    scaler_path = os.path.join(scalers_folder,
                               "{}_scaler".format(imp_method))

    scaler_dict = pickle.load(open(scaler_path, "rb"))
    for name, scaler in scaler_dict.items():
        print 'Scaling with {}'.format(name)
        # scale and binarize, adding one col for missing value in all categ vars
        data_scaled = np.copy(data)
        data_scaled[:, adult_params['non_cat_cols']] = scaler.transform(
            data[:, adult_params['non_cat_cols']].astype(float))
        data_scaled_bin = imp.binarize_data(data_scaled,
                                            adult_params['cat_cols'],
                                            adult_params['miss_data_symbol'])