Example #1
0
def get_prepared_data(load=False, save=True):
    if load:
        train_XY = pd.read_csv('train_XY.csv')
        validation_XY = pd.read_csv('validation_XY.csv')
        test_XY = pd.read_csv('test_XY.csv')
    else:
        data = get_data()
        data = data.iloc[:, :300]
        data = to_numerical_data(data)
        train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data(
            data)
        train_XY = X_Y_2_XY(train_X, train_Y)
        validation_XY = X_Y_2_XY(validation_X, validation_Y)
        test_XY = X_Y_2_XY(test_X, test_Y)
        train_XY = impute_train_X(train_XY)
        # train_XY = clean_and_correct_train_X(train_XY)
        train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY,
                                                     test_XY)
        validation_XY, test_XY = impute_test_and_validation(
            train_XY, validation_XY, test_XY)
        if save:
            train_XY.to_csv('train_XY')
            validation_XY.to_csv('validation_XY')
            test_XY.to_csv('test_XY')
            print('\033[1m' + "DATA SAVED" + '\033[0m')
    train_X, train_Y = XY_2_X_Y(train_XY)
    validation_X, validation_Y = XY_2_X_Y(validation_XY)
    test_X, test_Y = XY_2_X_Y(test_XY)
    return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
Example #2
0
def outliner_cleaner_test(data):
    train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data(data)
    train_XY = train_X.copy()
    train_XY.insert(loc=0, column='Vote', value=train_Y)
    imp = DistirbutionImputator()
    imp.fit(train_XY)
    imputed_train_XY = imp.fill_nans(train_XY, data_is_with_label_column=True)
    cleaner = DistirbutionOutlinersCleaner()
    cleaner.fit(imputed_train_XY)
    simple_imputer = SimpleImputer()
    best_acc = 0
    best_num_of_examples_to_delete = 0
    best_number_of_cells_to_correct = 0
    for num_of_examples_to_delete in range(0, 500, 25):
        for number_of_cells_to_correct in range(0, 500, 50):
            clean_correct_training_XY = cleaner.clean_and_correct(imputed_train_XY, num_of_examples_to_delete, number_of_cells_to_correct)
            clean_correct_training_X = clean_correct_training_XY.iloc[:, 1:]
            train_Y = clean_correct_training_XY.iloc[:, 0]
            simple_imputer.fit(clean_correct_training_X)
            imputed_validation_X = simple_imputer.transform(pd.DataFrame(validation_X))
            accuracy = test_data_quality(clean_correct_training_X, train_Y, imputed_validation_X, validation_Y)
            print('\naccuracy: ', accuracy, ' for num_of_examples_to_delete=', num_of_examples_to_delete, ' number_of_cells_to_correct=', number_of_cells_to_correct)
            if accuracy > best_acc:
                best_acc = accuracy
                best_num_of_examples_to_delete = num_of_examples_to_delete
                best_number_of_cells_to_correct = number_of_cells_to_correct
    print('best_num_of_examples_to_delete: ', best_num_of_examples_to_delete, ' best_number_of_cells_to_correct: ', best_number_of_cells_to_correct, ' best_acc: ', best_acc)
Example #3
0
def get_prepared_data():
    if isfile('train_XY.csv'):
        train_XY = pd.read_csv('train_XY.csv')
        validation_XY = pd.read_csv('validation_XY.csv')
        test_XY = pd.read_csv('test_XY.csv')
        print('\033[1m' + "DATA LOADED" + '\033[0m')
    else:
        print('\033[1m' + "PREPARING DATA..." + '\033[0m')
        data = get_data()
        data = to_numerical_data(data)
        train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data(
            data)
        train_XY = X_Y_2_XY(train_X, train_Y)
        validation_XY = X_Y_2_XY(validation_X, validation_Y)
        test_XY = X_Y_2_XY(test_X, test_Y)
        cleaner = DistirbutionOutlinersCleaner()
        cleaner.fit(train_XY)
        train_XY = cleaner.clean_and_correct(train_XY, int(len(train_XY) / 20),
                                             0)
        imputer = DistirbutionImputator()
        imputer.fit(train_XY)
        train_XY = imputer.fill_nans(train_XY)
        validation_XY = imputer.fill_nans(validation_XY)
        test_XY = imputer.fill_nans(test_XY)
        validation_XY = cleaner.clean_and_correct(validation_XY,
                                                  int(len(validation_XY) / 20),
                                                  0)
        test_XY = cleaner.clean_and_correct(test_XY, int(len(test_XY) / 20), 0)
        train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY,
                                                     test_XY)

        train_XY.to_csv('train_XY.csv', index=False)
        validation_XY.to_csv('validation_XY.csv', index=False)
        test_XY.to_csv('test_XY.csv', index=False)
        print('\033[1m' + "DATA SAVED" + '\033[0m')

    train_X, train_Y = XY_2_X_Y(train_XY)
    validation_X, validation_Y = XY_2_X_Y(validation_XY)
    test_X, test_Y = XY_2_X_Y(test_XY)
    return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
Example #4
0
from collections import Counter
import numpy as np
from Moodel_Chooser import Model_Chooser
from get_prepared_data import get_unlabeled_data
import pandas as pd
import pickle as pk
import data_handling
from sklearn.metrics import accuracy_score
from party_num_to_name import parties_dict

train, test = get_unlabeled_data(load=True)
train_X, train_Y, validation_X, validation_Y, test_X, test_Y = data_handling.split_data(train)
X_to_split = pd.concat([train_X, validation_X])
Y_to_split = pd.concat([train_Y, validation_Y])
all_data_X, all_data_Y = data_handling.XY_2_X_Y(train)


def get_party_hist(Y):
    hist = np.unique(Y, return_counts=True)
    res = np.zeros(13)
    for label, size in zip(hist[0], hist[1]):
        res[label] = size
    return res


def division_of_votes_score(Y_hat, Y):
    Y_hat_hist = get_party_hist(Y_hat)
    Y_hist = get_party_hist(Y)
    return 1/np.linalg.norm(Y_hat_hist - Y_hist, ord=1)