def get_prepared_data(load=False, save=True): if load: train_XY = pd.read_csv('train_XY.csv') validation_XY = pd.read_csv('validation_XY.csv') test_XY = pd.read_csv('test_XY.csv') else: data = get_data() data = data.iloc[:, :300] data = to_numerical_data(data) train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data( data) train_XY = X_Y_2_XY(train_X, train_Y) validation_XY = X_Y_2_XY(validation_X, validation_Y) test_XY = X_Y_2_XY(test_X, test_Y) train_XY = impute_train_X(train_XY) # train_XY = clean_and_correct_train_X(train_XY) train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY, test_XY) validation_XY, test_XY = impute_test_and_validation( train_XY, validation_XY, test_XY) if save: train_XY.to_csv('train_XY') validation_XY.to_csv('validation_XY') test_XY.to_csv('test_XY') print('\033[1m' + "DATA SAVED" + '\033[0m') train_X, train_Y = XY_2_X_Y(train_XY) validation_X, validation_Y = XY_2_X_Y(validation_XY) test_X, test_Y = XY_2_X_Y(test_XY) return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
def outliner_cleaner_test(data): train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data(data) train_XY = train_X.copy() train_XY.insert(loc=0, column='Vote', value=train_Y) imp = DistirbutionImputator() imp.fit(train_XY) imputed_train_XY = imp.fill_nans(train_XY, data_is_with_label_column=True) cleaner = DistirbutionOutlinersCleaner() cleaner.fit(imputed_train_XY) simple_imputer = SimpleImputer() best_acc = 0 best_num_of_examples_to_delete = 0 best_number_of_cells_to_correct = 0 for num_of_examples_to_delete in range(0, 500, 25): for number_of_cells_to_correct in range(0, 500, 50): clean_correct_training_XY = cleaner.clean_and_correct(imputed_train_XY, num_of_examples_to_delete, number_of_cells_to_correct) clean_correct_training_X = clean_correct_training_XY.iloc[:, 1:] train_Y = clean_correct_training_XY.iloc[:, 0] simple_imputer.fit(clean_correct_training_X) imputed_validation_X = simple_imputer.transform(pd.DataFrame(validation_X)) accuracy = test_data_quality(clean_correct_training_X, train_Y, imputed_validation_X, validation_Y) print('\naccuracy: ', accuracy, ' for num_of_examples_to_delete=', num_of_examples_to_delete, ' number_of_cells_to_correct=', number_of_cells_to_correct) if accuracy > best_acc: best_acc = accuracy best_num_of_examples_to_delete = num_of_examples_to_delete best_number_of_cells_to_correct = number_of_cells_to_correct print('best_num_of_examples_to_delete: ', best_num_of_examples_to_delete, ' best_number_of_cells_to_correct: ', best_number_of_cells_to_correct, ' best_acc: ', best_acc)
def get_prepared_data(): if isfile('train_XY.csv'): train_XY = pd.read_csv('train_XY.csv') validation_XY = pd.read_csv('validation_XY.csv') test_XY = pd.read_csv('test_XY.csv') print('\033[1m' + "DATA LOADED" + '\033[0m') else: print('\033[1m' + "PREPARING DATA..." + '\033[0m') data = get_data() data = to_numerical_data(data) train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data( data) train_XY = X_Y_2_XY(train_X, train_Y) validation_XY = X_Y_2_XY(validation_X, validation_Y) test_XY = X_Y_2_XY(test_X, test_Y) cleaner = DistirbutionOutlinersCleaner() cleaner.fit(train_XY) train_XY = cleaner.clean_and_correct(train_XY, int(len(train_XY) / 20), 0) imputer = DistirbutionImputator() imputer.fit(train_XY) train_XY = imputer.fill_nans(train_XY) validation_XY = imputer.fill_nans(validation_XY) test_XY = imputer.fill_nans(test_XY) validation_XY = cleaner.clean_and_correct(validation_XY, int(len(validation_XY) / 20), 0) test_XY = cleaner.clean_and_correct(test_XY, int(len(test_XY) / 20), 0) train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY, test_XY) train_XY.to_csv('train_XY.csv', index=False) validation_XY.to_csv('validation_XY.csv', index=False) test_XY.to_csv('test_XY.csv', index=False) print('\033[1m' + "DATA SAVED" + '\033[0m') train_X, train_Y = XY_2_X_Y(train_XY) validation_X, validation_Y = XY_2_X_Y(validation_XY) test_X, test_Y = XY_2_X_Y(test_XY) return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
from collections import Counter import numpy as np from Moodel_Chooser import Model_Chooser from get_prepared_data import get_unlabeled_data import pandas as pd import pickle as pk import data_handling from sklearn.metrics import accuracy_score from party_num_to_name import parties_dict train, test = get_unlabeled_data(load=True) train_X, train_Y, validation_X, validation_Y, test_X, test_Y = data_handling.split_data(train) X_to_split = pd.concat([train_X, validation_X]) Y_to_split = pd.concat([train_Y, validation_Y]) all_data_X, all_data_Y = data_handling.XY_2_X_Y(train) def get_party_hist(Y): hist = np.unique(Y, return_counts=True) res = np.zeros(13) for label, size in zip(hist[0], hist[1]): res[label] = size return res def division_of_votes_score(Y_hat, Y): Y_hat_hist = get_party_hist(Y_hat) Y_hist = get_party_hist(Y) return 1/np.linalg.norm(Y_hat_hist - Y_hist, ord=1)