def get_prepared_data(load=False, save=True): if load: train_XY = pd.read_csv('train_XY.csv') validation_XY = pd.read_csv('validation_XY.csv') test_XY = pd.read_csv('test_XY.csv') else: data = get_data() data = data.iloc[:, :300] data = to_numerical_data(data) train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data( data) train_XY = X_Y_2_XY(train_X, train_Y) validation_XY = X_Y_2_XY(validation_X, validation_Y) test_XY = X_Y_2_XY(test_X, test_Y) train_XY = impute_train_X(train_XY) # train_XY = clean_and_correct_train_X(train_XY) train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY, test_XY) validation_XY, test_XY = impute_test_and_validation( train_XY, validation_XY, test_XY) if save: train_XY.to_csv('train_XY') validation_XY.to_csv('validation_XY') test_XY.to_csv('test_XY') print('\033[1m' + "DATA SAVED" + '\033[0m') train_X, train_Y = XY_2_X_Y(train_XY) validation_X, validation_Y = XY_2_X_Y(validation_XY) test_X, test_Y = XY_2_X_Y(test_XY) return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
def build_coalition_from_similarity_matrix(similarity_matrix, X_to_split, Y_to_split, test_X, test_Y): best_coalition_score = float('-inf') best_coalition = list(range(13)) for party in range(13): coalition = [party] while not big_enough_coalition(coalition, test_X, test_Y): coalition.append(most_similar_party(coalition, similarity_matrix)) coalition_score_ = f(coalition, X_Y_2_XY(X_to_split, Y_to_split)) if coalition_score_ > best_coalition_score: best_coalition_score = coalition_score_ best_coalition = coalition.copy() while len(coalition) < 13: coalition.append(most_similar_party(coalition, similarity_matrix)) coalition_score_ = f(coalition, X_Y_2_XY(X_to_split, Y_to_split)) if coalition_score_ > best_coalition_score: best_coalition_score = coalition_score_ best_coalition = coalition.copy() best_coalition.sort() return best_coalition, best_coalition_score
def get_prepared_data(): if isfile('train_XY.csv'): train_XY = pd.read_csv('train_XY.csv') validation_XY = pd.read_csv('validation_XY.csv') test_XY = pd.read_csv('test_XY.csv') print('\033[1m' + "DATA LOADED" + '\033[0m') else: print('\033[1m' + "PREPARING DATA..." + '\033[0m') data = get_data() data = to_numerical_data(data) train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data( data) train_XY = X_Y_2_XY(train_X, train_Y) validation_XY = X_Y_2_XY(validation_X, validation_Y) test_XY = X_Y_2_XY(test_X, test_Y) cleaner = DistirbutionOutlinersCleaner() cleaner.fit(train_XY) train_XY = cleaner.clean_and_correct(train_XY, int(len(train_XY) / 20), 0) imputer = DistirbutionImputator() imputer.fit(train_XY) train_XY = imputer.fill_nans(train_XY) validation_XY = imputer.fill_nans(validation_XY) test_XY = imputer.fill_nans(test_XY) validation_XY = cleaner.clean_and_correct(validation_XY, int(len(validation_XY) / 20), 0) test_XY = cleaner.clean_and_correct(test_XY, int(len(test_XY) / 20), 0) train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY, test_XY) train_XY.to_csv('train_XY.csv', index=False) validation_XY.to_csv('validation_XY.csv', index=False) test_XY.to_csv('test_XY.csv', index=False) print('\033[1m' + "DATA SAVED" + '\033[0m') train_X, train_Y = XY_2_X_Y(train_XY) validation_X, validation_Y = XY_2_X_Y(validation_XY) test_X, test_Y = XY_2_X_Y(test_XY) return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
def build_coalition_from_similarity_matrix(similarities): # we will go through every party and will try to build a coalition with similar parties best_coalition_score = float('-inf') best_coalition = [] for party in range(13): coalition = [party] most_similar_party_left = most_similar_parties(party, similarities) while not big_enough_coalition(coalition): coalition.append(next(most_similar_party_left)) coalition_score_ = getCoalitionScore(coalition, X_Y_2_XY(test_X, test_Y)) if coalition_score_ > best_coalition_score: best_coalition_score = coalition_score_ best_coalition = coalition return best_coalition, best_coalition_score
def best_option_ever(): best_coalition_score = float('-inf') best_coalition = None # counter = 0 for coalition in powerset(set(train_Y)): # print(counter) # counter += 1 if not big_enough_coalition(coalition): continue else: score = coalition_score(coalition, X_Y_2_XY(X_to_split, Y_to_split)) if score > best_coalition_score: best_coalition_score = score best_coalition = coalition return best_coalition, best_coalition_score
def get_unlabeled_data(load=False): if load: real_test_X = pd.read_csv('real_test_X.csv') real_train_XY = pd.read_csv('real_train_XY.csv') print('\033[1m' + "DATA LOADED" + '\033[0m') else: print('\033[1m' + "PREPARING DATA..." + '\033[0m') real_train_XY = pd.read_csv('ElectionsData.csv') real_train_XY = real_train_XY.loc[:, [ 'Vote', 'Avg_environmental_importance', 'Avg_government_satisfaction', 'Avg_education_importance', 'Most_Important_Issue', 'Avg_monthly_expense_on_pets_or_plants', 'Avg_Residancy_Altitude', 'Yearly_ExpensesK', 'Weighted_education_rank', 'Number_of_valued_Kneset_members' ]] real_test_X = pd.read_csv('ElectionsData_Pred_Features.csv') real_test_X = real_test_X.loc[:, [ 'Avg_environmental_importance', 'Avg_government_satisfaction', 'Avg_education_importance', 'Most_Important_Issue', 'Avg_monthly_expense_on_pets_or_plants', 'Avg_Residancy_Altitude', 'Yearly_ExpensesK', 'Weighted_education_rank', 'Number_of_valued_Kneset_members' ]] real_train_XY = to_numerical_data(real_train_XY) real_test_X = to_numerical_data_test(real_test_X) cleaner = DistirbutionOutlinersCleaner() cleaner.fit(real_train_XY) real_train_XY = cleaner.clean_and_correct(real_train_XY, int(len(real_train_XY) / 20), 0) imputer = DistirbutionImputator() imputer.fit(real_train_XY) real_train_XY = imputer.fill_nans(real_train_XY) real_test_X = imputer.fill_nans(real_test_X, data_is_with_label_column=False) scaler = Scaler() real_train_X, real_train_Y = XY_2_X_Y(real_train_XY) scaler.fit(real_train_X) scaler.scale(real_train_X) scaler.scale(real_test_X) real_train_XY = X_Y_2_XY(real_train_X, real_train_Y) real_train_XY.to_csv('real_train_XY.csv', index=False) real_test_X.to_csv('real_test_X.csv', index=False) print('\033[1m' + "DATA SAVED" + '\033[0m') return real_train_XY, real_test_X
def scorer(estimator, X, Y): clusters = estimator.predict(X) cluster_list = [[], []] for party in set(Y): party_culsters = clusters[Y == party] _, counts = np.unique(party_culsters, return_counts=True) cluster_list[np.argmax(counts)].append(party) cluster_0_size = len(X[Y.isin(cluster_list[0])]) cluster_1_size = len(X[Y.isin(cluster_list[1])]) bigger_cluster = np.argmax([cluster_0_size, cluster_1_size]) coalition = cluster_list[bigger_cluster] score = coalition_score(coalition, X_Y_2_XY(X, Y)) return score
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from get_prepared_data import get_prepared_data from data_handling import X_Y_2_XY def plot_vote_to_features_colored(data: pd.DataFrame): names = data.columns.values for i in range(1, len(names)): sns.pairplot(data.iloc[:, [0, i]], hue='Vote') name = 'Vote to ' + str(names[i]) plt.title(name) plt.savefig(name + '.png') plt.show() train_X, train_Y, validation_X, validation_Y, test_X, test_Y = get_prepared_data( load=True) plot_vote_to_features_colored(X_Y_2_XY(train_X, train_Y))
coalition = [party] most_similar_party_left = most_similar_parties(party, similarities) while not big_enough_coalition(coalition): coalition.append(next(most_similar_party_left)) coalition_score_ = getCoalitionScore(coalition, X_Y_2_XY(test_X, test_Y)) if coalition_score_ > best_coalition_score: best_coalition_score = coalition_score_ best_coalition = coalition return best_coalition, best_coalition_score train_X, train_Y, validation_X, validation_Y, test_X, test_Y = get_prepared_data( save=False) data_test = X_Y_2_XY(test_X, test_Y, False) # best_score = float('-inf') # best_coalition = None # best_k = None # for k in range(2, 50, 2): # k_means = KMeans(n_clusters=k) # cluster_res = k_means.fit(train_X) # cluster_labels = k_means.predict(test_X) # parties_num = len(np.unique(XY['Vote'])) # similarities = get_similarities(parties_num, get_similarity, cluster_labels, XY) # coalition, score = build_coalition_from_similarity_matrix(similarities) # if score > best_score: # best_score = score # best_coalition = coalition # best_k = k
best_initialization = None best_score = float('-inf') for _ in range(20): initialization = np.random.normal(0.5, 1, (2, len(train_X.columns))) kmean = KMeans(2, initialization) score = np.average( cross_val_score(kmean, X_to_split, Y_to_split, cv=3, scoring=scorer)) if score > best_score: best_score = score best_initialization = initialization kmean = KMeans(2, best_initialization) kmean.fit(X_to_split, Y_to_split) clusters = kmean.predict(test_X) cluster_list = [[], []] for party in set(test_Y): party_culsters = clusters[test_Y == party] _, counts = np.unique(party_culsters, return_counts=True) cluster_list[np.argmax(counts)].append(party) cluster_0_size = len(test_X[test_Y.isin(cluster_list[0])]) cluster_1_size = len(test_X[test_Y.isin(cluster_list[1])]) bigger_cluster = np.argmax([cluster_0_size, cluster_1_size]) coalition = cluster_list[bigger_cluster] print(coalition) print(coalition_score(coalition, X_Y_2_XY(test_X, test_Y)))