Esempio n. 1
0
def get_prepared_data(load=False, save=True):
    if load:
        train_XY = pd.read_csv('train_XY.csv')
        validation_XY = pd.read_csv('validation_XY.csv')
        test_XY = pd.read_csv('test_XY.csv')
    else:
        data = get_data()
        data = data.iloc[:, :300]
        data = to_numerical_data(data)
        train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data(
            data)
        train_XY = X_Y_2_XY(train_X, train_Y)
        validation_XY = X_Y_2_XY(validation_X, validation_Y)
        test_XY = X_Y_2_XY(test_X, test_Y)
        train_XY = impute_train_X(train_XY)
        # train_XY = clean_and_correct_train_X(train_XY)
        train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY,
                                                     test_XY)
        validation_XY, test_XY = impute_test_and_validation(
            train_XY, validation_XY, test_XY)
        if save:
            train_XY.to_csv('train_XY')
            validation_XY.to_csv('validation_XY')
            test_XY.to_csv('test_XY')
            print('\033[1m' + "DATA SAVED" + '\033[0m')
    train_X, train_Y = XY_2_X_Y(train_XY)
    validation_X, validation_Y = XY_2_X_Y(validation_XY)
    test_X, test_Y = XY_2_X_Y(test_XY)
    return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
def build_coalition_from_similarity_matrix(similarity_matrix, X_to_split,
                                           Y_to_split, test_X, test_Y):
    best_coalition_score = float('-inf')
    best_coalition = list(range(13))
    for party in range(13):
        coalition = [party]
        while not big_enough_coalition(coalition, test_X, test_Y):
            coalition.append(most_similar_party(coalition, similarity_matrix))
        coalition_score_ = f(coalition, X_Y_2_XY(X_to_split, Y_to_split))
        if coalition_score_ > best_coalition_score:
            best_coalition_score = coalition_score_
            best_coalition = coalition.copy()
        while len(coalition) < 13:
            coalition.append(most_similar_party(coalition, similarity_matrix))
            coalition_score_ = f(coalition, X_Y_2_XY(X_to_split, Y_to_split))
            if coalition_score_ > best_coalition_score:
                best_coalition_score = coalition_score_
                best_coalition = coalition.copy()

    best_coalition.sort()
    return best_coalition, best_coalition_score
Esempio n. 3
0
def get_prepared_data():
    if isfile('train_XY.csv'):
        train_XY = pd.read_csv('train_XY.csv')
        validation_XY = pd.read_csv('validation_XY.csv')
        test_XY = pd.read_csv('test_XY.csv')
        print('\033[1m' + "DATA LOADED" + '\033[0m')
    else:
        print('\033[1m' + "PREPARING DATA..." + '\033[0m')
        data = get_data()
        data = to_numerical_data(data)
        train_X, train_Y, validation_X, validation_Y, test_X, test_Y = split_data(
            data)
        train_XY = X_Y_2_XY(train_X, train_Y)
        validation_XY = X_Y_2_XY(validation_X, validation_Y)
        test_XY = X_Y_2_XY(test_X, test_Y)
        cleaner = DistirbutionOutlinersCleaner()
        cleaner.fit(train_XY)
        train_XY = cleaner.clean_and_correct(train_XY, int(len(train_XY) / 20),
                                             0)
        imputer = DistirbutionImputator()
        imputer.fit(train_XY)
        train_XY = imputer.fill_nans(train_XY)
        validation_XY = imputer.fill_nans(validation_XY)
        test_XY = imputer.fill_nans(test_XY)
        validation_XY = cleaner.clean_and_correct(validation_XY,
                                                  int(len(validation_XY) / 20),
                                                  0)
        test_XY = cleaner.clean_and_correct(test_XY, int(len(test_XY) / 20), 0)
        train_XY, validation_XY, test_XY = scale_all(train_XY, validation_XY,
                                                     test_XY)

        train_XY.to_csv('train_XY.csv', index=False)
        validation_XY.to_csv('validation_XY.csv', index=False)
        test_XY.to_csv('test_XY.csv', index=False)
        print('\033[1m' + "DATA SAVED" + '\033[0m')

    train_X, train_Y = XY_2_X_Y(train_XY)
    validation_X, validation_Y = XY_2_X_Y(validation_XY)
    test_X, test_Y = XY_2_X_Y(test_XY)
    return train_X, train_Y, validation_X, validation_Y, test_X, test_Y
Esempio n. 4
0
def build_coalition_from_similarity_matrix(similarities):
    # we will go through every party and will try to build a coalition with similar parties
    best_coalition_score = float('-inf')
    best_coalition = []
    for party in range(13):
        coalition = [party]
        most_similar_party_left = most_similar_parties(party, similarities)
        while not big_enough_coalition(coalition):
            coalition.append(next(most_similar_party_left))
        coalition_score_ = getCoalitionScore(coalition,
                                             X_Y_2_XY(test_X, test_Y))
        if coalition_score_ > best_coalition_score:
            best_coalition_score = coalition_score_
            best_coalition = coalition
    return best_coalition, best_coalition_score
Esempio n. 5
0
def best_option_ever():
    best_coalition_score = float('-inf')
    best_coalition = None
    # counter = 0
    for coalition in powerset(set(train_Y)):
        # print(counter)
        # counter += 1
        if not big_enough_coalition(coalition):
            continue
        else:
            score = coalition_score(coalition,
                                    X_Y_2_XY(X_to_split, Y_to_split))
            if score > best_coalition_score:
                best_coalition_score = score
                best_coalition = coalition
    return best_coalition, best_coalition_score
Esempio n. 6
0
def get_unlabeled_data(load=False):
    if load:
        real_test_X = pd.read_csv('real_test_X.csv')
        real_train_XY = pd.read_csv('real_train_XY.csv')
        print('\033[1m' + "DATA LOADED" + '\033[0m')
    else:
        print('\033[1m' + "PREPARING DATA..." + '\033[0m')
        real_train_XY = pd.read_csv('ElectionsData.csv')
        real_train_XY = real_train_XY.loc[:, [
            'Vote', 'Avg_environmental_importance',
            'Avg_government_satisfaction', 'Avg_education_importance',
            'Most_Important_Issue', 'Avg_monthly_expense_on_pets_or_plants',
            'Avg_Residancy_Altitude', 'Yearly_ExpensesK',
            'Weighted_education_rank', 'Number_of_valued_Kneset_members'
        ]]
        real_test_X = pd.read_csv('ElectionsData_Pred_Features.csv')
        real_test_X = real_test_X.loc[:, [
            'Avg_environmental_importance', 'Avg_government_satisfaction',
            'Avg_education_importance', 'Most_Important_Issue',
            'Avg_monthly_expense_on_pets_or_plants', 'Avg_Residancy_Altitude',
            'Yearly_ExpensesK', 'Weighted_education_rank',
            'Number_of_valued_Kneset_members'
        ]]

        real_train_XY = to_numerical_data(real_train_XY)
        real_test_X = to_numerical_data_test(real_test_X)
        cleaner = DistirbutionOutlinersCleaner()
        cleaner.fit(real_train_XY)
        real_train_XY = cleaner.clean_and_correct(real_train_XY,
                                                  int(len(real_train_XY) / 20),
                                                  0)
        imputer = DistirbutionImputator()
        imputer.fit(real_train_XY)
        real_train_XY = imputer.fill_nans(real_train_XY)
        real_test_X = imputer.fill_nans(real_test_X,
                                        data_is_with_label_column=False)
        scaler = Scaler()
        real_train_X, real_train_Y = XY_2_X_Y(real_train_XY)
        scaler.fit(real_train_X)
        scaler.scale(real_train_X)
        scaler.scale(real_test_X)
        real_train_XY = X_Y_2_XY(real_train_X, real_train_Y)
        real_train_XY.to_csv('real_train_XY.csv', index=False)
        real_test_X.to_csv('real_test_X.csv', index=False)
        print('\033[1m' + "DATA SAVED" + '\033[0m')

    return real_train_XY, real_test_X
def scorer(estimator, X, Y):
    clusters = estimator.predict(X)

    cluster_list = [[], []]

    for party in set(Y):
        party_culsters = clusters[Y == party]
        _, counts = np.unique(party_culsters, return_counts=True)
        cluster_list[np.argmax(counts)].append(party)

    cluster_0_size = len(X[Y.isin(cluster_list[0])])
    cluster_1_size = len(X[Y.isin(cluster_list[1])])

    bigger_cluster = np.argmax([cluster_0_size, cluster_1_size])
    coalition = cluster_list[bigger_cluster]
    score = coalition_score(coalition, X_Y_2_XY(X, Y))

    return score
Esempio n. 8
0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from get_prepared_data import get_prepared_data
from data_handling import X_Y_2_XY


def plot_vote_to_features_colored(data: pd.DataFrame):
    names = data.columns.values
    for i in range(1, len(names)):
        sns.pairplot(data.iloc[:, [0, i]], hue='Vote')
        name = 'Vote to ' + str(names[i])
        plt.title(name)
        plt.savefig(name + '.png')
        plt.show()


train_X, train_Y, validation_X, validation_Y, test_X, test_Y = get_prepared_data(
    load=True)
plot_vote_to_features_colored(X_Y_2_XY(train_X, train_Y))
Esempio n. 9
0
        coalition = [party]
        most_similar_party_left = most_similar_parties(party, similarities)
        while not big_enough_coalition(coalition):
            coalition.append(next(most_similar_party_left))
        coalition_score_ = getCoalitionScore(coalition,
                                             X_Y_2_XY(test_X, test_Y))
        if coalition_score_ > best_coalition_score:
            best_coalition_score = coalition_score_
            best_coalition = coalition
    return best_coalition, best_coalition_score


train_X, train_Y, validation_X, validation_Y, test_X, test_Y = get_prepared_data(
    save=False)

data_test = X_Y_2_XY(test_X, test_Y, False)
# best_score = float('-inf')
# best_coalition = None
# best_k = None
# for k in range(2, 50, 2):
#     k_means = KMeans(n_clusters=k)
#     cluster_res = k_means.fit(train_X)
#     cluster_labels = k_means.predict(test_X)
#     parties_num = len(np.unique(XY['Vote']))
#     similarities = get_similarities(parties_num, get_similarity, cluster_labels, XY)
#     coalition, score = build_coalition_from_similarity_matrix(similarities)
#     if score > best_score:
#         best_score = score
#         best_coalition = coalition
#         best_k = k
Esempio n. 10
0
best_initialization = None
best_score = float('-inf')
for _ in range(20):
    initialization = np.random.normal(0.5, 1, (2, len(train_X.columns)))
    kmean = KMeans(2, initialization)
    score = np.average(
        cross_val_score(kmean, X_to_split, Y_to_split, cv=3, scoring=scorer))
    if score > best_score:
        best_score = score
        best_initialization = initialization

kmean = KMeans(2, best_initialization)
kmean.fit(X_to_split, Y_to_split)
clusters = kmean.predict(test_X)

cluster_list = [[], []]
for party in set(test_Y):
    party_culsters = clusters[test_Y == party]
    _, counts = np.unique(party_culsters, return_counts=True)
    cluster_list[np.argmax(counts)].append(party)

cluster_0_size = len(test_X[test_Y.isin(cluster_list[0])])
cluster_1_size = len(test_X[test_Y.isin(cluster_list[1])])

bigger_cluster = np.argmax([cluster_0_size, cluster_1_size])

coalition = cluster_list[bigger_cluster]

print(coalition)
print(coalition_score(coalition, X_Y_2_XY(test_X, test_Y)))