Esempio n. 1
0
def manipulate_winning_party(manipulation, f):
    train, validate, test = load_prepared_data()

    gbc = GradientBoostingClassifier(max_depth=7,
                                     max_features=10).fit(*split_label(train))

    test = pd.concat([validate, test])
    test[f] = test[f].map(manipulation)
    test_x, _ = split_label(test)

    return test, gbc.predict(test_x)
Esempio n. 2
0
def test_results():
    train = pd.read_csv(FILES_DIR + 'train_original.csv', header=0)
    validate = pd.read_csv(FILES_DIR + 'validate_original.csv', header=0)
    test = pd.read_csv(FILES_DIR + 'test_original.csv', header=0)
    train, validate, test = most_basic_preparation(train, validate, test)
    train_x, train_y = split_label(train)
    test_x, test_y = split_label(test)
    test_data_preparation(train_x, train_y, test_x, test_y, 'Basic')

    train = pd.read_csv(FILES_DIR + 'train.csv', header=0)
    test = pd.read_csv(FILES_DIR + 'test.csv', header=0)
    train_x, train_y = split_label(train)
    test_x, test_y = split_label(test)
    test_data_preparation(train_x, train_y, test_x, test_y, 'Advanced')
Esempio n. 3
0
def test_results():
    train = read_data('train_original.csv')
    validate = read_data('validate_original.csv')
    test = read_data('test_original.csv')
    train, validate, test = most_basic_preparation(train, validate, test)
    train_x, train_y = split_label(train)
    test_x, test_y = split_label(test)
    test_data_preparation(train_x, train_y, test_x, test_y, 'Basic')

    train = read_data('train.csv')
    test = read_data('test.csv')
    train_x, train_y = split_label(train)
    test_x, test_y = split_label(test)
    test_data_preparation(train_x, train_y, test_x, test_y, 'Advanced')
Esempio n. 4
0
def get_best_model(validate, models, names):
    validate_x, validate_y = split_label(validate)
    evaluated_models = [[
        model, name,
        f1_score(validate_y, model.predict(validate_x), average='weighted')
    ] for model, name in zip(models, names)]

    evaluated_models = sorted(evaluated_models,
                              key=lambda t: t[2],
                              reverse=True)

    print('=' * 100)
    print('Models Evaluated F1 Score:')

    # Print results in a nice format using pd.Dataframe
    print(
        pd.DataFrame(
            np.matrix([[name, f1]
                       for _, name, f1 in evaluated_models]).transpose(),
            ['Model Name', 'F1 Score']).transpose())

    print()
    best = evaluated_models[0]
    print('Best Model Is:')
    print(best[1], best[2])
    print('=' * 100)

    return best[0], best[1]
Esempio n. 5
0
def optimize_models_parameters(train, rerun_experiments=False):
    train_x, train_y = split_label(train)
    names = ['SVC', 'KNN', 'RANDOM_FOREST', 'GBC', 'MLP']
    models = run_experiments(
        train_x, train_y,
        names) if rerun_experiments else load_experiments(names)
    return models, names
Esempio n. 6
0
def manipulate_and_plot_distribution(manipulation, f):
    test, pred_y = manipulate_winning_party(manipulation, f)

    test_x, _ = split_label(test)

    code_to_name = dict(
        enumerate(test['Vote'].astype('category').cat.categories))
    results = pd.DataFrame(pred_y, test_x.index.values, columns=['Vote'])
    results['Vote'] = results['Vote'].map(code_to_name).astype('category')

    vote_distribution = results['Vote'].value_counts()
    vote_distribution = vote_distribution.divide(sum(vote_distribution.values))
    vote_distribution = vote_distribution.multiply(100)

    plt.figure(figsize=(10, 10))
    bar_plot = vote_distribution.plot.bar(
        color=[c[:-1] for c in results['Vote'].value_counts().index.values],
        edgecolor='black',
        width=0.8)

    for p in bar_plot.patches:
        bar_plot.annotate("{:.1f}".format(p.get_height()),
                          (p.get_x() + 0.2, p.get_height() + 0.2))

    bar_plot.set_xlabel('Party')
    bar_plot.set_ylabel('Vote %')

    plt.savefig('vote_distribution.png')
Esempio n. 7
0
def test_combinations():
    train, validate, test, test_new = load_prepared_data()
    train_x, train_y = split_label(train)
    labels = sorted(
        list(filter(lambda x: x not in {3, 4, 7}, train_y.unique())))

    test_new_x = read_data('test_new.csv', index=ID_COLUMN)
    test_new_y = read_data(
        'results.csv',
        index=ID_COLUMN)['PredictVote'].astype('category').cat.codes

    result = []

    for r in range(1, min(len(labels) + 1, 11)):
        print(r)
        for c in itertools.combinations(labels, r):
            y = test_new_y.map(lambda x: 1 if x in c else 0)
            counter = Counter(y)
            if {i: counter[i] / len(y) * 100.0 for i in counter}[1] < 51:
                continue
            else:
                score = calinski_harabaz_score(test_new_x, y)
                print(c, score)
                result.append((c, score))

    return result
Esempio n. 8
0
def most_basic_preparation(train, validate, test):
    train_x, _ = split_label(train)
    object_features = train_x.select_dtypes(include='object').columns.values

    train = train.drop(object_features, axis=1).dropna()
    validate = validate.drop(object_features, axis=1).dropna()
    test = test.drop(object_features, axis=1).dropna()

    return train, validate, test
Esempio n. 9
0
def optimize_models_parameters(train, rerun_experiments=False):
    train_x, train_y = split_label(train)
    names = [
        'KMeans_completeness', 'KMeans_homogeneity', 'KMeans_v_measure',
        'KMeans_calinski_harabaz', 'KMeans_silhouette', 'KMeans_adjusted_rand'
    ]
    models = run_experiments(
        train_x, train_y,
        names) if rerun_experiments else load_experiments(names)
    return models, names
Esempio n. 10
0
def load_optimized_models(train):
    train_x, train_y = split_label(train)
    models = [
        SVC(kernel='rbf', C=100000, gamma=0.01),
        KNeighborsClassifier(n_neighbors=3),
        RandomForestClassifier(max_depth=9, max_features=14),
        GradientBoostingClassifier(max_depth=7, max_features=10),
        MLPClassifier(alpha=1.5e-4, hidden_layer_sizes=(
            500,
            500,
        ))
    ]

    return [model.fit(train_x, train_y) for model in models
            ], ['SVC', 'KNN', 'RANDOM_FOREST', 'GBC', 'MLP']
Esempio n. 11
0
def run_k_means_all_data():
    train, validate, test = load_prepared_data()
    df = pd.concat([train, validate, test])

    # X, y = df.drop(['Vote'], axis=1), df['Vote']
    X, y = split_label(df)

    for k in [6, 9, 10, 11, 12]:
        print(k, '=========')
        kmeans = KMeans(n_clusters=k).fit(X)
        d = get_clusters_labels(kmeans, y)
        s = get_clusters_sizes_percent(kmeans)
        dist = get_clusters_distribution(kmeans, y)

        for i, v in d.items():
            print('{:>2} {:>6}%'.format(i, s[i]), v)
            print('{:>10}'.format('Percent'), np.array(dist[i]))

        print('=========')
Esempio n. 12
0
def predict_test_and_save_results(model, name, test):
    test_x, test_y = split_label(test)
    pred_y = model.predict(test_x)
    print('=' * 100)
    print('%s Test F1 (shhh, we\'re not supposed to know this):' % name,
          f1_score(test_y, pred_y, average='weighted'))
    print('=' * 100)

    code_to_name = dict(
        enumerate(test['Vote'].astype('category').cat.categories))
    results = pd.DataFrame(pred_y, test_x.index.values, columns=['Vote'])
    results['Vote'] = results['Vote'].map(code_to_name).astype('category')

    vote_distribution = results['Vote'].value_counts()
    vote_distribution = vote_distribution.divide(sum(vote_distribution.values))
    vote_distribution = vote_distribution.multiply(100)

    plt.figure(figsize=(10, 10))
    bar_plot = vote_distribution.plot.bar(
        color=[c[:-1] for c in results['Vote'].value_counts().index.values],
        edgecolor='black',
        width=0.8)

    for p in bar_plot.patches:
        bar_plot.annotate("{:.1f}".format(p.get_height()),
                          (p.get_x() + 0.2, p.get_height() + 0.2))

    bar_plot.set_xlabel('Party')
    bar_plot.set_ylabel('Vote %')

    plt.savefig('vote_distribution.png')
    df_as_csv(results, 'results')

    print('=' * 100)
    print('Confusion Matrix:')
    print(confusion_matrix(test_y, pred_y))
    print('=' * 100)
    print("Test Error (1-accuracy):")
    print(1 - accuracy_score(test_y, pred_y))
    print('=' * 100)

    print(code_to_name)
Esempio n. 13
0
def predict_test_and_save_results(model, name, test, test_new):
    test_x, test_y = split_label(test)
    print('=' * 100)
    print('%s Old Test F1:' % name,
          f1_score(test_y, model.predict(test_x), average='weighted'))
    print('=' * 100)

    pred_y = model.predict(test_new)

    code_to_name = dict(
        enumerate(test['Vote'].astype('category').cat.categories))
    results = pd.DataFrame(pred_y,
                           test_new.index.values,
                           columns=['PredictVote'])
    results['PredictVote'] = results['PredictVote'].map(code_to_name).astype(
        'category')

    vote_distribution = results['PredictVote'].value_counts()
    vote_distribution = vote_distribution.divide(sum(vote_distribution.values))
    vote_distribution = vote_distribution.multiply(100)

    plt.figure(figsize=(10, 10))
    bar_plot = vote_distribution.plot.bar(color=[
        c[:-1] for c in results['PredictVote'].value_counts().index.values
    ],
                                          edgecolor='black',
                                          width=0.8)

    for p in bar_plot.patches:
        bar_plot.annotate("{:.1f}".format(p.get_height()),
                          (p.get_x() + 0.2, p.get_height() + 0.2))

    bar_plot.set_xlabel('Party')
    bar_plot.set_ylabel('Vote %')

    plt.savefig('vote_distribution.png')
    df_as_csv(results, 'results', 'IdentityCard_Num')

    print('=' * 100)
    print(code_to_name)
Esempio n. 14
0
def retrain_best_model_using_all_data(best_model, train, validate, test):
    if hasattr(best_model, 'best_estimator_'):
        best_model = best_model.best_estimator_

    return best_model.fit(*split_label(pd.concat([train, validate, test])))
Esempio n. 15
0
def get_data():
    train, validate, test = load_prepared_data()
    df = pd.concat([train, validate])
    X_train, y_train = split_label(df)
    X_test, y_test = split_label(test)
    return X_train, y_train, X_test, y_test, df