def manipulate_winning_party(manipulation, f): train, validate, test = load_prepared_data() gbc = GradientBoostingClassifier(max_depth=7, max_features=10).fit(*split_label(train)) test = pd.concat([validate, test]) test[f] = test[f].map(manipulation) test_x, _ = split_label(test) return test, gbc.predict(test_x)
def test_results(): train = pd.read_csv(FILES_DIR + 'train_original.csv', header=0) validate = pd.read_csv(FILES_DIR + 'validate_original.csv', header=0) test = pd.read_csv(FILES_DIR + 'test_original.csv', header=0) train, validate, test = most_basic_preparation(train, validate, test) train_x, train_y = split_label(train) test_x, test_y = split_label(test) test_data_preparation(train_x, train_y, test_x, test_y, 'Basic') train = pd.read_csv(FILES_DIR + 'train.csv', header=0) test = pd.read_csv(FILES_DIR + 'test.csv', header=0) train_x, train_y = split_label(train) test_x, test_y = split_label(test) test_data_preparation(train_x, train_y, test_x, test_y, 'Advanced')
def test_results(): train = read_data('train_original.csv') validate = read_data('validate_original.csv') test = read_data('test_original.csv') train, validate, test = most_basic_preparation(train, validate, test) train_x, train_y = split_label(train) test_x, test_y = split_label(test) test_data_preparation(train_x, train_y, test_x, test_y, 'Basic') train = read_data('train.csv') test = read_data('test.csv') train_x, train_y = split_label(train) test_x, test_y = split_label(test) test_data_preparation(train_x, train_y, test_x, test_y, 'Advanced')
def get_best_model(validate, models, names): validate_x, validate_y = split_label(validate) evaluated_models = [[ model, name, f1_score(validate_y, model.predict(validate_x), average='weighted') ] for model, name in zip(models, names)] evaluated_models = sorted(evaluated_models, key=lambda t: t[2], reverse=True) print('=' * 100) print('Models Evaluated F1 Score:') # Print results in a nice format using pd.Dataframe print( pd.DataFrame( np.matrix([[name, f1] for _, name, f1 in evaluated_models]).transpose(), ['Model Name', 'F1 Score']).transpose()) print() best = evaluated_models[0] print('Best Model Is:') print(best[1], best[2]) print('=' * 100) return best[0], best[1]
def optimize_models_parameters(train, rerun_experiments=False): train_x, train_y = split_label(train) names = ['SVC', 'KNN', 'RANDOM_FOREST', 'GBC', 'MLP'] models = run_experiments( train_x, train_y, names) if rerun_experiments else load_experiments(names) return models, names
def manipulate_and_plot_distribution(manipulation, f): test, pred_y = manipulate_winning_party(manipulation, f) test_x, _ = split_label(test) code_to_name = dict( enumerate(test['Vote'].astype('category').cat.categories)) results = pd.DataFrame(pred_y, test_x.index.values, columns=['Vote']) results['Vote'] = results['Vote'].map(code_to_name).astype('category') vote_distribution = results['Vote'].value_counts() vote_distribution = vote_distribution.divide(sum(vote_distribution.values)) vote_distribution = vote_distribution.multiply(100) plt.figure(figsize=(10, 10)) bar_plot = vote_distribution.plot.bar( color=[c[:-1] for c in results['Vote'].value_counts().index.values], edgecolor='black', width=0.8) for p in bar_plot.patches: bar_plot.annotate("{:.1f}".format(p.get_height()), (p.get_x() + 0.2, p.get_height() + 0.2)) bar_plot.set_xlabel('Party') bar_plot.set_ylabel('Vote %') plt.savefig('vote_distribution.png')
def test_combinations(): train, validate, test, test_new = load_prepared_data() train_x, train_y = split_label(train) labels = sorted( list(filter(lambda x: x not in {3, 4, 7}, train_y.unique()))) test_new_x = read_data('test_new.csv', index=ID_COLUMN) test_new_y = read_data( 'results.csv', index=ID_COLUMN)['PredictVote'].astype('category').cat.codes result = [] for r in range(1, min(len(labels) + 1, 11)): print(r) for c in itertools.combinations(labels, r): y = test_new_y.map(lambda x: 1 if x in c else 0) counter = Counter(y) if {i: counter[i] / len(y) * 100.0 for i in counter}[1] < 51: continue else: score = calinski_harabaz_score(test_new_x, y) print(c, score) result.append((c, score)) return result
def most_basic_preparation(train, validate, test): train_x, _ = split_label(train) object_features = train_x.select_dtypes(include='object').columns.values train = train.drop(object_features, axis=1).dropna() validate = validate.drop(object_features, axis=1).dropna() test = test.drop(object_features, axis=1).dropna() return train, validate, test
def optimize_models_parameters(train, rerun_experiments=False): train_x, train_y = split_label(train) names = [ 'KMeans_completeness', 'KMeans_homogeneity', 'KMeans_v_measure', 'KMeans_calinski_harabaz', 'KMeans_silhouette', 'KMeans_adjusted_rand' ] models = run_experiments( train_x, train_y, names) if rerun_experiments else load_experiments(names) return models, names
def load_optimized_models(train): train_x, train_y = split_label(train) models = [ SVC(kernel='rbf', C=100000, gamma=0.01), KNeighborsClassifier(n_neighbors=3), RandomForestClassifier(max_depth=9, max_features=14), GradientBoostingClassifier(max_depth=7, max_features=10), MLPClassifier(alpha=1.5e-4, hidden_layer_sizes=( 500, 500, )) ] return [model.fit(train_x, train_y) for model in models ], ['SVC', 'KNN', 'RANDOM_FOREST', 'GBC', 'MLP']
def run_k_means_all_data(): train, validate, test = load_prepared_data() df = pd.concat([train, validate, test]) # X, y = df.drop(['Vote'], axis=1), df['Vote'] X, y = split_label(df) for k in [6, 9, 10, 11, 12]: print(k, '=========') kmeans = KMeans(n_clusters=k).fit(X) d = get_clusters_labels(kmeans, y) s = get_clusters_sizes_percent(kmeans) dist = get_clusters_distribution(kmeans, y) for i, v in d.items(): print('{:>2} {:>6}%'.format(i, s[i]), v) print('{:>10}'.format('Percent'), np.array(dist[i])) print('=========')
def predict_test_and_save_results(model, name, test): test_x, test_y = split_label(test) pred_y = model.predict(test_x) print('=' * 100) print('%s Test F1 (shhh, we\'re not supposed to know this):' % name, f1_score(test_y, pred_y, average='weighted')) print('=' * 100) code_to_name = dict( enumerate(test['Vote'].astype('category').cat.categories)) results = pd.DataFrame(pred_y, test_x.index.values, columns=['Vote']) results['Vote'] = results['Vote'].map(code_to_name).astype('category') vote_distribution = results['Vote'].value_counts() vote_distribution = vote_distribution.divide(sum(vote_distribution.values)) vote_distribution = vote_distribution.multiply(100) plt.figure(figsize=(10, 10)) bar_plot = vote_distribution.plot.bar( color=[c[:-1] for c in results['Vote'].value_counts().index.values], edgecolor='black', width=0.8) for p in bar_plot.patches: bar_plot.annotate("{:.1f}".format(p.get_height()), (p.get_x() + 0.2, p.get_height() + 0.2)) bar_plot.set_xlabel('Party') bar_plot.set_ylabel('Vote %') plt.savefig('vote_distribution.png') df_as_csv(results, 'results') print('=' * 100) print('Confusion Matrix:') print(confusion_matrix(test_y, pred_y)) print('=' * 100) print("Test Error (1-accuracy):") print(1 - accuracy_score(test_y, pred_y)) print('=' * 100) print(code_to_name)
def predict_test_and_save_results(model, name, test, test_new): test_x, test_y = split_label(test) print('=' * 100) print('%s Old Test F1:' % name, f1_score(test_y, model.predict(test_x), average='weighted')) print('=' * 100) pred_y = model.predict(test_new) code_to_name = dict( enumerate(test['Vote'].astype('category').cat.categories)) results = pd.DataFrame(pred_y, test_new.index.values, columns=['PredictVote']) results['PredictVote'] = results['PredictVote'].map(code_to_name).astype( 'category') vote_distribution = results['PredictVote'].value_counts() vote_distribution = vote_distribution.divide(sum(vote_distribution.values)) vote_distribution = vote_distribution.multiply(100) plt.figure(figsize=(10, 10)) bar_plot = vote_distribution.plot.bar(color=[ c[:-1] for c in results['PredictVote'].value_counts().index.values ], edgecolor='black', width=0.8) for p in bar_plot.patches: bar_plot.annotate("{:.1f}".format(p.get_height()), (p.get_x() + 0.2, p.get_height() + 0.2)) bar_plot.set_xlabel('Party') bar_plot.set_ylabel('Vote %') plt.savefig('vote_distribution.png') df_as_csv(results, 'results', 'IdentityCard_Num') print('=' * 100) print(code_to_name)
def retrain_best_model_using_all_data(best_model, train, validate, test): if hasattr(best_model, 'best_estimator_'): best_model = best_model.best_estimator_ return best_model.fit(*split_label(pd.concat([train, validate, test])))
def get_data(): train, validate, test = load_prepared_data() df = pd.concat([train, validate]) X_train, y_train = split_label(df) X_test, y_test = split_label(test) return X_train, y_train, X_test, y_test, df