def variance_plot_cv(claim_ids, target, rootdist_matrix, tf_matrix, questionmark, fold_range=range(2, 30), regularization='l2'): rootdist_feature = sparse.csr_matrix(rootdist_matrix) questionmark_feature = questionmark ppdb_alignment_feature = sparse.csr_matrix(get_ppdb_alignment_feature()) combined_all = sparse.hstack(( rootdist_feature, questionmark_feature, ppdb_alignment_feature, tf_matrix )) plot_2D_data(combined_all, target) results = [] for n_folds in fold_range: print(n_folds) custom_folds = cv_fold_generator(claim_ids, n_folds) results.append(logistic_regression_var(combined_all, target, custom_folds, regularization, 10000)) results_arr = np.array(results) plt.plot(fold_range, results_arr[:, 0], label='Accuracy') # Plot accuracy plt.plot(fold_range, results_arr[:, 1], label='F1-Score') # Plot F1-score plt.plot(fold_range, results_arr[:, 2], label='Recall') # Plot recall plt.plot(fold_range, results_arr[:, 3], label='Precision') # Plot precision plt.legend() plt.show() plt.plot(fold_range, results_arr[:, 4], label='Accuracy var.') # Plot accuracy variance plt.plot(fold_range, results_arr[:, 5], label='F1-Score var.') # Plot F1-score variance plt.plot(fold_range, results_arr[:, 6], label='Recall var.') # Plot recall variance plt.plot(fold_range, results_arr[:, 7], label='Precision var.') # Plot precision variance plt.legend() plt.show()
def grid_search_bow_custom_fold(data_h, target, ids, questionmark_features, folds=10, do_custom_folds=True): ngram_range = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)] max_features = range(80, 95) custom_folds = cv_fold_generator(ids, folds) res = [] count = 0 for i in ngram_range: for j in max_features: print(count / (len(max_features) * len(ngram_range))) count += 1 bow = BoW(ngram_range=i, max_features=j, stop_words=None) x = bow.fit(data_h) if i == (1, 2) and j == 90: plot_2D_data(x, target) # print(reduced) # combined2 = np.column_stack((reduced, questionmark_features.toarray())) combined = add_question_mark_feature(x, questionmark_features) # print(combined.toarray()[0]) regularization = 'l2' if do_custom_folds: res.append([logistic_regression(combined, target, custom_folds, regularization), i, j]) else: res.append([logistic_regression(combined, target, folds, regularization), i, j]) print(sorted(res, key=lambda x: x[0], reverse=True))
def combined_crossval(claim_ids, target, rootdist_matrix, tf_matrix, questionmark, folds=7, do_custom_folds=True): custom_folds = cv_fold_generator(claim_ids, folds) rootdist_feature = sparse.csr_matrix(rootdist_matrix) questionmark_feature = questionmark ppdb_alignment_feature = sparse.csr_matrix(get_ppdb_alignment_feature()) combined_all = sparse.hstack(( rootdist_feature, questionmark_feature, ppdb_alignment_feature, tf_matrix )) plot_2D_data(combined_all, target) if do_custom_folds: folds = custom_folds print("Classifier: ", '[accuracy,', 'f1_macro,', 'recall_macro,', 'precision_macro]') print("Logistic regression ovr L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'ovr')) print("Logistic regression ovr L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'ovr')) print("Logistic regression multiclass L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'multinomial')) print("Logistic regression multiclass L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'multinomial')) print("SVM Cross-validation") svm_crossval_grid(combined_all, target, folds) print("Naive Bayes: ", naive_bayes(combined_all.toarray(), target, folds))
def questionmark_only(claim_ids, target, questionmark, folds=5, do_custom_folds=True, regularization='l2'): custom_folds = cv_fold_generator(claim_ids, folds) print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro') if do_custom_folds: print(logistic_regression(questionmark, target, custom_folds, regularization, 1000000)) else: print(logistic_regression(questionmark, target, folds, regularization, 1000000))
def bow_rootdist(claim_ids, target, rootdist_matrix, tf_matrix, folds=5, do_custom_folds=True, regularization='l2'): custom_folds = cv_fold_generator(claim_ids, folds) data_sparse = sparse.csr_matrix(rootdist_matrix) combined_all = sparse.hstack((data_sparse, tf_matrix)) plot_2D_data(combined_all, target) print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro') if do_custom_folds: print(logistic_regression(combined_all, target, custom_folds, regularization, 1000000)) else: print(logistic_regression(combined_all, target, folds, regularization, 1000000))
def crossval_grid_search(target, ids, min_rootdist=1, max_rootdist=200, step=1, ppdb=None, questionmark_features=None, bow=None, folds=10): default_score = range(min_rootdist, max_rootdist + 1, step) res = [] count = 0 custom_folds = cv_fold_generator(ids, folds) for i in default_score: data = sparse.csc_matrix(get_rootdist_matrix(i)) print("At ", round((count * 100.0) / (len(default_score)), 2), "%") count += 1 combined = sparse.hstack((data, questionmark_features, bow, ppdb)) regularization = 'l2' res.append([ logistic_regression(combined, target, custom_folds, regularization), i ]) acc = np.asarray([[a[0][0], a[1]] for a in res]) f1 = np.asarray([[a[0][1], a[1]] for a in res]) recall = np.asarray([[a[0][2], a[1]] for a in res]) precision = np.asarray([[a[0][3], a[1]] for a in res]) print("Max acc without question at default_dist: ", acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0])) print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]), 1], " ", np.max(f1[:, 0])) print("Max recall without question at default_dist: ", recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0])) print("Max precision without question at default_dist: ", precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:, 0])) plt.plot(acc[:, 1], acc[:, 0], label='Accuracy') plt.plot(f1[:, 1], f1[:, 0], label='F1-Score') plt.plot(recall[:, 1], recall[:, 0], label='Recall') plt.plot(precision[:, 1], precision[:, 0], label='Precision') plt.legend() plt.xlabel("Default rootdist score") plt.ylabel("Accuracy") plt.show() return res
def crossval_rootdist(data, target, ids, questionmark_features=None, folds=10, do_custom_folds=True): custom_folds = cv_fold_generator(ids, folds) data = sparse.csr_matrix(data) if questionmark_features is not None: combined = add_question_mark_feature(data, questionmark_features) else: combined = data print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro') if do_custom_folds: print(logistic_regression(combined, target, custom_folds)) else: print(logistic_regression(combined, target, folds))