def run_baselines(): ROOT_DIR = dirname(dirname(__file__)) gold_data_folder = join(ROOT_DIR, 'data/training/') all_debates = [ join(gold_data_folder, debate_name) for debate_name in listdir(gold_data_folder) ] all_debates.sort() train_debates = all_debates[:-1] test_debate = all_debates[-1] random_baseline_fpath = join(ROOT_DIR, 'baselines/data/task1_random_baseline.tsv') run_random_baseline(test_debate, random_baseline_fpath) if check_format(random_baseline_fpath): thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate( test_debate, random_baseline_fpath) print("Random Baseline AVGP:", avg_precision) ngram_baseline_fpath = join(ROOT_DIR, 'baselines/data/task1_ngram_baseline.tsv') run_ngram_baseline(train_debates, test_debate, ngram_baseline_fpath) if check_format(ngram_baseline_fpath): thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate( test_debate, ngram_baseline_fpath) print("Ngram Baseline AVGP:", avg_precision)
def run_baselines(): gold_data_folder = join(ROOT_DIR, 'data/training/') gold_data_folder = [join(gold_data_folder, debate_name) for debate_name in listdir(gold_data_folder)] gold_data_folder.sort() n_train = int(.8 *len(gold_data_folder)) train_debates = gold_data_folder[:n_train] dev_debates = gold_data_folder[n_train:] run_random_baseline(dev_debates) avg_precisions = [] for test_debate in dev_debates: random_baseline_fpath = join(ROOT_DIR, 'baselines/data/task5_random_baseline_%s'%(os.path.basename(test_debate))) if check_format(random_baseline_fpath): thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(test_debate, random_baseline_fpath) avg_precisions.append(avg_precision) print("Random Baseline AVGP:", np.mean(avg_precisions)) run_ngram_baseline(train_debates, dev_debates) avg_precisions = [] for test_debate in dev_debates: ngram_baseline_fpath = join(ROOT_DIR, 'baselines/data/task5_ngram_baseline_%s'%(os.path.basename(test_debate))) if check_format(ngram_baseline_fpath): thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(test_debate, ngram_baseline_fpath) avg_precisions.append(avg_precision) print("Ngram Baseline AVGP:", np.mean(avg_precisions))
def run_baselines(): train_fpath = join(ROOT_DIR, 'data/training.tsv') test_fpath = join(ROOT_DIR, 'data/dev.tsv') run_random_baseline(test_fpath) random_baseline_fpath = join( ROOT_DIR, 'baselines/data/task1_random_baseline_%s' % (os.path.basename(test_fpath))) if check_format(random_baseline_fpath): thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate( test_fpath, random_baseline_fpath) print("Random Baseline AVGP:", avg_precision) run_ngram_baseline(train_fpath, test_fpath) ngram_baseline_fpath = join( ROOT_DIR, 'baselines/data/task1_ngram_baseline_%s' % (os.path.basename(test_fpath))) if check_format(ngram_baseline_fpath): thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate( test_fpath, ngram_baseline_fpath) print("Ngram Baseline AVGP:", avg_precision)
def get_best_svm_model(feature_vector_train, label, feature_vector_valid): param_grid = [{'kernel':'linear', 'C': np.logspace(-3, 3, 20), 'gamma': [1]}, {'kernel':'rbf', 'C': np.logspace(-3, 3, 20), 'gamma': np.logspace(-3, 3, 20)}] pca_list = [1.0,0.99,0.98,0.97,0.96,0.95] best_acc = 0.0 best_model = 0 best_prec = 0.0 best_pca_nk = 0 temp_xtrain = feature_vector_train temp_xval = feature_vector_valid for pca_nk in pca_list: if pca_nk != 1.0: pca = decomposition.PCA(n_components=pca_nk).fit(temp_xtrain) feature_vector_train = pca.transform(temp_xtrain) feature_vector_valid = pca.transform(temp_xval) for params in param_grid: for C in params['C']: for gamma in params['gamma']: # Model with different parameters model = svm.SVC(C=C, gamma=gamma, kernel=params['kernel'], random_state=42, class_weight='balanced') # fit the training dataset on the classifier model.fit(feature_vector_train, label) # predict the labels on validation dataset predictions = model.predict(feature_vector_valid) acc = metrics.accuracy_score(predictions, val_y) predicted_distance = model.decision_function(feature_vector_valid) results_fpath = my_loc+'/results/temp5_%d.tsv'%(args.out_file) with open(results_fpath, "w") as results_file: for i, line in valDF.iterrows(): dist = predicted_distance[i] results_file.write("{}\t{}\t{}\t{}\n".format('covid-19', line['tweet_id'], dist, "w2v_pos")) _, _, avg_precision, _, _ = evaluate('data/dev.tsv',results_fpath) if round(avg_precision,4) >= round(best_prec,4) and round(acc,2) >= round(best_acc,2): best_prec = avg_precision best_acc = acc best_model = model best_pca_nk = pca_nk return best_acc, best_pca_nk, best_model
if best_pca_nk != 1.0: pca = decomposition.PCA(n_components=best_pca_nk).fit(ft_train) ft_val = pca.transform(ft_val) print("SVM, %s+PoS Accuracy: %.3f"%(wordmod, round(accuracy,3))) print("PCA No. Components: %.2f, Dim: %d"%(best_pca_nk, ft_val.shape[1])) print("C: %.3f, Gamma: %.3f, kernel: %s"%(classifier.C, classifier.gamma, classifier.kernel)) predicted_distance = classifier.decision_function(ft_val) results_fpath = my_loc+'/results/task1_%s_pos_svm_dev_%d.tsv'%(wordmod, args.out_file) with open(results_fpath, "w") as results_file: for i, line in valDF.iterrows(): dist = predicted_distance[i] results_file.write("{}\t{}\t{}\t{}\n".format('covid-19', line['tweet_id'], dist, wordmod+'_pos')) thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate('data/dev.tsv', results_fpath) print("%s+PoS SVM AVGP: %.4f\n"%(wordmod, round(avg_precision,4))) all_res.append([round(accuracy,3), round(avg_precision,4), best_pca_nk, ft_train.shape[1], ft_val.shape[1]]) with open(my_loc+'/file_results/w2v_pos_%d.txt'%(args.out_file), 'a+') as f: for res in all_res: f.write("%.3f,%.4f,%.2f,%d,%d\n"%(res[0], res[1], res[2], res[3], res[4])) f.write('\n\n')
def get_best_svm_model(feature_vector_train, label, feature_vector_valid, fname, emb_type): # param_grid = [{'kernel':'linear', 'C': np.logspace(-2, 2, 10), 'gamma': [1]}, # {'kernel':'rbf', 'C': np.logspace(-2, 2, 10), # 'gamma': np.logspace(-2, 2, 10)}] param_grid = [{ 'kernel': 'rbf', 'C': np.logspace(-3, 3, 30), 'gamma': np.logspace(-3, 3, 30) }] pca_list = [1.0, 0.99, 0.98, 0.97, 0.96, 0.95] best_acc = 0.0 best_model = 0 best_prec = 0.0 best_pca_nk = 0 temp_xtrain = feature_vector_train temp_xval = feature_vector_valid for pca_nk in pca_list: print(pca_nk) if pca_nk != 1.0: pca = decomposition.PCA(n_components=pca_nk).fit(temp_xtrain) feature_vector_train = pca.transform(temp_xtrain) feature_vector_valid = pca.transform(temp_xval) for params in param_grid: for C in params['C']: for gamma in params['gamma']: # Model with different parameters model = SVC(C=C, gamma=gamma, kernel=params['kernel'], random_state=42, class_weight='balanced', gpu_id=args.gpu_id) # fit the training dataset on the classifier model.fit(feature_vector_train, label) # predict the acc on validation dataset acc = model.score(feature_vector_valid, val_y) predicted_distance = model.decision_function( feature_vector_valid) results_fpath = my_loc + '/results/bert_word_posdep_%s_%s_svm_norm%d.tsv' % ( fname, emb_type, args.normalize) with open(results_fpath, "w") as results_file: for i, line in valDF.iterrows(): dist = predicted_distance[i][0] results_file.write("{}\t{}\t{}\t{}\n".format( 'covid-19', line['tweet_id'], dist, "bert_wd_posdep")) _, _, avg_precision, _, _ = evaluate( 'data/dev.tsv', results_fpath) if round(avg_precision, 4) >= round( best_prec, 4) and round(acc, 2) >= round( best_acc, 2): best_prec = avg_precision best_acc = acc best_model = model best_pca_nk = pca_nk return best_acc, best_pca_nk, best_model
print("SVM, %s, %s Accuracy: %.3f" % (fname, emb_type, round(accuracy, 3))) print("PCA No. Components: %.2f, Dim: %d" % (best_pca_nk, ft_val.shape[1])) print("C: %.3f, Gamma: %.3f, kernel: %s" % (classifier.C, classifier.gamma, classifier.kernel)) predicted_distance = classifier.decision_function(ft_val) results_fpath = my_loc + '/results/bert_word_posdep_%s_%s_svm_norm%d.tsv' % ( fname, emb_type, args.normalize) with open(results_fpath, "w") as results_file: for i, line in valDF.iterrows(): dist = predicted_distance[i][0] results_file.write("{}\t{}\t{}\t{}\n".format( 'covid-19', line['tweet_id'], dist, 'bert_wd_posdep')) _, _, avg_precision, _, _ = evaluate('data/dev.tsv', results_fpath) print("%s, %s SVM AVGP: %.4f\n" % (fname, emb_type, round(avg_precision, 4))) pickle.dump({'best_pca': best_pca_nk}, open( my_loc + '/models/' + fname + '_' + emb_type + '_posdep_norm%s.pkl' % (args.normalize), 'wb')) classifier.save_to_file(my_loc + '/models/' + fname + '_' + emb_type + '_posdep_norm%s.dt' % (args.normalize)) all_res.append([ emb_type, round(accuracy, 3), round(avg_precision, 4), best_pca_nk, ft_train.shape[1], ft_val.shape[1]