def main(): x, y_arg, y_rhet, y_aspect, y_summary, y_citation = load_conll.load_data_multiple("./annotations_conll_final_without_abstracts") y_arg = remove_bio_from_arg(y_arg) y_citation = remove_bio_from_cit(y_citation) # plot_sentence_lengths(x) print("Number of sentences with more than 200 tokens: " + str(len([len(sentence) for sentence in x if len(sentence) > 100]))) print(str(len([len(sentence) for sentence in x if len(sentence) > 100])/len(x)))
def grid_search_crf_lexical(task=""): print("Running grid search for crf, lexical, " + str(task)) print("================================================================") x_train_dev, y_arg_train_dev, y_rhet_train_dev, y_aspect_train_dev, y_summary_train_dev, y_citation_train_dev = load_conll.load_data_multiple(path="./../../annotations_conll_final_splitted/train_dev/") x_test, y_arg_test, y_rhet_test, y_aspect_test, y_summary_test, y_citation_test = load_conll.load_data_multiple(path="./../../annotations_conll_final_splitted/test/") x_train, y_arg_train, y_rhet_train, y_aspect_train, y_summary_train, y_citation_train = load_conll.load_data_multiple(path="./../../annotations_conll_final_splitted_with_val_split/train/") x_dev, y_arg_dev, y_rhet_dev, y_aspect_dev, y_summary_dev, y_citation_dev = load_conll.load_data_multiple(path="./../../annotations_conll_final_splitted_with_val_split/dev/") if task == "citation": exclude_class = "NONE\n" y_train_dev = y_citation_train_dev y_train = y_citation_train y_dev = y_citation_dev y_test = y_citation_test elif task == "argumentation": exclude_class = "Token_Label.OUTSIDE" y_train_dev = y_arg_train_dev y_train = y_arg_train y_dev = y_arg_dev y_test = y_arg_test print("Data prepared") print("Data loaded") labels = list(set([lab for sublist in y_train_dev for lab in sublist])) c1s = [0.1, 0.01, 0.001, 0.0001] c2s = [0.1, 0.01, 0.001, 0.0001] configs = list(itertools.product(c1s, c2s)) best_macro_f1 = 0.0 best_config = None print("Data prepared") print("Grid search configs: {!s:s}".format(configs)) for config in configs: print("Using config " + str(config)) c1 = config[0] c2 = config[1] x_train_transformed = [sent2features(sent) for sent in x_train] x_dev_transformed = [sent2features(sent) for sent in x_dev] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=c1, c2=c2, max_iterations=100, all_possible_transitions=True ) crf.fit(x_train_transformed, y_train) y_pred = crf.predict(x_dev_transformed) #transform it to our metrics y_pred = transform_classes_to_binary(y_pred, labels) y_dev = transform_classes_to_binary(y_dev, labels) confusion_matrix = ConfusionMatrix(labels=labels, gold=y_dev, predictions=y_pred, token_level=True, one_hot_encoding=True) confusion_matrix.compute_all_scores(exclude_class=exclude_class) macro_f1 = confusion_matrix.macrof1 if macro_f1 > best_macro_f1: best_macro_f1 = macro_f1 best_config = config print(str(confusion_matrix.get_all_results())) # we found the best config, do it all again on train_dev + test: print("Best Config " + str(best_config)) print("Best Macro F1 " + str(best_macro_f1)) c1 = best_config[0] c2 = best_config[1] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=c1, c2=c2, max_iterations=100, all_possible_transitions=True ) x_train_dev_transformed = [sent2features(sent) for sent in x_train_dev] x_test_transformed = [sent2features(sent) for sent in x_test] crf.fit(x_train_dev_transformed, y_train_dev) y_pred = crf.predict(x_test_transformed) y_pred = transform_classes_to_binary(y_pred, labels) y_test = transform_classes_to_binary(y_test, labels) confusion_matrix = ConfusionMatrix(labels=labels, gold=y_test, predictions=y_pred, token_level=True, one_hot_encoding=True) confusion_matrix.compute_all_scores(exclude_class=exclude_class) print(str(confusion_matrix.get_all_results()))
def grid_search_linear_svm_tfidf(task=""): print("Running grid search for svm, linear kernal, tfidf, " + str(task)) print("================================================================") if task == "discourse": exclude_class = "DRI_Unspecified" elif task == "aspect": exclude_class = "NONE" elif task == "summary": exclude_class = "NONE" else: print("No valid task name provided") exit() print("SVM script started") start = time.time() x_train_dev, y_arg_train_dev, y_rhet_train_dev, y_aspect_train_dev, y_summary_train_dev, y_citation_train_dev = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted/train_dev/") x_test, y_arg_test, y_rhet_test, y_aspect_test, y_summary_test, y_citation_test = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted/test/") x_train, y_arg_train, y_rhet_train, y_aspect_train, y_summary_train, y_citation_train = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted_with_val_split/train/") x_dev, y_arg_dev, y_rhet_dev, y_aspect_dev, y_summary_dev, y_citation_dev = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted_with_val_split/dev/") print("Data loaded") x_train_dev = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_train_dev ] x_test = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_test ] x_dev = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_dev ] x_train = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_train ] if task == "discourse": exclude_class = "DRI_Unspecified" y_train_dev = y_rhet_train_dev y_train = y_rhet_train y_dev = y_rhet_dev y_test = y_rhet_test elif task == "aspect": exclude_class = "NONE" y_train_dev = y_aspect_train_dev y_train = y_aspect_train y_dev = y_aspect_dev y_test = y_aspect_test elif task == "summary": exclude_class = "NONE" y_train_dev = y_summary_train_dev y_train = y_summary_train y_dev = y_summary_dev y_test = y_summary_test y_train_dev = [sent[0] for sent in y_train_dev] y_test = [sent[0] for sent in y_test] y_dev = [sent[0] for sent in y_dev] y_train = [sent[0] for sent in y_train] print("Data prepared") labels = list(set([lab for lab in y_train_dev])) y_train_dev = transform_classes_to_binary(y_train_dev, labels) y_test = transform_classes_to_binary(y_test, labels) y_dev = transform_classes_to_binary(y_dev, labels) y_train = transform_classes_to_binary(y_train, labels) # grid search stuff possible_c = [0.1, 1.0, 10.0] print("Grid search configs are " + str(possible_c)) best_f1 = 0.0 best_c = "" for c in possible_c: print("Using config " + str(c)) tfidf_vectorizer = TfidfVectorizer() # fit to train x_train_transformed = tfidf_vectorizer.fit_transform(x_train) clf = OneVsRestClassifier(SVC(kernel='linear', C=c)) clf.fit(x_train_transformed, y_train) # predict on dev set x_dev_transformed = tfidf_vectorizer.transform(x_dev) y_pred = clf.predict(x_dev_transformed) confusion_matrix = ConfusionMatrix(labels=labels, gold=y_dev, predictions=y_pred, token_level=False, one_hot_encoding=True) confusion_matrix.compute_all_scores(exclude_class=exclude_class) if confusion_matrix.macrof1 > best_f1: best_f1 = confusion_matrix.macrof1 best_c = c print(str(confusion_matrix.get_all_results())) # we found the best config, now train again on train + dev print("Best Config " + str(best_c)) print("Best Macro F1 " + str(best_f1)) tfidf_vectorizer = TfidfVectorizer() # fit to train_dev x_train_dev = tfidf_vectorizer.fit_transform(x_train_dev) clf = OneVsRestClassifier(SVC(kernel='linear', C=best_c)) clf.fit(x_train_dev, y_train_dev) # predict on test set x_test = tfidf_vectorizer.transform(x_test) y_pred = clf.predict(x_test) confusion_matrix = ConfusionMatrix(labels=labels, gold=y_test, predictions=y_pred, token_level=False, one_hot_encoding=True) confusion_matrix.compute_all_scores(exclude_class=exclude_class) print(str(confusion_matrix.get_all_results())) print("Total training time: " + str(time.time() - start))
def grid_search_rbf_svm_tfidf_embeddings(embd_dict=None, task=""): print( "Running grid search for svm, rbf kernal, embeddings weighted tfidf, " + str(task)) print("================================================================") if task == "discourse": exclude_class = "DRI_Unspecified" elif task == "aspect": exclude_class = "NONE" elif task == "summary": exclude_class = "NONE" else: print("No valid task name provided") exit() print("SVM script started") x_train_dev, y_arg_train_dev, y_rhet_train_dev, y_aspect_train_dev, y_summary_train_dev, y_citation_train_dev = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted/train_dev/") x_test, y_arg_test, y_rhet_test, y_aspect_test, y_summary_test, y_citation_test = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted/test/") x_train, y_arg_train, y_rhet_train, y_aspect_train, y_summary_train, y_citation_train = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted_with_val_split/train/") x_dev, y_arg_dev, y_rhet_dev, y_aspect_dev, y_summary_dev, y_citation_dev = load_conll.load_data_multiple( path="./../../annotations_conll_final_splitted_with_val_split/dev/") print("Data loaded") x_train_dev = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_train_dev ] x_test = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_test ] x_dev = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_dev ] x_train = [ utils.preprocess_string_tfidf(' '.join(sentence)) for sentence in x_train ] if task == "discourse": exclude_class = "DRI_Unspecified" y_train_dev = y_rhet_train_dev y_train = y_rhet_train y_dev = y_rhet_dev y_test = y_rhet_test elif task == "aspect": exclude_class = "NONE" y_train_dev = y_aspect_train_dev y_train = y_aspect_train y_dev = y_aspect_dev y_test = y_aspect_test elif task == "summary": exclude_class = "NONE" y_train_dev = y_summary_train_dev y_train = y_summary_train y_dev = y_summary_dev y_test = y_summary_test y_train_dev = [sent[0] for sent in y_train_dev] y_test = [sent[0] for sent in y_test] y_dev = [sent[0] for sent in y_dev] y_train = [sent[0] for sent in y_train] print("Data prepared") labels = list(set([lab for lab in y_train_dev])) y_train_dev = transform_classes_to_binary(y_train_dev, labels) y_test = transform_classes_to_binary(y_test, labels) y_dev = transform_classes_to_binary(y_dev, labels) y_train = transform_classes_to_binary(y_train, labels) # grid search stuff possible_c = [0.1, 1.0, 10.0] possible_gamma = [0.01, 0.1, 1.0] configurations = list(itertools.product(possible_c, possible_gamma)) print("Grid search configs: {!s:s}".format(configurations)) best_f1 = 0.0 best_conf = "" for (c, gamma) in configurations: print("Using config " + str((c, gamma))) tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer = tfidf_vectorizer.fit(x_train) embedding_vectorizer = utils.TfidfEmbeddingVectorizer( embds=embd_dict, tfidf_vectorizer=tfidf_vectorizer) # fit to train x_train_transformed = embedding_vectorizer.transform(x_train) clf = OneVsRestClassifier(SVC(kernel='rbf', C=c, gamma=gamma)) clf.fit(x_train_transformed, y_train) # predict on dev set x_dev_transformed = embedding_vectorizer.transform(x_dev) y_pred = clf.predict(x_dev_transformed) confusion_matrix = ConfusionMatrix(labels=labels, gold=y_dev, predictions=y_pred, token_level=False, one_hot_encoding=True) confusion_matrix.compute_all_scores(exclude_class=exclude_class) if confusion_matrix.macrof1 > best_f1: best_f1 = confusion_matrix.macrof1 best_conf = (c, gamma) print(str(confusion_matrix.get_all_results())) # we found the best config, now train again on train + dev print("Best Config " + str(best_conf)) print("Best Macro F1 " + str(best_f1)) tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer = tfidf_vectorizer.fit(x_train_dev) embedding_vectorizer = utils.TfidfEmbeddingVectorizer( embds=embd_dict, tfidf_vectorizer=tfidf_vectorizer) # fit to train_dev x_train_dev = embedding_vectorizer.transform(x_train_dev) clf = OneVsRestClassifier( SVC(kernel='rbf', C=best_conf[0], gamma=best_conf[1])) clf.fit(x_train_dev, y_train_dev) # predict on test set x_test = embedding_vectorizer.transform(x_test) y_pred = clf.predict(x_test) confusion_matrix = ConfusionMatrix(labels=labels, gold=y_test, predictions=y_pred, token_level=False, one_hot_encoding=True) confusion_matrix.compute_all_scores(exclude_class=exclude_class) print(str(confusion_matrix.get_all_results()))
def transform_classes_to_binary(y, labels_list): labels_list = np.array(labels_list) y = np.array([ np.array([ np.array([ 1 if np.where(labels_list == label)[0] == i else 0 for i in range(0, len(labels_list)) ]) for label in sentence ]) for sentence in y ]) return y print("HMM script started") start = time.time() x_train_dev, y_arg_train_dev, y_rhet_train_dev, y_aspect_train_dev, y_summary_train_dev, y_citation_train_dev = load_conll.load_data_multiple( path="./../annotations_conll_final_splitted/train_dev/") x_test, y_arg_test, y_rhet_test, y_aspect_test, y_summary_test, y_citation_test = load_conll.load_data_multiple( path="./../annotations_conll_final_splitted/test/") print("Data loaded") # provide token-label tuples to the trainer xy_train_dev = [ list(zip(x_sent, y_sent)) for (x_sent, y_sent) in list(zip(x_train_dev, y_citation_train_dev)) ] # Setup a trainer with default(None) values # And train with the data trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(xy_train_dev) #x_test = [list(x_sent) for x_sent in x_test]