def main(base_path, output_dir, nb_epochs): # parser = argparse.ArgumentParser() # parser.add_argument("--data_dir", default='./', type=str, required=True, help="The parent dir of inpu data, must contain folder name `conll_03`") # parser.add_argument("--output_dir", default=None, required=True, help="The output directory where is going to store the trained model") # parser.add_argument("--train_epochs", default=3, type=int, required=True, help="Number of epochs to train") # args = parser.parse_args() # base_path = args.data_dir corpus: Corpus = CONLL_03(base_path=base_path) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), PooledFlairEmbeddings('news-forward', pooling='min'), PooledFlairEmbeddings('news-backward', pooling='min'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # output_dir = args.output_dir # nb_epochs = args.train_epochs # output_dir = # nb_epochs = 10 trainer.train(output_dir, train_with_dev=False, max_epochs=nb_epochs) # 150
def main(data_dir, model_path, result_file): # parser = argparse.ArgumentParser() # parser.add_argument("--data_dir", default='./', type=str, help="The parent dir of input data, should include folder named `conll_03` ") # parser.add_argument("--model_dir", default=None, type=str, required=True, help="The model directory where model chekpoints stored") # parser.add_argument("--result_file", default='dev.tsv', type=str, required=True, help="The name of prediction file, default save in current dir") # parser.add_argument("--eval_on", default='dev', type=str, required=True, help="Whether to eval on dev set or test set") # args = parser.parse_args() # model_path = args.model_dir model = SequenceTagger.load(model_path + '/final-model.pt') # corpus: Corpus = CONLL_03(base_path=args.data_dir) corpus: Corpus = CONLL_03(base_path=data_dir) testdata = corpus.dev # test_result, test_loss = model.evaluate(testdata, out_path=args.result_file) test_result, test_loss = model.evaluate(testdata, out_path=result_file) result_line = f"\t{test_loss}\t{test_result.log_line}" # main score is micro averaged f1 score # result line is precision, recall, micro averaged score print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}") print(f"TEST RESULT: {result_line}") print(test_result.detailed_results)
def flairInfer(model_path, test_or_train): from flair.data import Sentence from flair.models import SequenceTagger from flair.data import Corpus from flair.datasets import CONLL_03 from flair.datasets import ColumnCorpus model = SequenceTagger.load(model_path + '/final-model.pt') data_dir = '../GmbDataExperimentation/processed_data/1500_data' try: corpus: Corpus = CONLL_03(base_path=data_dir) except: pass columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(data_dir, columns) if test_or_train == 'train': testdata = corpus.train result_file = data_dir + '/train.tsv' else: testdata = corpus.test result_file = data_dir + '/test.tsv' test_result, test_loss = model.evaluate(testdata, out_path=result_file) result_line = f"\t{test_loss}\t{test_result.log_line}" print( f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}") print(f"TEST RESULT : {result_line}")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default='./', type=str, help= "The parent dir of input data, should include folder named `conll_03` ") parser.add_argument("--model_dir", default=None, type=str, required=True, help="The model directory where model chekpoints stored") parser.add_argument( "--result_file", default='dev.tsv', type=str, required=True, help= "The name of prediction file, default is in the same dir of script file") parser.add_argument("--eval_on", default='dev', type=str, required=True, help="Whether to eval on dev set or test set") args = parser.parse_args() model_path = args.model_dir data_dir = args.data_dir model = SequenceTagger.load(model_path + '/final-model.pt') try: corpus: Corpus = CONLL_03(base_path=data_dir) except: pass columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(data_dir, columns) if args.eval_on == 'dev': testdata = corpus.dev elif args.eval_on == 'test': testdata = corpus.test elif args.eval_on == 'train': print('You are evaluating on training set!') testdata = corpus.train else: raise ValueError("Invalid argument, must specify evaluation on dev or test") test_result, test_loss = model.evaluate(testdata, out_path=args.result_file) result_line = f"\t{test_loss}\t{test_result.log_line}" # main score is micro averaged f1 score # result line is precision, recall, micro averaged score print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}") print(f"TEST RESULT: {result_line}") print(test_result.detailed_results)
def trainNER(data_dir, model_dir): parser = argparse.ArgumentParser() parser.add_argument("--model", default='bert-base-cased', type=str, required=True, help="The pretrained model to produce embeddings") args = parser.parse_args() model = args.model # pdb.set_trace() try: corpus: Corpus = CONLL_03(base_path=data_dir + '/') except FileNotFoundError: columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(data_dir, columns) corpus.filter_empty_sentences() tag_type = 'ner' # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) tag_dictionary = corpus.make_label_dictionary('ner') print(tag_dictionary.get_items()) stats = corpus.obtain_statistics() print(stats) # ['<unk>', 'O', 'B-DEVICE', 'I-DEVICE', 'B-TREE', 'I-TREE', 'B-APPLICATION', 'I-APPLICATION', 'B-LOCATION', 'I-LOCATION', '<START>', '<STOP>'] # pdb.set_trace() embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), TransformerWordEmbeddings( model=model, layers='0', # dtype: str pooling_operation='first_last', use_scalar_mix=False, batch_size=16, fine_tune=False, allow_long_sentences=False) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # biLSTM + CRF # tagger: SequenceTagger = SequenceTagger(hidden_size=256, # embeddings=embeddings, # tag_dictionary=tag_dictionary, # tag_type=tag_type) model_path = '/home/carolyn/Projects/mygit/Flair-NER/exprmt-20201120/conll_frac/10ptdata/models-5e-20201124/final-model.pt' tagger: SequenceTagger = SequenceTagger.load(model_path) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_dir, train_with_dev=False, max_epochs=10) # 150
def train_flair(self): # Flair Model Initialisation and Training # # 1. get the corpus # corpus: Corpus = ColumnCorpus(os.path.join(os.getcwd(), 'results', '10'), # {0: 'text', 1: 'ner'}, # train_file='train.txt', # test_file='test.txt', # dev_file='valid.txt', # column_delimiter=' ') corpus: Corpus = CONLL_03( base_path=os.path.join(os.getcwd(), 'results', '10')) corpus.dev_file = 'valid.txt' # rather than 'dev.txt' # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ # GloVe embeddings WordEmbeddings('glove'), # contextual string embeddings, forward PooledFlairEmbeddings('news-forward', pooling='min'), # contextual string embeddings, backward PooledFlairEmbeddings('news-backward', pooling='min'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) results = trainer.train(os.path.join(os.getcwd(), 'results', '10', 'tagger'), train_with_dev=False, max_epochs=50) print(results)
def testModel(model_dir, test_sent=None, test_file_dir=None): """ model_dir: directory contains 'final_model.pt' test_sent: one sentence to test test_file: one file of sentences to test """ if test_sent and test_file_dir: raise Exception( "Argument conflicts, only one type of testing method is allowed.") elif not test_sent and not test_file_dir: raise Exception( "Argument invalid, at least one testing method is required") model_path = model_dir + '/final-model.pt' model = SequenceTagger.load(model_path) if test_sent: print('Predicting in singular mode') test_sent = Sentence(test_sent) model.predict(test_sent) print(test_sent.to_tagged_string()) if test_file_dir: print('Predicting in plural mode') try: corpus = CONLL_03(base_path=test_file_dir + '/') test_data = corpus.test except: try: columns = {0: 'text', 1: 'ner'} corpus = ColumnCorpus(test_file_dir, columns) test_data = corpus.test except AttributeError: raise Exception( 'Directory must contain `test.txt` file, one column `text` the other `ner`' ) test_result, test_loss = model.evaluate(test_data, out_path=test_file_dir + '/test_20201210.tsv') result_line = f"\t{test_loss}\t{test_result.log_line}" print( f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}" ) print(f"TEST RESULT : {result_line}") print('end')
tagger = TARSSequenceTagger2.load( "resources/v3/moviecomplex-long/final-model.pt") flair.set_seed(3) label_name_map = { "LOC": "Location", "PER": "Person", "ORG": "Organization", "MISC": "Miscellaneous" } print(label_name_map) corpus = CONLL_03( tag_to_bioes=None, tag_to_bio2="ner", label_name_map=label_name_map, base_path="/vol/fob-vol7/mi19/harnisph/studienprojekt-dokumentation") corpus = corpus.downsample(0.1) tag_type = "ner" tag_dictionary = corpus.make_label_dictionary(tag_type) tagger.add_and_switch_to_new_task("zeroshot-moviecomplex-long-to-conll3", tag_dictionary=tag_dictionary, tag_type=tag_type) result, eval_loss = tagger.evaluate(corpus.test) print(result.main_score) print(result.log_header) print(result.log_line) print(result.detailed_results) print(eval_loss)
# flair.set_seed(2) # flair.set_seed(3) from flair.datasets import CONLL_03 from mapping import (twitter_ner_mapped, onto_ner_mapped, wikigold_ner_mapped, webpages_ner_mapped) dataset_name = "conll3" for seed in [1, 2, 3]: flair.set_seed(123) if dataset_name == "onto_ner": corpus = onto_ner_mapped() elif dataset_name == "conll3": corpus = CONLL_03() elif dataset_name == "wikipedia": corpus = wikigold_ner_mapped() elif dataset_name == "webpages": corpus = webpages_ner_mapped() elif dataset_name == "twitter": corpus = twitter_ner_mapped() flair.set_seed(seed) # 2. what tag do we want to predict? tag_type = "ner" # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item)
def load_corpus(corpus_name, col_idx, text_idx, tag_type='ner', downsample_perc=1.0, name_train=None, name_dev=None, name_test=None, verbose=False): """ Loads a corpus with a given name. Optionally performs downsampling of the data. Parameters: corpus_name (str): name of the corpus used to load proper embeddings col_idx (int): index of the column's tag text_idx (int): index of the text's tag tag_type (str): type of the tag to load downsample_rate (float): downsample rate (1.0 = full corpus) name_train (str): name of a file containing the train set name_dev (str): name of a file containing the development set name_test (str): name of a file containing the test set Returns: ColumnCorpus: the loaded corpus """ from pathlib import Path data_dir = f'resources/tasks/' if corpus_name in ["conll03_en"]: from flair.datasets import CONLL_03 corpus = CONLL_03(base_path=Path(data_dir), tag_to_bioes=tag_type) elif corpus_name in ["conll03_de"]: from flair.datasets import CONLL_03_GERMAN corpus = CONLL_03_GERMAN(base_path=Path(data_dir), tag_to_bioes=tag_type) elif corpus_name in ["germeval"]: from flair.datasets import GERMEVAL corpus = GERMEVAL(base_path=Path(data_dir), tag_to_bioes=tag_type) else: corpus_dir = f"{data_dir}{corpus_name}" if not os.path.exists(corpus_dir): log.error(f"Data directory '{corpus_dir}' does not exists!") exit(EXIT_FAILURE) from flair.datasets import ColumnCorpus columns = {text_idx: 'text', col_idx: tag_type} train_set = None if name_train is None else f'{name_train}' dev_set = None if name_dev is None else f'{name_dev}' test_set = None if name_test is None else f'{name_test}' corpus: ColumnCorpus = ColumnCorpus(corpus_dir, columns, train_file=train_set, test_file=test_set, dev_file=dev_set, tag_to_bioes=tag_type) if downsample_perc >= 0.0 and downsample_perc < 1.0: corpus.downsample(downsample_perc) if verbose: log.info(corpus.obtain_statistics(tag_type=tag_type)) log.info("'{}' function finished!".format(sys._getframe().f_code.co_name)) return corpus
# -*- coding: utf-8 -*- from flair.datasets import CONLL_03, DataLoader from flair.models import SequenceTagger if __name__ == "__main__": corpus = CONLL_03(base_path="data/conll-2003") tagger = SequenceTagger.load("models/en-ner-conll03-v0.4.pt") dev_eval_result, dev_loss = tagger.evaluate( DataLoader(corpus.test, batch_size=64, num_workers=8)) print(dev_eval_result.main_score)
from flair.data import Corpus from flair.datasets import CONLL_03 from flair.models import SequenceTagger # Load the corpus corpus: Corpus = CONLL_03(base_path='resources/tasks') tagger: SequenceTagger = SequenceTagger.load('resources/taggers/akbik2018-ner/final-model.pt') # Uncomment the following to use the author provided tagger (model) # tagger: SequenceTagger = SequenceTagger.load('ner') result, _ = tagger.evaluate([corpus.test]) print(result.detailed_results)
parser.add_argument('--patience', default=3, type=int) parser.add_argument("--use-crf", action="store_true", help="use crf layer") parser.add_argument("--debug", action="store_true", help="debug") parser.add_argument( "--log-file", default="./code/flair/resources/tasks/conll_03/test_results.csv", type=str, help="the file to store resutls") args = parser.parse_args() # 1. get the corpus corpus: Corpus = CONLL_03(base_path='./code/flair/resources/tasks', tag_to_bioes='ner') # subsampling the corpus corpus.train.sentences = corpus.train.sentences[0:args.train_examples] corpus.train.total_sentence_count = args.train_examples # knn file args.knn_idx_file = './code/flair/resources/tasks/conll_03/sent_id_knn{}.pkl'.format( args.train_examples) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
from flair.data import Sentence from flair.data import MultiCorpus from flair.datasets import CONLL_03 flair.set_seed(1) #label_name_map = { #"S-LOC":"Single Location", "B-PER":"Begin Person", "E-PER":"End Person", "S-ORG":"Single Organization", "S-PER":"Single Person", "B-ORG":"Begin Organization", "E-ORG":"End Organization", "I-ORG":"In Organization", #"S-MISC":"Single Miscellaneous", "B-MISC":"Begin Miscellaneous", "E-MISC":"End Miscellaneous", "I-PER":"In Person", "B-LOC":"Begin Location", "E-LOC":"End Location", "I-MISC":"In Miscellaneous", "I-LOC":"In Location" #} label_name_map = { "LOC":"Location","PER":"Person","ORG":"Organization","MISC":"Miscellaneous" } print(label_name_map) corpus = CONLL_03(label_name_map=label_name_map, base_path="/vol/fob-vol7/mi19/harnisph/studienprojekt-dokumentation") print(corpus) corpus = corpus.downsample(0.1) print(corpus) tag_type = "ner" label_dictionary = corpus.make_label_dictionary(tag_type) print(label_dictionary) #embeddings = WordEmbeddings("glove") embeddings = TransformerWordEmbeddings() tagger = TARSSequenceTagger(tag_dictionary=label_dictionary, tag_type=tag_type, task_name="TEST_NER") #tagger = SequenceTagger(tag_dictionary=corpus.make_tag_dictionary(tag_type), tag_type=tag_type, hidden_size=256, embeddings=embeddings) trainer = ModelTrainer(tagger, corpus)
# evaluate using trained model to see if my modification is correct or not from flair.data import Sentence from flair.models import SequenceTagger model = SequenceTagger.load( 'reproduce_ner_10epochs/taggers/sota-ner/final-model.pt') from flair.data import Corpus from flair.datasets import CONLL_03 corpus: Corpus = CONLL_03(base_path='reproduce_ner_10epochs/tasks') testdata = corpus.test # sentence = Sentence('I love Berlin') # from tqdm import tqdm test_result, test_loss = model.evaluate(testdata, out_path='test.tsv') result_line = f"\t{test_loss}\t{test_result.log_line}" # import logging # log = logging.getLogger("test") # main score is micro averaged f1 score # result line is precision, recall, micro averaged score print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}") print(f"TEST RESULT: {result_line}") # print(test_result.detailed_results) # print(result) # print(sentence.to_tagged_string())