Ejemplo n.º 1
0
def main(base_path, output_dir, nb_epochs):
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--data_dir", default='./', type=str, required=True, help="The parent dir of inpu data, must contain folder name `conll_03`")
    # parser.add_argument("--output_dir", default=None, required=True, help="The output directory where is going to store the trained model")
    # parser.add_argument("--train_epochs", default=3, type=int, required=True, help="Number of epochs to train")
    # args = parser.parse_args()
    # base_path = args.data_dir
    corpus: Corpus = CONLL_03(base_path=base_path)
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        PooledFlairEmbeddings('news-forward', pooling='min'),
        PooledFlairEmbeddings('news-backward', pooling='min'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # output_dir = args.output_dir
    # nb_epochs = args.train_epochs
    # output_dir =
    # nb_epochs = 10
    trainer.train(output_dir, train_with_dev=False,
                  max_epochs=nb_epochs)  # 150
Ejemplo n.º 2
0
def main(data_dir, model_path, result_file):
  # parser = argparse.ArgumentParser()
  # parser.add_argument("--data_dir", default='./', type=str, help="The parent dir of input data, should include folder named `conll_03` ")
  # parser.add_argument("--model_dir", default=None, type=str, required=True, help="The model directory where model chekpoints stored")
  # parser.add_argument("--result_file", default='dev.tsv', type=str, required=True, help="The name of prediction file, default save in current dir")
  # parser.add_argument("--eval_on", default='dev', type=str, required=True, help="Whether to eval on dev set or test set")

  # args = parser.parse_args()

  # model_path = args.model_dir
  model = SequenceTagger.load(model_path + '/final-model.pt')

  # corpus: Corpus = CONLL_03(base_path=args.data_dir)
  corpus: Corpus = CONLL_03(base_path=data_dir)
  testdata = corpus.dev

  # test_result, test_loss = model.evaluate(testdata, out_path=args.result_file)
  test_result, test_loss = model.evaluate(testdata, out_path=result_file)
  result_line = f"\t{test_loss}\t{test_result.log_line}"

  # main score is micro averaged f1 score
  # result line is precision, recall, micro averaged score

  print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}")
  print(f"TEST RESULT: {result_line}")
  print(test_result.detailed_results)
Ejemplo n.º 3
0
def flairInfer(model_path, test_or_train):
    from flair.data import Sentence
    from flair.models import SequenceTagger
    from flair.data import Corpus
    from flair.datasets import CONLL_03
    from flair.datasets import ColumnCorpus

    model = SequenceTagger.load(model_path + '/final-model.pt')
    data_dir = '../GmbDataExperimentation/processed_data/1500_data'
    try:

        corpus: Corpus = CONLL_03(base_path=data_dir)

    except:
        pass
        columns = {0: 'text', 1: 'ner'}
        corpus: Corpus = ColumnCorpus(data_dir, columns)
    if test_or_train == 'train':
        testdata = corpus.train
        result_file = data_dir + '/train.tsv'
    else:
        testdata = corpus.test
        result_file = data_dir + '/test.tsv'

    test_result, test_loss = model.evaluate(testdata, out_path=result_file)
    result_line = f"\t{test_loss}\t{test_result.log_line}"
    print(
        f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}")
    print(f"TEST RESULT : {result_line}")
Ejemplo n.º 4
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument(
      "--data_dir",
      default='./',
      type=str,
      help=
      "The parent dir of input data, should include folder named `conll_03` ")
  parser.add_argument("--model_dir",
                      default=None,
                      type=str,
                      required=True,
                      help="The model directory where model chekpoints stored")
  parser.add_argument(
      "--result_file",
      default='dev.tsv',
      type=str,
      required=True,
      help=
      "The name of prediction file, default is in the same dir of script file")
  parser.add_argument("--eval_on",
                      default='dev',
                      type=str,
                      required=True,
                      help="Whether to eval on dev set or test set")

  args = parser.parse_args()

  model_path = args.model_dir
  data_dir = args.data_dir
  model = SequenceTagger.load(model_path + '/final-model.pt')
  try:

    corpus: Corpus = CONLL_03(base_path=data_dir)

  except:
    pass
    columns = {0: 'text', 1: 'ner'}
    corpus: Corpus = ColumnCorpus(data_dir, columns)

  if args.eval_on == 'dev':
    testdata = corpus.dev
  elif args.eval_on == 'test':
    testdata = corpus.test
  elif args.eval_on == 'train':
    print('You are evaluating on training set!')
    testdata = corpus.train
  else:
    raise ValueError("Invalid argument, must specify evaluation on dev or test")

  test_result, test_loss = model.evaluate(testdata, out_path=args.result_file)
  result_line = f"\t{test_loss}\t{test_result.log_line}"

  # main score is micro averaged f1 score
  # result line is precision, recall, micro averaged score

  print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}")
  print(f"TEST RESULT: {result_line}")
  print(test_result.detailed_results)
Ejemplo n.º 5
0
def trainNER(data_dir, model_dir):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        default='bert-base-cased',
                        type=str,
                        required=True,
                        help="The pretrained model to produce embeddings")
    args = parser.parse_args()
    model = args.model

    # pdb.set_trace()
    try:
        corpus: Corpus = CONLL_03(base_path=data_dir + '/')
    except FileNotFoundError:
        columns = {0: 'text', 1: 'ner'}
        corpus: Corpus = ColumnCorpus(data_dir, columns)
    corpus.filter_empty_sentences()
    tag_type = 'ner'
    # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    tag_dictionary = corpus.make_label_dictionary('ner')
    print(tag_dictionary.get_items())
    stats = corpus.obtain_statistics()
    print(stats)
    # ['<unk>', 'O', 'B-DEVICE', 'I-DEVICE', 'B-TREE', 'I-TREE', 'B-APPLICATION', 'I-APPLICATION', 'B-LOCATION', 'I-LOCATION', '<START>', '<STOP>']
    # pdb.set_trace()

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        TransformerWordEmbeddings(
            model=model,
            layers='0',  # dtype: str
            pooling_operation='first_last',
            use_scalar_mix=False,
            batch_size=16,
            fine_tune=False,
            allow_long_sentences=False)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # biLSTM + CRF
    # tagger: SequenceTagger = SequenceTagger(hidden_size=256,
    #                                         embeddings=embeddings,
    #                                         tag_dictionary=tag_dictionary,
    #                                         tag_type=tag_type)

    model_path = '/home/carolyn/Projects/mygit/Flair-NER/exprmt-20201120/conll_frac/10ptdata/models-5e-20201124/final-model.pt'
    tagger: SequenceTagger = SequenceTagger.load(model_path)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(model_dir, train_with_dev=False, max_epochs=10)  # 150
Ejemplo n.º 6
0
    def train_flair(self):

        # Flair Model Initialisation and Training
        # # 1. get the corpus
        # corpus: Corpus = ColumnCorpus(os.path.join(os.getcwd(), 'results', '10'),
        #                               {0: 'text', 1: 'ner'},
        #                               train_file='train.txt',
        #                               test_file='test.txt',
        #                               dev_file='valid.txt',
        #                               column_delimiter=' ')

        corpus: Corpus = CONLL_03(
            base_path=os.path.join(os.getcwd(), 'results', '10'))

        corpus.dev_file = 'valid.txt'  # rather than 'dev.txt'

        # 2. what tag do we want to predict?
        tag_type = 'ner'
        # 3. make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        # initialize embeddings
        embedding_types: List[TokenEmbeddings] = [
            # GloVe embeddings
            WordEmbeddings('glove'),
            # contextual string embeddings, forward
            PooledFlairEmbeddings('news-forward', pooling='min'),
            # contextual string embeddings, backward
            PooledFlairEmbeddings('news-backward', pooling='min'),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        # initialize sequence tagger
        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type)

        # initialize trainer
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        results = trainer.train(os.path.join(os.getcwd(), 'results', '10',
                                             'tagger'),
                                train_with_dev=False,
                                max_epochs=50)

        print(results)
Ejemplo n.º 7
0
def testModel(model_dir, test_sent=None, test_file_dir=None):
    """
  model_dir: directory contains 'final_model.pt'
  test_sent: one sentence to test
  test_file: one file of sentences to test
  """
    if test_sent and test_file_dir:
        raise Exception(
            "Argument conflicts, only one type of testing method is allowed.")
    elif not test_sent and not test_file_dir:
        raise Exception(
            "Argument invalid, at least one testing method is required")

    model_path = model_dir + '/final-model.pt'
    model = SequenceTagger.load(model_path)

    if test_sent:
        print('Predicting in singular mode')
        test_sent = Sentence(test_sent)
        model.predict(test_sent)
        print(test_sent.to_tagged_string())

    if test_file_dir:
        print('Predicting in plural mode')
        try:
            corpus = CONLL_03(base_path=test_file_dir + '/')
            test_data = corpus.test
        except:
            try:
                columns = {0: 'text', 1: 'ner'}
                corpus = ColumnCorpus(test_file_dir, columns)
                test_data = corpus.test
            except AttributeError:
                raise Exception(
                    'Directory must contain `test.txt` file, one column `text` the other `ner`'
                )

        test_result, test_loss = model.evaluate(test_data,
                                                out_path=test_file_dir +
                                                '/test_20201210.tsv')
        result_line = f"\t{test_loss}\t{test_result.log_line}"
        print(
            f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}"
        )
        print(f"TEST RESULT : {result_line}")

    print('end')
tagger = TARSSequenceTagger2.load(
    "resources/v3/moviecomplex-long/final-model.pt")

flair.set_seed(3)

label_name_map = {
    "LOC": "Location",
    "PER": "Person",
    "ORG": "Organization",
    "MISC": "Miscellaneous"
}
print(label_name_map)
corpus = CONLL_03(
    tag_to_bioes=None,
    tag_to_bio2="ner",
    label_name_map=label_name_map,
    base_path="/vol/fob-vol7/mi19/harnisph/studienprojekt-dokumentation")
corpus = corpus.downsample(0.1)
tag_type = "ner"
tag_dictionary = corpus.make_label_dictionary(tag_type)
tagger.add_and_switch_to_new_task("zeroshot-moviecomplex-long-to-conll3",
                                  tag_dictionary=tag_dictionary,
                                  tag_type=tag_type)
result, eval_loss = tagger.evaluate(corpus.test)
print(result.main_score)
print(result.log_header)
print(result.log_line)
print(result.detailed_results)
print(eval_loss)
# flair.set_seed(2)
# flair.set_seed(3)
from flair.datasets import CONLL_03
from mapping import (twitter_ner_mapped, onto_ner_mapped, wikigold_ner_mapped,
                     webpages_ner_mapped)

dataset_name = "conll3"

for seed in [1, 2, 3]:
    flair.set_seed(123)

    if dataset_name == "onto_ner":

        corpus = onto_ner_mapped()
    elif dataset_name == "conll3":
        corpus = CONLL_03()
    elif dataset_name == "wikipedia":
        corpus = wikigold_ner_mapped()
    elif dataset_name == "webpages":
        corpus = webpages_ner_mapped()
    elif dataset_name == "twitter":
        corpus = twitter_ner_mapped()

    flair.set_seed(seed)

    # 2. what tag do we want to predict?
    tag_type = "ner"

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)
Ejemplo n.º 10
0
def load_corpus(corpus_name,
                col_idx,
                text_idx,
                tag_type='ner',
                downsample_perc=1.0,
                name_train=None,
                name_dev=None,
                name_test=None,
                verbose=False):
    """
    Loads a corpus with a given name.
    Optionally performs downsampling of the data.

    Parameters:
        corpus_name (str): name of the corpus used to load proper embeddings
        col_idx (int): index of the column's tag
        text_idx (int): index of the text's tag
        tag_type (str): type of the tag to load
        downsample_rate (float): downsample rate (1.0 = full corpus)
        name_train (str): name of a file containing the train set
        name_dev (str): name of a file containing the development set
        name_test (str): name of a file containing the test set
    
    Returns:
        ColumnCorpus: the loaded corpus
    """

    from pathlib import Path

    data_dir = f'resources/tasks/'

    if corpus_name in ["conll03_en"]:
        from flair.datasets import CONLL_03
        corpus = CONLL_03(base_path=Path(data_dir), tag_to_bioes=tag_type)
    elif corpus_name in ["conll03_de"]:
        from flair.datasets import CONLL_03_GERMAN
        corpus = CONLL_03_GERMAN(base_path=Path(data_dir),
                                 tag_to_bioes=tag_type)
    elif corpus_name in ["germeval"]:
        from flair.datasets import GERMEVAL
        corpus = GERMEVAL(base_path=Path(data_dir), tag_to_bioes=tag_type)
    else:
        corpus_dir = f"{data_dir}{corpus_name}"
        if not os.path.exists(corpus_dir):
            log.error(f"Data directory '{corpus_dir}' does not exists!")
            exit(EXIT_FAILURE)

        from flair.datasets import ColumnCorpus

        columns = {text_idx: 'text', col_idx: tag_type}
        train_set = None if name_train is None else f'{name_train}'
        dev_set = None if name_dev is None else f'{name_dev}'
        test_set = None if name_test is None else f'{name_test}'

        corpus: ColumnCorpus = ColumnCorpus(corpus_dir,
                                            columns,
                                            train_file=train_set,
                                            test_file=test_set,
                                            dev_file=dev_set,
                                            tag_to_bioes=tag_type)

    if downsample_perc >= 0.0 and downsample_perc < 1.0:
        corpus.downsample(downsample_perc)

    if verbose:
        log.info(corpus.obtain_statistics(tag_type=tag_type))

    log.info("'{}' function finished!".format(sys._getframe().f_code.co_name))

    return corpus
Ejemplo n.º 11
0
# -*- coding: utf-8 -*-
from flair.datasets import CONLL_03, DataLoader
from flair.models import SequenceTagger

if __name__ == "__main__":
    corpus = CONLL_03(base_path="data/conll-2003")

    tagger = SequenceTagger.load("models/en-ner-conll03-v0.4.pt")

    dev_eval_result, dev_loss = tagger.evaluate(
        DataLoader(corpus.test, batch_size=64, num_workers=8))

    print(dev_eval_result.main_score)
Ejemplo n.º 12
0
from flair.data import Corpus
from flair.datasets import CONLL_03
from flair.models import SequenceTagger

# Load the corpus
corpus: Corpus = CONLL_03(base_path='resources/tasks')

tagger: SequenceTagger = SequenceTagger.load('resources/taggers/akbik2018-ner/final-model.pt')

# Uncomment the following to use the author provided tagger (model)
# tagger: SequenceTagger = SequenceTagger.load('ner')

result, _ = tagger.evaluate([corpus.test])

print(result.detailed_results)
Ejemplo n.º 13
0
parser.add_argument('--patience', default=3, type=int)

parser.add_argument("--use-crf", action="store_true", help="use crf layer")

parser.add_argument("--debug", action="store_true", help="debug")

parser.add_argument(
    "--log-file",
    default="./code/flair/resources/tasks/conll_03/test_results.csv",
    type=str,
    help="the file to store resutls")

args = parser.parse_args()

# 1. get the corpus
corpus: Corpus = CONLL_03(base_path='./code/flair/resources/tasks',
                          tag_to_bioes='ner')

# subsampling the corpus
corpus.train.sentences = corpus.train.sentences[0:args.train_examples]
corpus.train.total_sentence_count = args.train_examples

# knn file
args.knn_idx_file = './code/flair/resources/tasks/conll_03/sent_id_knn{}.pkl'.format(
    args.train_examples)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
Ejemplo n.º 14
0
from flair.data import Sentence
from flair.data import MultiCorpus
from flair.datasets import CONLL_03

flair.set_seed(1)

#label_name_map = {
#"S-LOC":"Single Location", "B-PER":"Begin Person", "E-PER":"End Person", "S-ORG":"Single Organization", "S-PER":"Single Person", "B-ORG":"Begin Organization", "E-ORG":"End Organization", "I-ORG":"In Organization", 
#"S-MISC":"Single Miscellaneous", "B-MISC":"Begin Miscellaneous", "E-MISC":"End Miscellaneous", "I-PER":"In Person", "B-LOC":"Begin Location", "E-LOC":"End Location", "I-MISC":"In Miscellaneous", "I-LOC":"In Location"
#}
label_name_map = {
"LOC":"Location","PER":"Person","ORG":"Organization","MISC":"Miscellaneous"
}

print(label_name_map)
corpus = CONLL_03(label_name_map=label_name_map, base_path="/vol/fob-vol7/mi19/harnisph/studienprojekt-dokumentation")
print(corpus)
corpus = corpus.downsample(0.1)
print(corpus)

tag_type = "ner"
label_dictionary = corpus.make_label_dictionary(tag_type)
print(label_dictionary)

#embeddings = WordEmbeddings("glove")
embeddings = TransformerWordEmbeddings()

tagger = TARSSequenceTagger(tag_dictionary=label_dictionary, tag_type=tag_type, task_name="TEST_NER")
#tagger = SequenceTagger(tag_dictionary=corpus.make_tag_dictionary(tag_type), tag_type=tag_type, hidden_size=256, embeddings=embeddings)

trainer = ModelTrainer(tagger, corpus)
Ejemplo n.º 15
0
# evaluate using trained model to see if my modification is correct or not
from flair.data import Sentence
from flair.models import SequenceTagger
model = SequenceTagger.load(
    'reproduce_ner_10epochs/taggers/sota-ner/final-model.pt')

from flair.data import Corpus
from flair.datasets import CONLL_03
corpus: Corpus = CONLL_03(base_path='reproduce_ner_10epochs/tasks')
testdata = corpus.test
# sentence = Sentence('I love Berlin')
# from tqdm import tqdm
test_result, test_loss = model.evaluate(testdata, out_path='test.tsv')
result_line = f"\t{test_loss}\t{test_result.log_line}"

# import logging
# log = logging.getLogger("test")
# main score is micro averaged f1 score
# result line is precision, recall, micro averaged score

print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}")
print(f"TEST RESULT: {result_line}")
# print(test_result.detailed_results)
# print(result)
# print(sentence.to_tagged_string())