def load_corpus(data_folder, tag_column, attr, downsample_perc):
    corpus = ColumnCorpus(Path(data_folder) / attr, 
                        {0: 'text', tag_column: 'ner'},
                        train_file='train.txt',
                        test_file='test.txt',
                        dev_file='dev.txt') 

    if downsample_perc > 0:
        print('Downsampling: ', downsample_perc)
        corpus = corpus.downsample(percentage=downsample_perc, only_downsample_train=True)
        
    return corpus
Example #2
0
    def __init__(self, dataset_name, output_path="/kaggle/working"):

        input_path = f"/kaggle/input/{dataset_name}"

        self.tag_type = "ner"
        self.corpus = ColumnCorpus(
            data_folder=input_path, column_format={0: "text", 1: "ner"}
        )

        self.tag_dictionary = self.corpus.make_tag_dictionary(tag_type=self.tag_type)

        self.dataset_name = dataset_name
        self.output_path = output_path
Example #3
0
def build_conll03en_corpus(base_path: str):
    document_as_sequence = False
    corpus = ColumnCorpus(
        base_path,
        column_format={0: "text", 1: "pos", 2: "np", 3: "ner"},
        train_file="train.txt",
        dev_file="dev.txt",
        test_file="test.txt",
        tag_to_bioes="ner",
        document_separator_token=None if not document_as_sequence else "-DOCSTART-",
    )
    tag_type = "ner"
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    return corpus, tag_type, tag_dictionary
Example #4
0
def parse_arguments(dataset, model):
    # Adjust logging level
    logging.getLogger("flair").setLevel(level="ERROR")

    columns = {0: "text", 1: "ner"}

    if dataset == "lft":
        corpus: ColumnCorpus = ColumnCorpus(
            Path("./data"),
            columns,
            train_file="./enp_DE.lft.mr.tok.train.bio",
            dev_file="./enp_DE.lft.mr.tok.dev.bio",
            test_file="./enp_DE.lft.mr.tok.test.bio",
            tag_to_bioes="ner",
        )
    elif dataset == "onb":
        corpus: ColumnCorpus = ColumnCorpus(
            Path("./data"),
            columns,
            train_file="./enp_DE.onb.mr.tok.train.bio",
            dev_file="./enp_DE.onb.mr.tok.dev.bio",
            test_file="./enp_DE.onb.mr.tok.test.bio",
            tag_to_bioes="ner",
        )

    tagger: SequenceTagger = SequenceTagger.load(model)

    for test_sentence in corpus.test:
        tokens = test_sentence.tokens
        gold_tags = [token.tags["ner"].value for token in tokens]

        tagged_sentence = Sentence()
        tagged_sentence.tokens = tokens

        # Tag sentence with model
        tagger.predict(tagged_sentence)

        predicted_tags = [token.tags["ner"].value for token in tagged_sentence.tokens]

        assert len(tokens) == len(gold_tags)
        assert len(gold_tags) == len(predicted_tags)

        for token, gold_tag, predicted_tag in zip(tokens, gold_tags, predicted_tags):
            gold_tag = iobes_to_iob(gold_tag)
            predicted_tag = iobes_to_iob(predicted_tag)

            print(f"{token.text} {gold_tag} {predicted_tag}")

        print("")
def create_flair_corpus(data_folder,
                        column_format,
                        percentage=1,
                        only_downsample_train=True):
    """
    Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.

    :param data_folder: base folder with the task dat
    :param column_format: a map specifying the column format
    :param percentage: Percentage of corpus to be used for training
    :param only_downsample_train: Only apply downsample to training set (recommended)
    :return: a Corpus with annotated train, dev and test data
    """

    corpus: Corpus = ColumnCorpus(data_folder,
                                  column_format,
                                  train_file='train',
                                  test_file='test',
                                  dev_file='dev',
                                  in_memory=False)

    if percentage < 1:
        corpus.downsample(percentage=percentage,
                          only_downsample_train=only_downsample_train)

    return corpus
Example #6
0
def onto_ner_mapped():
    corpus_mapped: Corpus = ColumnCorpus(
        "resources/tasks/onto-ner",
        column_format={
            0: "text",
            1: "pos",
            2: "upos",
            3: "ner"
        },
        #tag_to_bioes="ner",
        label_name_map={
            'NORP': 'MISC',
            'FAC': 'LOC',
            'GPE': 'LOC',
            'CARDINAL': 'O',
            'DATE': 'O',
            'EVENT': 'MISC',
            'LANGUAGE': 'MISC',
            'LAW': 'MISC',
            'MONEY': 'O',
            'ORDINAL': 'O',
            'PERCENT': 'O',
            'PRODUCT': 'MISC',
            'QUANTITY': 'O',
            'TIME': 'O',
            'WORK_OF_ART': 'MISC',
            'LOC': 'LOC',
            'PERSON': 'PER'
        })
    return corpus_mapped
def train():
    columns = {0: 'text', 1: 'pos'}
    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus('', columns,
                                  train_file=args.train,
                                  test_file=args.test,
                                  dev_file=args.dev)

    tag_dictionary = corpus.make_tag_dictionary(tag_type='pos')

    # initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        CharacterEmbeddings(),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='pos',
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(args.model,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)
Example #8
0
 def read_corpus(data_folder) -> Corpus:
     columns = {0: 'text', 1: 'pos', 2: 'ner'}
     corpus: Corpus = ColumnCorpus(data_folder, columns,
                                   train_file='flair_train.txt',
                                   test_file='flair_val.txt',
                                   dev_file='flair_test.txt')
     return corpus
Example #9
0
 def test(self, test_data):
     path = "./src/tmp/"
     filepath = path + self.model_name + '/best-model.pt'
     data = self.convert_format(test_data)
     model = SequenceTagger.load(filepath)
     corpus: Corpus = ColumnCorpus(".", {0: 'text', 1: 'ner'},
                                     train_file=data,
                                     test_file=data
                                   )
     result, eval_loss = model.evaluate(corpus.test)
     results = result.detailed_results
     global_res = result.log_line
     global_res = global_res.split("\t")
     res={"model_name" : self.model_name}
     res["precision"] = float(global_res[0])*100
     res["recall"] = float(global_res[1])*100
     res["f_score"] = float(global_res[2])*100
     results = results.split("\n")
     results = results[6:]
     score_by_labels = {}
     for scores in results:
         scores = scores.replace("/s"," ")
         scores = scores.split()
         score_by_labels[scores[0]] = {'p' : float(scores[11])*100, 'r' : float(scores[14])*100, 'f' : float(scores[17])*100}
     res["score_by_label"]  = score_by_labels
     loss_path = path + self.model_name + "/loss.tsv"
     df = pd.read_csv(loss_path, sep = "\t" )
     self.losses = df["TRAIN_LOSS"].tolist()
     return res
Example #10
0
File: tagger.py Project: yyht/daga
def predict(args):
    """Predict."""
    model = SequenceTagger.load(os.path.join(args.model_dir, args.model_file))

    logger.info(f'Model: "{model}"')

    corpus: Corpus = ColumnCorpus(
        args.data_dir,
        column_format=model.column_format,
        test_file=args.test_file,
        comment_symbol=args.comment_symbol,
    )

    fout = io.open(args.output_file, "w", encoding="utf-8", errors="ignore")
    logger.info("Saving to %s", args.output_file)

    start_time = time.time()
    for i in range(len(corpus.test)):
        sentence = corpus.test[i]
        model.predict(sentence)
        lines = []
        for token in sentence.tokens:
            lines.append(
                f"{token.text}\t{token.get_tag(model.tag_type).value}\n")
        lines.append("\n")
        fout.write("".join(lines))
        fout.flush()

    logger.info("End of prediction: time %.1f min",
                (time.time() - start_time) / 60)
Example #11
0
def test_load_universal_dependencies_conllu_corpus(tasks_base_path):
    """
    This test only covers basic universal dependencies datasets.
    For example, multi-word tokens or the "deps" column sentence annotations
    are not supported yet.
    """

    # Here, we use the default token annotation fields.
    corpus = ColumnCorpus(
        tasks_base_path / "conllu",
        train_file="universal_dependencies.conllu",
        dev_file="universal_dependencies.conllu",
        test_file="universal_dependencies.conllu",
        column_format={
            1: "text",
            2: "lemma",
            3: "upos",
            4: "pos",
            5: "feats",
            6: "head",
            7: "deprel",
            8: "deps",
            9: "misc",
        },
    )

    assert len(corpus.train) == 1
    assert len(corpus.dev) == 1
    assert len(corpus.test) == 1

    _assert_universal_dependencies_conllu_dataset(corpus.train)
Example #12
0
    def train_NER(self,
                  data_folder,
                  train_file,
                  dev_file,
                  test_file,
                  train_dict,
                  is_gpu=False):
        columns = {0: 'text', 1: 'ner'}
        if is_gpu == True:
            flair.device = torch.device("cuda:0")
        corpus: Corpus = ColumnCorpus(data_folder,
                                      columns,
                                      train_file=train_file,
                                      test_file=test_file,
                                      dev_file=dev_file)

        tag_dictionary = corpus.make_tag_dictionary(tag_type=self.tag_type)
        tagger: SequenceTagger = SequenceTagger(
            hidden_size=train_dict["hidden_size"],
            embeddings=self.embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=self.tag_type,
            use_crf=True)
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(self.download_dir + '/resources/taggers/saved-models',
                      learning_rate=train_dict["lr"],
                      mini_batch_size=train_dict["batch_size"],
                      max_epochs=train_dict["epochs"])
Example #13
0
def process_file(tagger: SequenceTagger,
                 file_path: Union[str, Path],
                 out_path: Union[str, Path],
                 print_corpus=None):
    try:
        corpus: ColumnCorpus = ColumnCorpus(
            data_folder=os.path.split(file_path)[0],
            train_file=os.path.split(file_path)[1],
            column_format={
                0: 'text',
                1: 'begin',
                2: 'end',
                3: 'ner'
            })
        if len(corpus.get_all_sentences()) == 0:
            return 0
        if print_corpus is not None:
            results: List[Span] = []
            data_loader = DataLoader(corpus.get_all_sentences())
            result, loss = tagger.evaluate(data_loader)
            print(result.detailed_results)
            if not os.path.isfile(print_corpus):
                for sentence in corpus.train:
                    for span in sentence.get_spans(tag_type):
                        if span.tag is not "O":
                            results.append(span)
                print_spans_in_brat_format(results, print_corpus)

        return tag_corpus(corpus, file_path, out_path, tagger)
    except IndexError:
        log.error(f'IndexError in file: "{file_path}"!')
        return 0
Example #14
0
def train():
    generate_datasets()
    DATA_FOLDER = '../content/data'
    # MAX_TOKENS = 500
    columns = {0: 'text', 1: 'pos', 2: 'tag'}

    data_folder = DATA_FOLDER

    corpus: Corpus = ColumnCorpus(data_folder, columns,
                                  train_file='train-labelled.txt',
                                  test_file='dev-labelled.txt',
                                  in_memory=False)
    # corpus._train = [x for x in corpus.train if len(x) < MAX_TOKENS]
    # corpus._test = [x for x in corpus.test if len(x) < MAX_TOKENS]

    tag_type = 'tag'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary)
    embeddings = TransformerWordEmbeddings('roberta-base', layers='-4', fine_tune=True)

    tagger: SequenceTagger = SequenceTagger(hidden_size=128,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            # dropout=0.3334816033039888,
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train('resources/taggers/task-TC',
                  learning_rate=0.2,
                  mini_batch_size=64,
                  max_epochs=100,
                  embeddings_storage_mode='gpu'),
Example #15
0
def flairInfer(model_path, test_or_train):
    from flair.data import Sentence
    from flair.models import SequenceTagger
    from flair.data import Corpus
    from flair.datasets import CONLL_03
    from flair.datasets import ColumnCorpus

    model = SequenceTagger.load(model_path + '/final-model.pt')
    data_dir = '../GmbDataExperimentation/processed_data/1500_data'
    try:

        corpus: Corpus = CONLL_03(base_path=data_dir)

    except:
        pass
        columns = {0: 'text', 1: 'ner'}
        corpus: Corpus = ColumnCorpus(data_dir, columns)
    if test_or_train == 'train':
        testdata = corpus.train
        result_file = data_dir + '/train.tsv'
    else:
        testdata = corpus.test
        result_file = data_dir + '/test.tsv'

    test_result, test_loss = model.evaluate(testdata, out_path=result_file)
    result_line = f"\t{test_loss}\t{test_result.log_line}"
    print(
        f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}")
    print(f"TEST RESULT : {result_line}")
Example #16
0
def prepare_flair_train_dev_corpus(
    spacy_model: Language, data_folder: str, dev_size: float, nb_segment: Optional[int], segment: Optional[int]
) -> Corpus:
    all_annotated_files: List[str] = [
        os.path.join(data_folder, filename) for filename in os.listdir(data_folder) if filename.endswith(".txt")
    ]
    if nb_segment is None and segment is None:
        random.shuffle(all_annotated_files)
        nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size)
        dev_file_names = all_annotated_files[0:nb_doc_dev_set]
    else:
        assert segment < nb_segment
        all_segments = np.array_split(all_annotated_files, nb_segment)
        dev_file_names = list(all_segments[segment])

    print(f"dev set file names: {dev_file_names}")
    train_file_names = [file for file in all_annotated_files if file not in dev_file_names]

    train_path = export_data_set_flair_format(spacy_model, train_file_names)
    dev_path = export_data_set_flair_format(spacy_model, dev_file_names)

    corpus: Corpus = ColumnCorpus(
        data_folder=tempfile.gettempdir(),
        column_format={0: "text", 1: "ner"},
        train_file=os.path.basename(train_path),
        dev_file=os.path.basename(dev_path),
        test_file=os.path.basename(dev_path),
    )
    return corpus
Example #17
0
    def train(self):
        path = "./src/tmp/"
        self.training_data = self.convert_format(self.training_data)
    
        corpus: Corpus = ColumnCorpus(".", {0: 'text', 1: 'ner'},
                                    train_file=self.training_data
                                    )
        tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
        embedding_types: List[TokenEmbeddings] = [

            WordEmbeddings('fr'),
            FlairEmbeddings('fr-forward'),
            FlairEmbeddings('fr-backward'),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type='ner',
                                                use_crf=True)
        self.trainer = ModelTrainer(tagger, corpus)
        save_path = path + self.model_name
        self.trainer.train(save_path,learning_rate=self.learning_rate,mini_batch_size=self.batch_size, max_epochs=self.nb_iter,embeddings_storage_mode=self.mode)
        self.is_ready = 1
Example #18
0
def prepare_flair_train_test_corpus(spacy_model: Language, data_folder: str,
                                    dev_size: float) -> Corpus:

    all_annotated_files: List[str] = [
        os.path.join(data_folder, filename)
        for filename in os.listdir(data_folder) if filename.endswith(".txt")
    ]
    random.shuffle(all_annotated_files)

    nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size)

    dev_file_names = all_annotated_files[0:nb_doc_dev_set]

    train_file_names = [
        file for file in all_annotated_files if file not in dev_file_names
    ]

    train_path = export_data_set_flair_format(spacy_model, train_file_names)
    dev_path = export_data_set_flair_format(spacy_model, dev_file_names)

    corpus: Corpus = ColumnCorpus(data_folder=tempfile.gettempdir(),
                                  column_format={
                                      0: 'text',
                                      1: 'ner'
                                  },
                                  train_file=os.path.basename(train_path),
                                  dev_file=os.path.basename(dev_path),
                                  test_file=os.path.basename(dev_path))
    return corpus
Example #19
0
 def create_corpus(self) -> Corpus:
     corpus: Corpus = ColumnCorpus(data_folder = '{}/{}/'.format(self.data_folder, self.entity_type),  
                                             column_format = {0: 'text', 1: 'ner'},
                                             train_file = '{}_train.conll'.format(self.entity_type),
                                             test_file = '{}_test.conll'.format(self.entity_type),
                                             dev_file = '{}_dev.conll'.format(self.entity_type))
     return corpus
Example #20
0
    def get_data(self,
                 data_dir,
                 tag_type='ner',
                 train_file='train.txt',
                 dev_file='dev.txt',
                 test_file='test.txt'):

        # customize data format, see
        # https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md#reading-your-own-sequence-labeling-dataset

        columns = {0: 'text', 1: 'pos', 2: 'deprel', 3: 'ner'}
        corpus: Corpus = ColumnCorpus(data_dir,
                                      columns,
                                      train_file=train_file,
                                      dev_file=dev_file,
                                      test_file=test_file)
        # print(corpus)

        # make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        # print(tag_dictionary.idx2item)

        # (TODO: TEST IF WORKS)  Some useful stats below:

        #  Obtain statistics about the dataset
        stats = corpus.obtain_statistics()
        print(stats)

        # check how many sentences there are in the training split
        len(corpus.train)

        # You can also access a sentence and check out annotations
        print(corpus.train[0].to_tagged_string('ner'))

        return corpus, tag_dictionary
Example #21
0
def testModel(model_dir, test_sent=None, test_file_dir=None):
  """
  model_dir: directory contains 'final_model.pt'
  test_sent: one sentence to test
  test_file: one file of sentences to test
  """
  if test_sent and test_file_dir:
    raise Exception("Argument conflicts, only one type of testing method is allowed.")
  elif not test_sent and not test_file_dir:
    raise Exception("Argument invalid, at least one testing method is required") 
  
  model_path = model_dir + '/final-model.pt'
  model = SequenceTagger.load(model_path)
  
  if test_sent:
    print('Predicting in singular mode')
    test_sent = Sentence(test_sent)
    model.predict(test_sent)
    print(test_sent.to_tagged_string())

  if test_file_dir:
    print('Predicting in plural mode')
    try:
      columns = {0: 'text', 1: 'ner'}
      corpus = ColumnCorpus(test_file_dir, columns)
      test_data = corpus.test 
    except:
      raise Exception('Directory must contain `test.txt` file, one column `text` the other `ner`')
    
    test_result, test_loss = model.evaluate(test_data, out_path=test_file_dir + '/test.tsv')
    result_line = f"\t{test_loss}\t{test_result.log_line}"
    print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}")
    print(f"TEST RESULT : {result_line}")

  print('end')
Example #22
0
def main():
    args = parse_args()

    if not os.path.exists(args.data_dir):
        raise Exception(f'Path does not exist: {args.data_dir}')

    # 1. Build corpus
    columns = {0: 'text', 1: 'ner'}
    corpus: Corpus = ColumnCorpus(args.data_dir,
                                  columns,
                                  train_file=args.train_file,
                                  dev_file=args.dev_file,
                                  test_file=args.test_file)

    print(corpus)
    print(corpus.obtain_statistics())

    # 2. What tag do we want to predict?
    tag_type = 'ner'

    # 3. Build tag dictionary
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # 4. Initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('crawl'),
        FlairEmbeddings(args.forward_flair_embeddings),
        FlairEmbeddings(args.backward_flair_embeddings),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. Initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=HIDDEN_SIZE,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    # 6. Initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    if args.learning_rate_find:
        print('***** Plotting learning rate')
        # 7a. Find learning rate
        learning_rate_tsv = trainer.find_learning_rate(
            'temp', 'learning_rate.tsv', mini_batch_size=MINI_BATCH_SIZE)

    else:
        print('***** Running train')
        # 7b. Run Training
        trainer.train(
            'temp',
            learning_rate=0.1,
            mini_batch_size=MINI_BATCH_SIZE,
            # it's a big dataset so maybe set embeddings_in_memory to False
            embeddings_storage_mode='none')

        tag_and_output(corpus.test, tagger,
                       os.path.join(args.data_dir, args.test_output_file),
                       tag_type)
Example #23
0
def read_in_CADEC_prev():
    # define columns
    columns = {
        0: 'text',
        1: 'text_lower',
        2: 'pos',
        3: 'ner',
        4: 'text_start',
        5: 'text_end',
        6: 'SNOMEDCT_id'
    }

    # this is the folder in which train, test and dev files reside
    data_folder = 'data/CADEC/NER/'

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='corpus-conll-train.txt',
                                  test_file='corpus-conll-test.txt',
                                  dev_file='corpus-conll-dev.txt')

    print(corpus)

    print(corpus.train[0].to_tagged_string('pos'))
    print(corpus.train[0].to_tagged_string('ner'))

    return corpus
Example #24
0
def read_in_Micromed():
    # define columns
    # avelox-51c3e5a853785f584a9a8c01	76	93	ADR	connective tissue	avelox	avelox
    columns = {0: 'text', 1: 'ner'}

    # this is the folder in which train, test and dev files reside
    data_folder = 'data/Micromed/'

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='Micromed_train.csv',
                                  test_file='Micromed_test.csv',
                                  dev_file='Micromed_dev.csv')

    len(corpus.train)

    print(corpus.train[0].to_tagged_string('pos'))
    print(corpus.train[0].to_tagged_string('ner'))

    return corpus


# read_in_CADEC()
# read_in_Micromed()
# read_in_AMT()
Example #25
0
def run_splits(embedding_types, embeddings_name):
    embeddings : StackedEmbeddings = StackedEmbeddings(
                                  embeddings=embedding_types)

    for i in range(1,6):
        print('##########')
        print('Split', str(i))
        print('##########')

        # define columns
        columns = {0 : 'text', 1 : 'pos', 2 : 'ner', 3 : 'event', 4 : 'when', 5 : 'who', 6 : 'core', 7 : 'eventtype'}
        # directory where the data resides
        data_folder = '<path_to_splits>/split_' + str(i) + '/'
        # initializing the corpus
        corpus: Corpus = ColumnCorpus(data_folder, columns,
                                      train_file = 'ner_train.csv',
                                      test_file = 'ner_test.csv',
                                      dev_file = 'ner_dev.csv')
        # tag to predict
        tag_type = 'ner'
        # make tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)
        print(tagger)

        from flair.trainers import ModelTrainer
        trainer : ModelTrainer = ModelTrainer(tagger, corpus)
        trainer.train(data_folder + '/ner_' + embeddings_name,
                      learning_rate=0.1,
                      mini_batch_size=32,
                      max_epochs=150)
Example #26
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument(
      "--data_dir",
      default='./',
      type=str,
      help=
      "The parent dir of input data, should include folder named `conll_03` ")
  parser.add_argument("--model_dir",
                      default=None,
                      type=str,
                      required=True,
                      help="The model directory where model chekpoints stored")
  parser.add_argument(
      "--result_file",
      default='dev.tsv',
      type=str,
      required=True,
      help=
      "The name of prediction file, default is in the same dir of script file")
  parser.add_argument("--eval_on",
                      default='dev',
                      type=str,
                      required=True,
                      help="Whether to eval on dev set or test set")

  args = parser.parse_args()

  model_path = args.model_dir
  data_dir = args.data_dir
  model = SequenceTagger.load(model_path + '/final-model.pt')
  try:

    corpus: Corpus = CONLL_03(base_path=data_dir)

  except:
    pass
    columns = {0: 'text', 1: 'ner'}
    corpus: Corpus = ColumnCorpus(data_dir, columns)

  if args.eval_on == 'dev':
    testdata = corpus.dev
  elif args.eval_on == 'test':
    testdata = corpus.test
  elif args.eval_on == 'train':
    print('You are evaluating on training set!')
    testdata = corpus.train
  else:
    raise ValueError("Invalid argument, must specify evaluation on dev or test")

  test_result, test_loss = model.evaluate(testdata, out_path=args.result_file)
  result_line = f"\t{test_loss}\t{test_result.log_line}"

  # main score is micro averaged f1 score
  # result line is precision, recall, micro averaged score

  print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}")
  print(f"TEST RESULT: {result_line}")
  print(test_result.detailed_results)
Example #27
0
    def train(self, training_dir=None):
        from flair.trainers import ModelTrainer

        if training_dir is None:
            training_dir = flair_splitter_dep_dir

        # define columns
        columns = {0: "text", 1: "ner"}

        # this is the folder in which train, test and dev files reside
        data_folder = flair_splitter_dep_dir + "data"

        # init a corpus using column format, data folder and the names of the train, dev and test files
        # note that training data should be unescaped, i.e. tokens like "&", not "&amp;"
        corpus: Corpus = ColumnCorpus(
            data_folder,
            columns,
            train_file="sent_train.txt",
            test_file="sent_test.txt",
            dev_file="sent_dev.txt",
            document_separator_token="-DOCSTART-",
        )

        print(corpus)

        tag_type = "ner"
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary)

        # initialize embeddings
        embedding_types = [
            # WordEmbeddings('glove'),
            # comment in this line to use character embeddings
            CharacterEmbeddings(),
            # comment in these lines to use flair embeddings
            #FlairEmbeddings("news-forward"),
            #FlairEmbeddings("news-backward"),
            # BertEmbeddings('distilbert-base-cased')
            TransformerWordEmbeddings('google/electra-base-discriminator')
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=128,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=tag_type,
            use_crf=True,
        )

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(training_dir,
                      learning_rate=0.1,
                      mini_batch_size=16,
                      max_epochs=50)
        self.model = tagger
Example #28
0
def test_train_load_use_classifier(results_base_path, tasks_base_path):
    corpus = ColumnCorpus(
        data_folder=tasks_base_path / "conllu",
        train_file="train.conllup",
        dev_file="train.conllup",
        test_file="train.conllup",
        column_format={1: "text", 2: "pos", 3: "ner"},
    )

    relation_label_dict = corpus.make_label_dictionary(label_type="relation")

    embeddings = TransformerWordEmbeddings()

    model: RelationExtractor = RelationExtractor(
        embeddings=embeddings,
        label_dictionary=relation_label_dict,
        label_type="relation",
        entity_label_type="ner",
        train_on_gold_pairs_only=True,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(model, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=3,
        shuffle=False,
    )

    del trainer, model, relation_label_dict, corpus

    loaded_model: RelationExtractor = RelationExtractor.load(results_base_path / "final-model.pt")
    loaded_model.train_on_gold_pairs_only = False

    sentence = Sentence(["Apple", "was", "founded", "by", "Steve", "Jobs", "."])
    sentence[0:1].add_label("ner", "ORG")
    sentence[4:6].add_label("ner", "PER")

    loaded_model.predict(sentence)

    assert "founded_by" == sentence.get_labels("relation")[0].value

    del loaded_model
Example #29
0
    def __init__(self,
                 path: Union[Path, str],
                 columns: dict = None,
                 tag_types: List = ['ner'],
                 corpus: Corpus = None):
        if isinstance(path, str):
            path = Path(path)
        assert path.exists()

        self.path = path
        if corpus:
            self.corpus = corpus
        else:
            self.corpus = ColumnCorpus(self.path, columns)
        self.sentences = ColumnCorpusAnalysis.index_spans(
            self.corpus.get_all_sentences(), tag_types)
        print(self.corpus)
Example #30
0
def train_model(directory='Data', use_BERT=True):
    # define columns
    columns = {
        0: 'ID',
        1: 'text',
        2: 'empty_0',
        3: 'pos',
        4: 'empty_1',
        5: 'empty_2',
        6: 'empty_3',
        7: 'empty_4',
        8: 'empty_5',
        9: 'tox'
    }

    # this is the folder in which train, test and dev files reside
    data_folder = directory

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='converted_data_train.conll',
                                  test_file='converted_data_test.conll',
                                  dev_file='converted_data_dev.conll')

    # tag to predict
    tag_type = 'tox'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # embeddings
    if use_BERT:
        bert_embeddings = [
            TransformerWordEmbeddings('bert-large-uncased', fine_tune=True)
        ]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=bert_embeddings)
    else:
        embedding_types = [WordEmbeddings('glove')]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

    # initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # start training
    trainer.train('resources/taggers/toxic_classifier_bert',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=5)