Python NLPTaskDataFetcher.load_column_corpusの例、flair.data_fetcher.NLPTaskDataFetcher.load_column_corpus Pythonの例

コード例 #1

0

ファイルを表示

ファイル: train.py プロジェクト: haozturk/kanarya

def load_specific_corpus(dirname, files):

    columns = {0: 'text', 1: 'ner'}
    if files['train_file']:
        train_file = os.path.basename(files['train_file'])
    else:
        train_file = None
    if files['dev_file']:
        dev_file = os.path.basename(files['dev_file'])
    else:
        dev_file = None
    if files['test_file']:
        test_file = os.path.basename(files['test_file'])
    else:
        test_file = None

    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        dirname,
        columns,
        train_file=train_file,
        dev_file=dev_file,
        test_file=test_file)
    log.info(corpus)

    return corpus

コード例 #2

0

ファイルを表示

ファイル: claim_target_tagger.py プロジェクト: webis-de/acl20-target-inference-in-conclusion-generation

def train_tagger(data_path, model_path):
    tag_type='ct'
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'ct'}
    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_path, columns, train_file='train.tsv', test_file='test.tsv')


    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [

        WordEmbeddings('glove'),
        CharLMEmbeddings('news-forward'),
        CharLMEmbeddings('news-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)


    tag_dictionary = corpus.make_tag_dictionary(tag_type='ct')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)



    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(model_path, learning_rate=0.1, mini_batch_size=16, max_epochs=30)

コード例 #3

0

ファイルを表示

def load_cropus(config):
    '''
    this function load the cropus to flair library : https://github.com/zalandoresearch/flair
    the orgnization of data files required can be find in the above link
    '''
    # the 3rd column should avoid named as 'ner', otherwise it will be convert into BIOES format by flair library
    columns = {0: 'text', 1: 'pos', 2: 'np', 3: 'ner11'}
    data_folder = config.path_data_root
    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        data_folder,
        columns,
        #train_file='eng.train',
        train_file='eng.testb',
        test_file='eng.testb',
        dev_file='eng.testa')

    # skip the document separator in the CONLL cropus
    filtered_train = list(
        filter(lambda x: x.to_tokenized_string() != '-DOCSTART-',
               corpus.train))
    filtered_dev = list(
        filter(lambda x: x.to_tokenized_string() != '-DOCSTART-', corpus.dev))
    filtered_test = list(
        filter(lambda x: x.to_tokenized_string() != '-DOCSTART-', corpus.test))

    return filtered_train, filtered_dev, filtered_test

コード例 #4

0

ファイルを表示

def train():
    # column format - word postag label
    columns = {0: "word", 1: "postag", 2: "ner"}
    data_folder = os.path.join(path, "../data/")

    # read train, dev and test set
    # here test set is same as dev set
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa")
    print(corpus)

    # create label dictionary
    tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner")
    print(tag_dictionary.idx2item)

    # using glove embeddings and character embeddings
    embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

    # create sequence tagger and trainer instance
    tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True)
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    model_path = os.path.join(path, "../models/")

    # commence training
    # model shall be saved in model_path under filename final-model.pt
    # this step takes at least 4 hours to complete, so please ensure access to GPU
    trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)

コード例 #5

0

ファイルを表示

ファイル: test_data_fetchers.py プロジェクト: rkwojdan/flair35

def test_load_no_dev_data(tasks_base_path):
    corpus = NLPTaskDataFetcher.load_column_corpus(
        (tasks_base_path / u'fashion_nodev'), {
            0: u'text',
            2: u'ner',
        })
    assert (len(corpus.train) == 5)
    assert (len(corpus.dev) == 1)
    assert (len(corpus.test) == 1)

コード例 #6

0

ファイルを表示

ファイル: flair_train_ade_concepts.py プロジェクト: tmills/ctakes-ade

def main(args):
    args = parser.parse_args()

    # 1. get the corpus
    column_format = {0: 'word', 1: 'pos', 2: 'ner'}

    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        Path(args.data_file[0]), column_format, tag_to_biloes='ner')
    print(corpus)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        # CharacterEmbeddings(),

        # comment in these lines to use contextual string embeddings
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),

        # comment in these lines to use Bert embeddings
        # BertEmbeddings(),

        # comment in these lines to use Elmo embeddings
        # ELMoEmbeddings(),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train('resources/taggers/glove',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=50)

コード例 #7

0

ファイルを表示

def test():
    #from flair.data import TaggedCorpus
    from flair.data_fetcher import NLPTaskDataFetcher
    columns = {1: "ner", 3: "text"}
    corpus = NLPTaskDataFetcher.load_column_corpus(
        "../dataset/flair",
        column_format=columns,
        train_file="train_res_bilou.txt",
        test_file="test_res_bilou.txt")

コード例 #8

0

ファイルを表示

ファイル: test_data_fetchers.py プロジェクト: rkwojdan/flair35

def test_load_no_dev_data_explicit(tasks_base_path):
    corpus = NLPTaskDataFetcher.load_column_corpus(
        (tasks_base_path / u'fashion_nodev'), {
            0: u'text',
            2: u'ner',
        },
        train_file=u'train.tsv',
        test_file=u'test.tsv')
    assert (len(corpus.train) == 5)
    assert (len(corpus.dev) == 1)
    assert (len(corpus.test) == 1)

コード例 #9

0

ファイルを表示

ファイル: test_data_fetchers.py プロジェクト: mfojtak/flair

def test_load_no_dev_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_column_corpus(
        tasks_base_path / 'fashion_nodev', {
            0: 'text',
            2: 'ner'
        })

    assert len(list(corpus.train())) == 5
    assert len(list(corpus.dev())) == 1
    assert len(list(corpus.test())) == 1

コード例 #10

0

ファイルを表示

ファイル: test_data_fetchers.py プロジェクト: bluesea0/ditk

def test_load_no_dev_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_column_corpus(
        tasks_base_path / "fashion_nodev", {
            0: "text",
            2: "ner"
        })

    assert len(corpus.train) == 5
    assert len(corpus.dev) == 1
    assert len(corpus.test) == 1

コード例 #11

0

ファイルを表示

def test_load_no_dev_data_explicit(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_column_corpus(tasks_base_path /
                                                   'fashion_nodev', {
                                                       0: 'text',
                                                       2: 'ner'
                                                   },
                                                   train_file='train.tsv',
                                                   test_file='test.tsv')

    assert len(corpus.train) == 5
    assert len(corpus.dev) == 1
    assert len(corpus.test) == 1

コード例 #12

0

ファイルを表示

    def __init__(self, corpus_name: str):

        corpus = NLPTaskDataFetcher.load_column_corpus(
            loc.abs_path([loc.ASSETS, loc.MODELS, loc.DIRKSON]), {
                0: 'text',
                1: 'ner'
            },
            train_file=corpus_name + loc.DIRKSON_VALIDATION_TXT,
            test_file=corpus_name + loc.DIRKSON_TEST_TXT)

        embedding_types = [
            BertEmbeddings('bert-base-uncased'),
            FlairEmbeddings('mix-forward'),
            FlairEmbeddings('mix-backward')
        ]

        tag_type = 'ner'
        embeddings = StackedEmbeddings(embeddings=embedding_types)
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type,
                                                use_crf=True)

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        if not path.exists:
            os.mkdir(
                loc.abs_path(
                    [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]))
        trainer.train(loc.abs_path(
            [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]),
                      learning_rate=0.1,
                      mini_batch_size=32,
                      max_epochs=150)

        plotter = Plotter()
        plotter.plot_training_curves(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.LOSS_TSV
            ]))
        plotter.plot_weights(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name,
                loc.WEIGHTS_TXT
            ]))

コード例 #13

0

ファイルを表示

def infer():
    # column format - word, postag
    columns = {0: "word", 1: "postag"}
    data_folder = os.path.join(path, "../data/")
    
    # load sequence tagger model
    model_path = os.path.join(path, "../models/final-model.pt")
    tagger = SequenceTagger.load_from_file(model_path)
    
    # load test set corpus
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file
            = "onto.testb")
    print(corpus)

    # commencing inference
    # this step takes atleast 30 minutes
    print("Infering on test set...")
    sentences = tagger.predict(corpus.test)
    out_fname = os.path.join(path, "../output/lstm_output.txt")
    with open(out_fname, "w") as fw:
        for sentence in sentences:
            for token in sentence.tokens:
                fw.write("{}\n".format(token.tags['ner'].value))
            fw.write("\n")

コード例 #14

0

ファイルを表示

from flair.training_utils import EvaluationMetric
from flair.visual.training_curves import Plotter

# 1. get the corpus
# corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH)
# DC, EE, EG, HG
# define columns
columns = {0: 'text', 1: 'DC', 2: 'EE', 3: 'EG', 4: 'HG', 5: 'comb'}

# this is the folder in which train, test and dev files reside
data_folder = 'data'

# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file='eda_train.txt',
    test_file='eda_test.txt',
    dev_file='eda_dev.txt')

print(corpus)

# 2. what tag do we want to predict?
tag_types = [
    'DC',
    'EE',
    'EG',
    'HG',
]

# 3. make the tag dictionary from the corpus
tag_dictionaries = []

コード例 #15

0

ファイルを表示

ファイル: ner_flair.py プロジェクト: ml-ai-samples-project/ner-pt

from torch.optim.adam import Adam
from typing import List
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.optim import SGDW
import os
import torch
import gensim

print(" ")
columns = {0: 'token', 1: 'pos', 2: 'sublabel', 3: 'label'}
data_folder = "data/"

corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file="train_selective.txt",
    test_file="test_selective.txt",
    dev_file="dev_selective.txt")

print(" ")
print("Train len: ", len(corpus.train))
print("Test len: ", len(corpus.test))
print("Dev len: ", len(corpus.dev))

print(" ")
print("Train: ", corpus.train[0].to_tagged_string('label'))
print("Test: ", corpus.test[0].to_tagged_string('label'))
print("Dev: ", corpus.dev[0].to_tagged_string('label'))

tag_type = 'label'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

コード例 #16

0

ファイルを表示

ファイル: training_flair_vol2.py プロジェクト: LinePinna/Query-Classification

from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List
import os
import setGPU


columns = {0: 'text', 1: 'ner'}

os.chdir("/home/lpinna")

# this is the folder in which train, test and dev files reside
data_folder = '/home/lpinna/classification1/training/vol3'

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                             train_file='train_20190426_v1.csv',
                                                             test_file='test_20190426_v1.csv')

print(corpus)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
#print(tag_dictionary.idx2item)


# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [

コード例 #17

0

ファイルを表示

def main(train_file):

    # 1. get the corpus
    # define columns
    columns = {0: 'text', 1: '', 2: '', 3: 'ner'}

    # this is the folder in which train, test and dev files reside
    data_folder = './eng_data_mini_onefile/'

    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        data_folder,
        columns,
        train_file=train_file,
        test_file='eng.testb',
        dev_file='eng.testa')

    print(corpus)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        # CharacterEmbeddings(),

        # comment in these lines to use flair embeddings
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train('resources/taggers/example-ner',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves('resources/taggers/example-ner/loss.tsv')
    plotter.plot_weights('resources/taggers/example-ner/weights.txt')

コード例 #18

0

ファイルを表示

from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, FlairEmbeddings, CharLMEmbeddings, ELMoEmbeddings, BertEmbeddings
from pathlib import Path

from typing import List

# 1. get the corpus
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = './'

#retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file='customData/usDL/train.txt',
    test_file='customData/usDL/test.txt',
    dev_file='customData/usDL/test.txt')

# len(corpus.train)
print(corpus.train[0].to_tagged_string('ner'))

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)
cachedir = Path(
    '/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/nishant/embeddings'
)

コード例 #19

0

ファイルを表示

ファイル: flair_ner.py プロジェクト: AugustCzy/137_final_NER

args = parser.parse_args()
print(vars(args))

column_format = {0: 'text', 1: 'ner'} # the datafiles generated by our scripts have columns: text ner [weight]
if args.include_weight:
    column_format[2] = 'weight'

# this can be modified to individual needs.
data_folder = os.path.join(args.data_folder_prefix, args.folder_name)
model_folder = os.path.join(args.model_folder_prefix, args.folder_name)

if args.include_weight:
    model_folder += '_w'
# print(column_format)
corpus: Corpus = NLPTaskDataFetcher.load_column_corpus(data_folder,
                                                       column_format=column_format,
                                                       tag_to_biloes="ner")

tag_type = 'ner'

tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

embedding_types: List[TokenEmbeddings] = [

    # GloVe embeddings
    WordEmbeddings('glove'),

    # contextual string embeddings, forward
    FlairEmbeddings('news-forward'),
    # PooledFlairEmbeddings('news-forward', pooling='min'),

コード例 #20

0

ファイルを表示

ファイル: ontonotes_en.py プロジェクト: NLP1502/NLP

from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings
from typing import List

#1. get the corpus
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = './data/OntoNote4NER'

# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file='train.char.bmes',
    test_file='test.char.bmes',
    dev_file='dev.char.bmes')
print(corpus)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('custom'),
    # WordEmbeddings('glove'),

コード例 #21

0

ファイルを表示

# define column
columns = {0: 'text', 1: 'ner'}

config = configparser.ConfigParser()
config.read('config')

data_folder = config.get('data', 'data_folder')
train_file = config.get('data', 'train_file')
test_file = config.get('data', 'test_file')
dev_file = config.get('data', 'dev_file')

# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file=train_file,
    test_file=test_file,
    dev_file=dev_file)
print(corpus)

tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

### used for OwnELMoEmbeddings
from flair import device

cuda_device = 0 if str(device) != 'cpu' else -1

model = allennlp.commands.elmo.ElmoEmbedder(
    options_file='path_to_pretrain_elmo_options.json',
    weight_file='path_to_pretrain_elmo_weights.hdf5',

コード例 #22

0

ファイルを表示

ファイル: ner_flair.py プロジェクト: Petroles/Petrovec

from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.optim import SGDW
import os
import torch
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText
import sys

print(" ")
columns = {0: 'token', 1:'pos', 2: 'sublabel', 3:'label'}
data_folder = "data/"

corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
  train_file=sys.argv[1],
  test_file=sys.argv[2],
  dev_file=sys.argv[3])

print(" ")
print("Train len: ", len(corpus.train))
print("Test len: ", len(corpus.test))
print("Dev len: ", len(corpus.dev))

print(" ")
print("Train: ", corpus.train[0].to_tagged_string('label'))
print("Test: ", corpus.test[0].to_tagged_string('label'))
print("Dev: ", corpus.dev[0].to_tagged_string('label'))

tag_type = 'label'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

コード例 #23

0

ファイルを表示

ファイル: train_flair.py プロジェクト: ismailfatih/kanarya

from flair.data import TaggedCorpus, MultiCorpus
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List
from flair.data import Dictionary
import flair, torch
flair.device = torch.device('cpu') 

columns = {0: 'text', 1: 'ner'}
data_folder = '../'
corpus1: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.train.txt", test_file="de-da-te-ta.10E-4percent.conll.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.dev.txt")
corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt")
corpus = MultiCorpus([corpus1, corpus2])
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
#tag_dictionary: Dictionary = Dictionary.load('../vocab/m.model')

glove_embedding = WordEmbeddings('../../glove/GLOVE/GloVe/vectors.gensim')
word2vec_embedding = WordEmbeddings('../../huawei_w2v/vector.gensim')

#bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32')
embedding_types: List[TokenEmbeddings] = [WordEmbeddings('tr'), glove_embedding, word2vec_embedding]
#embedding_types: List[TokenEmbeddings] = [custom_embedding]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, use_rnn=True, rnn_layers=2)

from flair.trainers import ModelTrainer

コード例 #24

0

ファイルを表示

ファイル: train_tagger.py プロジェクト: stbirc/quaero_ner

def train_tagger(options):
    # Define columns
    columns = {1: 'text', 2: 'pos', 3: 'ner'}

    # What tag should be predicted?
    tag_type = 'ner'

    # Folder in which train, test and dev files reside
    data_folder = options.iob_dir + '/' + options.correction_mode

    # Folder in which to save tagging model and additional information
    tagger_folder = '/'.join([
        options.tagger_dir, options.ner_cycle, options.lm_domain,
        options.correction_mode
    ]) + '-stringemb'

    # Retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        data_folder,
        columns,
        train_file='train.txt',
        test_file='test.txt',
        dev_file='dev.txt')

    # Make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # Initialize embeddings
    char_embeddings = [
        FlairEmbeddings(options.lm_dir + options.lm_domain + '-fw/best-lm.pt',
                        use_cache=False),
        FlairEmbeddings(options.lm_dir + options.lm_domain + '-bw/best-lm.pt',
                        use_cache=False)
    ]

    if not options.use_wiki_wordemb:
        if not options.use_press_wordemb:
            embedding_types: List[TokenEmbeddings] = char_embeddings
        else:
            embedding_types: List[TokenEmbeddings] = [
                WordEmbeddings(
                    'resources.d/embeddings/fasttext/pressfr-wikifr')
            ] + char_embeddings
            tagger_folder = tagger_folder + '-wordemb-pr'
    else:
        embedding_types: List[TokenEmbeddings] = [WordEmbeddings('fr')
                                                  ] + char_embeddings
        tagger_folder = tagger_folder + '-wordemb'

    if options.use_crf:
        tagger_folder = tagger_folder + '-crf'

    # Print information
    print(tagger_folder)
    print(corpus)
    print(tag_dictionary.idx2item)

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # Initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=options.use_crf)

    # Initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # Start training
    trainer.train(
        tagger_folder,
        learning_rate=0.1,
        mini_batch_size=32,
        max_epochs=50,
        patience=options.train_patience,
        #train_with_dev=True,
        anneal_against_train_loss=False,
        embeddings_in_memory=False)

    # Plot training curves (optional)
    plotter = Plotter()
    plotter.plot_training_curves(tagger_folder + '/loss.tsv')
    plotter.plot_weights(tagger_folder + '/weights.txt')