def test_train_charlm_changed_chache_load_use_tagger():

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    # make a temporary cache directory that we remove afterwards
    os.makedirs('./results/cache/', exist_ok=True)
    embeddings = CharLMEmbeddings('news-forward-fast', cache_directory='./results/cache/')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)

    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3)

    # remove the cache directory
    shutil.rmtree('./results/cache')

    loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree('./results')
Exemple #2
0
def test_training():

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('glove')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
                                                           test_mode=True)

    trainer.train('./results',
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=10)

    # clean up results directory
    shutil.rmtree('./results')
def test_train_load_use_tagger():

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('glove')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)

    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree('./results')
Exemple #4
0
def train(data_dir,
          model_dir,
          hidden_dim=256,
          word_embeddings='de-fasttext',
          use_crf=True,
          learning_rate=.1,
          batch_size=32,
          max_epochs=50,
          seed=0,
          dev_size=.1,
          test_size=.2):

    tag_type = 'ner'

    corpus: TaggedCorpus = load_corpus(data_dir,
                                       dev_size,
                                       test_size,
                                       seed,
                                       tag_to_biloes=None)
    print('Corpus:', corpus)

    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print('Size of tag dictionary:', len(tag_dictionary))

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings(word_embeddings),

        # comment in this line to use character embeddings
        #CharacterEmbeddings(),

        # comment in these lines to use contextual string embeddings
        # CharLMEmbeddings('news-forward'),
        # CharLMEmbeddings('news-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=hidden_dim,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=use_crf)

    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
                                                           test_mode=False)

    trainer.train(model_dir,
                  learning_rate=learning_rate,
                  mini_batch_size=batch_size,
                  max_epochs=max_epochs)
def test_train_charlm_nochache_load_use_tagger(results_base_path,
                                               tasks_base_path):

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION,
                                           base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = CharLMEmbeddings('news-forward-fast', use_cache=False)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
                                                           test_mode=True)

    trainer.train(str(results_base_path),
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Exemple #6
0
from models import FORWARD_LM, BACKWARD_LM, GLOVE
from embeddings import KeyedWordEmbeddings
from ne_groups import GROUPS
from corpora import read_group

embedding_types: List[TokenEmbeddings] = [
    KeyedWordEmbeddings(GLOVE),
    CharLMEmbeddings(FORWARD_LM),
    CharLMEmbeddings(BACKWARD_LM)
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

for entities in GROUPS:
    corpus: TaggedCorpus = read_group(entities)
    tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
    tagger: SequenceTagger = SequenceTagger(hidden_size=512,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=True)
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
                                                           test_mode=True)
    file_name = '-'.join(entities)
    trainer.train(f'data/models/{file_name}',
                  learning_rate=0.05,
                  mini_batch_size=124,
                  max_epochs=40,
                  save_model=True)
Exemple #7
0
    # comment in these lines to use contextual string embeddings
    # CharLMEmbeddings('news-forward'),
    # CharLMEmbeddings('news-backward'),
]

embeddings = WordEmbeddings("tmp/glove.bin")
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=1024,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import SequenceTaggerTrainer

trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=8,
              max_epochs=150)

# 8. plot training curves (optional)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('resources/taggers1/example-ner/loss.tsv')
plotter.plot_weights('resources/taggers1/example-ner/weights.txt')