Ejemplo n.º 1
0
def test_load_ag_news_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path)

    assert len(corpus.train) == 10
    assert len(corpus.dev) == 10
    assert len(corpus.test) == 10
Ejemplo n.º 2
0
def test_load_sequence_labeling_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, tasks_base_path)

    assert len(corpus.train) == 6
    assert len(corpus.dev) == 1
    assert len(corpus.test) == 1
Ejemplo n.º 3
0
def test_text_classifier_param_selector(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path)
    glove_embedding = WordEmbeddings(u'en-glove')
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[[glove_embedding]])
    search_space.add(Parameter.HIDDEN_SIZE,
                     hp.choice,
                     options=[64, 128, 256, 512])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.REPROJECT_WORDS,
                     hp.choice,
                     options=[True, False])
    search_space.add(Parameter.REPROJECT_WORD_DIMENSION,
                     hp.choice,
                     options=[64, 128])
    search_space.add(Parameter.BIDIRECTIONAL, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0, high=1)
    search_space.add(Parameter.MINI_BATCH_SIZE,
                     hp.choice,
                     options=[4, 8, 16, 32])
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    param_selector = TextClassifierParamSelector(
        corpus,
        False,
        results_base_path,
        document_embedding_type=u'lstm',
        max_epochs=2)
    param_selector.optimize(search_space, max_evals=2)
    shutil.rmtree(results_base_path)
Ejemplo n.º 4
0
def test_train_load_use_tagger(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('turian')

    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  test_mode=True)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 5
0
def test_train_optimizer_arguments(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary(u'ner')
    embeddings = WordEmbeddings(u'glove')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=u'ner',
                            use_crf=False)
    optimizer = AdamW
    trainer = ModelTrainer(tagger, corpus, optimizer=optimizer)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  test_mode=True,
                  weight_decay=0.001)
    loaded_model = SequenceTagger.load_from_file(
        (results_base_path / u'final-model.pt'))
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
Ejemplo n.º 6
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([WordEmbeddings(u'glove')]),
                         StackedEmbeddings([
                             WordEmbeddings(u'glove'),
                             FlairEmbeddings(u'news-forward'),
                             FlairEmbeddings(u'news-backward')
                         ])
                     ])
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)
    optimizer = SequenceTaggerParamSelector(corpus,
                                            u'ner',
                                            results_base_path,
                                            max_epochs=2)
    optimizer.optimize(search_space, max_evals=2)
    shutil.rmtree(results_base_path)
Ejemplo n.º 7
0
def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary(u'ner')
    embeddings = FlairEmbeddings(u'news-forward-fast')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=u'ner',
                            use_crf=False)
    trainer = ModelTrainer(tagger, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  test_mode=True)
    loaded_model = SequenceTagger.load_from_file(
        (results_base_path / u'final-model.pt'))
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
Ejemplo n.º 8
0
def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(
        NLPTask.UD_ENGLISH).downsample(0.05)
    tag_dictionary = corpus.make_tag_dictionary(u'pos')
    embeddings = WordEmbeddings(u'glove')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=u'pos',
                            use_crf=False)
    trainer = ModelTrainer(tagger, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=2,
                  test_mode=True)
    loaded_model = SequenceTagger.load_from_file(
        (results_base_path / u'final-model.pt'))
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
Ejemplo n.º 9
0
def test_train_load_use_classifier(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [word_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  max_epochs=2,
                  test_mode=True)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
def test_train_load_use_classifier(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False,
                                                                         False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=2, test_mode=True)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 11
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast',
                                                  use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(
        results_base_path / 'checkpoint.pt', 'TextClassifier', corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 12
0
def test_load_imdb_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_corpus('imdb', tasks_base_path)
    print(len(list(corpus.train())))
    assert len(list(corpus.train())) == 5
    assert len(list(corpus.dev())) == 5
    assert len(list(corpus.test())) == 5
Ejemplo n.º 13
0
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast", use_cache=False)
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [embedding], 128, 1, False, 64, False, False
    )

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 14
0
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()
    glove_embedding = FlairEmbeddings(u'news-forward-fast')
    document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1,
                                                 False, 64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MACRO_F1_SCORE,
                  max_epochs=2,
                  test_mode=True)
    sentence = Sentence(u'Berlin is a really nice city.')
    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)
    loaded_model = TextClassifier.load_from_file(
        (results_base_path / u'final-model.pt'))
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
Ejemplo n.º 15
0
def test_download_load_data(tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH)
    assert (len(corpus.train) == 12543)
    assert (len(corpus.dev) == 2002)
    assert (len(corpus.test) == 2077)
    shutil.rmtree(
        ((Path(flair.file_utils.CACHE_ROOT) / u'datasets') / u'ud_english'))
Ejemplo n.º 16
0
def test_load_imdb_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_corpus("imdb", tasks_base_path)

    assert len(corpus.train) == 5
    assert len(corpus.dev) == 5
    assert len(corpus.test) == 5
Ejemplo n.º 17
0
def test_load_germeval_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.GERMEVAL, tasks_base_path)

    assert len(corpus.train) == 2
    assert len(corpus.dev) == 1
    assert len(corpus.test) == 1
Ejemplo n.º 18
0
def test_load_ud_english_data(tasks_base_path):
    # get training, test and dev data
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH,
                                            tasks_base_path)

    assert len(corpus.train) == 6
    assert len(corpus.test) == 4
    assert len(corpus.dev) == 2
Ejemplo n.º 19
0
def init(tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path)
    label_dict = corpus.make_label_dictionary()
    glove_embedding = WordEmbeddings(u'en-glove')
    document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1,
                                                 False, 64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    return (corpus, label_dict, model)
Ejemplo n.º 20
0
def init(tasks_base_path) -> Tuple[(Corpus, TextRegressor, ModelTrainer)]:
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.REGRESSION,
                                            tasks_base_path)
    glove_embedding = WordEmbeddings('glove')
    document_embeddings = DocumentRNNEmbeddings([glove_embedding], 128, 1,
                                                False, 64, False, False)
    model = TextRegressor(document_embeddings)
    trainer = ModelTrainer(model, corpus)
    return (corpus, model, trainer)
Ejemplo n.º 21
0
def test_sentence_to_real_string(tasks_base_path):
    sentence = Sentence(u'I love Berlin.', use_tokenizer=True)
    assert (u'I love Berlin.' == sentence.to_plain_string())
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.GERMEVAL, tasks_base_path)
    sentence = corpus.train[0]
    assert (u'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' == sentence.to_tokenized_string())
    assert (u'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".' == sentence.to_plain_string())
    sentence = corpus.train[1]
    assert (u'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf .' == sentence.to_tokenized_string())
    assert (u'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf.' == sentence.to_plain_string())
Ejemplo n.º 22
0
def test_download_load_data(tasks_base_path):
    # get training, test and dev data for full English UD corpus from web
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH)

    assert len(corpus.train) == 12543
    assert len(corpus.dev) == 2002
    assert len(corpus.test) == 2077

    # clean up data directory
    shutil.rmtree(Path(flair.cache_root) / "datasets" / "ud_english")
Ejemplo n.º 23
0
def init(tasks_base_path) -> Tuple[TaggedCorpus, Dictionary, TextClassifier]:
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('turian')
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    return corpus, label_dict, model
Ejemplo n.º 24
0
def test_download_load_data(tasks_base_path):
    # get training, test and dev data for full English UD corpus from web
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH)

    assert len(list(corpus.train())) == 12543
    assert len(list(corpus.dev())) == 2002
    assert len(list(corpus.test())) == 2077

    # clean up data directory
    shutil.rmtree(
        Path(flair.file_utils.CACHE_ROOT) / 'datasets' / 'ud_english')
Ejemplo n.º 25
0
def init(tasks_base_path) -> Tuple[TaggedCorpus, TextRegressor]:
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.REGRESSION,
                                            tasks_base_path)

    glove_embedding: WordEmbeddings = WordEmbeddings("glove")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextRegressor(document_embeddings, Dictionary(), False)

    trainer = RegressorTrainer(model, corpus)

    return corpus, model, trainer
Ejemplo n.º 26
0
def test_find_learning_rate(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary(u'ner')
    embeddings = WordEmbeddings(u'glove')
    tagger = SequenceTagger(hidden_size=64,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=u'ner',
                            use_crf=False)
    optimizer = SGD
    trainer = ModelTrainer(tagger, corpus, optimizer=optimizer)
    trainer.find_learning_rate(results_base_path, iterations=5)
    shutil.rmtree(results_base_path)
Ejemplo n.º 27
0
def test_train_charlm_changed_chache_load_use_tagger(
    results_base_path, tasks_base_path
):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary("ner")

    # make a temporary cache directory that we remove afterwards
    cache_dir = results_base_path / "cache"
    os.makedirs(cache_dir, exist_ok=True)
    embeddings = FlairEmbeddings("news-forward-fast", cache_directory=cache_dir)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        EvaluationMetric.MACRO_ACCURACY,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        test_mode=True,
    )

    # remove the cache directory
    shutil.rmtree(cache_dir)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / "final-model.pt"
    )

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 28
0
def train(data_folder, model_output_folder):

    corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(
        NLPTask.CONLL_03, base_path=data_folder)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # init Flair embeddings
    flair_forward_embedding = FlairEmbeddings('multi-forward')
    flair_backward_embedding = FlairEmbeddings('multi-backward')

    # init multilingual BERT
    bert_embedding = BertEmbeddings('bert-base-multilingual-cased')

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        flair_forward_embedding, flair_backward_embedding, bert_embedding
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)
    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train(model_output_folder, mini_batch_size=256, max_epochs=150)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(model_output_folder + '/loss.tsv')
    plotter.plot_weights(model_output_folder + '/weights.txt')
Ejemplo n.º 29
0
def test_train_optimizer_arguments(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    optimizer: Optimizer = AdamW

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer)

    trainer.train(
        results_base_path,
        EvaluationMetric.MICRO_F1_SCORE,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=2,
        test_mode=True,
        weight_decay=1e-3,
    )

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / "final-model.pt"
    )

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 30
0
def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH).downsample(0.05)
    tag_dictionary = corpus.make_tag_dictionary("pos")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="pos",
        use_crf=False,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        results_base_path,
        EvaluationMetric.MICRO_F1_SCORE,
        learning_rate=0.1,
        mini_batch_size=32,
        max_epochs=2,
        test_mode=True,
    )

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / "final-model.pt"
    )

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)