Ejemplo n.º 1
0
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 2
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast",
                                                  use_cache=False)
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    checkpoint = TextClassifier.load_checkpoint(results_base_path /
                                                "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 3
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast',
                                                  use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(
        results_base_path / 'checkpoint.pt', 'TextClassifier', corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 4
0
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpora(
        [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path
    )
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(
        results_base_path / "checkpoint.pt", "SequenceTagger", corpus
    )
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 5
0
def test_train_resume_sequence_tagging_training(results_base_path,
                                                tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path /
                                                        'fashion'),
                                           column_format={
                                               0: 'text',
                                               2: 'ner',
                                           })
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary('ner')
    embeddings = WordEmbeddings('turian')
    model = SequenceTagger(hidden_size=64,
                           embeddings=embeddings,
                           tag_dictionary=tag_dictionary,
                           tag_type='ner',
                           use_crf=False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    checkpoint = SequenceTagger.load_checkpoint(
        (results_base_path / 'checkpoint.pt'))
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
Ejemplo n.º 6
0
def train(params, tagger, corpus):
    if tagger is not None:
        trainer = create_trainer(tagger, corpus)
    else:
        print("Resuming training")
        trainer = ModelTrainer.load_from_checkpoint(
            Path(os.path.join(params["model_output_dirpath"],
                              "checkpoint.pt")), 'SequenceTagger', corpus)

    # trainer.train('./models/tr_glove2_word2vec_embedding_150_epochs_0.15_lr', learning_rate=0.15, mini_batch_size=16, max_epochs=150, checkpoint=True)
    trainer.train(params["model_output_dirpath"],
                  learning_rate=params["learning_rate"],
                  mini_batch_size=params["mini_batch_size"],
                  max_epochs=params["max_epochs"],
                  patience=3,
                  anneal_factor=0.5,
                  anneal_against_train_loss=True,
                  anneal_with_restarts=True,
                  checkpoint=True,
                  embeddings_in_memory=True)
Ejemplo n.º 7
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb'))
    label_dict = corpus.make_label_dictionary()
    embeddings = FlairEmbeddings('news-forward-fast')
    document_embeddings = DocumentRNNEmbeddings([embeddings], 128, 1, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    checkpoint = TextClassifier.load_checkpoint(
        (results_base_path / 'checkpoint.pt'))
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
Ejemplo n.º 8
0
def test_train_resume_sequence_tagging_training(results_base_path,
                                                tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpora(
        [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary(u'ner')
    embeddings = WordEmbeddings(u'glove')
    model = SequenceTagger(hidden_size=64,
                           embeddings=embeddings,
                           tag_dictionary=tag_dictionary,
                           tag_type=u'ner',
                           use_crf=False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)
    trainer = ModelTrainer.load_from_checkpoint(
        (results_base_path / u'checkpoint.pt'), u'SequenceTagger', corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)
    shutil.rmtree(results_base_path)
Ejemplo n.º 9
0
                          test_file='test.txt',
                          dev_file='dev.txt',
                          **options['corpus'])
    print(corpus)

    # what tag to predict
    tag_type = 'ner'
    # make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    # embeddings
    if len(args.embeddings_list) == 1:
        embeddings = FlairEmbeddings(args.embeddings_list[0])
    else:
        embeddings = StackedEmbeddings(
            [FlairEmbeddings(lm) for lm in args.embeddings_list])
    # initialize tagger
    tagger = SequenceTagger(embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=tag_type,
                            **options['sequences_tagger'])
    # initialize trainer
    if args.continue_training:
        checkpoint = tagger.load_checkpoint(args.train_path + 'checkpoint.pt')
        trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    else:
        trainer = ModelTrainer(tagger,
                               corpus,
                               use_tensorboard=args.tensorboard)
    # training
    trainer.train(args.train_path, **options['training'])
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-cased')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        base_path=file_path,
        #EvaluationMetric.MACRO_F1_SCORE,
        max_epochs=n_epochs,
        checkpoint=True)

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
Ejemplo n.º 11
0
from pathlib import Path
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings
from typing import List
import flair, torch

flair.device = torch.device('cpu')

columns = {0: 'text', 1: 'ner'}
data_folder = '../'
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder,
    columns,
    train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt",
    test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt",
    dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt")

trainer = ModelTrainer.load_from_checkpoint(
    Path('./models/example-ner-tr-embedding/checkpoint.pt'), 'SequenceTagger',
    corpus)

trainer.train('./models/example-ner-tr-embedding-continued',
              learning_rate=0.15,
              mini_batch_size=32,
              max_epochs=150,
              checkpoint=True)