Esempio n. 1
0
def train():
    corpus: Corpus = ClassificationCorpus(sst_folder,
                                          test_file='test.csv',
                                          dev_file='dev.csv',
                                          train_file='sst_dev.csv')

    label_dict = corpus.make_label_dictionary()
    stacked_embedding = WordEmbeddings('glove')

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(model_path, max_epochs=10, train_with_dev=False)
    def train():
        # load training data in FastText format
        corpus = NLPTaskDataFetcher.load_classification_corpus(
            Path('./'),
            test_file='./data/test.txt',
            train_file='./data/train.txt')

        # Combine different embeddings:
        # Glove word ebmeddings + Flair contextual string embeddings
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        # use LSTM based method for combining the different embeddings
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train('./models', max_epochs=10)
Esempio n. 3
0
def optimize_lr():

    corpus, label_dictionary = load_corpus()

    embeddings = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward')
    ]

    document_embeddings = DocumentRNNEmbeddings(embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256,
                                                bidirectional=True)
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dictionary,
                                multi_label=False)
    trainer = ModelTrainer(classifier, corpus)

    # 7. find learning rate
    learning_rate_tsv = trainer.find_learning_rate('resources/classifiers/',
                                                   'learning_rate.tsv')

    # 8. plot the learning rate finder curve
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_learning_rate(learning_rate_tsv)
Esempio n. 4
0
def run_splits(word_embeddings, embeddings_name):
    for i in range(1, 6):
        print('##########')
        print('Split', str(i))
        print('##########')

        data_folder = '<path_to_splits>/split_' + str(i) + '/'
        corpus = ClassificationCorpus(data_folder,
                                      test_file='test.csv',
                                      dev_file='dev.csv',
                                      train_file='train.csv')

        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
Esempio n. 5
0
    def train(self,
              learning_rate: float = 0.1,
              mini_batch_size: int = 16,
              anneal_factor: float = 0.5,
              patience: int = 5,
              max_epochs: int = 10):
        """

        :return:
        """
        self.make_corpus()
        corpus = ClassificationCorpus(self.output_data_path,
                                      train_file='train.txt',
                                      dev_file='dev.txt',
                                      test_file='test.txt')

        label_dictionary = corpus.make_label_dictionary()

        embeddings = [WordEmbeddings('glove')]
        document_pool = DocumentPoolEmbeddings(embeddings)
        classifier = TextClassifier(document_pool,
                                    label_dictionary=label_dictionary)
        trainer = ModelTrainer(classifier, corpus)
        trainer.train(
            self.model_path,
            learning_rate=learning_rate,
            mini_batch_size=mini_batch_size,
            anneal_factor=anneal_factor,
            patience=patience,
            max_epochs=max_epochs,
        )
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64,
                                                                         False,
                                                                         False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 7
0
def test_train_charlm_load_use_classifier():
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False,
                                                                         False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

        loaded_model = TextClassifier.load_from_file('./results/final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree('./results')
Esempio n. 8
0
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()
    glove_embedding = FlairEmbeddings(u'news-forward-fast')
    document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1,
                                                 False, 64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MACRO_F1_SCORE,
                  max_epochs=2,
                  test_mode=True)
    sentence = Sentence(u'Berlin is a really nice city.')
    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)
    loaded_model = TextClassifier.load_from_file(
        (results_base_path / u'final-model.pt'))
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    shutil.rmtree(results_base_path)
def train():
    # Get the SST-5 corpus
    corpus: Corpus = SENTEVAL_SST_GRANULAR()

    # create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # make a list of word embeddings ( Using Glove for testing )
    word_embeddings = [WordEmbeddings('glove')]

    # initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)

    # create the text classifier
    classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

    # initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # start the training
    trainer.train('resources/taggers/trec',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  embeddings_storage_mode='gpu',
                  max_epochs=15)
Esempio n. 10
0
def test_train_charlm_nocache_load_use_classifier(results_base_path,
                                                  tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast",
                                                 use_cache=False)
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 11
0
def test_train_classifier_with_sampler(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    model: TextClassifier = TextClassifier(document_embeddings,
                                           label_dict,
                                           multi_label=False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        max_epochs=2,
        shuffle=False,
        sampler=ImbalancedClassificationDatasetSampler,
    )

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    del trainer, model, corpus
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model
Esempio n. 12
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    #document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    #    [flair_embeddings], 128, 1, False
    #)

    model = TextClassifier(document_embeddings, label_dict, multi_label=False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del trainer, model
    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt",
                                           corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
Esempio n. 13
0
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path):
    # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path)
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        data_folder=tasks_base_path / "multi_class"
    )
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings = DocumentRNNEmbeddings(
        embeddings=[word_embedding],
        hidden_size=32,
        reproject_words=False,
        bidirectional=False,
    )

    model = TextClassifier(document_embeddings, label_dict, multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        EvaluationMetric.MICRO_F1_SCORE,
        mini_batch_size=1,
        max_epochs=100,
        test_mode=True,
        checkpoint=False,
    )

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert "apple" in sentence.get_label_names()
        assert "tv" in sentence.get_label_names()

        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 14
0
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [embedding], 128, 1, False, 64, False, False
    )

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, test_mode=True
    )

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 15
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast',
                                                  use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(
        results_base_path / 'checkpoint.pt', 'TextClassifier', corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  test_mode=True,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 16
0
def test_train_resume_classifier(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb",
                                                 label_type="topic")
    label_dict = corpus.make_label_dictionary(label_type="topic")

    model = TextClassifier(
        document_embeddings=document_embeddings,
        label_dictionary=label_dict,
        multi_label=False,
        label_type="topic",
    )

    # train model for 2 epochs
    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del model

    # load the checkpoint model and train until epoch 4
    checkpoint_model = TextClassifier.load(results_base_path / "checkpoint.pt")
    with pytest.warns(UserWarning):
        trainer.resume(model=checkpoint_model, max_epochs=4)

    del trainer
Esempio n. 17
0
def test_train_load_use_classifier(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb",
                                                 label_type="topic")
    label_dict = corpus.make_label_dictionary(label_type="topic")

    model: TextClassifier = TextClassifier(
        document_embeddings=document_embeddings,
        label_dictionary=label_dict,
        label_type="topic",
        multi_label=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False)

    sentence = Sentence("Berlin is a really nice city.")

    model.predict(sentence)

    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float

    del trainer, model, corpus
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    del loaded_model
Esempio n. 18
0
def train_model(data_dir, max_epochs):
    st.write('Creating word corpus for training...')
    corpus = ClassificationCorpus(data_dir)
    label_dict = corpus.make_label_dictionary()
    st.write('Done')

    st.write('Load and create Embeddings for text data...')
    word_embeddings = [
        WordEmbeddings('glove'),
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward')
    ]
    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)
    st.write('Done')

    st.write('Preparing')
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)
    trainer = ModelTrainer(classifier, corpus)
    trainer.train('model-saves',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=8,
                  max_epochs=max_epochs,
                  checkpoint=True)
    st.write('Model Training Finished!')
Esempio n. 19
0
def test_train_load_use_classifier_with_prob(results_base_path,
                                             tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [word_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence, multi_class_prob=True):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence, multi_class_prob=True)
    loaded_model.predict([sentence, sentence_empty], multi_class_prob=True)
    loaded_model.predict([sentence_empty], multi_class_prob=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 20
0
def test_train_resume_text_classification_training(results_base_path,
                                                   tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast",
                                                  use_cache=False)
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [embeddings], 128, 1, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    checkpoint = TextClassifier.load_checkpoint(results_base_path /
                                                "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
def test_train_resume_classifier(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb",
                                                 label_type="topic")
    label_dict = corpus.make_label_dictionary(label_type="topic")

    model = TextClassifier(document_embeddings=document_embeddings,
                           label_dictionary=label_dict,
                           multi_label=False,
                           label_type="topic")

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    del trainer, model
    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt",
                                           corpus)
    trainer.train(results_base_path,
                  max_epochs=2,
                  shuffle=False,
                  checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
Esempio n. 22
0
def train(args):
    """Train."""
    start_time = time.time()
    column_format = {i: col for i, col in enumerate(args.data_columns)}
    corpus: Corpus = ClassColumnCorpus(
        args.data_dir,
        column_format,
        train_file=args.train_file,
        dev_file=args.dev_file,
        comment_symbol=args.comment_symbol,
        label_symbol=args.label_symbol,
    )

    tag_type = args.data_columns[-1]
    tag_dict = corpus.make_tag_dictionary(tag_type=tag_type)
    label_dict = corpus.make_label_dictionary()
    vocab = corpus.make_vocab_dictionary().get_items()
    embeddings = utils.init_embeddings(vocab, args)

    model1: SequenceTagger = SequenceTagger(
        hidden_size=args.hidden_size,
        embeddings=embeddings,
        tag_dictionary=tag_dict,
        tag_type=tag_type,
        column_format=column_format,
        use_crf=True,
        use_attn=args.use_attn,
        attn_type=args.attn_type,
        num_heads=args.num_heads,
        scaling=args.scaling,
        pooling_operation=args.pooling_operation,
        use_sent_query=args.use_sent_query,
    )

    document_embeddings = DocumentRNNEmbeddings(
        [embeddings], hidden_size=args.hidden_size,
    )

    model2 = TextClassifier(document_embeddings, label_dictionary=label_dict)

    utils.init_joint_models(model1, model2, args)

    trainer: JointModelTrainer = JointModelTrainer(
        model1, model2, corpus, utils.optim_method(args.optim)
    )

    trainer.train(
        args.model_dir,
        mini_batch_size=args.mini_batch_size,
        max_epochs=args.max_epochs,
        anneal_factor=args.anneal_factor,
        learning_rate=args.learning_rate,
        patience=args.patience,
        min_learning_rate=args.min_learning_rate,
        embeddings_storage_mode=args.embeddings_storage_mode,
        gamma=args.gamma,
    )

    logger.info("End of training: time %.1f min", (time.time() - start_time) / 60)
Esempio n. 23
0
def test_text_classifier_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=TransformerDocumentEmbeddings(
            "distilbert-base-uncased"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-5,
        num_workers=2,
    )

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict([sentence, sentence_empty])

    values = []
    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path /
                                                 "multi_class",
                                                 label_type="topic")
    label_dict = corpus.make_label_dictionary(label_type="topic")

    model: TextClassifier = TextClassifier(
        document_embeddings=document_embeddings,
        label_dictionary=label_dict,
        label_type="topic",
        multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        mini_batch_size=1,
        max_epochs=100,
        shuffle=False,
        checkpoint=False,
        train_with_test=True,
        train_with_dev=True,
    )

    sentence = Sentence("apple tv")

    model.predict(sentence)

    for label in sentence.labels:
        print(label)
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float

    sentence = Sentence("apple tv")

    model.predict(sentence)

    assert "apple" in sentence.get_label_names()
    assert "tv" in sentence.get_label_names()

    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float

    del trainer, model, corpus
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model
Esempio n. 25
0
    def __init__(
            self,
            task_name: str,
            label_dictionary: Dictionary,
            label_type: str,
            embeddings: str = 'bert-base-uncased',
            num_negative_labels_to_sample: int = 2,
            prefix: bool = True,
            **tagger_args,
    ):
        """
        Initializes a TextClassifier
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of labels you want to predict
        :param embeddings: name of the pre-trained transformer model e.g.,
        'bert-base-uncased' etc
        :param num_negative_labels_to_sample: number of negative labels to sample for each
        positive labels against a sentence during training. Defaults to 2 negative
        labels for each positive label. The model would sample all the negative labels
        if None is passed. That slows down the training considerably.
        :param multi_label: auto-detected by default, but you can set this to True
        to force multi-label predictionor False to force single-label prediction
        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
        :param beta: Parameter for F-beta score for evaluation and training annealing
        """
        super(TARSClassifier, self).__init__()

        from flair.embeddings import TransformerDocumentEmbeddings

        if not isinstance(embeddings, TransformerDocumentEmbeddings):
            embeddings = TransformerDocumentEmbeddings(model=embeddings,
                                                       fine_tune=True,
                                                       layers='-1',
                                                       layer_mean=False,
                                                       )

        # prepare TARS dictionary
        tars_dictionary = Dictionary(add_unk=False)
        tars_dictionary.add_item('False')
        tars_dictionary.add_item('True')

        # initialize a bare-bones sequence tagger
        self.tars_model = TextClassifier(document_embeddings=embeddings,
                                         label_dictionary=tars_dictionary,
                                         label_type=self.static_label_type,
                                         **tagger_args,
                                         )

        # transformer separator
        self.separator = str(self.tars_embeddings.tokenizer.sep_token)
        if self.tars_embeddings.tokenizer._bos_token:
            self.separator += str(self.tars_embeddings.tokenizer.bos_token)

        self.prefix = prefix
        self.num_negative_labels_to_sample = num_negative_labels_to_sample

        # Store task specific labels since TARS can handle multiple tasks
        self.add_and_switch_to_new_task(task_name, label_dictionary, label_type)
Esempio n. 26
0
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):

    # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path)
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        data_folder=tasks_base_path / 'multi_class')
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding],
                                                 hidden_size=32,
                                                 reproject_words=False,
                                                 bidirectional=False)

    model = TextClassifier(document_embeddings, label_dict, multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  max_epochs=100,
                  test_mode=True,
                  checkpoint=False)

    sentence = Sentence('apple tv')

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert ('apple' in sentence.get_label_names())
        assert ('tv' in sentence.get_label_names())

        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path /
                                                 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 27
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_multi",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([turian_embeddings],
                                                   fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=True,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  mini_batch_size=2,
                  max_epochs=50,
                  shuffle=True)

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_double = Sentence("this is Berlin and pizza")

    loaded_model.predict([sentence, sentence_double])

    values = []
    for label in sentence_double.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values
    assert "pizza" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    print(result.classification_report)
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Esempio n. 28
0
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv,
                         classifierFileName):
    ids = df['id'].tolist()

    nSamples = len(ids)
    idx80 = int(nSamples * 0.7)
    idx90 = int(nSamples * 0.9)

    train_ids = ids[:idx80]
    test_ids = ids[idx80:idx90]
    dev_ids = ids[idx90:]

    with TemporaryDirectory() as temp_dir:
        trainCsv = temp_dir + trainNameCsv
        testCsv = temp_dir + testNameCsv
        devCsv = temp_dir + devNameCsv

        df[df['id'].isin(train_ids)].to_csv(trainCsv,
                                            columns=columns,
                                            sep='\t',
                                            index=False,
                                            header=False)
        df[df['id'].isin(test_ids)].to_csv(testCsv,
                                           columns=columns,
                                           sep='\t',
                                           index=False,
                                           header=False)
        df[df['id'].isin(dev_ids)].to_csv(devCsv,
                                          columns=columns,
                                          sep='\t',
                                          index=False,
                                          header=False)

        corpus = NLPTaskDataFetcher.load_classification_corpus(
            temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv)

        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)
        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)
        trainer = ModelTrainer(classifier, corpus)

        trainer.train(temp_dir, max_epochs=50)

        classifier.save(classifierFileName)
Esempio n. 29
0
def main(args):
    args = parser.parse_args()

    # 1. get the corpus
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(
        args.data_dir[0],
        train_file='train.txt',
        dev_file='dev.txt',
        test_file='test.txt')

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. make a list of word embeddings
    word_embeddings = [
        WordEmbeddings('glove'),

        # comment in flair embeddings for state-of-the-art results
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward'),
        # ELMoEmbeddings()
    ]

    # 4. init document embedding by passing list of word embeddings
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        word_embeddings,
        hidden_size=128,
        reproject_words=True,
        reproject_words_dimension=64,
    )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    # 6. initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # 7. start the training
    model_out = 'resources/classifiers/sentence-classification/glove'
    trainer.train(model_out,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  max_epochs=100)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(join(model_out, 'loss.tsv'))
    plotter.plot_weights(join(model_out, 'weights.txt'))
Esempio n. 30
0
def test_labels_to_indices(tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "ag_news")
    label_dict = corpus.make_label_dictionary()
    model = TextClassifier(document_embeddings, label_dict, multi_label=False)

    result = model._labels_to_indices(corpus.train)

    for i in range(len(corpus.train)):
        expected = label_dict.get_idx_for_item(corpus.train[i].labels[0].value)
        actual = result[i].item()

        assert expected == actual