Esempio n. 1
0
def test_init_tars_and_switch(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb")

    # create a TARS classifier
    tars = TARSClassifier(task_name='2_CLASS',
                          label_dictionary=corpus.make_label_dictionary(label_type='class'),
                          label_type='class')

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 2)

    # switch to task with only one label
    tars.add_and_switch_to_new_task('1_CLASS', 'one class', "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 1)

    # switch to task with three labels provided as list
    tars.add_and_switch_to_new_task('3_CLASS', ['list 1', 'list 2', 'list 3'], "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 3)

    # switch to task with four labels provided as set
    tars.add_and_switch_to_new_task('4_CLASS', {'set 1', 'set 2', 'set 3', 'set 4'}, "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 4)

    # switch to task with two labels provided as Dictionary
    tars.add_and_switch_to_new_task('2_CLASS_AGAIN', corpus.make_label_dictionary(label_type='class'), "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 2)
Esempio n. 2
0
def run_splits(word_embeddings, embeddings_name):
    for i in range(1, 6):
        print('##########')
        print('Split', str(i))
        print('##########')

        data_folder = '<path_to_splits>/split_' + str(i) + '/'
        corpus = ClassificationCorpus(data_folder,
                                      test_file='test.csv',
                                      dev_file='dev.csv',
                                      train_file='train.csv')

        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
Esempio n. 3
0
    def train(self,
              learning_rate: float = 0.1,
              mini_batch_size: int = 16,
              anneal_factor: float = 0.5,
              patience: int = 5,
              max_epochs: int = 10):
        """

        :return:
        """
        self.make_corpus()
        corpus = ClassificationCorpus(self.output_data_path,
                                      train_file='train.txt',
                                      dev_file='dev.txt',
                                      test_file='test.txt')

        label_dictionary = corpus.make_label_dictionary()

        embeddings = [WordEmbeddings('glove')]
        document_pool = DocumentPoolEmbeddings(embeddings)
        classifier = TextClassifier(document_pool,
                                    label_dictionary=label_dictionary)
        trainer = ModelTrainer(classifier, corpus)
        trainer.train(
            self.model_path,
            learning_rate=learning_rate,
            mini_batch_size=mini_batch_size,
            anneal_factor=anneal_factor,
            patience=patience,
            max_epochs=max_epochs,
        )
Esempio n. 4
0
def test_train_tars(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore")

    # create a TARS classifier
    tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base")

    # switch to a new task (TARS can do multiple tasks so you must define one)
    tars.add_and_switch_to_new_task(task_name="question 2_CLASS",
                                    label_dictionary=corpus.make_label_dictionary(label_type='class'),
                                    label_type='class',
                                    )

    # initialize the text classifier trainer
    trainer = ModelTrainer(tars, corpus)

    # start the training
    trainer.train(base_path='resources/taggers/trec',  # path to store the model artifacts
                  learning_rate=0.02,  # use very small learning rate
                  mini_batch_size=1,
                  # mini_batch_chunk_size=4,  # optionally set this if transformer is too much for your machine
                  max_epochs=1,  # terminate after 10 epochs
                  )

    sentence = Sentence("This is great!")
    tars.predict(sentence)
Esempio n. 5
0
def test_train_tars(tasks_base_path, results_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore")

    # create a TARS classifier
    tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base")

    # switch to a new task (TARS can do multiple tasks so you must define one)
    tars.add_and_switch_to_new_task(
        task_name="question 2_CLASS",
        label_dictionary=corpus.make_label_dictionary(label_type="class"),
        label_type="class",
    )

    # initialize the text classifier trainer
    trainer = ModelTrainer(tars, corpus)

    # start the training
    trainer.train(
        base_path=results_base_path,
        learning_rate=0.02,
        mini_batch_size=1,
        max_epochs=1,
    )

    sentence = Sentence("This is great!")
    tars.predict(sentence)
Esempio n. 6
0
def train_model(data_dir, max_epochs):
    st.write('Creating word corpus for training...')
    corpus = ClassificationCorpus(data_dir)
    label_dict = corpus.make_label_dictionary()
    st.write('Done')

    st.write('Load and create Embeddings for text data...')
    word_embeddings = [
        WordEmbeddings('glove'),
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward')
    ]
    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)
    st.write('Done')

    st.write('Preparing')
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)
    trainer = ModelTrainer(classifier, corpus)
    trainer.train('model-saves',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=8,
                  max_epochs=max_epochs,
                  checkpoint=True)
    st.write('Model Training Finished!')
Esempio n. 7
0
def test_text_classifier_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=TransformerDocumentEmbeddings(
            "distilbert-base-uncased"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-5,
        num_workers=2,
    )

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict([sentence, sentence_empty])

    values = []
    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Esempio n. 8
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_multi",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([turian_embeddings],
                                                   fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=True,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  mini_batch_size=2,
                  max_epochs=50,
                  shuffle=True)

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_double = Sentence("this is Berlin and pizza")

    loaded_model.predict([sentence, sentence_double])

    values = []
    for label in sentence_double.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values
    assert "pizza" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    print(result.classification_report)
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Esempio n. 9
0
def test_init_tars_and_switch(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb")

    # create a TARS classifier
    tars = TARSClassifier(
        task_name="2_CLASS",
        label_dictionary=corpus.make_label_dictionary(label_type="class"),
        label_type="class",
    )

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 2

    # switch to task with only one label
    tars.add_and_switch_to_new_task("1_CLASS", "one class", "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 1

    # switch to task with three labels provided as list
    tars.add_and_switch_to_new_task("3_CLASS", ["list 1", "list 2", "list 3"], "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 3

    # switch to task with four labels provided as set
    tars.add_and_switch_to_new_task("4_CLASS", {"set 1", "set 2", "set 3", "set 4"}, "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 4

    # switch to task with two labels provided as Dictionary
    tars.add_and_switch_to_new_task("2_CLASS_AGAIN", corpus.make_label_dictionary(label_type="class"), "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 2
def train_sentiment_model(rootdir, train, dev, test, num_epochs, device, outputdir):

    flair.device = torch.device(device)

    corpus = ClassificationCorpus(rootdir,
                                  train_file=train,
                                  dev_file=dev,
                                  test_file=test,
                                  in_memory=False)

    label_dict = corpus.make_label_dictionary()

    # init Flair embeddings
    flair_forward_embedding = FlairEmbeddings('multi-forward')
    flair_backward_embedding = FlairEmbeddings('multi-backward')

    optional_embedding = ELMoEmbeddings('original')

    word_embeddings = list(filter(None, [
        optional_embedding,
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]))

    # Initialize document embedding by passing list of word embeddings
    #
    # Note this will kick off model generation that will take a long time (several hours)
    # This will produce final-model.pt and best-model.pt files which represent a stored trained model.
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(outputdir, max_epochs=num_epochs)
Esempio n. 11
0
 def _train_model(self):
     # type: () -> None
     corpus = ClassificationCorpus(
         Path(__path_to_base__),
         test_file=os.path.basename(self.path_to_test),
         dev_file=os.path.basename(self.path_to_dev),
         train_file=os.path.basename(self.path_to_train))
     word_embeddings = [
         ELMoEmbeddings('original'),
         FlairEmbeddings('news-forward-fast'),
         FlairEmbeddings('news-backward-fast')
     ]
     document_embeddings = DocumentRNNEmbeddings(
         word_embeddings,
         hidden_size=512,
         reproject_words=True,
         reproject_words_dimension=256)
     classifier = TextClassifier(
         document_embeddings,
         label_dictionary=corpus.make_label_dictionary(),
         multi_label=False)
     trainer = ModelTrainer(classifier, corpus)
     trainer.train(__path_to_base__, max_epochs=10)
Esempio n. 12
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    flair_embeddings = FlairEmbeddings("news-forward-fast")

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([flair_embeddings], fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, mini_batch_size=2, max_epochs=1, shuffle=True)

    del model
    train_log_file = results_base_path / "training.log"
    assert train_log_file.exists()
    lines = train_log_file.read_text(encoding="utf-8").split("\n")
    expected_substrings = [
        "Device: ",
        "Corpus: ",
        "Parameters:",
        "- learning_rate: ",
        "- patience: ",
        "Embeddings storage mode:",
        "epoch 1 - iter",
        "EPOCH 1 done: loss",
        "Results:",
    ]
    for expected_substring in expected_substrings:
        assert any(expected_substring in line for line in lines), expected_substring
Esempio n. 13
0
    OptimizationValue,
)

if __name__ == "__main__":
    data_folder = Path("..", "classification", "data", "downsampled", "flair")
    for c in ["dramen", "romane", "zeitung", "wikipedia"]:
        test_file = f"{c}-downsampled-test-flair.txt"
        dev_file = f"{c}-downsampled-val-flair.txt"
        train_file = f"{c}-downsampled-train-flair.txt"

        corpus = ClassificationCorpus(data_folder,
                                      test_file=test_file,
                                      dev_file=dev_file,
                                      train_file=train_file)

        label_dict = corpus.make_label_dictionary()

        search_space = SearchSpace()
        search_space.add(
            Parameter.EMBEDDINGS,
            hp.choice,
            options=[[BertEmbeddings("bert-base-german-cased")]],
        )
        search_space.add(Parameter.HIDDEN_SIZE,
                         hp.choice,
                         options=[32, 64, 128])
        search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
        search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
        search_space.add(Parameter.LEARNING_RATE,
                         hp.choice,
                         options=[0.05, 0.1, 0.15, 0.2])
Esempio n. 14
0
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "fasttext":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('it')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-uncased')
    elif stack == "bert-multi":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased')
    elif stack == 'bpe':
        from flair.embeddings import BytePairEmbeddings
        stacked_embedding = BytePairEmbeddings('it')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('it-forward'),
            FlairEmbeddings('it-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=256,
        reproject_words=True,
        dropout=0.5,
        reproject_words_dimension=256,
    )

    #document_embeddings = DocumentPoolEmbeddings([
    #    stacked_embedding,
    #    FlairEmbeddings('it-forward'),
    #    FlairEmbeddings('it-backward')],pooling='mean')

    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=True)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        #checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        file_path,
        max_epochs=n_epochs,
        checkpoint=True,
    )

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
word_embeddings = [
            embedding,
#             FlairEmbeddings('news-forward',use_cache=True),
#             FlairEmbeddings('news-backward',use_cache=True),
        ]

#apply document LSTM to the stacked embeddings
document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
#         hidden_size=512,
#         reproject_words=True,
#         reproject_words_dimension=256,
    )

#build model
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)

#specify parameters and train model
trainer.train(PATH/'models/', max_epochs=3,checkpoint=True, learning_rate=1e-1) 

classifier = TextClassifier.load('/content/drive/My Drive/emnlp/models/best-model.pt')



"""## Dev Set Prediction"""

dev_folder = ""     # if not adjust these variables accordingly
dev_template_labels_file = ""
task_SLC_output_file = ""