Esempio n. 1
0
def train_model(data_dir, max_epochs):
    st.write('Creating word corpus for training...')
    corpus = ClassificationCorpus(data_dir)
    label_dict = corpus.make_label_dictionary()
    st.write('Done')

    st.write('Load and create Embeddings for text data...')
    word_embeddings = [
        WordEmbeddings('glove'),
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward')
    ]
    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)
    st.write('Done')

    st.write('Preparing')
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)
    trainer = ModelTrainer(classifier, corpus)
    trainer.train('model-saves',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=8,
                  max_epochs=max_epochs,
                  checkpoint=True)
    st.write('Model Training Finished!')
Esempio n. 2
0
def test_init_tars_and_switch(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb")

    # create a TARS classifier
    tars = TARSClassifier(task_name='2_CLASS',
                          label_dictionary=corpus.make_label_dictionary(label_type='class'),
                          label_type='class')

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 2)

    # switch to task with only one label
    tars.add_and_switch_to_new_task('1_CLASS', 'one class', "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 1)

    # switch to task with three labels provided as list
    tars.add_and_switch_to_new_task('3_CLASS', ['list 1', 'list 2', 'list 3'], "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 3)

    # switch to task with four labels provided as set
    tars.add_and_switch_to_new_task('4_CLASS', {'set 1', 'set 2', 'set 3', 'set 4'}, "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 4)

    # switch to task with two labels provided as Dictionary
    tars.add_and_switch_to_new_task('2_CLASS_AGAIN', corpus.make_label_dictionary(label_type='class'), "testlabel")

    # check if right number of classes
    assert (len(tars.get_current_label_dictionary()) == 2)
Esempio n. 3
0
    def train(self,
              learning_rate: float = 0.1,
              mini_batch_size: int = 16,
              anneal_factor: float = 0.5,
              patience: int = 5,
              max_epochs: int = 10):
        """

        :return:
        """
        self.make_corpus()
        corpus = ClassificationCorpus(self.output_data_path,
                                      train_file='train.txt',
                                      dev_file='dev.txt',
                                      test_file='test.txt')

        label_dictionary = corpus.make_label_dictionary()

        embeddings = [WordEmbeddings('glove')]
        document_pool = DocumentPoolEmbeddings(embeddings)
        classifier = TextClassifier(document_pool,
                                    label_dictionary=label_dictionary)
        trainer = ModelTrainer(classifier, corpus)
        trainer.train(
            self.model_path,
            learning_rate=learning_rate,
            mini_batch_size=mini_batch_size,
            anneal_factor=anneal_factor,
            patience=patience,
            max_epochs=max_epochs,
        )
Esempio n. 4
0
def run_splits(word_embeddings, embeddings_name):
    for i in range(1, 6):
        print('##########')
        print('Split', str(i))
        print('##########')

        data_folder = '<path_to_splits>/split_' + str(i) + '/'
        corpus = ClassificationCorpus(data_folder,
                                      test_file='test.csv',
                                      dev_file='dev.csv',
                                      train_file='train.csv')

        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
Esempio n. 5
0
def train():
    corpus: Corpus = ClassificationCorpus(sst_folder,
                                          test_file='test.csv',
                                          dev_file='dev.csv',
                                          train_file='sst_dev.csv')

    label_dict = corpus.make_label_dictionary()
    stacked_embedding = WordEmbeddings('glove')

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(model_path, max_epochs=10, train_with_dev=False)
Esempio n. 6
0
def test_train_tars(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore")

    # create a TARS classifier
    tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base")

    # switch to a new task (TARS can do multiple tasks so you must define one)
    tars.add_and_switch_to_new_task(task_name="question 2_CLASS",
                                    label_dictionary=corpus.make_label_dictionary(label_type='class'),
                                    label_type='class',
                                    )

    # initialize the text classifier trainer
    trainer = ModelTrainer(tars, corpus)

    # start the training
    trainer.train(base_path='resources/taggers/trec',  # path to store the model artifacts
                  learning_rate=0.02,  # use very small learning rate
                  mini_batch_size=1,
                  # mini_batch_chunk_size=4,  # optionally set this if transformer is too much for your machine
                  max_epochs=1,  # terminate after 10 epochs
                  )

    sentence = Sentence("This is great!")
    tars.predict(sentence)
Esempio n. 7
0
def test_train_tars(tasks_base_path, results_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore")

    # create a TARS classifier
    tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base")

    # switch to a new task (TARS can do multiple tasks so you must define one)
    tars.add_and_switch_to_new_task(
        task_name="question 2_CLASS",
        label_dictionary=corpus.make_label_dictionary(label_type="class"),
        label_type="class",
    )

    # initialize the text classifier trainer
    trainer = ModelTrainer(tars, corpus)

    # start the training
    trainer.train(
        base_path=results_base_path,
        learning_rate=0.02,
        mini_batch_size=1,
        max_epochs=1,
    )

    sentence = Sentence("This is great!")
    tars.predict(sentence)
Esempio n. 8
0
def predict(args):
    """Predict."""
    model = TextClassifier.load(os.path.join(args.model_dir, args.model_file))
    logger.info(f'Model: "{model}"')

    if args.one_per_line:
        corpus: Corpus = ClassificationCorpus(
            args.data_dir,
            test_file=args.test_file,
        )
    else:
        assert args.label_symbol is not None
        corpus: Corpus = FlyClassificationCorpus(
            args.data_dir,
            test_file=args.test_file,
            comment_symbol=args.comment_symbol,
            label_symbol=args.label_symbol,
        )

    fout = io.open(args.output_file, "w", encoding="utf-8", errors="ignore")
    logger.info("Saving to %s", args.output_file)

    start_time = time.time()
    for i in range(len(corpus.test)):
        sentence = corpus.test[i]
        model.predict(sentence)
        if sentence.labels:
            top = sentence.labels[0]
            fout.write(f"{top.value} {top.score:.4f}\n")
            fout.flush()

    logger.info("End of prediction: time %.1f min",
                (time.time() - start_time) / 60)
Esempio n. 9
0
def test_text_classifier_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=TransformerDocumentEmbeddings(
            "distilbert-base-uncased"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-5,
        num_workers=2,
    )

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict([sentence, sentence_empty])

    values = []
    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Esempio n. 10
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_multi",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([turian_embeddings],
                                                   fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=True,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  mini_batch_size=2,
                  max_epochs=50,
                  shuffle=True)

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_double = Sentence("this is Berlin and pizza")

    loaded_model.predict([sentence, sentence_double])

    values = []
    for label in sentence_double.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values
    assert "pizza" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    print(result.classification_report)
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Esempio n. 11
0
def train(args):
    """Train."""
    start_time = time.time()
    if args.one_per_line:
        corpus: Corpus = ClassificationCorpus(
            args.data_dir,
            train_file=args.train_file,
            dev_file=args.dev_file,
        )
    else:
        assert args.label_symbol is not None
        corpus: Corpus = FlyClassificationCorpus(
            args.data_dir,
            train_file=args.train_file,
            dev_file=args.dev_file,
            comment_symbol=args.comment_symbol,
            label_symbol=args.label_symbol,
        )

    label_dict = corpus.make_label_dictionary()
    vocab = corpus.make_vocab_dictionary().get_items()
    embeddings = utils.init_embeddings(vocab, args)

    document_embeddings = DocumentRNNEmbeddings(
        [embeddings],
        hidden_size=args.hidden_size,
        use_attn=args.use_attn,
        num_heads=args.num_heads,
        scaling=args.scaling,
        pooling_operation=args.pooling_operation,
        use_sent_query=args.use_sent_query,
    )

    model = TextClassifier(document_embeddings, label_dictionary=label_dict)

    utils.init_model(model, args)

    trainer: ModelTrainer = ModelTrainer(model, corpus,
                                         utils.optim_method(args.optim))

    trainer.train(
        args.model_dir,
        mini_batch_size=args.mini_batch_size,
        max_epochs=args.max_epochs,
        anneal_factor=args.anneal_factor,
        learning_rate=args.learning_rate,
        patience=args.patience,
        min_learning_rate=args.min_learning_rate,
        embeddings_storage_mode=args.embeddings_storage_mode,
    )

    logger.info("End of training: time %.1f min",
                (time.time() - start_time) / 60)
Esempio n. 12
0
def load_corpus():
    label_dictionary: Dictionary = Dictionary(add_unk=False)
    label_dictionary.multi_label = False

    label_dictionary.add_item('0')
    label_dictionary.add_item('1')

    # this is the folder in which train, test and dev files reside
    data_folder = 'datasets/constrained_classification/k16'

    # load corpus containing training, test and dev data
    corpus: Corpus = ClassificationCorpus(data_folder,
                                          dev_file='fasttext.valid',
                                          train_file='fasttext.train')

    return corpus, label_dictionary
def train_sentiment_model(rootdir, train, dev, test, num_epochs, device, outputdir):

    flair.device = torch.device(device)

    corpus = ClassificationCorpus(rootdir,
                                  train_file=train,
                                  dev_file=dev,
                                  test_file=test,
                                  in_memory=False)

    label_dict = corpus.make_label_dictionary()

    # init Flair embeddings
    flair_forward_embedding = FlairEmbeddings('multi-forward')
    flair_backward_embedding = FlairEmbeddings('multi-backward')

    optional_embedding = ELMoEmbeddings('original')

    word_embeddings = list(filter(None, [
        optional_embedding,
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]))

    # Initialize document embedding by passing list of word embeddings
    #
    # Note this will kick off model generation that will take a long time (several hours)
    # This will produce final-model.pt and best-model.pt files which represent a stored trained model.
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(outputdir, max_epochs=num_epochs)
Esempio n. 14
0
    def __init__(self,
                 path: Union[Path, str],
                 column_name_map: dict = None,
                 corpus: Corpus = None,
                 **corpus_params):
        if isinstance(path, str):
            path = Path(path)
        assert path.exists()

        self.path = path
        if corpus:
            self.corpus = corpus
        else:
            if column_name_map:
                self.corpus = CSVClassificationCorpus(self.path,
                                                      column_name_map,
                                                      **corpus_params)
            else:
                self.corpus = ClassificationCorpus(self.path, **corpus_params)
        self.sentences = self.corpus.get_all_sentences()
        print(self.corpus)
Esempio n. 15
0
 def _train_model(self):
     # type: () -> None
     corpus = ClassificationCorpus(
         Path(__path_to_base__),
         test_file=os.path.basename(self.path_to_test),
         dev_file=os.path.basename(self.path_to_dev),
         train_file=os.path.basename(self.path_to_train))
     word_embeddings = [
         ELMoEmbeddings('original'),
         FlairEmbeddings('news-forward-fast'),
         FlairEmbeddings('news-backward-fast')
     ]
     document_embeddings = DocumentRNNEmbeddings(
         word_embeddings,
         hidden_size=512,
         reproject_words=True,
         reproject_words_dimension=256)
     classifier = TextClassifier(
         document_embeddings,
         label_dictionary=corpus.make_label_dictionary(),
         multi_label=False)
     trainer = ModelTrainer(classifier, corpus)
     trainer.train(__path_to_base__, max_epochs=10)
Esempio n. 16
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    flair_embeddings = FlairEmbeddings("news-forward-fast")

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([flair_embeddings], fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, mini_batch_size=2, max_epochs=1, shuffle=True)

    del model
    train_log_file = results_base_path / "training.log"
    assert train_log_file.exists()
    lines = train_log_file.read_text(encoding="utf-8").split("\n")
    expected_substrings = [
        "Device: ",
        "Corpus: ",
        "Parameters:",
        "- learning_rate: ",
        "- patience: ",
        "Embeddings storage mode:",
        "epoch 1 - iter",
        "EPOCH 1 done: loss",
        "Results:",
    ]
    for expected_substring in expected_substrings:
        assert any(expected_substring in line for line in lines), expected_substring
Esempio n. 17
0
def test_init_tars_and_switch(tasks_base_path):
    # test corpus
    corpus = ClassificationCorpus(tasks_base_path / "imdb")

    # create a TARS classifier
    tars = TARSClassifier(
        task_name="2_CLASS",
        label_dictionary=corpus.make_label_dictionary(label_type="class"),
        label_type="class",
    )

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 2

    # switch to task with only one label
    tars.add_and_switch_to_new_task("1_CLASS", "one class", "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 1

    # switch to task with three labels provided as list
    tars.add_and_switch_to_new_task("3_CLASS", ["list 1", "list 2", "list 3"], "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 3

    # switch to task with four labels provided as set
    tars.add_and_switch_to_new_task("4_CLASS", {"set 1", "set 2", "set 3", "set 4"}, "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 4

    # switch to task with two labels provided as Dictionary
    tars.add_and_switch_to_new_task("2_CLASS_AGAIN", corpus.make_label_dictionary(label_type="class"), "testlabel")

    # check if right number of classes
    assert len(tars.get_current_label_dictionary()) == 2
Esempio n. 18
0
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.hyperparameter.param_selection import (
    TextClassifierParamSelector,
    OptimizationValue,
)

if __name__ == "__main__":
    data_folder = Path("..", "classification", "data", "downsampled", "flair")
    for c in ["dramen", "romane", "zeitung", "wikipedia"]:
        test_file = f"{c}-downsampled-test-flair.txt"
        dev_file = f"{c}-downsampled-val-flair.txt"
        train_file = f"{c}-downsampled-train-flair.txt"

        corpus = ClassificationCorpus(data_folder,
                                      test_file=test_file,
                                      dev_file=dev_file,
                                      train_file=train_file)

        label_dict = corpus.make_label_dictionary()

        search_space = SearchSpace()
        search_space.add(
            Parameter.EMBEDDINGS,
            hp.choice,
            options=[[BertEmbeddings("bert-base-german-cased")]],
        )
        search_space.add(Parameter.HIDDEN_SIZE,
                         hp.choice,
                         options=[32, 64, 128])
        search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
        search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
Esempio n. 19
0
def train(
    review_category,
    params,
    update_model= False,
    learning_rate=0.01,
    embeddings_storage_mode='gpu',
    checkpoint= True,
    batch_growth_annealing= True,
    weight_decay = 1e-4,
    shuffle=True,
    train_with_dev=True,
    mini_batch_size=2,
    maxi_batch_size=128,
    anneal_factor=0.5,
    patience=2,
    max_epochs=150
    ):
    review_category = str(review_category)
    print('loading training corpus from %s'%(params.data_folder))
    corpus: Corpus = ClassificationCorpus(params.data_folder,
                train_file= review_category+'_train.txt',
                test_file= review_category+'_test.txt',
                dev_file= review_category+'_dev.txt')
    label_dict = corpus.make_label_dictionary()
    print('labels: ',label_dict)
    if eval(params.transformer):
        print('initializing transformer document embeddings using %s ...'%(params.transformer_pretrain_lm))
        # 3. initialize transformer document embeddings (many models are available)
        document_embeddings = TransformerDocumentEmbeddings(params.transformer_pretrain_lm, fine_tune=True)
    else:
        print('initializing document embeddings')
        word_embeddings= [
            WordEmbeddings('glove'),
            # comment in this line to use character embeddings
            CharacterEmbeddings(),
            # comment in these lines to use flair embeddings
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
            BertEmbeddings(),
            # TransformerXLEmbeddings(),
            #RoBERTaEmbeddings(),
            #XLNetEmbeddings()
        ]
        # Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
        document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                    hidden_size=512,
                                                    reproject_words=True,
                                                    reproject_words_dimension=256,
                                                    )
    if not update_model:
        print('building review_analysis classifier ...')
        # create the text classifier
        classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
        # initialize the text classifier trainer
        print("initializing review_analysis classifier's trainer")
        trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
    else:
        # continue trainer at later point
        checkpoint_path = params.checkpoint_dir+'/%s/checkpoint.pt'%(review_category)
        print('loading checkpoint from %s'%(checkpoint_path))
        trainer = ModelTrainer.load_checkpoint(checkpoint_path, corpus)
    ####### training the model
    print("training the review_category: %s model ..."%(review_category))
    try:
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs)
    except:
        print('chuncking batch ... by %d'%(params.mini_batch_chunk_size))
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs,
        mini_batch_chunk_size=params.mini_batch_chunk_size)
Esempio n. 20
0
labelDf['Label'] = '__label__' + labelDf['Label'].astype(str)

labelDf = labelDf.sample(frac=1)
labelDf.iloc[0:int(len(labelDf) * 0.8)].to_csv('data/train.csv',
                                               sep='\t',
                                               index=False,
                                               header=False)
labelDf.iloc[int(len(labelDf) * 0.8):int(len(labelDf) * 0.9)].to_csv(
    'data/test.csv', sep='\t', index=False, header=False)
labelDf.iloc[int(len(labelDf) * 0.9):].to_csv('data/dev.csv',
                                              sep='\t',
                                              index=False,
                                              header=False)

corpus = ClassificationCorpus(Path('data/'),
                              test_file='test.csv',
                              dev_file='dev.csv',
                              train_file='train.csv')
word_embeddings = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast')
]
document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                            hidden_size=512,
                                            reproject_words=True,
                                            reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings,
                            label_dictionary=corpus.make_label_dictionary(),
                            multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.find_learning_rate('model/', 'learning_rate.tsv')
Esempio n. 21
0
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "fasttext":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('it')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-uncased')
    elif stack == "bert-multi":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased')
    elif stack == 'bpe':
        from flair.embeddings import BytePairEmbeddings
        stacked_embedding = BytePairEmbeddings('it')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('it-forward'),
            FlairEmbeddings('it-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=256,
        reproject_words=True,
        dropout=0.5,
        reproject_words_dimension=256,
    )

    #document_embeddings = DocumentPoolEmbeddings([
    #    stacked_embedding,
    #    FlairEmbeddings('it-forward'),
    #    FlairEmbeddings('it-backward')],pooling='mean')

    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=True)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        #checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        file_path,
        max_epochs=n_epochs,
        checkpoint=True,
    )

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
Esempio n. 22
0
def train_classifier(pre_trained_model,
                     layer,
                     lr,
                     batch_size,
                     pooling_sub_token,
                     epochs,
                     hidden_size,
                     word_level=False,
                     task='text_classification'):
    # corpus = NLPTaskDataFetcher.load_classification_corpus(data_folder='label_embs_2/', test_file='test.csv', train_file='train.csv', dev_file='dev.csv')
    if not word_level:
        document_embeddings = TransformerDocumentEmbeddings(pre_trained_model,
                                                            fine_tune=True)
    else:
        token_embeddings = TransformerWordEmbeddings(
            pre_trained_model,
            layers=layer,
            pooling_operation=pooling_sub_token,
            fine_tune=True)

    #text classification
    if task == 'text_classification':
        corpus: Corpus = ClassificationCorpus(data_folder=dataset_folder,
                                              test_file='test.txt',
                                              dev_file='dev.txt',
                                              train_file='train.txt')
        label_dict = corpus.make_label_dictionary()
        classifier = TextClassifier(document_embeddings=token_embeddings,
                                    label_dictionary=label_dict,
                                    multi_label=False)
        # trainer = ModelTrainer(model=classifier, corpus=corpus, optimizer=SGD)
    #sequence labelling
    elif task == 'sequence_labelling':
        columns = {0: 'text', 1: 'tag'}
        corpus: Corpus = ColumnCorpus(dataset_folder,
                                      columns,
                                      train_file='train.txt',
                                      test_file='test.txt',
                                      dev_file='dev.txt')
        token_tag_dictionary = corpus.make_tag_dictionary(tag_type=columns[1])
        embedding_types = [
            TransformerWordEmbeddings(pre_trained_model,
                                      layers=layer,
                                      pooling_operation=pooling_sub_token,
                                      fine_tune=True)
        ]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
        classifier: SequenceTagger = SequenceTagger(
            hidden_size=hidden_size,
            embeddings=embeddings,
            tag_dictionary=token_tag_dictionary,
            tag_type=columns[1],
            use_crf=True)
    trainer: ModelTrainer = ModelTrainer(model=classifier,
                                         corpus=corpus,
                                         optimizer=SGD)
    trainer.train(dest_folder + '/{}-output'.format(task),
                  learning_rate=lr,
                  mini_batch_size=batch_size,
                  max_epochs=epochs)
label_mapping={'propaganda':1,'non-propaganda':0} ## label encoding
df['labels']=df['labels'].apply(lambda x:label_mapping[x])

df=pd.concat([df,df[df['labels']==1].copy()],0).reset_index(drop=True) #oversampling

#convert to flair readable format
data=df[['labels','sentence']].rename(columns={'labels':"label", 'sentence':"text"})
data['label'] = '__label__' + data['label'].astype(str)

#train-test split
data.iloc[0:int(len(data)*0.8)].to_csv(PATH/'flair/train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*1)].to_csv(PATH/'flair/test.csv', sep='\t', index = False, header = False)


corpus = ClassificationCorpus(Path('/content/drive/My Drive/emnlp/flair/'), test_file='test.csv', dev_file='test.csv',train_file='train.csv')

print(corpus.obtain_statistics())


## use any pretrained stacked embedding from the FLAIR Framework
# embedding = RoBERTaEmbeddings()
# embedding = BertEmbeddings('bert-base-uncased')
embedding = ELMoEmbeddings('small')

#stack them with other embeddings
word_embeddings = [
            embedding,
#             FlairEmbeddings('news-forward',use_cache=True),
#             FlairEmbeddings('news-backward',use_cache=True),
        ]
Esempio n. 24
0
test_data = utils.mgdb.read_mongo('raw_data_test')
test_data.to_csv(path.join(data_folder, 'test.txt'),
                 sep=' ',
                 index=False,
                 header=False,
                 columns=['label', 'text'])

dev_data = utils.mgdb.read_mongo('raw_data_dev')
dev_data.to_csv(path.join(data_folder, 'dev.txt'),
                sep=' ',
                index=False,
                header=False,
                columns=['label', 'text'])

#%%
corpus: Corpus = ClassificationCorpus('data/splitted_data')
if len(corpus.train) == 0 or len(corpus.test) == 0:
    raise Exception('Creating corpus failed')

#%%
word_embeddings = [WordEmbeddings('glove')]

document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256,
)

label_dict = corpus.make_label_dictionary()
Esempio n. 25
0
                                               articles_train,
                                               train=True)
    tokens_valid, labels_valid = return_annotated_articles(
        params["dev_labels_file"], articles_valid, train=True)
    tokens_test, labels_test = return_annotated_articles(
        params["test_labels_file"], articles_test, train=True)

    write_to_file_flair_corpus(params['data_bert_format_dir'] + 'train.txt',
                               tokens, labels)
    write_to_file_flair_corpus(params['data_bert_format_dir'] + 'dev.txt',
                               tokens_valid, labels_valid)
    write_to_file_flair_corpus(params['data_bert_format_dir'] + 'test.txt',
                               tokens_test, labels_test)
    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus = ClassificationCorpus(params['data_bert_format_dir'],
                                  train_file='train.txt',
                                  test_file='test.txt',
                                  dev_file='dev.txt')

    corpus.filter_empty_sentences()
    print(corpus)

    label_dictionary = corpus.make_label_dictionary()

    print(label_dictionary)

    flat_labels = [item for sublist in labels for item in sublist]
    class_weights = compute_class_weight('balanced', np.unique(flat_labels),
                                         flat_labels)
    unique_labels = np.unique(flat_labels)
    weights = {}
    for i in range(len(unique_labels)):
Esempio n. 26
0
data_folder = 'content_folder'
from flair.data import Corpus
from flair.datasets import ClassificationCorpus

corpus: Corpus = ClassificationCorpus(data_folder)

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.trainers import ModelTrainer
from flair.models import TextClassifier

label_dict = corpus.make_label_dictionary()
word_embeddings = [WordEmbeddings('glove')]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
trainer = ModelTrainer(classifier, corpus)
trainer.train('/content/data',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)
Esempio n. 27
0
        dev_data['preprocessed'] = tweet_preprocessing.preprocess_data(
            dev_data['content'], 'embedding')
        if B_TEST_PHASE is True:
            test_data['preprocessed'] = tweet_preprocessing.preprocess_data(
                test_data['content'], 'embedding')

        utils.csv2ftx(train_data.content, train_data.sentiment, S_DATASET,
                      'train', 'flair')
        utils.csv2ftx(dev_data.content, dev_data.sentiment, S_DATASET, 'dev',
                      'flair')
        utils.csv2ftx(test_data.content, test_data.sentiment, S_DATASET,
                      'test', 'flair')

        corpus = Corpus = ClassificationCorpus(
            '../dataset/flair/',
            train_file='intertass_{}_train.txt'.format(S_DATASET),
            dev_file='intertass_{}_dev.txt'.format(S_DATASET),
            test_file='intertass_{}_test.txt'.format(S_DATASET))

        # class_weights = compute_class_weight('balanced', [0, 1, 2, 3], y=train_data.sentiment)
        # dict_weights = dict()
        # for i, label in enumerate(class_weights):
        #     dict_weights.update({str(label): class_weights[i]})

        # word_embeddings = [BertEmbeddings('bert-base-multilingual-cased')]
        word_embeddings = [
            BertEmbeddings('dccuchile/bert-base-spanish-wwm-cased')
        ]

        document_embeddings = DocumentRNNEmbeddings(
            word_embeddings,