def main(args):
    args = parser.parse_args()

    # 0. Make a list of word embeddings
    if args.method == 'glove':
        word_embeddings = [WordEmbeddings('glove')]
    elif args.method == 'flair':
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward')
        ]
    elif args.method == 'cui_svd':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings('resources/embeddings/cui2vec100.npy'))
        ]
    elif args.method == 'cui_proj':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings(
                    'resources/embeddings/cui2vec_projected_100-100.gensim'))
        ]
    elif args.method == 'mimic':
        word_embeddings = [
            WordEmbeddings(
                'resources/embeddings/mimic3_mixed_embeddings100.gensim')
        ]
    elif args.method == 'cui2vec':
        word_embeddings = [
            BackOffEmbeddings(
                WordEmbeddings('glove'),
                WordEmbeddings(
                    'resources/embeddings/cui2vec_combined_glove_100dim.gensim'
                ))
        ]
    elif args.method == 'mimic_lm':
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('resources/taggers/mimic-forward/best-lm.pt'),
            FlairEmbeddings('resources/taggers/mimic-backward/best-lm.pt')
        ]
    else:
        raise Exception(
            "Received option for method %s that cannot be interpreted." %
            (args.method))

    if 'bg' in args.data_file:
        multi = True
        print(
            "Running in multiple label setting because 'bg' was in the data file name %s"
            % (args.data_file))
    else:
        multi = False

    # 1. get the corpus
    sents: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(
        args.data_file)
    corpus = TaggedCorpus(sents, None, None)

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. split the training data into folds
    num_folds = args.num_folds
    seed = 719
    kf = KFold(n_splits=num_folds, random_state=seed)
    kf.get_n_splits()

    # 4. iterate over folds:
    total_acc = 0
    fold = 1
    for train_index, test_index in kf.split(corpus.train):
        # 4a. initialize the text classifier trainer
        split_traindev = np.array(corpus.train)[train_index].tolist()
        traindev_size = len(split_traindev)
        train_dev_splitpoint = int(0.9 * traindev_size)
        split_train = split_traindev[:train_dev_splitpoint]
        split_dev = split_traindev[train_dev_splitpoint:]

        split_test = np.array(corpus.train)[test_index].tolist()
        split_corpus = TaggedCorpus(split_train,
                                    dev=split_dev,
                                    test=split_test)

        print("After split, size of splits: train=%d, dev=%d, test=%d" %
              (len(split_train), len(split_dev), len(split_test)))

        # 4b. do training:
        with tempfile.TemporaryDirectory() as model_dir:
            # init document embedding by passing list of word embeddings
            document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
                word_embeddings,
                hidden_size=128,
                reproject_words=True,
                reproject_words_dimension=64,
            )
            classifier = TextClassifier(document_embeddings,
                                        label_dictionary=label_dict,
                                        multi_label=multi)
            trainer = ModelTrainer(classifier, split_corpus)

            results = trainer.train(model_dir,
                                    embeddings_in_memory=False,
                                    learning_rate=0.1,
                                    mini_batch_size=128,
                                    anneal_factor=0.5,
                                    patience=5,
                                    max_epochs=100)

        fold_acc = results['test_score']
        total_acc += fold_acc
        print(f"Finished fold {fold} with accuracy {fold_acc}")
        fold += 1
    total_acc /= num_folds

    print("Finished with total cross-fold accuracy of %f" % (total_acc))
Example #2
0
from typing import List

from flair.data import Sentence, TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings
from flair.models.text_classification_model import TextClassifier
from flair.trainers.text_classification_trainer import TextClassifierTrainer

sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt')
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt')
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt')

corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test)

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('de-fasttext'),
                   CharLMEmbeddings('german-forward'),
                   CharLMEmbeddings('german-backward')]

# 4. init document embedding by passing list of word embeddings
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32)

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)

# 6. initialize the text classifier trainer
trainer = TextClassifierTrainer(classifier, corpus, label_dict)