def main(args): args = parser.parse_args() # 0. Make a list of word embeddings if args.method == 'glove': word_embeddings = [WordEmbeddings('glove')] elif args.method == 'flair': word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] elif args.method == 'cui_svd': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings('resources/embeddings/cui2vec100.npy')) ] elif args.method == 'cui_proj': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings( 'resources/embeddings/cui2vec_projected_100-100.gensim')) ] elif args.method == 'mimic': word_embeddings = [ WordEmbeddings( 'resources/embeddings/mimic3_mixed_embeddings100.gensim') ] elif args.method == 'cui2vec': word_embeddings = [ BackOffEmbeddings( WordEmbeddings('glove'), WordEmbeddings( 'resources/embeddings/cui2vec_combined_glove_100dim.gensim' )) ] elif args.method == 'mimic_lm': word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('resources/taggers/mimic-forward/best-lm.pt'), FlairEmbeddings('resources/taggers/mimic-backward/best-lm.pt') ] else: raise Exception( "Received option for method %s that cannot be interpreted." % (args.method)) if 'bg' in args.data_file: multi = True print( "Running in multiple label setting because 'bg' was in the data file name %s" % (args.data_file)) else: multi = False # 1. get the corpus sents: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file( args.data_file) corpus = TaggedCorpus(sents, None, None) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. split the training data into folds num_folds = args.num_folds seed = 719 kf = KFold(n_splits=num_folds, random_state=seed) kf.get_n_splits() # 4. iterate over folds: total_acc = 0 fold = 1 for train_index, test_index in kf.split(corpus.train): # 4a. initialize the text classifier trainer split_traindev = np.array(corpus.train)[train_index].tolist() traindev_size = len(split_traindev) train_dev_splitpoint = int(0.9 * traindev_size) split_train = split_traindev[:train_dev_splitpoint] split_dev = split_traindev[train_dev_splitpoint:] split_test = np.array(corpus.train)[test_index].tolist() split_corpus = TaggedCorpus(split_train, dev=split_dev, test=split_test) print("After split, size of splits: train=%d, dev=%d, test=%d" % (len(split_train), len(split_dev), len(split_test))) # 4b. do training: with tempfile.TemporaryDirectory() as model_dir: # init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=multi) trainer = ModelTrainer(classifier, split_corpus) results = trainer.train(model_dir, embeddings_in_memory=False, learning_rate=0.1, mini_batch_size=128, anneal_factor=0.5, patience=5, max_epochs=100) fold_acc = results['test_score'] total_acc += fold_acc print(f"Finished fold {fold} with accuracy {fold_acc}") fold += 1 total_acc /= num_folds print("Finished with total cross-fold accuracy of %f" % (total_acc))
from typing import List from flair.data import Sentence, TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings from flair.models.text_classification_model import TextClassifier from flair.trainers.text_classification_trainer import TextClassifierTrainer sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt') sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt') sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt') corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [WordEmbeddings('de-fasttext'), CharLMEmbeddings('german-forward'), CharLMEmbeddings('german-backward')] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = TextClassifierTrainer(classifier, corpus, label_dict)