def test_sequence_classifier_trainer(): corpus = TREC_6() # Instantiate AdaptNLP easy document embeddings module, which can take in a variable number of embeddings to make `Stacked Embeddings`. # You may also use custom Transformers LM models by specifying the path the the language model doc_embeddings = EasyDocumentEmbeddings("bert-base-cased", methods=["rnn"]) # Instantiate Sequence Classifier Trainer by loading in the data, data column map, and embeddings as an encoder trainer = SequenceClassifierTrainer(corpus=corpus, encoder=doc_embeddings, column_name_map={ 0: "text", 1: "label" }) trainer
def fine_tune(self): if isinstance(self.document_embedding, TransformerDocumentEmbeddings): corpus = TREC_6() label_dict = corpus.make_label_dictionary() classifier = TextClassifier(self.document_embedding, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # 6. start the training trainer.train( 'resources/taggers/trec', learning_rate=3e-5, # use very small learning rate mini_batch_size=16, mini_batch_chunk_size= 4, # optionally set this if transformer is too much for your machine max_epochs=5, # terminate after 5 epochs ) else: raise UserWarning( "No fine tuning for this embedding type implemented")
from flair.data import Corpus from flair.datasets import TREC_6 from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.data import Sentence # 1. get the corpus corpus: Corpus = TREC_6() # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings( 'glove'), # comment in flair embrddings for state-of-the-art results # FlairEmbeddingd('news-forward'), # FlairEmbeddings('news-backward'), ] # 4. initialize document embedding by passing list of word embeddings # Can choose between many RNN types (GRU by default, to change use rnn_type parameter) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
from FlairParamOptimizer import search_strategies, search_spaces, orchestrator import FlairParamOptimizer.parameter_listings.parameters_for_user_input as param from flair.datasets import TREC_6 from torch.optim import SGD, Adam # 1.) Define your corpus corpus = TREC_6() # 2.) create an search space search_space = search_spaces.TextClassifierSearchSpace() search_strategy = search_strategies.GridSearch() # 3.) depending on your task add the respective parameters you want to optimize over search_space.add_budget(param.Budget.TIME_IN_H, 24) search_space.add_evaluation_metric(param.EvaluationMetric.MICRO_F1_SCORE) search_space.add_optimization_value(param.OptimizationValue.DEV_SCORE) search_space.add_max_epochs_per_training_run(15) #Depending on your downstream task, add embeddings and specify these with the respective Parameters below search_space.add_parameter(param.ModelTrainer.LEARNING_RATE, options=[0.01, 0.05, 0.1]) search_space.add_parameter(param.ModelTrainer.MINI_BATCH_SIZE, options=[16, 32, 64]) search_space.add_parameter(param.ModelTrainer.ANNEAL_FACTOR, options=[0.25, 0.5]) search_space.add_parameter(param.ModelTrainer.OPTIMIZER, options=[SGD, Adam]) search_space.add_parameter(param.Optimizer.WEIGHT_DECAY, options=[1e-2, 0]) #Define parameters for document embeddings RNN search_space.add_parameter(param.DocumentRNNEmbeddings.HIDDEN_SIZE, options=[128, 256, 512])
from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings, TransformerDocumentEmbeddings from flair.data import Sentence from flair.data import MultiCorpus from flair.datasets import TREC_6 # 1. define label names in natural language since some datasets come with cryptic set of labels label_name_map = {'ENTY':'question about entity', 'DESC':'question about description', 'ABBR':'question about abbreviation', 'HUM':'question about person', 'NUM':'question about number', 'LOC':'question about location' } corpus = TREC_6(label_name_map=label_name_map) print(corpus) corpus = corpus.downsample(0.1) print(corpus) label_dictionary = corpus.make_label_dictionary() print(label_dictionary) tagger = TARSClassifier(label_dictionary=label_dictionary,label_type="label", task_name="TEST_CLASS") trainer = ModelTrainer(tagger, corpus) trainer.train( base_path='resources/taggers/tars', learning_rate=0.01, mini_batch_size=16, mini_batch_chunk_size=4,