def _train(self, corpus: Corpus, params: dict, base_path: Path, max_epochs: int, optimization_value: str): corpus = corpus label_dict = corpus.make_label_dictionary() for sent in corpus.get_all_sentences(): sent.clear_embeddings() model = self._set_up_model(params, label_dict) training_parameters = { key: params[key] for key, value in params.items() if key in TRAINING_PARAMETERS } model_trainer_parameters = { key: params[key] for key, value in params.items() if key in MODEL_TRAINER_PARAMETERS and key != 'model' } trainer: ModelTrainer = ModelTrainer(model, corpus, **model_trainer_parameters) path = base_path results = trainer.train(path, max_epochs=max_epochs, param_selection_mode=True, **training_parameters) if optimization_value == "score": result = results['test_score'] else: result = results['dev_loss_history'][-1] return {'result': result, 'params': params}
def test_tagged_corpus_make_label_dictionary_string(): sentence_1 = Sentence('sentence 1', labels=['class_1']) sentence_2 = Sentence('sentence 2', labels=['class_2']) sentence_3 = Sentence('sentence 3', labels=['class_1']) corpus = Corpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary() assert (2 == len(label_dict)) assert ('<unk>' not in label_dict.get_items()) assert ('class_1' in label_dict.get_items()) assert ('class_2' in label_dict.get_items())
def run_zero_shot(train_tweets, train_y, val_tweets, val_y): """ Performs the training of the zero shot learning model @param train_tweets: the tweets that will be used for training @param train_y: the training labels @param val_tweets: the tweets that will be used for validation @param val_y: the validation labels @return: None """ # 1. Load our pre-trained TARS model for English print("Zero shot") # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt tars = TARSClassifier.load( os.path.join(os.path.dirname(__file__), "..", "..", "saved_models", "tars-base.pt")) train_tweets["output"] = train_y.iloc[:] train = train_tweets.apply(create_sentences, axis=1).tolist() train = SentenceDataset(train) val_tweets["output"] = val_y.iloc[:] val = val_tweets.apply(create_sentences, axis=1).tolist() val = SentenceDataset(val) corpus = Corpus(train=train, test=val) tars.add_and_switch_to_new_task( "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary()) trainer = ModelTrainer(tars, corpus) # 4. train model trainer.train( base_path='../../data/zero_shot', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=16, # small mini-batch size since corpus is tiny max_epochs=10, # terminate after 10 epochs ) print("DONE TRAINING") tars = TARSClassifier.load('../../model/zero_shot/final-model.pt') val_tweets["pred"] = val_tweets.apply(predict_few_shot, args=(tars, ), axis=1) val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1 if x == "positive" else -1) pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction']) pred.index += 1 pred.insert(0, 'Id', pred.index) pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
def fit(self, corpus: Corpus, model_path: str): self.model = TARSClassifier( task_name="ChemicalUnderstanding", label_dictionary=corpus.make_label_dictionary(), ) trainer = ModelTrainer(self.model, corpus) trainer.train( base_path=model_path, learning_rate=0.02, mini_batch_size=16, mini_batch_chunk_size=4, max_epochs=10, )
def train_classifier_model(self, corpus: Corpus, document_embeddings: DocumentRNNEmbeddings, model_params: {dict} = None): try: label_dict = corpus.make_label_dictionary() # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) if model_params is None: learning_rate = gv.learning_rate mini_batch_size = gv.mini_batch_size anneal_factor = gv.anneal_factor patience = gv.patience max_epochs = gv.max_epochs else: learning_rate = model_params["learning_rate"] mini_batch_size = model_params["mini_batch_size"] anneal_factor = model_params["anneal_factor"] patience = model_params["patience"] max_epochs = model_params["max_epochs"] # start the training self.select_training_device() trainer.train(self.model_filepath, learning_rate=learning_rate, mini_batch_size=mini_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs) except Exception as e: gv.logger.error(e)
tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] > 0: corpus_sents.append(sent) tag_countdown[ tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] -= 1 sent_picked = True print("sents for training: " + str(len(corpus_sents))) print("amount of items in dict: " + str(len(tag_dictionary.item2idx))) training_dataset = SentenceDataset(corpus_sents) training_corpus = Corpus(train=training_dataset, dev=corpus_small.dev, test=corpus_small.test, sample_missing_splits=False) trainer = ModelTrainer(tagger, training_corpus, optimizer=torch.optim.AdamW) tag_dictionary = training_corpus.make_label_dictionary(tag_type) tagger.add_and_switch_to_new_task("fewshot-moviecomplex-simple-to-conll3", tag_dictionary=tag_dictionary, tag_type=tag_type) trainer.train( base_path='resources/v3/fewshot-moviecomplex-simple-to-conll3-k' + str(k), learning_rate=5.0e-5, mini_batch_size=32, mini_batch_chunk_size=None, max_epochs=10, weight_decay=0., embeddings_storage_mode="none", scheduler=OneCycleLR, ) # evaluation
Sentence('We had a 1 hour call with Denise').add_label('contact_type', 'call'), Sentence('Had a quick call to discuss the offer').add_label('contact_type', 'call'), Sentence('I was on skype with Paul all day').add_label('contact_type', 'call'), Sentence('I have set up a meeting tomorrow').add_label('contact_type', 'meeting'), Sentence('I emailed the latest report').add_label('contact_type', 'email'), Sentence('I sent an email to Mark').add_label('contact_type', 'email'), Sentence('I will email those files to you').add_label('contact_type', 'email'), Sentence('phoned Jeremy').add_label('contact_type', 'call'), Sentence('emailed Jeremy').add_label('contact_type', 'email'), Sentence('called Jeremy').add_label('contact_type', 'call'), Sentence('meet with Jeremy').add_label('contact_type', 'meeting'), Sentence('meeting w Deborah').add_label('contact_type', 'meeting'), Sentence('the client scheduled a meeting at their office').add_label('contact_type', 'meeting'), Sentence('had a meeting with client and agreed to continue over email').add_label('contact_type', 'meeting'), Sentence('had a call with client and to arrange a meeting tomorrow').add_label('contact_type', 'call'), ]) corpus = Corpus(train=train, test=test) tars = TARSClassifier.load('tars-base') tars.add_and_switch_to_new_task("contact_type", label_dictionary=corpus.make_label_dictionary()) trainer = ModelTrainer(tars, corpus) trainer.train(base_path='../pretrained/contact_type_model/', learning_rate=0.02, mini_batch_size=1, max_epochs=15, train_with_dev=True, )
class FlairTARS(Classifier): '''Flair TARS few-shots training. It makes use of meaningful category labels found in classes.txt. Slow... base pretrained model (tars-base) is too heavy for our needs. Embeddings: 30522 x 768, 24 layers, 12 heads, 110M params. Flair works on top of PyTorch. In principle this classifier should be able to perform very well with ~3 samples per class. (See paper about TARS). ''' pretrained_model_name = 'tars-base' def prepare_resources(self): # turn off INFO and DEBUG logging import flair # KEEP THIS IMPORT HERE! (it initialises 'flair' logger) import logging logger = logging.getLogger('flair') logger.setLevel(logging.WARNING) if self.seed: flair.set_seed(self.seed) def train(self): from flair.data import Corpus from flair.datasets import SentenceDataset from flair.data import Sentence self.classes = utils.read_class_titles(settings.CAT_DEPTH) self.classes['NOCAT'] = 'NOCAT' train = SentenceDataset([ Sentence(row['titlen']).add_label('law_topic', self.classes[row['cat1']]) for i, row in self.df_train.iterrows() ]) # make a corpus with train and test split self.corpus = Corpus(train=train, dev=train) # 1. load base TARS tars = self._load_pretained_model() # 2. make the model aware of the desired set of labels from the new corpus tars.add_and_switch_to_new_task( "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary()) # 3. initialize the text classifier trainer with your corpus from flair.trainers import ModelTrainer trainer = ModelTrainer(tars, self.corpus) # 4. train model path = settings.WORKING_DIR if 1: trainer.train( base_path=path, # path to store the model artifacts learning_rate=5e-2, # 5ep, 0.2 bad; 5ep with 0.1 looks ok. mini_batch_size=settings.MINIBATCH, # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=settings.EPOCHS, # terminate after 10 epochs train_with_dev=False, save_final_model=False, param_selection_mode=True, # True to avoid model saves shuffle=False, # Already done ) # from flair.models.text_classification_model import TARSClassifier # self.model = TARSClassifier.load( # os.path.join(path, 'best-model.pt') # ) self.model = tars def predict(self, string): from flair.data import Sentence # 2. Prepare a test sentence sentence = Sentence(string) ret = ['NOCAT', 1.0] # 4. Predict for these classes self.model.predict(sentence) if len(sentence.labels): label = sentence.labels[0] ret = [self.classes.get_key_from_val(label.value), label.score] return str(ret[0]), ret[1] def _predict_zero(self, string): '''Abandoned; 0-shot predictions were too poor.''' from flair.models.text_classification_model import TARSClassifier from flair.data import Sentence # 2. Prepare a test sentence sentence = Sentence(string) # 3. Define some classes that you want to predict using descriptive names ret = [len(self.classes) - 1, 1.0] # 4. Predict for these classes self._get_tars().predict_zero_shot(sentence, self.classes) if len(sentence.labels): label = sentence.labels[0] ret = [self.classes.get_key_from_val(label.value), label.score] return str(ret[0]), ret[1] def get_internal_dimension(self): # return self.model.document_embeddings.embedding_length return None def _load_pretained_model(self): from flair.models.text_classification_model import TARSClassifier # 1. Load our pre-trained TARS model for English # Note that this must be reloaded before each training as its modified # during training. return TARSClassifier.load(self.get_pretrained_model_name())
Sentence('How much do data engineers make').add_label(label_name, money), Sentence('Canadian tech salaries').add_label(label_name, money), Sentence('Vulnerability found in a popular crypto wallet').add_label( label_name, crypto), Sentence('Ethereum smart contracts are useless').add_label( label_name, crypto), ]) # make a corpus with train and test split corpus = Corpus(train=train, test=test) from flair.trainers import ModelTrainer # 2. make the model aware of the desired set of labels from the new corpus tars.add_and_switch_to_new_task( "HN_MONEYTALK", label_dictionary=corpus.make_label_dictionary()) # 3. initialize the text classifier trainer with your corpus trainer = ModelTrainer(tars, corpus) # 4. train model trainer.train( base_path= 'resources/taggers/hn_moneytalk', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=1, # small mini-batch size since corpus is tiny max_epochs=10, # terminate after 10 epochs train_with_dev=True, ) # 1. Load few-shot TARS model