def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 50, anneal_factor: float = 0.5, patience: int = 5, train_with_dev: bool = False, embeddings_in_memory: bool = False, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, eval_on_train: bool = True): """ Trains a text classification model using the training data of the corpus. :param base_path: the directory to which any results should be written to :param learning_rate: the learning rate :param mini_batch_size: the mini batch size :param max_epochs: the maximum number of epochs to train :param anneal_factor: learning rate will be decreased by this factor :param patience: number of 'bad' epochs before learning rate gets decreased :param train_with_dev: boolean indicating, if the dev data set should be used for training or not :param embeddings_in_memory: boolean indicating, if embeddings should be kept in memory or not :param checkpoint: boolean indicating, whether the model should be save after every epoch or not :param save_final_model: boolean indicating, whether the final model should be saved or not :param anneal_with_restarts: boolean indicating, whether the best model should be reloaded once the learning rate changed or not :param eval_on_train: boolean value indicating, if evaluation metrics should be calculated on training data set or not """ loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) anneal_mode = 'min' if train_with_dev else 'max' scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(max_epochs): log.info('-' * 100) bad_epochs = scheduler.num_bad_epochs for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ os.path.exists(base_path + "/best-model.pt"): log.info('Resetting to best model ...') self.model.load_from_file(base_path + "/best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.001: log.info('Learning rate too small - quitting training!') break if not self.test_mode: random.shuffle(train_data) self.model.train() batches = [ self.corpus.train[x:x + mini_batch_size] for x in range(0, len(self.corpus.train), mini_batch_size) ] current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): scores = self.model.forward(batch) loss = self.model.calculate_loss(scores, batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) current_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) self.model.eval() # if checkpoint is enable, save model at each epoch if checkpoint: self.model.save(base_path + "/checkpoint.pt") log.info('-' * 100) log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, bad_epochs)) dev_metric = train_metric = None dev_loss = '_' train_loss = current_loss if eval_on_train: train_metric, train_loss = self._calculate_evaluation_results_for( 'TRAIN', self.corpus.train, embeddings_in_memory, mini_batch_size) if not train_with_dev: dev_metric, dev_loss = self._calculate_evaluation_results_for( 'DEV', self.corpus.dev, embeddings_in_memory, mini_batch_size) with open(loss_txt, 'a') as f: train_metric_str = train_metric.to_tsv( ) if train_metric is not None else Metric.to_empty_tsv() dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), train_loss, train_metric_str, dev_loss, dev_metric_str, '_', Metric.to_empty_tsv())) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_metric.f_score()) current_score = dev_metric.f_score( ) if not train_with_dev else train_metric.f_score() # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and current_score == scheduler.best: self.model.save(base_path + "/best-model.pt") if save_final_model: self.model.save(base_path + "/final-model.pt") log.info('-' * 100) log.info('Testing using best model ...') self.model.eval() if os.path.exists(base_path + "/best-model.pt"): self.model = TextClassifier.load_from_file(base_path + "/best-model.pt") test_metric, test_loss = self.evaluate( self.corpus.test, mini_batch_size=mini_batch_size, eval_class_metrics=True, embeddings_in_memory=embeddings_in_memory, metric_name='TEST') test_metric.print() self.model.train() log.info('-' * 100) except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') with open(base_path + "/final-model.pt", 'wb') as model_save_file: torch.save(self.model, model_save_file, pickle_protocol=4) model_save_file.close() log.info('Done.')
def train( self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 4, train_with_dev: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, ): evaluation_method = 'F1' if self.model.tag_type in ['pos', 'upos']: evaluation_method = 'accuracy' log.info('Evaluation method: {}'.format(evaluation_method)) loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) # annealing scheduler anneal_mode = 'min' if train_with_dev else 'max' scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0, max_epochs): log.info('-' * 100) bad_epochs = scheduler.num_bad_epochs for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ os.path.exists(base_path + "/best-model.pt"): log.info('resetting to best model') self.model.load_from_file(base_path + "/best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.001: log.info('learning rate too small - quitting training!') break if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] self.model.train() current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): batch: List[Sentence] = batch optimizer.zero_grad() # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() loss = self.model.neg_log_likelihood(batch) current_loss += loss.item() seen_sentences += len(batch) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) # switch to eval mode self.model.eval() # if checkpointing is enable, save model at each epoch if checkpoint: self.model.save(base_path + "/checkpoint.pt") log.info('-' * 100) dev_score = dev_metric = None if not train_with_dev: dev_score, dev_metric = self.evaluate( self.corpus.dev, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) test_score, test_metric = self.evaluate( self.corpus.test, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_score) # logging info log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, bad_epochs)) if not train_with_dev: log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('DEV', dev_metric.f_score(), dev_metric.accuracy(), dev_metric._tp, dev_metric._fp, dev_metric._fn, dev_metric._tn)) log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('TEST', test_metric.f_score(), test_metric.accuracy(), test_metric._tp, test_metric._fp, test_metric._fn, test_metric._tn)) with open(loss_txt, 'a') as f: dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), '_', Metric.to_empty_tsv(), '_', dev_metric_str, '_', test_metric.to_tsv())) # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and dev_score == scheduler.best: self.model.save(base_path + "/best-model.pt") # if we do not use dev data for model selection, save final model if save_final_model: self.model.save(base_path + "/final-model.pt") except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') self.model.save(base_path + "/final-model.pt") log.info('Done.')
from __future__ import absolute_import