def __init__(self, corpus, base_path, max_epochs, evaluation_metric, training_runs, optimization_value): if (type(base_path) is unicode): base_path = Path(base_path) self.corpus = corpus self.max_epochs = max_epochs self.base_path = base_path self.evaluation_metric = evaluation_metric self.run = 1 self.training_runs = training_runs self.optimization_value = optimization_value self.param_selection_file = init_output_file(base_path, u'param_selection.txt')
def __init__(self, corpus: Corpus, base_path: Union[(str, Path)], max_epochs: int, evaluation_metric: EvaluationMetric, training_runs: int, optimization_value: OptimizationValue): if (type(base_path) is str): base_path = Path(base_path) self.corpus = corpus self.max_epochs = max_epochs self.base_path = base_path self.evaluation_metric = evaluation_metric self.run = 1 self.training_runs = training_runs self.optimization_value = optimization_value self.param_selection_file = init_output_file(base_path, 'param_selection.txt')
def train( self, base_path: Union[Path, str], learning_rate: float = 0.1, mini_batch_size: int = 32, eval_mini_batch_size: int = None, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 3, min_learning_rate: float = 0.0001, train_with_dev: bool = False, monitor_train: bool = False, monitor_test: bool = False, embeddings_storage_mode: str = "cpu", checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 6, sampler=None, **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. :param base_path: Main path to which all output during training is logged and models are saved :param learning_rate: Initial learning rate :param mini_batch_size: Size of mini-batches during training :param eval_mini_batch_size: Size of mini-batches during evaluation :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed. :param anneal_factor: The factor by which the learning rate is annealed :param patience: Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate :param min_learning_rate: If the learning rate falls below this threshold, training terminates :param train_with_dev: If True, training is performed using both train+dev data :param monitor_train: If True, training data is evaluated at end of each epoch :param monitor_test: If True, test data is evaluated at end of each epoch :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU) :param checkpoint: If True, a full checkpoint is saved at end of each epoch :param save_final_model: If True, final model is saved :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate :param shuffle: If True, data is shuffled during training :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing parameter selection. :param num_workers: Number of workers in your data loader. :param sampler: You can pass a data sampler here for special sampling of data. :param kwargs: Other arguments for the Optimizer :return: """ if self.use_tensorboard: try: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() except: log_line(log) log.warning( "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!" ) log_line(log) self.use_tensorboard = False pass if eval_mini_batch_size is None: eval_mini_batch_size = mini_batch_size # cast string to Path if type(base_path) is str: base_path = Path(base_path) log_handler = add_file_handler(log, base_path / "training.log") log_line(log) log.info(f'Model: "{self.model}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info("Parameters:") log.info(f' - learning_rate: "{learning_rate}"') log.info(f' - mini_batch_size: "{mini_batch_size}"') log.info(f' - patience: "{patience}"') log.info(f' - anneal_factor: "{anneal_factor}"') log.info(f' - max_epochs: "{max_epochs}"') log.info(f' - shuffle: "{shuffle}"') log.info(f' - train_with_dev: "{train_with_dev}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Device: {flair.device}") log_line(log) log.info(f"Embeddings storage mode: {embeddings_storage_mode}") # determine what splits (train, dev, test) to evaluate and log log_train = True if monitor_train else False log_test = (True if (not param_selection_mode and self.corpus.test and monitor_test) else False) log_dev = True if not train_with_dev else False # prepare loss logging file and set up header loss_txt = init_output_file(base_path, "loss.tsv") weight_extractor = WeightExtractor(base_path) optimizer: torch.optim.Optimizer = self.optimizer( self.model.parameters(), lr=learning_rate, **kwargs) if self.optimizer_state is not None: optimizer.load_state_dict(self.optimizer_state) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" scheduler: ReduceLROnPlateau = ReduceLROnPlateau( optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True, ) if self.scheduler_state is not None: scheduler.load_state_dict(self.scheduler_state) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data = ConcatDataset([self.corpus.train, self.corpus.dev]) if sampler is not None: sampler = sampler(train_data) shuffle = False dev_score_history = [] dev_loss_history = [] train_loss_history = [] # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0 + self.epoch, max_epochs + self.epoch): log_line(log) # get new learning rate for group in optimizer.param_groups: learning_rate = group["lr"] # reload last best model if annealing with restarts is enabled if (learning_rate != previous_learning_rate and anneal_with_restarts and (base_path / "best-model.pt").exists()): log.info("resetting to best model") self.model.load(base_path / "best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < min_learning_rate: log_line(log) log.info("learning rate too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler, ) self.model.train() train_loss: float = 0 seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) # process mini-batches for batch_no, batch in enumerate(batch_loader): loss = self.model.forward_loss(batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_batches += 1 train_loss += loss.item() # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(batch, embeddings_storage_mode) if batch_no % modulo == 0: log.info( f"epoch {epoch + 1} - iter {batch_no}/{total_number_of_batches} - loss " f"{train_loss / seen_batches:.8f}") iteration = epoch * total_number_of_batches + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= seen_batches self.model.eval() log_line(log) log.info( f"EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f}" ) if self.use_tensorboard: writer.add_scalar("train_loss", train_loss, epoch + 1) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss # evaluate on train / dev / test split depending on training settings result_line: str = "" if log_train: train_eval_result, train_loss = self.model.evaluate( DataLoader( self.corpus.train, batch_size=eval_mini_batch_size, num_workers=num_workers, ), embeddings_storage_mode=embeddings_storage_mode, ) result_line += f"\t{train_eval_result.log_line}" # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.train, embeddings_storage_mode) if log_dev: dev_eval_result, dev_loss = self.model.evaluate( DataLoader( self.corpus.dev, batch_size=eval_mini_batch_size, num_workers=num_workers, ), embeddings_storage_mode=embeddings_storage_mode, ) result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}" log.info( f"DEV : loss {dev_loss} - score {dev_eval_result.main_score}" ) # calculate scores using dev data if available # append dev score to score history dev_score_history.append(dev_eval_result.main_score) dev_loss_history.append(dev_loss) current_score = dev_eval_result.main_score # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.dev, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("dev_loss", dev_loss, epoch + 1) writer.add_scalar("dev_score", dev_eval_result.main_score, epoch + 1) if log_test: test_eval_result, test_loss = self.model.evaluate( DataLoader( self.corpus.test, batch_size=eval_mini_batch_size, num_workers=num_workers, ), base_path / "test.tsv", embeddings_storage_mode=embeddings_storage_mode, ) result_line += f"\t{test_loss}\t{test_eval_result.log_line}" log.info( f"TEST : loss {test_loss} - score {test_eval_result.main_score}" ) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.test, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("test_loss", test_loss, epoch + 1) writer.add_scalar("test_score", test_eval_result.main_score, epoch + 1) # determine learning rate annealing through scheduler scheduler.step(current_score) train_loss_history.append(train_loss) # determine bad epoch number try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: new_learning_rate = group["lr"] if new_learning_rate != previous_learning_rate: bad_epochs = patience + 1 # log bad epochs log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") # output log file with open(loss_txt, "a") as f: # make headers on first epoch if epoch == 0: f.write( f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS" ) if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join( train_eval_result.log_header.split("\t"))) if log_dev: f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join( dev_eval_result.log_header.split("\t"))) if log_test: f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join( test_eval_result.log_header.split("\t"))) f.write( f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) f.write(result_line) # if checkpoint is enable, save model at each epoch if checkpoint and not param_selection_mode: self.model.save_checkpoint( base_path / "checkpoint.pt", optimizer.state_dict(), scheduler.state_dict(), epoch + 1, train_loss, ) # if we use dev data, remember best model based on dev evaluation score if (not train_with_dev and not param_selection_mode and current_score == scheduler.best): self.model.save(base_path / "best-model.pt") # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt") except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") if self.use_tensorboard: writer.close() if not param_selection_mode: log.info("Saving model ...") self.model.save(base_path / "final-model.pt") log.info("Done.") # test best model if test data is present if self.corpus.test: final_score = self.final_test(base_path, eval_mini_batch_size, num_workers) else: final_score = 0 log.info("Test data not provided setting final score to 0") log.removeHandler(log_handler) if self.use_tensorboard: writer.close() return { "test_score": final_score, "dev_score_history": dev_score_history, "train_loss_history": train_loss_history, "dev_loss_history": dev_loss_history, }
from __future__ import absolute_import
def find_learning_rate(self, base_path: Union[Path, str], file_name: str = 'learning_rate.tsv', start_learning_rate: float = 1e-7, end_learning_rate: float = 10, iterations: int = 100, mini_batch_size: int = 32, stop_early: bool = True, smoothing_factor: float = 0.98, **kwargs) -> Path: best_loss = None moving_avg_loss = 0 # cast string to Path if type(base_path) is str: base_path = Path(base_path) learning_rate_tsv = init_output_file(base_path, file_name) with open(learning_rate_tsv, 'a') as f: f.write('ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n') optimizer = self.optimizer(self.model.parameters(), lr=start_learning_rate, **kwargs) train_data = self.corpus.train random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ][:iterations] scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations) model_state = self.model.state_dict() model_device = next(self.model.parameters()).device self.model.train() for itr, batch in enumerate(batches): loss = self.model.forward_loss(batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() scheduler.step() learning_rate = scheduler.get_lr()[0] loss_item = loss.item() if itr == 0: best_loss = loss_item else: if smoothing_factor > 0: moving_avg_loss = smoothing_factor * moving_avg_loss + ( 1 - smoothing_factor) * loss_item loss_item = moving_avg_loss / (1 - smoothing_factor**(itr + 1)) if loss_item < best_loss: best_loss = loss if stop_early and (loss_item > 4 * best_loss or torch.isnan(loss)): log_line(log) log.info('loss diverged - stopping early!') break with open(learning_rate_tsv, 'a') as f: f.write( f'{itr}\t{datetime.datetime.now():%H:%M:%S}\t{learning_rate}\t{loss_item}\n' ) self.model.load_state_dict(model_state) self.model.to(model_device) log_line(log) log.info(f'learning rate finder finished - plot {learning_rate_tsv}') log_line(log) return Path(learning_rate_tsv)
def train(self, base_path: Union[Path, str], evaluation_metric: EvaluationMetric = EvaluationMetric. MICRO_F1_SCORE, learning_rate: float = 0.1, mini_batch_size: int = 32, eval_mini_batch_size: int = None, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 3, anneal_against_train_loss: bool = True, train_with_dev: bool = False, monitor_train: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, test_mode: bool = False, param_selection_mode: bool = False, **kwargs) -> dict: if eval_mini_batch_size is None: eval_mini_batch_size = mini_batch_size # cast string to Path if type(base_path) is str: base_path = Path(base_path) add_file_handler(log, base_path / 'training.log') log_line(log) log.info(f'Evaluation method: {evaluation_metric.name}') if not param_selection_mode: loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( f'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS\t{Metric.tsv_header("TRAIN")}\tDEV_LOSS\t{Metric.tsv_header("DEV")}' f'\tTEST_LOSS\t{Metric.tsv_header("TEST")}\n') weight_extractor = WeightExtractor(base_path) optimizer = self.optimizer(self.model.parameters(), lr=learning_rate, **kwargs) if self.optimizer_state is not None: optimizer.load_state_dict(self.optimizer_state) # annealing scheduler anneal_mode = 'min' if anneal_against_train_loss else 'max' if isinstance(optimizer, (AdamW, SGDW)): scheduler = ReduceLRWDOnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) else: scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) if self.scheduler_state is not None: scheduler.load_state_dict(self.scheduler_state) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) dev_score_history = [] dev_loss_history = [] train_loss_history = [] # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0 + self.epoch, max_epochs + self.epoch): log_line(log) try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ (base_path / 'best-model.pt').exists(): log.info('resetting to best model') self.model.load_from_file(base_path / 'best-model.pt') previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.0001: log_line(log) log.info('learning rate too small - quitting training!') log_line(log) break if not test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] self.model.train() train_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): loss = self.model.forward_loss(batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) train_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( f'epoch {epoch + 1} - iter {batch_no}/{len(batches)} - loss ' f'{train_loss / seen_sentences:.8f}') iteration = epoch * len(batches) + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= len(train_data) self.model.eval() log_line(log) log.info( f'EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}' ) dev_metric = None dev_loss = '_' train_metric = None if monitor_train: train_metric, train_loss = self._calculate_evaluation_results_for( 'TRAIN', self.corpus.train, evaluation_metric, embeddings_in_memory, eval_mini_batch_size) if not train_with_dev: dev_metric, dev_loss = self._calculate_evaluation_results_for( 'DEV', self.corpus.dev, evaluation_metric, embeddings_in_memory, eval_mini_batch_size) if not param_selection_mode: test_metric, test_loss = self._calculate_evaluation_results_for( 'TEST', self.corpus.test, evaluation_metric, embeddings_in_memory, eval_mini_batch_size, base_path / 'test.tsv') if not param_selection_mode: with open(loss_txt, 'a') as f: train_metric_str = train_metric.to_tsv( ) if train_metric is not None else Metric.to_empty_tsv( ) dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() test_metric_str = test_metric.to_tsv( ) if test_metric is not None else Metric.to_empty_tsv( ) f.write( f'{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t' f'{train_loss}\t{train_metric_str}\t{dev_loss}\t{dev_metric_str}\t_\t{test_metric_str}\n' ) # calculate scores using dev data if available dev_score = 0. if not train_with_dev: if evaluation_metric == EvaluationMetric.MACRO_ACCURACY: dev_score = dev_metric.macro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY: dev_score = dev_metric.micro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE: dev_score = dev_metric.macro_avg_f_score() else: dev_score = dev_metric.micro_avg_f_score() # append dev score to score history dev_score_history.append(dev_score) dev_loss_history.append(dev_loss.item()) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss if anneal_against_train_loss else dev_score scheduler.step(current_score) train_loss_history.append(train_loss) # if checkpoint is enable, save model at each epoch if checkpoint and not param_selection_mode: self.model.save_checkpoint(base_path / 'checkpoint.pt', optimizer.state_dict(), scheduler.state_dict(), epoch + 1, train_loss) # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and not param_selection_mode and current_score == scheduler.best: self.model.save(base_path / 'best-model.pt') # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / 'final-model.pt') except KeyboardInterrupt: log_line(log) log.info('Exiting from training early.') if not param_selection_mode: log.info('Saving model ...') self.model.save(base_path / 'final-model.pt') log.info('Done.') # test best model on test data final_score = self.final_test(base_path, embeddings_in_memory, evaluation_metric, eval_mini_batch_size) return { 'test_score': final_score, 'dev_score_history': dev_score_history, 'train_loss_history': train_loss_history, 'dev_loss_history': dev_loss_history }
def train( self, base_path: Union[Path, str], evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE, learning_rate: float = 0.1, mini_batch_size: int = 32, eval_mini_batch_size: int = None, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 3, train_with_dev: bool = False, monitor_train: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 8, valid_with_misspellings: bool = True, **kwargs, ) -> dict: if eval_mini_batch_size is None: eval_mini_batch_size = mini_batch_size # cast string to Path if type(base_path) is str: base_path = Path(base_path) log_handler = add_file_handler(log, base_path / "training.log") log_line(log) log.info(f'Model: "{self.model}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info("Parameters:") log.info(f' - learning_rate: "{learning_rate}"') log.info(f' - mini_batch_size: "{mini_batch_size}"') log.info(f' - patience: "{patience}"') log.info(f' - anneal_factor: "{anneal_factor}"') log.info(f' - max_epochs: "{max_epochs}"') log.info(f' - shuffle: "{shuffle}"') log.info(f' - train_with_dev: "{train_with_dev}"') log.info(f' - valid_with_misspellings: "{valid_with_misspellings}"') log.info("Model:") log.info(f' - hidden_size: "{self.model.hidden_size}"') log.info(f' - train_mode: "{self.model.train_mode}"') log.info(f' - alpha: "{self.model.alpha}"') log.info(f' - misspell_mode: "{self.model.misspell_mode}"') log.info(f' - misspelling_rate: "{self.model.misspelling_rate_train}"') log.info(f' - cmx_file: "{self.model.cmx_file_train}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Evaluation method: {evaluation_metric.name}") # determine what splits (train, dev, test) to evaluate and log log_train = True if monitor_train else False log_test = True if (not param_selection_mode and self.corpus.test) else False log_dev = True if not train_with_dev else False log_test = not log_dev eval_misspelling_rate = 0.05 log_suffix = lambda prefix, rate, cm, mode: f"{prefix} (misspell: cmx={cm})" if mode == MisspellingMode.ConfusionMatrixBased else f"{prefix} (misspell: rate={rate})" loss_txt = init_output_file(base_path, "loss.tsv") with open(loss_txt, "a") as f: f.write(f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS") dummy_result, _ = self.model.evaluate( [Sentence("d", labels=["0.1"])], eval_mini_batch_size, embeddings_in_memory, ) if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join(dummy_result.log_header.split("\t"))) if log_dev: f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dummy_result.log_header.split("\t"))) if valid_with_misspellings: suffix = log_suffix('DEV', eval_misspelling_rate, self.model.cmx_file_train, self.model.misspell_mode) f.write(f"\t{suffix}" + f"_LOSS\t{suffix})_" + f"\t{suffix}_".join( dummy_result.log_header.split("\t"))) if log_test: f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(dummy_result.log_header.split("\t"))) if valid_with_misspellings: suffix = log_suffix('TEST', eval_misspelling_rate, self.model.cmx_file_train, self.model.misspell_mode) f.write(f"\t{suffix}" + f"_LOSS\t{suffix})_" + f"\t{suffix}_".join( dummy_result.log_header.split("\t"))) weight_extractor = WeightExtractor(base_path) optimizer = self.optimizer(self.model.parameters(), lr=learning_rate, **kwargs) if self.optimizer_state is not None: optimizer.load_state_dict(self.optimizer_state) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" if isinstance(optimizer, (AdamW, SGDW)): scheduler = ReduceLRWDOnPlateau( optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True, ) else: scheduler = ReduceLROnPlateau( optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True, ) if self.scheduler_state is not None: scheduler.load_state_dict(self.scheduler_state) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data = ConcatDataset([self.corpus.train, self.corpus.dev]) dev_clean_score_history = [] dev_noisy_score_history = [] dev_clean_loss_history = [] dev_noisy_loss_history = [] train_loss_history = [] complete_data = ConcatDataset( [self.corpus.train, self.corpus.dev, self.corpus.test]) char_vocab = make_char_vocab(complete_data) log.info( f"Vocabulary of the corpus (#{len(char_vocab)}): {char_vocab}") if self.model.misspell_mode == MisspellingMode.ConfusionMatrixBased: cmx, lut = load_confusion_matrix(self.model.cmx_file_train) cmx, lut = filter_cmx(cmx, lut, char_vocab) else: cmx, lut = None, {} loss_params = {} loss_params["verbose"] = False loss_params["char_vocab"] = char_vocab loss_params["cmx"] = cmx loss_params["lut"] = lut loss_params["embeddings_in_memory"] = embeddings_in_memory # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0 + self.epoch, max_epochs + self.epoch): log_line(log) try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: learning_rate = group["lr"] # reload last best model if annealing with restarts is enabled if (learning_rate != previous_learning_rate and anneal_with_restarts and (base_path / "best-model.pt").exists()): log.info("resetting to best model") self.model.load(base_path / "best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.0001: log_line(log) log.info("learning rate too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, ) self.model.train() train_loss: float = 0 train_auxilary_losses = {} seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) for batch_no, batch in enumerate(batch_loader): loss, auxilary_losses = self.model.forward_loss( batch, params=loss_params) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_batches += 1 train_loss += loss.item() for k, v in auxilary_losses.items(): train_auxilary_losses[k] = train_auxilary_losses.get( k, 0) + v clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: msg = f"epoch {epoch + 1} - iter {batch_no}/{total_number_of_batches} - loss {train_loss / seen_batches:.6f}" # note: this is the loss accumulated in the current epoch divided by the number of already seen batches if len(train_auxilary_losses) > 0: aux_losses_str = " ".join([ f"{key}={value / seen_batches:.6f}" for (key, value) in train_auxilary_losses.items() ]) msg += f" ({aux_losses_str})" log.info(msg) iteration = epoch * total_number_of_batches + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= seen_batches for k, v in auxilary_losses.items(): train_auxilary_losses[k] /= seen_batches self.model.eval() log_line(log) log.info( f"EPOCH {epoch + 1} done: loss {train_loss:.6f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}" ) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss with open(loss_txt, "a") as f: f.write( f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) if log_train: train_eval_result, train_loss = self.model.evaluate( self.corpus.train, eval_mini_batch_size, embeddings_in_memory, num_workers=num_workers, ) f.write(f"\t{train_eval_result.log_line}") if log_dev: dev_eval_result_clean, dev_loss_clean = self.model.evaluate( self.corpus.dev, eval_mini_batch_size, embeddings_in_memory, num_workers=num_workers, ) f.write( f"\t{dev_loss_clean}\t{dev_eval_result_clean.log_line}" ) log.info( f"DEV : loss {dev_loss_clean:.6f} - score {dev_eval_result_clean.main_score:.4f}" ) # calculate scores using dev data if available # append dev score to score history dev_clean_score_history.append( dev_eval_result_clean.main_score) dev_clean_loss_history.append(dev_loss_clean) if valid_with_misspellings: # evaluate on misspellings dev_eval_result_noisy, dev_loss_noisy = self.model.evaluate( self.corpus.dev, eval_mini_batch_size, embeddings_in_memory, num_workers=num_workers, eval_mode=EvalMode.Misspellings, misspell_mode=self.model.misspell_mode, char_vocab=char_vocab, cmx=cmx, lut=lut, misspelling_rate=eval_misspelling_rate, ) f.write( f"\t{dev_loss_noisy}\t{dev_eval_result_noisy.log_line}" ) log.info( f"{log_suffix('DEV', eval_misspelling_rate, self.model.cmx_file_train, self.model.misspell_mode)}" + f" : loss {dev_loss_noisy:.6f} - score {dev_eval_result_noisy.main_score:.4f}" ) # calculate scores using dev data if available # append dev score to score history dev_noisy_score_history.append( dev_eval_result_noisy) dev_noisy_loss_history.append(dev_loss_noisy) current_score = ( dev_eval_result_clean.main_score + dev_eval_result_noisy.main_score) / 2.0 else: current_score = dev_eval_result_clean.main_score if log_test: test_eval_result_clean, test_loss_clean = self.model.evaluate( self.corpus.test, eval_mini_batch_size, embeddings_in_memory, base_path / f"test.tsv", num_workers=num_workers, ) f.write( f"\t{test_loss_clean}\t{test_eval_result_clean.log_line}" ) log.info( f"TEST : loss {test_loss_clean:.6f} - score {test_eval_result_clean.main_score:.4f}" ) if valid_with_misspellings: # evaluate on misspellings test_eval_result_noisy, test_loss_noisy = self.model.evaluate( self.corpus.test, eval_mini_batch_size, embeddings_in_memory, base_path / f"test.tsv", num_workers=num_workers, eval_mode=EvalMode.Misspellings, misspell_mode=self.model.misspell_mode, char_vocab=char_vocab, cmx=cmx, lut=lut, misspelling_rate=eval_misspelling_rate, ) f.write( f"\t{test_loss_noisy}\t{test_eval_result_noisy.log_line}" ) log.info( f"{log_suffix('TEST', eval_misspelling_rate, self.model.cmx_file_train, self.model.misspell_mode)}" + f" : loss {test_loss_noisy:.6f} - score {test_eval_result_noisy.main_score:.4f}" #f"TEST (misspell, rate={eval_misspelling_rate}) : loss {test_loss_noisy:.6f} - score {test_eval_result_noisy.main_score:.4f}" ) scheduler.step(current_score) train_loss_history.append(train_loss) # if checkpoint is enable, save model at each epoch if checkpoint and not param_selection_mode: self.model.save_checkpoint( base_path / "checkpoint.pt", optimizer.state_dict(), scheduler.state_dict(), epoch + 1, train_loss, ) # if we use dev data, remember best model based on dev evaluation score if (not train_with_dev and not param_selection_mode and current_score == scheduler.best): log.info("'best-model.pt' saved.") self.model.save(base_path / "best-model.pt") # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt") except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") if not param_selection_mode: log.info("Saving model ...") self.model.save(base_path / "final-model.pt") log.info("Done.") # test best model if test data is present if self.corpus.test: final_score_clean = self.final_test( base_path, embeddings_in_memory, evaluation_metric, eval_mini_batch_size, num_workers, ) final_score_noisy = self.final_test( base_path, embeddings_in_memory, evaluation_metric, eval_mini_batch_size, num_workers, eval_mode=EvalMode.Misspellings, misspell_mode=self.model.misspell_mode, misspelling_rate=eval_misspelling_rate, char_vocab=char_vocab, cmx=cmx, lut=lut, ) else: final_score_clean, final_score_noisy = 0, 0 log.info("Test data not provided setting final score to 0") log.removeHandler(log_handler) return { "test_score_clean": final_score_clean, "test_score_noisy": final_score_noisy, "dev_clean_score_history": dev_clean_score_history, "dev_noisy_score_history": dev_noisy_score_history, "train_loss_history": train_loss_history, "dev_clean_loss_history": dev_clean_loss_history, "dev_noisy_loss_history": dev_noisy_loss_history, }
def find_learning_rate( self, base_path: Union[Path, str], file_name: str = "learning_rate.tsv", start_learning_rate: float = 1e-7, end_learning_rate: float = 10, iterations: int = 100, mini_batch_size: int = 32, stop_early: bool = True, smoothing_factor: float = 0.98, **kwargs, ) -> Path: best_loss = None moving_avg_loss = 0 # cast string to Path if type(base_path) is str: base_path = Path(base_path) learning_rate_tsv = init_output_file(base_path, file_name) with open(learning_rate_tsv, "a") as f: f.write("ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n") optimizer = self.optimizer(self.model.parameters(), lr=start_learning_rate, **kwargs) train_data = self.corpus.train scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations) model_state = self.model.state_dict() self.model.train() step = 0 while step < iterations: batch_loader = DataLoader(train_data, batch_size=mini_batch_size, shuffle=True) for batch in batch_loader: step += 1 # forward pass loss = self.model.forward_loss(batch) # update optimizer and scheduler optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() scheduler.step(step) print(scheduler.get_lr()) learning_rate = scheduler.get_lr()[0] loss_item = loss.item() if step == 1: best_loss = loss_item else: if smoothing_factor > 0: moving_avg_loss = (smoothing_factor * moving_avg_loss + (1 - smoothing_factor) * loss_item) loss_item = moving_avg_loss / (1 - smoothing_factor** (step + 1)) if loss_item < best_loss: best_loss = loss if step > iterations: break if stop_early and (loss_item > 4 * best_loss or torch.isnan(loss)): log_line(log) log.info("loss diverged - stopping early!") step = iterations break with open(str(learning_rate_tsv), "a") as f: f.write( f"{step}\t{datetime.datetime.now():%H:%M:%S}\t{learning_rate}\t{loss_item}\n" ) self.model.load_state_dict(model_state) self.model.to(flair.device) log_line(log) log.info(f"learning rate finder finished - plot {learning_rate_tsv}") log_line(log) return Path(learning_rate_tsv)
def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 50, anneal_factor: float = 0.5, patience: int = 5, train_with_dev: bool = False, embeddings_in_memory: bool = False, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, eval_on_train: bool = True): """ Trains a text classification model using the training data of the corpus. :param base_path: the directory to which any results should be written to :param learning_rate: the learning rate :param mini_batch_size: the mini batch size :param max_epochs: the maximum number of epochs to train :param anneal_factor: learning rate will be decreased by this factor :param patience: number of 'bad' epochs before learning rate gets decreased :param train_with_dev: boolean indicating, if the dev data set should be used for training or not :param embeddings_in_memory: boolean indicating, if embeddings should be kept in memory or not :param checkpoint: boolean indicating, whether the model should be save after every epoch or not :param save_final_model: boolean indicating, whether the final model should be saved or not :param anneal_with_restarts: boolean indicating, whether the best model should be reloaded once the learning rate changed or not :param eval_on_train: boolean value indicating, if evaluation metrics should be calculated on training data set or not """ loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) anneal_mode = 'min' if train_with_dev else 'max' scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(max_epochs): log.info('-' * 100) bad_epochs = scheduler.num_bad_epochs for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ os.path.exists(base_path + "/best-model.pt"): log.info('Resetting to best model ...') self.model.load_from_file(base_path + "/best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.001: log.info('Learning rate too small - quitting training!') break if not self.test_mode: random.shuffle(train_data) self.model.train() batches = [ self.corpus.train[x:x + mini_batch_size] for x in range(0, len(self.corpus.train), mini_batch_size) ] current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): scores = self.model.forward(batch) loss = self.model.calculate_loss(scores, batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) current_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) self.model.eval() # if checkpoint is enable, save model at each epoch if checkpoint: self.model.save(base_path + "/checkpoint.pt") log.info('-' * 100) log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, bad_epochs)) dev_metric = train_metric = None dev_loss = '_' train_loss = current_loss if eval_on_train: train_metric, train_loss = self._calculate_evaluation_results_for( 'TRAIN', self.corpus.train, embeddings_in_memory, mini_batch_size) if not train_with_dev: dev_metric, dev_loss = self._calculate_evaluation_results_for( 'DEV', self.corpus.dev, embeddings_in_memory, mini_batch_size) with open(loss_txt, 'a') as f: train_metric_str = train_metric.to_tsv( ) if train_metric is not None else Metric.to_empty_tsv() dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), train_loss, train_metric_str, dev_loss, dev_metric_str, '_', Metric.to_empty_tsv())) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_metric.f_score()) current_score = dev_metric.f_score( ) if not train_with_dev else train_metric.f_score() # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and current_score == scheduler.best: self.model.save(base_path + "/best-model.pt") if save_final_model: self.model.save(base_path + "/final-model.pt") log.info('-' * 100) log.info('Testing using best model ...') self.model.eval() if os.path.exists(base_path + "/best-model.pt"): self.model = TextClassifier.load_from_file(base_path + "/best-model.pt") test_metric, test_loss = self.evaluate( self.corpus.test, mini_batch_size=mini_batch_size, eval_class_metrics=True, embeddings_in_memory=embeddings_in_memory, metric_name='TEST') test_metric.print() self.model.train() log.info('-' * 100) except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') with open(base_path + "/final-model.pt", 'wb') as model_save_file: torch.save(self.model, model_save_file, pickle_protocol=4) model_save_file.close() log.info('Done.')
def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 2, save_model: bool = True, embeddings_in_memory: bool = True, train_with_dev: bool = False): """ Trains the model using the training data of the corpus. :param base_path: the directory to which any results should be written to :param learning_rate: the learning rate :param mini_batch_size: the mini batch size :param max_epochs: the maximum number of epochs to train :param save_model: boolean value indicating, whether the model should be saved or not :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not :param train_with_dev: boolean value indicating, if the dev data set should be used for training or not """ loss_txt = init_output_file(base_path, 'loss.txt') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tITERATION\tDEV_LOSS\tTRAIN_LOSS\tDEV_F_SCORE\tTRAIN_F_SCORE\tDEV_ACC\tTRAIN_ACC\n' ) weights_txt = init_output_file(base_path, 'weights.txt') weights_index = defaultdict(lambda: defaultdict(lambda: list())) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) anneal_mode = 'min' if train_with_dev else 'max' scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: # record overall best dev scores and best loss best_score = 0 for epoch in range(max_epochs): print('-' * 100) if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) self.model.train() for batch_no, batch in enumerate(batches): scores = self.model.forward(batch) loss = self.model.calculate_loss(scores, batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) current_loss += loss.item() if not embeddings_in_memory: clear_embeddings(batch) if batch_no % modulo == 0: print("epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no self._extract_weigths(iteration, weights_index, weights_txt) current_loss /= len(train_data) # IMPORTANT: Switch to eval mode self.model.eval() print('-' * 100) train_metrics, train_loss = self.evaluate( self.corpus.train, mini_batch_size=mini_batch_size, embeddings_in_memory=embeddings_in_memory) train_f_score = train_metrics['MICRO_AVG'].f_score() train_acc = train_metrics['MICRO_AVG'].accuracy() print( "{0:<7} epoch {1} - loss {2:.8f} - f-score {3:.4f} - acc {4:.4f}" .format('TRAIN:', epoch, train_loss, train_f_score, train_acc)) dev_f_score = dev_acc = dev_loss = 0 if not train_with_dev: dev_metrics, dev_loss = self.evaluate( self.corpus.dev, mini_batch_size=mini_batch_size, embeddings_in_memory=embeddings_in_memory) dev_f_score = dev_metrics['MICRO_AVG'].f_score() dev_acc = dev_metrics['MICRO_AVG'].accuracy() print( "{0:<7} epoch {1} - loss {2:.8f} - f-score {3:.4f} - acc {4:.4f}" .format('DEV:', epoch, dev_loss, dev_f_score, dev_acc)) with open(loss_txt, 'a') as f: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, epoch * len(batches), dev_loss, train_loss, dev_f_score, train_f_score, dev_acc, train_acc)) # IMPORTANT: Switch back to train mode self.model.train() # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_f_score) is_best_model_so_far: bool = False current_score = dev_f_score if not train_with_dev else train_f_score if current_score > best_score: best_score = current_score is_best_model_so_far = True if is_best_model_so_far: if save_model: self.model.save(base_path + "/model.pt") self.model.save(base_path + "/final-model.pt") if save_model: self.model = TextClassifier.load_from_file(base_path + "/model.pt") print('-' * 100) print('testing...') test_metrics, test_loss = self.evaluate( self.corpus.test, mini_batch_size=mini_batch_size, eval_class_metrics=True, embeddings_in_memory=embeddings_in_memory) for metric in test_metrics.values(): metric.print() print('-' * 100) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print('saving model') with open(base_path + "/final-model.pt", 'wb') as model_save_file: torch.save(self.model, model_save_file, pickle_protocol=4) model_save_file.close() print('done')
def train( self, base_path: Union[Path, str], evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE, learning_rate: float = 0.1, mini_batch_size: int = 32, eval_mini_batch_size: int = None, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 3, train_with_dev: bool = False, monitor_train: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 8, **kwargs, ) -> dict: if eval_mini_batch_size is None: eval_mini_batch_size = mini_batch_size log.info(f'Model training base path: "{base_path}"') # cast string to Path if type(base_path) is str: base_path = Path(base_path) add_file_handler(log, base_path / "training.log") log_line(log) log.info(f"Evaluation method: {evaluation_metric.name}") # determine what splits (train, dev, test) to evaluate and log log_train = True if monitor_train else False log_test = True if (not param_selection_mode and self.corpus.test) else False log_dev = True if not train_with_dev else False loss_txt = init_output_file(base_path, "loss.tsv") with open(loss_txt, "a") as f: f.write(f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS") dummy_result, _ = self.model.evaluate( [Sentence("d", labels=["0.1"])], eval_mini_batch_size, embeddings_in_memory, ) if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join(dummy_result.log_header.split("\t"))) if log_dev: f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dummy_result.log_header.split("\t"))) if log_test: f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(dummy_result.log_header.split("\t"))) weight_extractor = WeightExtractor(base_path) optimizer = self.optimizer(self.model.parameters(), lr=learning_rate, **kwargs) if self.optimizer_state is not None: optimizer.load_state_dict(self.optimizer_state) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" if isinstance(optimizer, (AdamW, SGDW)): scheduler = ReduceLRWDOnPlateau( optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True, ) else: scheduler = ReduceLROnPlateau( optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True, ) if self.scheduler_state is not None: scheduler.load_state_dict(self.scheduler_state) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data = ConcatDataset([self.corpus.train, self.corpus.dev]) dev_score_history = [] dev_loss_history = [] train_loss_history = [] # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0 + self.epoch, max_epochs + self.epoch): log_line(log) try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: learning_rate = group["lr"] # reload last best model if annealing with restarts is enabled if (learning_rate != previous_learning_rate and anneal_with_restarts and (base_path / "best-model.pt").exists()): log.info("resetting to best model") self.model.load(base_path / "best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.0001: log_line(log) log.info("learning rate too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, ) self.model.train() train_loss: float = 0 seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) for batch_no, batch in enumerate(batch_loader): loss = self.model.forward_loss(batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_batches += 1 train_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( f"epoch {epoch + 1} - iter {batch_no}/{total_number_of_batches} - loss " f"{train_loss / seen_batches:.8f}") iteration = epoch * total_number_of_batches + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= seen_batches self.model.eval() log_line(log) log.info( f"EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}" ) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss with open(loss_txt, "a") as f: f.write( f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) if log_train: train_eval_result, train_loss = self.model.evaluate( self.corpus.train, eval_mini_batch_size, embeddings_in_memory, num_workers=num_workers, ) f.write(f"\t{train_eval_result.log_line}") if log_dev: dev_eval_result, dev_loss = self.model.evaluate( self.corpus.dev, eval_mini_batch_size, embeddings_in_memory, num_workers=num_workers, ) f.write(f"\t{dev_loss}\t{dev_eval_result.log_line}") log.info( f"DEV : loss {dev_loss} - score {dev_eval_result.main_score}" ) # calculate scores using dev data if available # append dev score to score history dev_score_history.append(dev_eval_result.main_score) dev_loss_history.append(dev_loss) current_score = dev_eval_result.main_score if log_test: test_eval_result, test_loss = self.model.evaluate( self.corpus.test, eval_mini_batch_size, embeddings_in_memory, base_path / "test.tsv", num_workers=num_workers, ) f.write(f"\t{test_loss}\t{test_eval_result.log_line}") log.info( f"TEST : loss {test_loss} - score {test_eval_result.main_score}" ) scheduler.step(current_score) train_loss_history.append(train_loss) # if checkpoint is enable, save model at each epoch if checkpoint and not param_selection_mode: self.model.save_checkpoint( base_path / "checkpoint.pt", optimizer.state_dict(), scheduler.state_dict(), epoch + 1, train_loss, ) # if we use dev data, remember best model based on dev evaluation score if (not train_with_dev and not param_selection_mode and current_score == scheduler.best): self.model.save(base_path / "best-model.pt") # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt") except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") if not param_selection_mode: log.info("Saving model ...") self.model.save(base_path / "final-model.pt") log.info("Done.") # test best model if test data is present if self.corpus.test: final_score = self.final_test( base_path, embeddings_in_memory, evaluation_metric, eval_mini_batch_size, num_workers, ) else: final_score = 0 log.info("Test data not provided setting final score to 0") return { "test_score": final_score, "dev_score_history": dev_score_history, "train_loss_history": train_loss_history, "dev_loss_history": dev_loss_history, }
def train( self, base_path: Union[Path, str], learning_rate: float = 0.1, mini_batch_size: int = 32, mini_batch_chunk_size: int = None, max_epochs: int = 100, scheduler=AnnealOnPlateau, anneal_factor: float = 0.5, patience: int = 3, initial_extra_patience=0, min_learning_rate: float = 0.0001, train_with_dev: bool = False, monitor_train: bool = False, monitor_test: bool = False, embeddings_storage_mode: str = "cpu", checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, anneal_with_prestarts: bool = False, batch_growth_annealing: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 6, sampler=None, use_amp: bool = False, amp_opt_level: str = "O1", eval_on_train_fraction=0., eval_on_train_shuffle=False, valid_with_misspellings: bool = True, corpus_name: str = "", **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. :param base_path: Main path to which all output during training is logged and models are saved :param learning_rate: Initial learning rate :param mini_batch_size: Size of mini-batches during training :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed. :param anneal_factor: The factor by which the learning rate is annealed :param patience: Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate :param min_learning_rate: If the learning rate falls below this threshold, training terminates :param train_with_dev: If True, training is performed using both train+dev data :param monitor_train: If True, training data is evaluated at end of each epoch :param monitor_test: If True, test data is evaluated at end of each epoch :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU) :param checkpoint: If True, a full checkpoint is saved at end of each epoch :param save_final_model: If True, final model is saved :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate :param shuffle: If True, data is shuffled during training :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing parameter selection. :param num_workers: Number of workers in your data loader. :param sampler: You can pass a data sampler here for special sampling of data. :param eval_on_train_fraction: the fraction of train data to do the evaluation on, if 0. the evaluation is not performed on fraction of training data, if 'dev' the size is determined from dev set size :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training and kept fixed during training, otherwise it's sampled at beginning of each epoch :param valid_with_misspellings: use a combination of the original loss and the loss computed using the misspelled sentences for validation :param kwargs: Other arguments for the Optimizer :return: """ if self.use_tensorboard: try: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() except: log_line(log) log.warning( "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!" ) log_line(log) self.use_tensorboard = False pass if use_amp: if sys.version_info < (3, 0): raise RuntimeError( "Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError( "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if mini_batch_chunk_size is None: mini_batch_chunk_size = mini_batch_size if learning_rate < min_learning_rate: min_learning_rate = learning_rate / 10 initial_learning_rate = learning_rate # cast string to Path if type(base_path) is str: base_path = Path(base_path) log_handler = add_file_handler(log, base_path / "training.log") log_line(log) log.info(f'Model: "{self.model}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info("Parameters:") log.info(f' - learning_rate: "{learning_rate}"') log.info(f' - mini_batch_size: "{mini_batch_size}"') log.info(f' - patience: "{patience}"') log.info(f' - anneal_factor: "{anneal_factor}"') log.info(f' - max_epochs: "{max_epochs}"') log.info(f' - shuffle: "{shuffle}"') log.info(f' - train_with_dev: "{train_with_dev}"') log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') log.info(f' - mixed precision training: "{use_amp}"') log.info(f' - valid_with_misspellings: "{valid_with_misspellings}"') log.info("Model:") log.info(f' - hidden_size: "{self.model.hidden_size}"') log.info(f' - train_mode: "{self.model.train_mode}"') log.info(f' - misspell_mode: "{self.model.misspell_mode}"') log.info(f' - alpha: "{self.model.alpha}"') log.info(f' - beta: "{self.model.beta}"') if self.model.misspell_mode == MisspellingMode.Seq2Seq: log.info(f' - errgen_model: "{self.model.errgen_model_train}"') log.info(f' - errgen_mode: "{self.model.errgen_mode_train}"') from pysia.utils import is_generation_mode, is_correction_mode if is_generation_mode(self.model.errgen_mode_train): log.info(f' - errgen_temp: "{self.model.errgen_temp_train}"') log.info(f' - errgen_topk: "{self.model.errgen_topk_train}"') elif is_correction_mode(self.model.errgen_mode_train): log.info(f' - errgen_nbest: "{self.model.errgen_nbest_train}"') log.info( f' - errgen_beam_size: "{self.model.errgen_beam_size_train}"' ) elif self.model.misspell_mode in [MisspellingMode.Random]: log.info( f' - misspelling_rate: "{self.model.misspelling_rate_train}"') elif self.model.misspell_mode in [ MisspellingMode.ConfusionMatrixBased ]: log.info(f' - cmx_file: "{self.model.cmx_file_train}"') elif self.model.misspell_mode in [MisspellingMode.Typos]: log.info(f' - typos_file: "{self.model.typos_file_train}"') log.info( f' - misspelling_rate: "{self.model.misspelling_rate_train}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Device: {flair.device}") log_line(log) log.info(f"Embeddings storage mode: {embeddings_storage_mode}") # determine what splits (train, dev, test) to evaluate and log log_train = True if monitor_train else False log_test = (True if (not param_selection_mode and self.corpus.test and monitor_test) else False) log_dev = True if not train_with_dev else False log_train_part = (True if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0) else False) if log_train_part: train_part_size = (len( self.corpus.dev) if eval_on_train_fraction == "dev" else int( len(self.corpus.train) * eval_on_train_fraction)) assert train_part_size > 0 if not eval_on_train_shuffle: train_part_indices = list(range(train_part_size)) train_part = torch.utils.data.dataset.Subset( self.corpus.train, train_part_indices) log_test = not log_dev eval_misspelling_rate = 0.05 eval_misspelling_mode = MisspellingMode.Random log_suffix = lambda prefix, rate, cm, mode: f"{prefix} (misspell: cmx={cm})" if mode == MisspellingMode.ConfusionMatrixBased else f"{prefix} (misspell: rate={rate})" # prepare loss logging file and set up header loss_txt = init_output_file(base_path, "loss.tsv") weight_extractor = WeightExtractor(base_path) optimizer: torch.optim.Optimizer = self.optimizer( self.model.parameters(), lr=learning_rate, **kwargs) if use_amp: self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=amp_opt_level) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" lr_scheduler = scheduler( optimizer, factor=anneal_factor, patience=patience, initial_extra_patience=initial_extra_patience, mode=anneal_mode, verbose=True, ) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data = ConcatDataset([self.corpus.train, self.corpus.dev]) # initialize sampler if provided if sampler is not None: # init with default values if only class is provided if inspect.isclass(sampler): sampler = sampler() # set dataset to sample from sampler.set_dataset(train_data) shuffle = False dev_clean_score_history = [] dev_noisy_score_history = [] dev_clean_loss_history = [] dev_noisy_loss_history = [] train_loss_history = [] micro_batch_size = mini_batch_chunk_size complete_data = ConcatDataset( [self.corpus.train, self.corpus.dev, self.corpus.test]) char_vocab = make_char_vocab(complete_data) log.info( f"Vocabulary of the corpus (#{len(char_vocab)}): {char_vocab}") cmx, lut, typos = None, {}, {} if self.model.misspell_mode == MisspellingMode.ConfusionMatrixBased: cmx, lut = load_confusion_matrix(self.model.cmx_file_train) cmx, lut = filter_cmx(cmx, lut, char_vocab) elif self.model.misspell_mode == MisspellingMode.Typos: typos = load_typos(self.model.typos_file_train, char_vocab, False) if self.model.misspell_mode == MisspellingMode.Seq2Seq: translator, opt = init_translator( self.model.errgen_model_train, self.model.errgen_mode_train, log, temp=self.model.errgen_temp_train, topk=self.model.errgen_topk_train, nbest=self.model.errgen_nbest_train, beam_size=self.model.errgen_beam_size_train, shard_size=20000, batch_size=256, verbose=True) else: translator, opt = None, None loss_params = {} loss_params["verbose"] = False loss_params["char_vocab"] = char_vocab loss_params["cmx"] = cmx loss_params["lut"] = lut loss_params["typos"] = typos loss_params["translator"] = translator loss_params["opt"] = opt loss_params["translation_mode"] = self.model.errgen_mode_train loss_params["embeddings_storage_mode"] = embeddings_storage_mode if self.model.train_mode == TrainingMode.Combined and self.model.beta > 0.0: batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler, ) sum_sent_len, cnt_sent = 0, 0 for batch_no, batch in enumerate(batch_loader): for sent in batch: sum_sent_len += len(sent) cnt_sent += len(batch) mean_tokens_per_batch = float(sum_sent_len) / float(cnt_sent) loss_params["mean_tokens_per_batch"] = mean_tokens_per_batch log.info(f"mean_tokens_per_batch = {mean_tokens_per_batch:.4f}") # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for self.epoch in range(self.epoch + 1, max_epochs + 1): log_line(log) if anneal_with_prestarts: last_epoch_model_state_dict = copy.deepcopy( self.model.state_dict()) if eval_on_train_shuffle: train_part_indices = list(range(self.corpus.train)) random.shuffle(train_part_indices) train_part_indices = train_part_indices[:train_part_size] train_part = torch.utils.data.dataset.Subset( self.corpus.train, train_part_indices) # get new learning rate for group in optimizer.param_groups: learning_rate = group["lr"] if learning_rate != previous_learning_rate and batch_growth_annealing: mini_batch_size *= 2 # reload last best model if annealing with restarts is enabled if ((anneal_with_restarts or anneal_with_prestarts) and learning_rate != previous_learning_rate and (base_path / "best-model.pt").exists()): if anneal_with_restarts: log.info("resetting to best model") self.model.load_state_dict( self.model.load(base_path / "best-model.pt").state_dict()) if anneal_with_prestarts: log.info("resetting to pre-best model") self.model.load_state_dict( self.model.load(base_path / "pre-best-model.pt").state_dict()) previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < min_learning_rate: log_line(log) log.info("learning rate too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler, ) self.model.train() train_loss: float = 0 train_auxilary_losses = {} seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) # process mini-batches batch_time = 0 for batch_no, batch in enumerate(batch_loader): start_time = time.time() # zero the gradients on the model and optimizer self.model.zero_grad() optimizer.zero_grad() # if necessary, make batch_steps batch_steps = [batch] if len(batch) > micro_batch_size: batch_steps = [ batch[x:x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) ] # forward and backward for batch for batch_step in batch_steps: # forward pass loss, auxilary_losses = self.model.forward_loss( batch_step, params=loss_params) # Backward if use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # do the optimizer step torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_batches += 1 train_loss += loss.item() for k, v in auxilary_losses.items(): train_auxilary_losses[k] = train_auxilary_losses.get( k, 0) + v # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(batch, embeddings_storage_mode) batch_time += time.time() - start_time if seen_batches % modulo == 0: msg = f"epoch {self.epoch} - iter {seen_batches}/{total_number_of_batches} - loss {train_loss / seen_batches:.6f} - samples/sec: {mini_batch_size * modulo / batch_time:.2f}" # note: this is the loss accumulated in the current epoch divided by the number of already seen batches if len(train_auxilary_losses) > 0: accuracies = [ (key, value) for (key, value) in train_auxilary_losses.items() if key.startswith("acc_") ] counts = [ (key, value) for (key, value) in train_auxilary_losses.items() if key.startswith("cnt_") or key.startswith("sum_") ] losses = [ (key, value) for (key, value) in train_auxilary_losses.items() if key.startswith("loss_") ] aux_losses_str = "" if len(losses) > 0: aux_losses_str = " ".join([ f"{key}={value / seen_batches:.6f}" for (key, value) in losses ]) if len(accuracies) > 0: if len(aux_losses_str) > 0: aux_losses_str += " " aux_losses_str += " ".join([ f"{key}={value / seen_batches:.2f}%" for (key, value) in accuracies ]) if len(counts) > 0: if len(aux_losses_str) > 0: aux_losses_str += " " aux_losses_str += " ".join([ f"{key}={value / seen_batches:.2f}" for (key, value) in counts ]) msg += f" ({aux_losses_str})" log.info(msg) batch_time = 0 iteration = self.epoch * total_number_of_batches + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= seen_batches for k, v in auxilary_losses.items(): train_auxilary_losses[k] /= seen_batches self.model.eval() log_line(log) log.info( f"EPOCH {self.epoch} done: loss {train_loss:.4f} - lr {learning_rate:.4f}" ) if self.use_tensorboard: writer.add_scalar("train_loss", train_loss, self.epoch) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss # evaluate on train / dev / test split depending on training settings result_line: str = "" if log_train: train_eval_result, train_loss = self.model.evaluate( DataLoader( self.corpus.train, batch_size=mini_batch_chunk_size, num_workers=num_workers, ), embeddings_storage_mode=embeddings_storage_mode, eval_dict_name=corpus_name, ) result_line += f"\t{train_eval_result.log_line}" # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.train, embeddings_storage_mode) if log_train_part: train_part_eval_result, train_part_loss = self.model.evaluate( DataLoader( train_part, batch_size=mini_batch_chunk_size, num_workers=num_workers, ), embeddings_storage_mode=embeddings_storage_mode, eval_dict_name=corpus_name, ) result_line += ( f"\t{train_part_loss}\t{train_part_eval_result.log_line}" ) log.info( f"TRAIN_SPLIT : loss {train_part_loss} - score {round(train_part_eval_result.main_score, 4)}" ) if log_dev: dev_eval_result_clean, dev_loss_clean = self.model.evaluate( DataLoader( self.corpus.dev, batch_size=mini_batch_chunk_size, num_workers=num_workers, ), embeddings_storage_mode=embeddings_storage_mode, eval_dict_name=corpus_name, ) result_line += f"\t{dev_loss_clean}\t{dev_eval_result_clean.log_line}" log.info( f"DEV : loss {dev_loss_clean} - score {round(dev_eval_result_clean.main_score, 4)}" ) # calculate scores using dev data if available # append dev score to score history dev_clean_score_history.append( dev_eval_result_clean.main_score) dev_clean_loss_history.append(dev_loss_clean.item()) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.dev, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("dev_clean_loss", dev_loss_clean, self.epoch) writer.add_scalar("dev_clean_score", dev_eval_result_clean.main_score, self.epoch) # evaluate on misspellings if valid_with_misspellings: dev_eval_result_noisy, dev_loss_noisy = self.model.evaluate( DataLoader( self.corpus.dev, batch_size=mini_batch_chunk_size, num_workers=num_workers, ), embeddings_storage_mode=embeddings_storage_mode, eval_mode=EvalMode.Misspellings, misspell_mode=eval_misspelling_mode, char_vocab=char_vocab, cmx=cmx, lut=lut, typos=typos, misspelling_rate=eval_misspelling_rate, eval_dict_name=corpus_name, ) result_line += f"\t{dev_loss_noisy}\t{dev_eval_result_noisy.log_line}" log.info( f"{log_suffix('DEV', eval_misspelling_rate, '', eval_misspelling_mode)}" + f" : loss {dev_loss_noisy} - score {round(dev_eval_result_noisy.main_score, 4)}" ) # calculate scores using dev data if available # append dev score to score history dev_noisy_score_history.append(dev_eval_result_noisy) dev_noisy_loss_history.append(dev_loss_noisy.item()) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.dev, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("dev_noisy_loss", dev_loss_noisy, self.epoch) writer.add_scalar("dev_noisy_score", dev_eval_result_noisy.main_score, self.epoch) if valid_with_misspellings: current_score = ( dev_eval_result_clean.main_score + dev_eval_result_noisy.main_score) / 2.0 dev_loss = (dev_loss_clean + dev_loss_noisy) / 2.0 else: current_score = dev_eval_result_clean.main_score dev_loss = dev_loss_clean # else: current_score = train_loss if log_test: test_eval_result_clean, test_loss_clean = self.model.evaluate( DataLoader( self.corpus.test, batch_size=mini_batch_chunk_size, num_workers=num_workers, ), base_path / "test.tsv", embeddings_storage_mode=embeddings_storage_mode, eval_dict_name=corpus_name, ) result_line += f"\t{test_loss_clean}\t{test_eval_result_clean.log_line}" log.info( f"TEST : loss {test_loss_clean} - score {round(test_eval_result_clean.main_score, 4)}" ) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.test, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("test_clean_loss", test_loss_clean, self.epoch) writer.add_scalar("test_clean_score", test_eval_result_clean.main_score, self.epoch) if valid_with_misspellings: # evaluate on misspellings test_eval_result_noisy, test_loss_noisy = self.model.evaluate( DataLoader( self.corpus.test, batch_size=mini_batch_chunk_size, num_workers=num_workers, ), base_path / f"test.tsv", embeddings_storage_mode=embeddings_storage_mode, eval_mode=EvalMode.Misspellings, misspell_mode=eval_misspelling_mode, char_vocab=char_vocab, cmx=cmx, lut=lut, typos=typos, misspelling_rate=eval_misspelling_rate, eval_dict_name=corpus_name, ) result_line += f"\t{test_loss_noisy}\t{test_eval_result_noisy.log_line}" log.info( f"{log_suffix('TEST', eval_misspelling_rate, '', eval_misspelling_mode)}" + f" : loss {test_loss_noisy} - score {round(test_eval_result_noisy.main_score, 4)}" ) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.test, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("test_noisy_loss", test_loss_noisy, self.epoch) writer.add_scalar( "test_noisy_score", test_eval_result_noisy.main_score, self.epoch) # determine learning rate annealing through scheduler. Use auxiliary metric for AnnealOnPlateau if not train_with_dev and isinstance(lr_scheduler, AnnealOnPlateau): lr_scheduler.step(current_score, dev_loss) else: lr_scheduler.step(current_score) train_loss_history.append(train_loss) # determine bad epoch number try: bad_epochs = lr_scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: new_learning_rate = group["lr"] if new_learning_rate != previous_learning_rate: bad_epochs = patience + 1 if previous_learning_rate == initial_learning_rate: bad_epochs += initial_extra_patience # log bad epochs log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") # output log file with open(loss_txt, "a") as f: # make headers on first epoch if self.epoch == 1: f.write( f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS" ) if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join( train_eval_result.log_header.split("\t"))) if log_train_part: f.write("\tTRAIN_PART_LOSS\tTRAIN_PART_" + "\tTRAIN_PART_".join( train_part_eval_result.log_header. split("\t"))) if log_dev: f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join( dev_eval_result_clean.log_header.split("\t"))) if log_test: f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join( test_eval_result_clean.log_header.split("\t"))) f.write( f"\n{self.epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) f.write(result_line) # if checkpoint is enabled, save model at each epoch if checkpoint and not param_selection_mode: self.save_checkpoint(base_path / "checkpoint.pt") # if we use dev data, remember best model based on dev evaluation score if ((not train_with_dev or anneal_with_restarts or anneal_with_prestarts) and not param_selection_mode and current_score == lr_scheduler.best and bad_epochs == 0): log.info("saving best model") self.model.save(base_path / "best-model.pt") if anneal_with_prestarts: current_state_dict = self.model.state_dict() self.model.load_state_dict(last_epoch_model_state_dict) self.model.save(base_path / "pre-best-model.pt") self.model.load_state_dict(current_state_dict) # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt") except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") if self.use_tensorboard: writer.close() if not param_selection_mode: log.info("Saving model ...") self.model.save(base_path / "final-model.pt") log.info("Done.") # test best model if test data is present if self.corpus.test: final_score_clean = self.final_test(base_path, mini_batch_chunk_size, num_workers, embeddings_storage_mode, corpus_name=corpus_name) final_score_noisy = self.final_test( base_path, mini_batch_chunk_size, num_workers, embeddings_storage_mode, eval_mode=EvalMode.Misspellings, misspell_mode=eval_misspelling_mode, misspelling_rate=eval_misspelling_rate, char_vocab=char_vocab, cmx=cmx, lut=lut, typos=typos, corpus_name=corpus_name) else: final_score_clean, final_score_noisy = 0, 0 log.info("Test data not provided setting final score to 0") log.removeHandler(log_handler) if self.use_tensorboard: writer.close() return { "test_score_clean": final_score_clean, "test_score_noisy": final_score_noisy, "dev_clean_score_history": dev_clean_score_history, "dev_noisy_score_history": dev_noisy_score_history, "train_loss_history": train_loss_history, "dev_clean_loss_history": dev_clean_loss_history, "dev_noisy_loss_history": dev_noisy_loss_history, }
def train( self, base_path: Union[Path, str], learning_rate: float = 0.1, mini_batch_size: int = 32, mini_batch_chunk_size: int = None, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 3, min_learning_rate: float = 0.0001, train_with_dev: bool = False, monitor_train: bool = False, monitor_test: bool = False, embeddings_storage_mode: str = "cpu", checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, batch_growth_annealing: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 6, sampler=None, use_amp: bool = False, amp_opt_level: str = "O1", eval_on_train_fraction=0.0, eval_on_train_shuffle=False, gamma: float = 1.0, **kwargs, ) -> dict: if mini_batch_chunk_size is None: mini_batch_chunk_size = mini_batch_size # cast string to Path if type(base_path) is str: base_path = Path(base_path) log_line(log) log.info(f'Model1: "{self.model1}"') log.info(f'Model2: "{self.model2}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Device: {flair.device}") log_line(log) log.info(f"Embeddings storage mode: {embeddings_storage_mode}") loss_txt = init_output_file(base_path, "loss.tsv") learning_rate1 = learning_rate2 = learning_rate optimizer1: torch.optim.Optimizer = self.optimizer( self.model1.parameters(), lr=learning_rate1, **kwargs ) optimizer2: torch.optim.Optimizer = self.optimizer( self.model2.parameters(), lr=learning_rate2, **kwargs ) anneal_mode = "min" if train_with_dev else "max" scheduler1: ReduceLROnPlateau = ReduceLROnPlateau( optimizer1, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True, ) scheduler2: ReduceLROnPlateau = ReduceLROnPlateau( optimizer2, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True, ) train_data = self.corpus.train dev_score_history = [] dev_loss_history = [] train_loss_history = [] try: previous_learning_rate1 = learning_rate1 # previous_learning_rate2 = learning_rate2 for self.epoch in range(self.epoch + 1, max_epochs + 1): log_line(log) # get new learning rate for group in optimizer1.param_groups: learning_rate1 = group["lr"] for group in optimizer2.param_groups: learning_rate2 = group["lr"] previous_learning_rate1 = learning_rate1 # previous_learning_rate2 = learning_rate2 # stop training if learning rate becomes too small if learning_rate1 < min_learning_rate: log_line(log) log.info("learning rate (1) too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler, ) self.model1.train() self.model2.train() train_loss: float = 0 train_loss1: float = 0 train_loss2: float = 0 seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) # process mini-batches batch_time = 0 for batch_no, batch in enumerate(batch_loader): start_time = time.time() # zero the gradients on the model and optimizer self.model1.zero_grad() self.model2.zero_grad() optimizer1.zero_grad() optimizer2.zero_grad() loss1 = self.model1.forward_loss(batch) loss2 = self.model2.forward_loss(batch, self.model1.encoder_final) loss = loss1 + gamma * loss2 loss.backward() torch.nn.utils.clip_grad_norm_(self.model1.parameters(), 5.0) torch.nn.utils.clip_grad_norm_(self.model2.parameters(), 5.0) optimizer1.step() optimizer2.step() seen_batches += 1 train_loss += loss.item() train_loss1 += loss1.item() train_loss2 += loss2.item() store_embeddings(batch, embeddings_storage_mode) batch_time += time.time() - start_time if batch_no % modulo == 0: log.info( f"epoch {self.epoch} - iter {batch_no}/{total_number_of_batches} - " f"loss {train_loss / seen_batches:.8f} - " f"samples/sec: {mini_batch_size * modulo / batch_time:.2f}" ) batch_time = 0 train_loss /= seen_batches train_loss1 /= seen_batches train_loss2 /= seen_batches self.model1.eval() self.model2.eval() log_line(log) log.info( f"EPOCH {self.epoch} done: " f"loss {train_loss:.5f} - " f"loss1 {train_loss1:.5f} - " f"loss2 {train_loss2:.5f} - " f"gamma {gamma:.2f} - " f"lr1 {learning_rate1:.5f} - " f"lr2 {learning_rate2:.5f}" ) result_line: str = "" dev_eval_result, dev_loss = self.model1.evaluate( DataLoader( self.corpus.dev, batch_size=mini_batch_chunk_size, num_workers=num_workers, ), embedding_storage_mode=embeddings_storage_mode, ) result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}" log.info(f"DEV : loss {dev_loss} - score {dev_eval_result.main_score}") # calculate scores using dev data if available # append dev score to score history dev_score_history.append(dev_eval_result.main_score) dev_loss_history.append(dev_loss) current_score = dev_eval_result.main_score # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.dev, embeddings_storage_mode) # determine learning rate annealing through scheduler scheduler1.step(current_score) scheduler2.step(current_score) train_loss_history.append(train_loss) # determine bad epoch number try: bad_epochs = scheduler1.num_bad_epochs except: bad_epochs = 0 for group in optimizer1.param_groups: new_learning_rate1 = group["lr"] if new_learning_rate1 != previous_learning_rate1: bad_epochs = patience + 1 # log bad epochs log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") # output log file with open(loss_txt, "a") as f: # make headers on first epoch if self.epoch == 1: f.write( f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS" ) f.write( "\tDEV_LOSS\tDEV_" + "\tDEV_".join(dev_eval_result.log_header.split("\t")) ) f.write( f"\n{self.epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) f.write(result_line) # if we use dev data, remember best model based on dev evaluation score if current_score == scheduler1.best: log.info(f"BEST SO FAR: {scheduler1.best}") self.model1.save(base_path / "best-model1.pt") self.model2.save(base_path / "best-model2.pt") except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") return { "dev_score_history": dev_score_history, "train_loss_history": train_loss_history, "dev_loss_history": dev_loss_history, }
def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 2, save_model: bool = True, embeddings_in_memory: bool = True, train_with_dev: bool = False, use_tensorboard: bool = False): """ Trains the model using the training data of the corpus. :param base_path: the directory to which any results should be written to :param learning_rate: the learning rate :param mini_batch_size: the mini batch size :param max_epochs: the maximum number of epochs to train :param save_model: boolean value indicating, whether the model should be saved or not :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not :param train_with_dev: boolean value indicating, if the dev data set should be used for training or not """ if use_tensorboard: try: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() except: log_line(log) log.warning( "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!" ) log_line(log) self.use_tensorboard = False pass loss_txt = init_output_file_in(base_path, 'loss.tsv') training_log = init_output_file_in(base_path, 'training_log.txt') with open(loss_txt, 'a') as f: f.write( f"EPOCH\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\tDEV_LOSS\tDEV_PRECISION\tDEV_RECALL\tDEV_F1\tDEV_ACC\n" #f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS\tDEV_LOSS\tDEV_PRECISION\tDEV_RECALL\tDEV_F1\tTRAIN_PRECISION\tTRAIN_RECALL\tTRAIN_F1\tDEV_ACC\tTRAIN_ACC\n" ) f.close() # with open(loss_txt, 'a') as f: # f.write('EPOCH\tITERATION\tDEV_LOSS\tTRAIN_LOSS\tDEV_F_SCORE\tTRAIN_F_SCORE\tDEV_ACC\tTRAIN_ACC\n') weights_txt = init_output_file(base_path, 'weights.txt') weights_index = defaultdict(lambda: defaultdict(lambda: list())) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) anneal_mode = 'min' if train_with_dev else 'max' scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: # record overall best dev scores and best loss best_score = 0 for epoch in range(max_epochs): print('-' * 100) if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) self.model.train() for batch_no, batch in enumerate(batches): scores = self.model.forward(batch) loss = self.model.calculate_loss(scores, batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) current_loss += loss.item() if not embeddings_in_memory: clear_embeddings(batch) if batch_no % modulo == 0: print("epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no self._extract_weigths(iteration, weights_index, weights_txt) current_loss /= len(train_data) # IMPORTANT: Switch to eval mode self.model.eval() print('-' * 100) #train_metrics, train_loss = self.evaluate(self.corpus.train, mini_batch_size=mini_batch_size, # embeddings_in_memory=embeddings_in_memory) #train_f_score = train_metrics['MICRO_AVG'].f_score() #train_acc = train_metrics['MICRO_AVG'].accuracy() #print("{0:<7} epoch {1} - loss {2:.8f} - f-score {3:.4f} - acc {4:.4f}".format( # 'TRAIN:', epoch, train_loss, train_f_score, train_acc)) dev_presicion = dev_recall = dev_f_score = dev_acc = dev_loss = 0 if not train_with_dev: dev_metrics, dev_loss = self.evaluate( self.corpus.dev, mini_batch_size=mini_batch_size, embeddings_in_memory=embeddings_in_memory) dev_precision = dev_metrics['MICRO_AVG'].precision() dev_recall = dev_metrics['MICRO_AVG'].recall() dev_f_score = dev_metrics['MICRO_AVG'].f_score() dev_acc = dev_metrics['MICRO_AVG'].accuracy() print( "{0:<7} epoch {1} - loss {2:.8f} - f-score {3:.4f} - acc {4:.4f}" .format('DEV:', epoch, dev_loss, dev_f_score, dev_acc)) with open(loss_txt, 'a') as f: f.write( f"{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{learning_rate:.4f}\t{current_loss}\t{dev_loss}\t{dev_presicion}\t{dev_recall}\t{dev_f_score}\t{dev_acc}\n" ) f.close() if use_tensorboard: writer.add_scalar("dev_loss", dev_loss, epoch) writer.add_scalar("dev_score", dev_f_score, epoch) writer.add_scalar("train_loss", current_loss, epoch) #writer.add_scalar("train_score", train_f_score, epoch) # f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( # epoch, epoch * len(batches), dev_loss, train_loss, dev_f_score, train_f_score, dev_acc, train_acc)) # IMPORTANT: Switch back to train mode self.model.train() # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_f_score) is_best_model_so_far: bool = False current_score = dev_f_score if not train_with_dev else train_f_score if current_score > best_score: best_score = current_score is_best_model_so_far = True if is_best_model_so_far: if save_model: self.model.save(base_path + "/model.pt") self.model.save(base_path + "/final-model.pt") if save_model: self.model = TextClassifier.load_from_file(base_path + "/model.pt") print('-' * 100) print('testing...') test_metrics, test_loss = self.evaluate( self.corpus.test, mini_batch_size=mini_batch_size, eval_class_metrics=True, embeddings_in_memory=embeddings_in_memory) for metric in test_metrics.values(): metric.print() with open(training_log, 'a') as c: for metric in test_metrics.values(): print(metric.name + "\t" + "False-Negative: " + str(metric._fn) + "\t" + "False-Positive: " + str(metric._fp) + "\t" + "True-Negative: " + str(metric._tn) + "\t" + "True-Positive: " + str(metric._tp)) c.write(metric.name + "\t" + "False-Negative: " + str(metric._fn) + "\t" + "False-Positive: " + str(metric._fp) + "\t" + "True-Negative: " + str(metric._tn) + "\t" + "True-Positive: " + str(metric._tp) + "\n") c.close() print('-' * 100) if use_tensorboard: writer.close() except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print('saving model') with open(base_path + "/final-model.pt", 'wb') as model_save_file: torch.save(self.model, model_save_file, pickle_protocol=4) model_save_file.close() print('done')
from pathlib import Path
def find_learning_rate(self, base_path: Union[(Path, str)], file_name: str = 'learning_rate.tsv', start_learning_rate: float = 1e-07, end_learning_rate: float = 10, iterations: int = 100, mini_batch_size: int = 32, stop_early: bool = True, smoothing_factor: float = 0.98, **kwargs) -> Path: best_loss = None moving_avg_loss = 0 if (type(base_path) is str): base_path = Path(base_path) learning_rate_tsv = init_output_file(base_path, file_name) with open(learning_rate_tsv, 'a') as f: f.write('ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n') optimizer = self.optimizer(self.model.parameters(), lr=start_learning_rate, **kwargs) train_data = self.corpus.train batch_loader = DataLoader(train_data, batch_size=mini_batch_size, shuffle=True) scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations) model_state = self.model.state_dict() model_device = next(self.model.parameters()).device self.model.train() for (itr, batch) in enumerate(batch_loader): loss = self.model.forward_loss(batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() scheduler.step(1) learning_rate = scheduler.get_lr()[0] loss_item = loss.item() if (itr == 0): best_loss = loss_item else: if (smoothing_factor > 0): moving_avg_loss = ((smoothing_factor * moving_avg_loss) + ((1 - smoothing_factor) * loss_item)) loss_item = (moving_avg_loss / (1 - (smoothing_factor**(itr + 1)))) if (loss_item < best_loss): best_loss = loss if (stop_early and ((loss_item > (4 * best_loss)) or torch.isnan(loss))): log_line(log) log.info('loss diverged - stopping early!') break if (itr > iterations): break with open(str(learning_rate_tsv), 'a') as f: f.write(''.join([ '{}'.format(itr), '\t', '{:%H:%M:%S}'.format(datetime.datetime.now()), '\t', '{}'.format(learning_rate), '\t', '{}'.format(loss_item), '\n' ])) self.model.load_state_dict(model_state) self.model.to(model_device) log_line(log) log.info(''.join([ 'learning rate finder finished - plot ', '{}'.format(learning_rate_tsv) ])) log_line(log) return Path(learning_rate_tsv)
def train(self, base_path: Union[(Path, str)], learning_rate: float = 0.1, mini_batch_size: int = 32, eval_mini_batch_size: int = None, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 3, min_learning_rate: float = 0.0001, train_with_dev: bool = False, monitor_train: bool = False, monitor_test: bool = False, embeddings_storage_mode: str = 'cpu', checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 6, sampler=None, use_amp: bool = False, amp_opt_level: str = 'O1', **kwargs) -> dict: "\n Trains any class that implements the flair.nn.Model interface.\n :param base_path: Main path to which all output during training is logged and models are saved\n :param learning_rate: Initial learning rate\n :param mini_batch_size: Size of mini-batches during training\n :param eval_mini_batch_size: Size of mini-batches during evaluation\n :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.\n :param anneal_factor: The factor by which the learning rate is annealed\n :param patience: Patience is the number of epochs with no improvement the Trainer waits\n until annealing the learning rate\n :param min_learning_rate: If the learning rate falls below this threshold, training terminates\n :param train_with_dev: If True, training is performed using both train+dev data\n :param monitor_train: If True, training data is evaluated at end of each epoch\n :param monitor_test: If True, test data is evaluated at end of each epoch\n :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),\n 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)\n :param checkpoint: If True, a full checkpoint is saved at end of each epoch\n :param save_final_model: If True, final model is saved\n :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate\n :param shuffle: If True, data is shuffled during training\n :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing\n parameter selection.\n :param num_workers: Number of workers in your data loader.\n :param sampler: You can pass a data sampler here for special sampling of data.\n :param kwargs: Other arguments for the Optimizer\n :return:\n " if self.use_tensorboard: try: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() except: log_line(log) log.warning( 'ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!' ) log_line(log) self.use_tensorboard = False pass if use_amp: if (sys.version_info < (3, 0)): raise RuntimeError( 'Apex currently only supports Python 3. Aborting.') if (amp is None): raise RuntimeError( 'Failed to import apex. Please install apex from https://www.github.com/nvidia/apex to enable mixed-precision training.' ) if (eval_mini_batch_size is None): eval_mini_batch_size = mini_batch_size if (type(base_path) is str): base_path = Path(base_path) log_handler = add_file_handler(log, (base_path / 'training.log')) log_line(log) log.info(''.join(['Model: "', '{}'.format(self.model), '"'])) log_line(log) log.info(''.join(['Corpus: "', '{}'.format(self.corpus), '"'])) log_line(log) log.info('Parameters:') log.info(''.join( [' - learning_rate: "', '{}'.format(learning_rate), '"'])) log.info(''.join( [' - mini_batch_size: "', '{}'.format(mini_batch_size), '"'])) log.info(''.join([' - patience: "', '{}'.format(patience), '"'])) log.info(''.join( [' - anneal_factor: "', '{}'.format(anneal_factor), '"'])) log.info(''.join([' - max_epochs: "', '{}'.format(max_epochs), '"'])) log.info(''.join([' - shuffle: "', '{}'.format(shuffle), '"'])) log.info(''.join( [' - train_with_dev: "', '{}'.format(train_with_dev), '"'])) log_line(log) log.info(''.join( ['Model training base path: "', '{}'.format(base_path), '"'])) log_line(log) log.info(''.join(['Device: ', '{}'.format(flair.device)])) log_line(log) log.info(''.join([ 'Embeddings storage mode: ', '{}'.format(embeddings_storage_mode) ])) log_train = (True if monitor_train else False) log_test = (True if ((not param_selection_mode) and self.corpus.test and monitor_test) else False) log_dev = (True if (not train_with_dev) else False) loss_txt = init_output_file(base_path, 'loss.tsv') weight_extractor = WeightExtractor(base_path) optimizer = self.optimizer(self.model.parameters(), lr=learning_rate, **kwargs) if (self.optimizer_state is not None): optimizer.load_state_dict(self.optimizer_state) if use_amp: (self.model, optimizer) = amp.initialize(self.model, optimizer, opt_level=amp_opt_level) anneal_mode = ('min' if train_with_dev else 'max') scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) if (self.scheduler_state is not None): scheduler.load_state_dict(self.scheduler_state) train_data = self.corpus.train if train_with_dev: train_data = ConcatDataset([self.corpus.train, self.corpus.dev]) if (sampler is not None): sampler = sampler(train_data) shuffle = False dev_score_history = [] dev_loss_history = [] train_loss_history = [] try: previous_learning_rate = learning_rate for epoch in range((0 + self.epoch), (max_epochs + self.epoch)): log_line(log) for group in optimizer.param_groups: learning_rate = group['lr'] if ((learning_rate != previous_learning_rate) and anneal_with_restarts and (base_path / 'best-model.pt').exists()): log.info('resetting to best model') self.model.load((base_path / 'best-model.pt')) previous_learning_rate = learning_rate if (learning_rate < min_learning_rate): log_line(log) log.info('learning rate too small - quitting training!') log_line(log) break batch_loader = DataLoader(train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler) self.model.train() train_loss = 0 seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int((total_number_of_batches / 10))) batch_time = 0 for (batch_no, batch) in enumerate(batch_loader): start_time = time.time() loss = self.model.forward_loss(batch) optimizer.zero_grad() if use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_batches += 1 train_loss += loss.item() store_embeddings(batch, embeddings_storage_mode) batch_time += (time.time() - start_time) if ((batch_no % modulo) == 0): log.info(''.join([ 'epoch ', '{}'.format((epoch + 1)), ' - iter ', '{}'.format(batch_no), '/', '{}'.format(total_number_of_batches), ' - loss ', '{:.8f}'.format((train_loss / seen_batches)), ' - samples/sec: ', '{:.2f}'.format( ((mini_batch_size * modulo) / batch_time)) ])) batch_time = 0 iteration = ((epoch * total_number_of_batches) + batch_no) if (not param_selection_mode): weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= seen_batches self.model.eval() log_line(log) log.info(''.join([ 'EPOCH ', '{}'.format((epoch + 1)), ' done: loss ', '{:.4f}'.format(train_loss), ' - lr ', '{:.4f}'.format(learning_rate) ])) if self.use_tensorboard: writer.add_scalar('train_loss', train_loss, (epoch + 1)) current_score = train_loss result_line = '' if log_train: (train_eval_result, train_loss) = self.model.evaluate( DataLoader(self.corpus.train, batch_size=eval_mini_batch_size, num_workers=num_workers), embeddings_storage_mode=embeddings_storage_mode) result_line += ''.join( ['\t', '{}'.format(train_eval_result.log_line)]) store_embeddings(self.corpus.train, embeddings_storage_mode) if log_dev: (dev_eval_result, dev_loss) = self.model.evaluate( DataLoader(self.corpus.dev, batch_size=eval_mini_batch_size, num_workers=num_workers), embeddings_storage_mode=embeddings_storage_mode) result_line += ''.join([ '\t', '{}'.format(dev_loss), '\t', '{}'.format(dev_eval_result.log_line) ]) log.info(''.join([ 'DEV : loss ', '{}'.format(dev_loss), ' - score ', '{}'.format(dev_eval_result.main_score) ])) dev_score_history.append(dev_eval_result.main_score) dev_loss_history.append(dev_loss) current_score = dev_eval_result.main_score store_embeddings(self.corpus.dev, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar('dev_loss', dev_loss, (epoch + 1)) writer.add_scalar('dev_score', dev_eval_result.main_score, (epoch + 1)) if log_test: (test_eval_result, test_loss) = self.model.evaluate( DataLoader(self.corpus.test, batch_size=eval_mini_batch_size, num_workers=num_workers), (base_path / 'test.tsv'), embeddings_storage_mode=embeddings_storage_mode) result_line += ''.join([ '\t', '{}'.format(test_loss), '\t', '{}'.format(test_eval_result.log_line) ]) log.info(''.join([ 'TEST : loss ', '{}'.format(test_loss), ' - score ', '{}'.format(test_eval_result.main_score) ])) store_embeddings(self.corpus.test, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar('test_loss', test_loss, (epoch + 1)) writer.add_scalar('test_score', test_eval_result.main_score, (epoch + 1)) scheduler.step(current_score) train_loss_history.append(train_loss) try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: new_learning_rate = group['lr'] if (new_learning_rate != previous_learning_rate): bad_epochs = (patience + 1) log.info(''.join( ['BAD EPOCHS (no improvement): ', '{}'.format(bad_epochs)])) with open(loss_txt, 'a') as f: if (epoch == 0): f.write( 'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS' ) if log_train: f.write(('\tTRAIN_' + '\tTRAIN_'.join( train_eval_result.log_header.split('\t')))) if log_dev: f.write(('\tDEV_LOSS\tDEV_' + '\tDEV_'.join( dev_eval_result.log_header.split('\t')))) if log_test: f.write(('\tTEST_LOSS\tTEST_' + '\tTEST_'.join( test_eval_result.log_header.split('\t')))) f.write(''.join([ '\n', '{}'.format(epoch), '\t', '{:%H:%M:%S}'.format(datetime.datetime.now()), '\t', '{}'.format(bad_epochs), '\t', '{:.4f}'.format(learning_rate), '\t', '{}'.format(train_loss) ])) f.write(result_line) if (checkpoint and (not param_selection_mode)): self.model.save_checkpoint((base_path / 'checkpoint.pt'), optimizer.state_dict(), scheduler.state_dict(), (epoch + 1), train_loss) if ((not train_with_dev) and (not param_selection_mode) and (current_score == scheduler.best)): self.model.save((base_path / 'best-model.pt')) if (save_final_model and (not param_selection_mode)): self.model.save((base_path / 'final-model.pt')) except KeyboardInterrupt: log_line(log) log.info('Exiting from training early.') if self.use_tensorboard: writer.close() if (not param_selection_mode): log.info('Saving model ...') self.model.save((base_path / 'final-model.pt')) log.info('Done.') if self.corpus.test: final_score = self.final_test(base_path, eval_mini_batch_size, num_workers) else: final_score = 0 log.info('Test data not provided setting final score to 0') log.removeHandler(log_handler) if self.use_tensorboard: writer.close() return { 'test_score': final_score, 'dev_score_history': dev_score_history, 'train_loss_history': train_loss_history, 'dev_loss_history': dev_loss_history, }
def train( self, base_path: Union[Path, str], learning_rate: float = 0.1, mini_batch_size: int = 32, mini_batch_chunk_size: Optional[int] = None, max_epochs: int = 100, train_with_dev: bool = False, train_with_test: bool = False, monitor_train: bool = False, monitor_test: bool = False, main_evaluation_metric: Tuple[str, str] = ("micro avg", 'f1-score'), scheduler=AnnealOnPlateau, anneal_factor: float = 0.5, patience: int = 3, min_learning_rate: float = 0.0001, initial_extra_patience: int = 0, optimizer: torch.optim.Optimizer = SGD, cycle_momentum: bool = False, warmup_fraction: float = 0.1, embeddings_storage_mode: str = "cpu", checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, anneal_with_prestarts: bool = False, anneal_against_dev_loss: bool = False, batch_growth_annealing: bool = False, shuffle: bool = True, param_selection_mode: bool = False, write_weights: bool = False, num_workers: int = 6, sampler=None, use_amp: bool = False, amp_opt_level: str = "O1", eval_on_train_fraction: float = 0.0, eval_on_train_shuffle: bool = False, save_model_each_k_epochs: int = 0, tensorboard_comment: str = '', use_swa: bool = False, use_final_model_for_eval: bool = False, gold_label_dictionary_for_eval: Optional[Dictionary] = None, create_file_logs: bool = True, create_loss_file: bool = True, epoch: int = 0, use_tensorboard: bool = False, tensorboard_log_dir=None, metrics_for_tensorboard=[], optimizer_state_dict: Optional = None, scheduler_state_dict: Optional = None, save_optimizer_state: bool = False, **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. :param base_path: Main path to which all output during training is logged and models are saved :param learning_rate: Initial learning rate (or max, if scheduler is OneCycleLR) :param mini_batch_size: Size of mini-batches during training :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed. :param scheduler: The learning rate scheduler to use :param checkpoint: If True, a full checkpoint is saved at end of each epoch :param cycle_momentum: If scheduler is OneCycleLR, whether the scheduler should cycle also the momentum :param anneal_factor: The factor by which the learning rate is annealed :param patience: Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate :param min_learning_rate: If the learning rate falls below this threshold, training terminates :param warmup_fraction: Fraction of warmup steps if the scheduler is LinearSchedulerWithWarmup :param train_with_dev: If True, the data from dev split is added to the training data :param train_with_test: If True, the data from test split is added to the training data :param monitor_train: If True, training data is evaluated at end of each epoch :param monitor_test: If True, test data is evaluated at end of each epoch :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU) :param save_final_model: If True, final model is saved :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate :param shuffle: If True, data is shuffled during training :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing parameter selection. :param num_workers: Number of workers in your data loader. :param sampler: You can pass a data sampler here for special sampling of data. :param eval_on_train_fraction: the fraction of train data to do the evaluation on, if 0. the evaluation is not performed on fraction of training data, if 'dev' the size is determined from dev set size :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training and kept fixed during training, otherwise it's sampled at beginning of each epoch :param save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will be saved each 5 epochs. Default is 0 which means no model saving. :param main_evaluation_metric: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used), currently only applicable for text_classification_model :param tensorboard_comment: Comment to use for tensorboard logging :param create_file_logs: If True, the logs will also be stored in a file 'training.log' in the model folder :param create_loss_file: If True, the loss will be writen to a file 'loss.tsv' in the model folder :param optimizer: The optimizer to use (typically SGD or Adam) :param epoch: The starting epoch (normally 0 but could be higher if you continue training model) :param use_tensorboard: If True, writes out tensorboard information :param tensorboard_log_dir: Directory into which tensorboard log files will be written :param metrics_for_tensorboard: List of tuples that specify which metrics (in addition to the main_score) shall be plotted in tensorboard, could be [("macro avg", 'f1-score'), ("macro avg", 'precision')] for example :param kwargs: Other arguments for the Optimizer :return: """ # create a model card for this model with Flair and PyTorch version model_card = {'flair_version': flair.__version__, 'pytorch_version': torch.__version__} # also record Transformers version if library is loaded try: import transformers model_card['transformers_version'] = transformers.__version__ except: pass # remember all parameters used in train() call local_variables = locals() training_parameters = {} for parameter in signature(self.train).parameters: training_parameters[parameter] = local_variables[parameter] model_card['training_parameters'] = training_parameters # add model card to model self.model.model_card = model_card if use_tensorboard: try: from torch.utils.tensorboard import SummaryWriter if tensorboard_log_dir is not None and not os.path.exists(tensorboard_log_dir): os.mkdir(tensorboard_log_dir) writer = SummaryWriter(log_dir=tensorboard_log_dir, comment=tensorboard_comment) log.info(f"tensorboard logging path is {tensorboard_log_dir}") except: log_line(log) log.warning("ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!") log_line(log) use_tensorboard = False pass if use_amp: if sys.version_info < (3, 0): raise RuntimeError("Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError( "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training." ) if mini_batch_chunk_size is None: mini_batch_chunk_size = mini_batch_size if learning_rate < min_learning_rate: min_learning_rate = learning_rate / 10 initial_learning_rate = learning_rate # cast string to Path if type(base_path) is str: base_path = Path(base_path) base_path.mkdir(exist_ok=True, parents=True) if create_file_logs: log_handler = add_file_handler(log, base_path / "training.log") else: log_handler = None log_line(log) log.info(f'Model: "{self.model}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info("Parameters:") log.info(f' - learning_rate: "{learning_rate}"') log.info(f' - mini_batch_size: "{mini_batch_size}"') log.info(f' - patience: "{patience}"') log.info(f' - anneal_factor: "{anneal_factor}"') log.info(f' - max_epochs: "{max_epochs}"') log.info(f' - shuffle: "{shuffle}"') log.info(f' - train_with_dev: "{train_with_dev}"') log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Device: {flair.device}") log_line(log) log.info(f"Embeddings storage mode: {embeddings_storage_mode}") if isinstance(self.model, SequenceTagger) and self.model.weight_dict and self.model.use_crf: log_line(log) log.warning(f'WARNING: Specified class weights will not take effect when using CRF') # check for previously saved best models in the current training folder and delete them self.check_for_and_delete_previous_best_models(base_path) # determine what splits (train, dev, test) to evaluate and log log_train = True if monitor_train else False log_test = True if (not param_selection_mode and self.corpus.test and monitor_test) else False log_dev = False if train_with_dev or not self.corpus.dev else True log_train_part = True if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0) else False if log_train_part: train_part_size = len(self.corpus.dev) if eval_on_train_fraction == "dev" \ else int(len(self.corpus.train) * eval_on_train_fraction) assert train_part_size > 0 if not eval_on_train_shuffle: train_part_indices = list(range(train_part_size)) train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) # prepare loss logging file and set up header loss_txt = init_output_file(base_path, "loss.tsv") if create_loss_file else None weight_extractor = WeightExtractor(base_path) # if optimizer class is passed, instantiate: if inspect.isclass(optimizer): optimizer: torch.optim.Optimizer = optimizer(self.model.parameters(), lr=learning_rate, **kwargs) if use_swa: import torchcontrib optimizer = torchcontrib.optim.SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=learning_rate) if use_amp: self.model, optimizer = amp.initialize( self.model, optimizer, opt_level=amp_opt_level ) # load existing optimizer state dictionary if it exists if optimizer_state_dict: optimizer.load_state_dict(optimizer_state_dict) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev or anneal_against_dev_loss else "max" best_validation_score = 100000000000 if train_with_dev or anneal_against_dev_loss else 0. dataset_size = len(self.corpus.train) if train_with_dev: dataset_size += len(self.corpus.dev) # if scheduler is passed as a class, instantiate if inspect.isclass(scheduler): if scheduler == OneCycleLR: scheduler = OneCycleLR(optimizer, max_lr=learning_rate, steps_per_epoch=dataset_size // mini_batch_size + 1, epochs=max_epochs - epoch, # if we load a checkpoint, we have already trained for epoch pct_start=0.0, cycle_momentum=cycle_momentum) elif scheduler == LinearSchedulerWithWarmup: steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size num_train_steps = int(steps_per_epoch * max_epochs) num_warmup_steps = int(num_train_steps * warmup_fraction) scheduler = LinearSchedulerWithWarmup(optimizer, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) else: scheduler = scheduler( optimizer, factor=anneal_factor, patience=patience, initial_extra_patience=initial_extra_patience, mode=anneal_mode, verbose=True, ) # load existing scheduler state dictionary if it exists if scheduler_state_dict: scheduler.load_state_dict(scheduler_state_dict) # update optimizer and scheduler in model card model_card['training_parameters']['optimizer'] = optimizer model_card['training_parameters']['scheduler'] = scheduler if isinstance(scheduler, OneCycleLR) and batch_growth_annealing: raise ValueError("Batch growth with OneCycle policy is not implemented.") train_data = self.corpus.train # if training also uses dev/train data, include in training set if train_with_dev or train_with_test: parts = [self.corpus.train] if train_with_dev: parts.append(self.corpus.dev) if train_with_test: parts.append(self.corpus.test) train_data = ConcatDataset(parts) # initialize sampler if provided if sampler is not None: # init with default values if only class is provided if inspect.isclass(sampler): sampler = sampler() # set dataset to sample from sampler.set_dataset(train_data) shuffle = False dev_score_history = [] dev_loss_history = [] train_loss_history = [] micro_batch_size = mini_batch_chunk_size # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate momentum = 0 for group in optimizer.param_groups: if "momentum" in group: momentum = group["momentum"] for epoch in range(epoch + 1, max_epochs + 1): log_line(log) # update epoch in model card self.model.model_card['training_parameters']['epoch'] = epoch if anneal_with_prestarts: last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict()) if eval_on_train_shuffle: train_part_indices = list(range(self.corpus.train)) random.shuffle(train_part_indices) train_part_indices = train_part_indices[:train_part_size] train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) # get new learning rate for group in optimizer.param_groups: learning_rate = group["lr"] if learning_rate != previous_learning_rate and batch_growth_annealing: mini_batch_size *= 2 # reload last best model if annealing with restarts is enabled if ( (anneal_with_restarts or anneal_with_prestarts) and learning_rate != previous_learning_rate and os.path.exists(base_path / "best-model.pt") ): if anneal_with_restarts: log.info("resetting to best model") self.model.load_state_dict( self.model.load(base_path / "best-model.pt").state_dict() ) if anneal_with_prestarts: log.info("resetting to pre-best model") self.model.load_state_dict( self.model.load(base_path / "pre-best-model.pt").state_dict() ) previous_learning_rate = learning_rate if use_tensorboard: writer.add_scalar("learning_rate", learning_rate, epoch) # stop training if learning rate becomes too small if ((not isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and learning_rate < min_learning_rate)): log_line(log) log.info("learning rate too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle if epoch > 1 else False, # never shuffle the first epoch num_workers=num_workers, sampler=sampler, ) self.model.train() train_loss: float = 0 seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) # process mini-batches batch_time = 0 average_over = 0 for batch_no, batch in enumerate(batch_loader): start_time = time.time() # zero the gradients on the model and optimizer self.model.zero_grad() optimizer.zero_grad() # if necessary, make batch_steps batch_steps = [batch] if len(batch) > micro_batch_size: batch_steps = [batch[x: x + micro_batch_size] for x in range(0, len(batch), micro_batch_size)] # forward and backward for batch for batch_step in batch_steps: # forward pass loss = self.model.forward_loss(batch_step) if isinstance(loss, Tuple): average_over += loss[1] loss = loss[0] # Backward if use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() # do the optimizer step torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() # do the scheduler step if one-cycle or linear decay if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)): scheduler.step() # get new learning rate for group in optimizer.param_groups: learning_rate = group["lr"] if "momentum" in group: momentum = group["momentum"] if "betas" in group: momentum, _ = group["betas"] seen_batches += 1 # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(batch, embeddings_storage_mode) batch_time += time.time() - start_time if seen_batches % modulo == 0: momentum_info = f' - momentum: {momentum:.4f}' if cycle_momentum else '' intermittent_loss = train_loss / average_over if average_over > 0 else train_loss / seen_batches log.info( f"epoch {epoch} - iter {seen_batches}/{total_number_of_batches} - loss " f"{intermittent_loss:.8f} - samples/sec: {mini_batch_size * modulo / batch_time:.2f}" f" - lr: {learning_rate:.6f}{momentum_info}" ) batch_time = 0 iteration = epoch * total_number_of_batches + batch_no if not param_selection_mode and write_weights: weight_extractor.extract_weights(self.model.state_dict(), iteration) if average_over != 0: train_loss /= average_over self.model.eval() log_line(log) log.info(f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}") if use_tensorboard: writer.add_scalar("train_loss", train_loss, epoch) # evaluate on train / dev / test split depending on training settings result_line: str = "" if log_train: train_eval_result = self.model.evaluate( self.corpus.train, gold_label_type=self.model.label_type, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, embedding_storage_mode=embeddings_storage_mode, main_evaluation_metric=main_evaluation_metric, gold_label_dictionary=gold_label_dictionary_for_eval, ) result_line += f"\t{train_eval_result.log_line}" # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.train, embeddings_storage_mode) if log_train_part: train_part_eval_result = self.model.evaluate( train_part, gold_label_type=self.model.label_type, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, embedding_storage_mode=embeddings_storage_mode, main_evaluation_metric=main_evaluation_metric, gold_label_dictionary=gold_label_dictionary_for_eval, ) result_line += f"\t{train_part_eval_result.loss}\t{train_part_eval_result.log_line}" log.info( f"TRAIN_SPLIT : loss {train_part_eval_result.loss} - {main_evaluation_metric[1]} ({main_evaluation_metric[0]}) {round(train_part_eval_result.main_score, 4)}" ) if use_tensorboard: for (metric_class_avg_type, metric_type) in metrics_for_tensorboard: writer.add_scalar( f"train_{metric_class_avg_type}_{metric_type}", train_part_eval_result.classification_report[metric_class_avg_type][metric_type], epoch ) if log_dev: dev_eval_result = self.model.evaluate( self.corpus.dev, gold_label_type=self.model.label_type, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, out_path=base_path / "dev.tsv", embedding_storage_mode=embeddings_storage_mode, main_evaluation_metric=main_evaluation_metric, gold_label_dictionary=gold_label_dictionary_for_eval, ) result_line += f"\t{dev_eval_result.loss}\t{dev_eval_result.log_line}" log.info( f"DEV : loss {dev_eval_result.loss} - {main_evaluation_metric[1]} ({main_evaluation_metric[0]}) {round(dev_eval_result.main_score, 4)}" ) # calculate scores using dev data if available # append dev score to score history dev_score_history.append(dev_eval_result.main_score) dev_loss_history.append(dev_eval_result.loss) dev_score = dev_eval_result.main_score # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.dev, embeddings_storage_mode) if use_tensorboard: writer.add_scalar("dev_loss", dev_eval_result.loss, epoch) writer.add_scalar("dev_score", dev_eval_result.main_score, epoch) for (metric_class_avg_type, metric_type) in metrics_for_tensorboard: writer.add_scalar( f"dev_{metric_class_avg_type}_{metric_type}", dev_eval_result.classification_report[metric_class_avg_type][metric_type], epoch ) if log_test: test_eval_result = self.model.evaluate( self.corpus.test, gold_label_type=self.model.label_type, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, out_path=base_path / "test.tsv", embedding_storage_mode=embeddings_storage_mode, main_evaluation_metric=main_evaluation_metric, gold_label_dictionary=gold_label_dictionary_for_eval, ) result_line += f"\t{test_eval_result.loss}\t{test_eval_result.log_line}" log.info( f"TEST : loss {test_eval_result.loss} - {main_evaluation_metric[1]} ({main_evaluation_metric[0]}) {round(test_eval_result.main_score, 4)}" ) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.test, embeddings_storage_mode) if use_tensorboard: writer.add_scalar("test_loss", test_eval_result.loss, epoch) writer.add_scalar("test_score", test_eval_result.main_score, epoch) for (metric_class_avg_type, metric_type) in metrics_for_tensorboard: writer.add_scalar( f"test_{metric_class_avg_type}_{metric_type}", test_eval_result.classification_report[metric_class_avg_type][metric_type], epoch ) # determine if this is the best model or if we need to anneal current_epoch_has_best_model_so_far = False # default mode: anneal against dev score if not train_with_dev and not anneal_against_dev_loss: if dev_score > best_validation_score: current_epoch_has_best_model_so_far = True best_validation_score = dev_score if isinstance(scheduler, AnnealOnPlateau): scheduler.step(dev_score, dev_eval_result.loss) # alternative: anneal against dev loss if not train_with_dev and anneal_against_dev_loss: if dev_eval_result.loss < best_validation_score: current_epoch_has_best_model_so_far = True best_validation_score = dev_eval_result.loss if isinstance(scheduler, AnnealOnPlateau): scheduler.step(dev_eval_result.loss) # alternative: anneal against train loss if train_with_dev: if train_loss < best_validation_score: current_epoch_has_best_model_so_far = True best_validation_score = train_loss if isinstance(scheduler, AnnealOnPlateau): scheduler.step(train_loss) train_loss_history.append(train_loss) # determine bad epoch number try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: new_learning_rate = group["lr"] if new_learning_rate != previous_learning_rate: bad_epochs = patience + 1 if previous_learning_rate == initial_learning_rate: bad_epochs += initial_extra_patience # log bad epochs log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") if create_loss_file: # output log file with open(loss_txt, "a") as f: # make headers on first epoch if epoch == 1: f.write(f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS") if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join(train_eval_result.log_header.split("\t"))) if log_train_part: f.write("\tTRAIN_PART_LOSS\tTRAIN_PART_" + "\tTRAIN_PART_".join( train_part_eval_result.log_header.split("\t"))) if log_dev: f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dev_eval_result.log_header.split("\t"))) if log_test: f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(test_eval_result.log_header.split("\t"))) f.write( f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) f.write(result_line) # if checkpoint is enabled, save model at each epoch if checkpoint and not param_selection_mode: self.model.save(base_path / "checkpoint.pt", checkpoint=True) # Check whether to save best model if ( (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) and not param_selection_mode and current_epoch_has_best_model_so_far and not use_final_model_for_eval ): log.info("saving best model") self.model.save(base_path / "best-model.pt", checkpoint=save_optimizer_state) if anneal_with_prestarts: current_state_dict = self.model.state_dict() self.model.load_state_dict(last_epoch_model_state_dict) self.model.save(base_path / "pre-best-model.pt") self.model.load_state_dict(current_state_dict) if save_model_each_k_epochs > 0 and not epoch % save_model_each_k_epochs: print("saving model of current epoch") model_name = "model_epoch_" + str(epoch) + ".pt" self.model.save(base_path / model_name, checkpoint=save_optimizer_state) if use_swa: optimizer.swap_swa_sgd() # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") if use_tensorboard: writer.close() if not param_selection_mode: log.info("Saving model ...") self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) log.info("Done.") # test best model if test data is present if self.corpus.test and not train_with_test: final_score = self.final_test( base_path=base_path, eval_mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, main_evaluation_metric=main_evaluation_metric, gold_label_dictionary_for_eval=gold_label_dictionary_for_eval, ) else: final_score = 0 log.info("Test data not provided setting final score to 0") if create_file_logs: log_handler.close() log.removeHandler(log_handler) if use_tensorboard: writer.close() return { "test_score": final_score, "dev_score_history": dev_score_history, "train_loss_history": train_loss_history, "dev_loss_history": dev_loss_history, }
def train( self, base_path: Union[Path, str], learning_rate: float = 0.1, mini_batch_size: int = 32, mini_batch_chunk_size: int = None, max_epochs: int = 100, scheduler=AnnealOnPlateau, cycle_momentum: bool = False, anneal_factor: float = 0.5, patience: int = 3, initial_extra_patience=0, min_learning_rate: float = 0.0001, train_with_dev: bool = False, train_with_test: bool = False, monitor_train: bool = False, monitor_test: bool = False, embeddings_storage_mode: str = "cpu", checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, anneal_with_prestarts: bool = False, batch_growth_annealing: bool = False, shuffle: bool = True, param_selection_mode: bool = False, write_weights: bool = False, num_workers: int = 6, sampler=None, use_amp: bool = False, amp_opt_level: str = "O1", eval_on_train_fraction=0.0, eval_on_train_shuffle=False, save_model_at_each_epoch=False, **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. :param base_path: Main path to which all output during training is logged and models are saved :param learning_rate: Initial learning rate (or max, if scheduler is OneCycleLR) :param mini_batch_size: Size of mini-batches during training :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed. :param scheduler: The learning rate scheduler to use :param cycle_momentum: If scheduler is OneCycleLR, whether the scheduler should cycle also the momentum :param anneal_factor: The factor by which the learning rate is annealed :param patience: Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate :param min_learning_rate: If the learning rate falls below this threshold, training terminates :param train_with_dev: If True, training is performed using both train+dev data :param monitor_train: If True, training data is evaluated at end of each epoch :param monitor_test: If True, test data is evaluated at end of each epoch :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU) :param checkpoint: If True, a full checkpoint is saved at end of each epoch :param save_final_model: If True, final model is saved :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate :param shuffle: If True, data is shuffled during training :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing parameter selection. :param num_workers: Number of workers in your data loader. :param sampler: You can pass a data sampler here for special sampling of data. :param eval_on_train_fraction: the fraction of train data to do the evaluation on, if 0. the evaluation is not performed on fraction of training data, if 'dev' the size is determined from dev set size :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training and kept fixed during training, otherwise it's sampled at beginning of each epoch :param save_model_at_each_epoch: If True, at each epoch the thus far trained model will be saved :param kwargs: Other arguments for the Optimizer :return: """ if self.use_tensorboard: try: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() except: log_line(log) log.warning( "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!" ) log_line(log) self.use_tensorboard = False pass if use_amp: if sys.version_info < (3, 0): raise RuntimeError( "Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError( "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if mini_batch_chunk_size is None: mini_batch_chunk_size = mini_batch_size if learning_rate < min_learning_rate: min_learning_rate = learning_rate / 10 initial_learning_rate = learning_rate # cast string to Path if type(base_path) is str: base_path = Path(base_path) log_handler = add_file_handler(log, base_path / "training.log") log_line(log) log.info(f'Model: "{self.model}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info("Parameters:") log.info(f' - learning_rate: "{learning_rate}"') log.info(f' - mini_batch_size: "{mini_batch_size}"') log.info(f' - patience: "{patience}"') log.info(f' - anneal_factor: "{anneal_factor}"') log.info(f' - max_epochs: "{max_epochs}"') log.info(f' - shuffle: "{shuffle}"') log.info(f' - train_with_dev: "{train_with_dev}"') log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Device: {flair.device}") log_line(log) log.info(f"Embeddings storage mode: {embeddings_storage_mode}") if isinstance(self.model, SequenceTagger ) and self.model.weight_dict and self.model.use_crf: log_line(log) log.warning( f'WARNING: Specified class weights will not take effect when using CRF' ) # determine what splits (train, dev, test) to evaluate and log log_train = True if monitor_train else False log_test = (True if (not param_selection_mode and self.corpus.test and monitor_test) else False) log_dev = False if train_with_dev or not self.corpus.dev else True log_train_part = (True if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0) else False) if log_train_part: train_part_size = (len( self.corpus.dev) if eval_on_train_fraction == "dev" else int( len(self.corpus.train) * eval_on_train_fraction)) assert train_part_size > 0 if not eval_on_train_shuffle: train_part_indices = list(range(train_part_size)) train_part = torch.utils.data.dataset.Subset( self.corpus.train, train_part_indices) # prepare loss logging file and set up header loss_txt = init_output_file(base_path, "loss.tsv") weight_extractor = WeightExtractor(base_path) optimizer: torch.optim.Optimizer = self.optimizer( self.model.parameters(), lr=learning_rate, **kwargs) if use_amp: self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=amp_opt_level) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" if scheduler == OneCycleLR: dataset_size = len(self.corpus.train) if train_with_dev: dataset_size += len(self.corpus.dev) lr_scheduler = OneCycleLR( optimizer, max_lr=learning_rate, steps_per_epoch=dataset_size // mini_batch_size + 1, epochs=max_epochs - self. epoch, # if we load a checkpoint, we have already trained for self.epoch pct_start=0.0, cycle_momentum=cycle_momentum) else: lr_scheduler = scheduler( optimizer, factor=anneal_factor, patience=patience, initial_extra_patience=initial_extra_patience, mode=anneal_mode, verbose=True, ) if (isinstance(lr_scheduler, OneCycleLR) and batch_growth_annealing): raise ValueError( "Batch growth with OneCycle policy is not implemented.") train_data = self.corpus.train # if training also uses dev/train data, include in training set if train_with_dev or train_with_test: parts = [self.corpus.train] if train_with_dev: parts.append(self.corpus.dev) if train_with_test: parts.append(self.corpus.test) train_data = ConcatDataset(parts) # initialize sampler if provided if sampler is not None: # init with default values if only class is provided if inspect.isclass(sampler): sampler = sampler() # set dataset to sample from sampler.set_dataset(train_data) shuffle = False dev_score_history = [] dev_loss_history = [] train_loss_history = [] micro_batch_size = mini_batch_chunk_size # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate momentum = 0 for group in optimizer.param_groups: if "momentum" in group: momentum = group["momentum"] for self.epoch in range(self.epoch + 1, max_epochs + 1): log_line(log) if anneal_with_prestarts: last_epoch_model_state_dict = copy.deepcopy( self.model.state_dict()) if eval_on_train_shuffle: train_part_indices = list(range(self.corpus.train)) random.shuffle(train_part_indices) train_part_indices = train_part_indices[:train_part_size] train_part = torch.utils.data.dataset.Subset( self.corpus.train, train_part_indices) # get new learning rate for group in optimizer.param_groups: learning_rate = group["lr"] if learning_rate != previous_learning_rate and batch_growth_annealing: mini_batch_size *= 2 # reload last best model if annealing with restarts is enabled if ((anneal_with_restarts or anneal_with_prestarts) and learning_rate != previous_learning_rate and (base_path / "best-model.pt").exists()): if anneal_with_restarts: log.info("resetting to best model") self.model.load_state_dict( self.model.load(base_path / "best-model.pt").state_dict()) if anneal_with_prestarts: log.info("resetting to pre-best model") self.model.load_state_dict( self.model.load(base_path / "pre-best-model.pt").state_dict()) previous_learning_rate = learning_rate # stop training if learning rate becomes too small if (not isinstance(lr_scheduler, OneCycleLR) ) and learning_rate < min_learning_rate: log_line(log) log.info("learning rate too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler, ) self.model.train() train_loss: float = 0 seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) # process mini-batches batch_time = 0 for batch_no, batch in enumerate(batch_loader): start_time = time.time() # zero the gradients on the model and optimizer self.model.zero_grad() optimizer.zero_grad() # if necessary, make batch_steps batch_steps = [batch] if len(batch) > micro_batch_size: batch_steps = [ batch[x:x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) ] # forward and backward for batch for batch_step in batch_steps: # forward pass loss = self.model.forward_loss(batch_step) # Backward if use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # do the optimizer step torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() # do the scheduler step if one-cycle if isinstance(lr_scheduler, OneCycleLR): lr_scheduler.step() # get new learning rate for group in optimizer.param_groups: learning_rate = group["lr"] if "momentum" in group: momentum = group["momentum"] seen_batches += 1 train_loss += loss.item() # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(batch, embeddings_storage_mode) batch_time += time.time() - start_time if seen_batches % modulo == 0: momentum_info = f' - momentum: {momentum:.4f}' if cycle_momentum else '' log.info( f"epoch {self.epoch} - iter {seen_batches}/{total_number_of_batches} - loss " f"{train_loss / seen_batches:.8f} - samples/sec: {mini_batch_size * modulo / batch_time:.2f}" f" - lr: {learning_rate:.6f}{momentum_info}") batch_time = 0 iteration = self.epoch * total_number_of_batches + batch_no if not param_selection_mode and write_weights: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= seen_batches self.model.eval() log_line(log) log.info( f"EPOCH {self.epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}" ) if self.use_tensorboard: writer.add_scalar("train_loss", train_loss, self.epoch) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss # evaluate on train / dev / test split depending on training settings result_line: str = "" if log_train: train_eval_result, train_loss = self.model.evaluate( self.corpus.train, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, embedding_storage_mode=embeddings_storage_mode, ) result_line += f"\t{train_eval_result.log_line}" # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.train, embeddings_storage_mode) if log_train_part: train_part_eval_result, train_part_loss = self.model.evaluate( train_part, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, embedding_storage_mode=embeddings_storage_mode, ) result_line += ( f"\t{train_part_loss}\t{train_part_eval_result.log_line}" ) log.info( f"TRAIN_SPLIT : loss {train_part_loss} - score {round(train_part_eval_result.main_score, 4)}" ) if log_dev: dev_eval_result, dev_loss = self.model.evaluate( self.corpus.dev, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, out_path=base_path / "dev.tsv", embedding_storage_mode=embeddings_storage_mode, ) result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}" log.info( f"DEV : loss {dev_loss} - score {round(dev_eval_result.main_score, 4)}" ) # calculate scores using dev data if available # append dev score to score history dev_score_history.append(dev_eval_result.main_score) dev_loss_history.append(dev_loss.item()) current_score = dev_eval_result.main_score # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.dev, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("dev_loss", dev_loss, self.epoch) writer.add_scalar("dev_score", dev_eval_result.main_score, self.epoch) if log_test: test_eval_result, test_loss = self.model.evaluate( self.corpus.test, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, out_path=base_path / "test.tsv", embedding_storage_mode=embeddings_storage_mode, ) result_line += f"\t{test_loss}\t{test_eval_result.log_line}" log.info( f"TEST : loss {test_loss} - score {round(test_eval_result.main_score, 4)}" ) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.test, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("test_loss", test_loss, self.epoch) writer.add_scalar("test_score", test_eval_result.main_score, self.epoch) # determine learning rate annealing through scheduler. Use auxiliary metric for AnnealOnPlateau if log_dev and isinstance(lr_scheduler, AnnealOnPlateau): lr_scheduler.step(current_score, dev_loss) elif not isinstance(lr_scheduler, OneCycleLR): lr_scheduler.step(current_score) train_loss_history.append(train_loss) # determine bad epoch number try: bad_epochs = lr_scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: new_learning_rate = group["lr"] if new_learning_rate != previous_learning_rate: bad_epochs = patience + 1 if previous_learning_rate == initial_learning_rate: bad_epochs += initial_extra_patience # log bad epochs log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") # output log file with open(loss_txt, "a") as f: # make headers on first epoch if self.epoch == 1: f.write( f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS" ) if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join( train_eval_result.log_header.split("\t"))) if log_train_part: f.write("\tTRAIN_PART_LOSS\tTRAIN_PART_" + "\tTRAIN_PART_".join( train_part_eval_result.log_header. split("\t"))) if log_dev: f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join( dev_eval_result.log_header.split("\t"))) if log_test: f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join( test_eval_result.log_header.split("\t"))) f.write( f"\n{self.epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) f.write(result_line) # if checkpoint is enabled, save model at each epoch if checkpoint and not param_selection_mode: self.save_checkpoint(base_path / "checkpoint.pt") # if we use dev data, remember best model based on dev evaluation score if ((not train_with_dev or anneal_with_restarts or anneal_with_prestarts) and not param_selection_mode and not isinstance(lr_scheduler, OneCycleLR) and current_score == lr_scheduler.best and bad_epochs == 0): print("saving best model") self.model.save(base_path / "best-model.pt") if anneal_with_prestarts: current_state_dict = self.model.state_dict() self.model.load_state_dict(last_epoch_model_state_dict) self.model.save(base_path / "pre-best-model.pt") self.model.load_state_dict(current_state_dict) if save_model_at_each_epoch: print("saving model of current epoch") model_name = "model_epoch_" + str(self.epoch) + ".pt" self.model.save(base_path / model_name) # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt") except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") if self.use_tensorboard: writer.close() if not param_selection_mode: log.info("Saving model ...") self.model.save(base_path / "final-model.pt") log.info("Done.") # test best model if test data is present if self.corpus.test and not train_with_test: final_score = self.final_test(base_path, mini_batch_chunk_size, num_workers) else: final_score = 0 log.info("Test data not provided setting final score to 0") log.removeHandler(log_handler) if self.use_tensorboard: writer.close() return { "test_score": final_score, "dev_score_history": dev_score_history, "train_loss_history": train_loss_history, "dev_loss_history": dev_loss_history, }
def find_learning_rate( self, base_path: Union[Path, str], optimizer, mini_batch_size: int = 32, start_learning_rate: float = 1e-7, end_learning_rate: float = 10, iterations: int = 1000, stop_early: bool = True, file_name: str = "learning_rate.tsv", **kwargs, ) -> Path: best_loss = None # cast string to Path if type(base_path) is str: base_path = Path(base_path) base_path.mkdir(exist_ok=True, parents=True) learning_rate_tsv = init_output_file(base_path, file_name) with open(learning_rate_tsv, "a") as f: f.write("ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n") optimizer = optimizer(self.model.parameters(), lr=start_learning_rate, **kwargs) train_data = self.corpus.train scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations) model_state = self.model.state_dict() self.model.train() step = 0 loss_list = [] average_loss_list = [] while step < iterations: batch_loader = DataLoader(train_data, batch_size=mini_batch_size, shuffle=True) for batch in batch_loader: step += 1 # forward pass loss = self.model.forward_loss(batch) if isinstance(loss, Tuple): loss = loss[0] # update optimizer and scheduler optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() scheduler.step() learning_rate = scheduler.get_lr()[0] # append current loss to list of losses for all iterations loss_list.append(loss.item()) # compute averaged loss import statistics moving_avg_loss = statistics.mean(loss_list) average_loss_list.append(moving_avg_loss) if len(average_loss_list) > 10: drop = average_loss_list[-10] - moving_avg_loss else: drop = 0. if not best_loss or moving_avg_loss < best_loss: best_loss = moving_avg_loss if step > iterations: break if stop_early and (moving_avg_loss > 4 * best_loss or torch.isnan(loss)): log_line(log) log.info("loss diverged - stopping early!") step = iterations break with open(str(learning_rate_tsv), "a") as f: f.write(f"{step}\t{learning_rate}\t{loss.item()}\t{moving_avg_loss}\t{drop}\n") self.model.load_state_dict(model_state) self.model.to(flair.device) log_line(log) log.info(f"learning rate finder finished - plot {learning_rate_tsv}") log_line(log) return Path(learning_rate_tsv)
def train( self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 4, train_with_dev: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, ): evaluation_method = 'F1' if self.model.tag_type in ['pos', 'upos']: evaluation_method = 'accuracy' log.info('Evaluation method: {}'.format(evaluation_method)) loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) # annealing scheduler anneal_mode = 'min' if train_with_dev else 'max' scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0, max_epochs): log.info('-' * 100) bad_epochs = scheduler.num_bad_epochs for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ os.path.exists(base_path + "/best-model.pt"): log.info('resetting to best model') self.model.load_from_file(base_path + "/best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.001: log.info('learning rate too small - quitting training!') break if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] self.model.train() current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): batch: List[Sentence] = batch optimizer.zero_grad() # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() loss = self.model.neg_log_likelihood(batch) current_loss += loss.item() seen_sentences += len(batch) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) # switch to eval mode self.model.eval() # if checkpointing is enable, save model at each epoch if checkpoint: self.model.save(base_path + "/checkpoint.pt") log.info('-' * 100) dev_score = dev_metric = None if not train_with_dev: dev_score, dev_metric = self.evaluate( self.corpus.dev, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) test_score, test_metric = self.evaluate( self.corpus.test, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_score) # logging info log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, bad_epochs)) if not train_with_dev: log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('DEV', dev_metric.f_score(), dev_metric.accuracy(), dev_metric._tp, dev_metric._fp, dev_metric._fn, dev_metric._tn)) log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('TEST', test_metric.f_score(), test_metric.accuracy(), test_metric._tp, test_metric._fp, test_metric._fn, test_metric._tn)) with open(loss_txt, 'a') as f: dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), '_', Metric.to_empty_tsv(), '_', dev_metric_str, '_', test_metric.to_tsv())) # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and dev_score == scheduler.best: self.model.save(base_path + "/best-model.pt") # if we do not use dev data for model selection, save final model if save_final_model: self.model.save(base_path + "/final-model.pt") except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') self.model.save(base_path + "/final-model.pt") log.info('Done.')
def prepare_data( self, base_path: Union[Path, str], learning_rate: float = 0.1, mini_batch_size: int = 32, eval_mini_batch_size: int = None, anneal_factor: float = 0.5, patience: int = 3, min_learning_rate: float = 0.0001, train_with_dev: bool = False, monitor_train: bool = False, monitor_test: bool = False, embedding_storage_mode: str = "cpu", checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 6, **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. :param base_path: Main path to which all output during training is logged and models are saved :param learning_rate: Initial learning rate :param mini_batch_size: Size of mini-batches during training :param eval_mini_batch_size: Size of mini-batches during evaluation :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed. :param anneal_factor: The factor by which the learning rate is annealed :param patience: Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate :param min_learning_rate: If the learning rate falls below this threshold, training terminates :param train_with_dev: If True, training is performed using both train+dev data :param monitor_train: If True, training data is evaluated at end of each epoch :param monitor_test: If True, test data is evaluated at end of each epoch :param embedding_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU) :param checkpoint: If True, a full checkpoint is saved at end of each epoch :param save_final_model: If True, final model is saved :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate :param shuffle: If True, data is shuffled during training :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing parameter selection. :param num_workers: Number of workers in your data loader. :param sampler: You can pass a data sampler here for special sampling of data. :param kwargs: Other arguments for the Optimizer :return: """ self.shuffle = shuffle self.embedding_storage_mode = embedding_storage_mode self.checkpoint = checkpoint self.save_final_model = save_final_model self.anneal_with_restarts = anneal_with_restarts self.num_workers = num_workers self.mini_batch_size = mini_batch_size if eval_mini_batch_size is None: self.eval_mini_batch_size = mini_batch_size else: self.eval_mini_batch_size = eval_mini_batch_size # cast string to Path if type(base_path) is str: self.base_path = Path(base_path) self.log_handler = add_file_handler(log, self.base_path / "training.log") if self.display_name is not None: log_line(log) log.info(f'Model: {self.display_name}') log_line(log) log.info(f'Model: "{self.model}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info("Parameters:") log.info(f' - learning_rate: "{learning_rate}"') log.info(f' - mini_batch_size: "{mini_batch_size}"') log.info(f' - patience: "{patience}"') log.info(f' - anneal_factor: "{anneal_factor}"') log.info(f' - max_epochs: "{self.max_epochs}"') log.info(f' - shuffle: "{shuffle}"') log.info(f' - train_with_dev: "{train_with_dev}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Device: {flair.device}") log_line(log) log.info(f"Embedding storage mode: {embedding_storage_mode}") # determine what splits (train, dev, test) to evaluate and log self.monitor_train = monitor_train self.monitor_test = monitor_test self.param_selection_mode = param_selection_mode self.train_with_dev = train_with_dev self.log_train = True if self.monitor_train else False self.log_test = (True if (not self.param_selection_mode and self.corpus.test and self.monitor_test) else False) self.log_dev = True if not self.train_with_dev else False # prepare loss logging file and set up header self.loss_txt = init_output_file(self.base_path, "loss.tsv") self.weight_extractor = WeightExtractor(self.base_path) self.learning_rate = learning_rate self.min_learning_rate = min_learning_rate self.previous_learning_rate = learning_rate self.optimizer: torch.optim.Optimizer = self.optimizer_type( self.model.parameters(), lr=self.learning_rate, **kwargs) if self.optimizer_state is not None: optimizer.load_state_dict(self.optimizer_state) # minimize training loss if training with dev data, else maximize dev score self.anneal_mode = "min" if self.train_with_dev else "max" self.anneal_factor = anneal_factor self.patience = patience self.scheduler: ReduceLROnPlateau = ReduceLROnPlateau( self.optimizer, factor=self.anneal_factor, patience=self.patience, mode=self.anneal_mode, verbose=True, ) if self.scheduler_state is not None: self.scheduler.load_state_dict(self.scheduler_state) self.train_data = self.corpus.train # if training also uses dev data, include in training set if self.train_with_dev: self.train_data = ConcatDataset( [self.corpus.train, self.corpus.dev]) self.dev_score_history = [] self.dev_loss_history = [] self.train_loss_history = []