def test_save_checkpoint_calls_torch_save(self, mock_open, mock_dill, mock_torch): epoch = 5 step = 10 optim = mock.Mock() state_dict = {'epoch': epoch, 'step': step, 'optimizer': optim} mock_model = mock.Mock() mock_vocab = mock.Mock() mock_open.return_value = mock.MagicMock() chk_point = Checkpoint(model=mock_model, optimizer=optim, epoch=epoch, step=step, input_vocab=mock_vocab, output_vocab=mock_vocab) path = chk_point.save(self._get_experiment_dir()) self.assertEquals(2, mock_torch.save.call_count) mock_torch.save.assert_any_call( state_dict, os.path.join(chk_point.path, Checkpoint.TRAINER_STATE_NAME)) mock_torch.save.assert_any_call( mock_model, os.path.join(chk_point.path, Checkpoint.MODEL_NAME)) self.assertEquals(2, mock_open.call_count) mock_open.assert_any_call( os.path.join(path, Checkpoint.INPUT_VOCAB_FILE), ANY) mock_open.assert_any_call( os.path.join(path, Checkpoint.OUTPUT_VOCAB_FILE), ANY) self.assertEquals(2, mock_dill.dump.call_count) mock_dill.dump.assert_any_call( mock_vocab, mock_open.return_value.__enter__.return_value)
def load_model_from_checkpoint(opt, src, tgt): logging.info("loading checkpoint from {}".format( os.path.join(opt.output_dir, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.output_dir, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab src.vocab = input_vocab output_vocab = checkpoint.output_vocab tgt.vocab = output_vocab tgt.eos_id = tgt.vocab.stoi[tgt.SYM_EOS] tgt.sos_id = tgt.vocab.stoi[tgt.SYM_SOS] return seq2seq, input_vocab, output_vocab
def test_load(self, mock_open, mock_dill, mock_torch): dummy_vocabulary = mock.Mock() mock_optimizer = mock.Mock() torch_dict = {"optimizer": mock_optimizer, "epoch": 5, "step": 10} mock_open.return_value = mock.MagicMock() mock_torch.load.side_effect = [torch_dict, mock.MagicMock()] mock_dill.load.return_value = dummy_vocabulary loaded_chk_point = Checkpoint.load("mock_checkpoint_path") mock_torch.load.assert_any_call( os.path.join('mock_checkpoint_path', Checkpoint.TRAINER_STATE_NAME)) mock_torch.load.assert_any_call( os.path.join("mock_checkpoint_path", Checkpoint.MODEL_NAME)) self.assertEquals(loaded_chk_point.epoch, torch_dict['epoch']) self.assertEquals(loaded_chk_point.optimizer, torch_dict['optimizer']) self.assertEquals(loaded_chk_point.step, torch_dict['step']) self.assertEquals(loaded_chk_point.input_vocab, dummy_vocabulary) self.assertEquals(loaded_chk_point.output_vocab, dummy_vocabulary)
def load_models_from_paths(paths: list, src, tgt): """ Load all the models specified in a list of paths. """ models = [] for path in paths: checkpoint = Checkpoint.load(path) models.append(checkpoint.model) # Build vocab once input_vocab = checkpoint.input_vocab src.vocab = input_vocab input_vocab = checkpoint.input_vocab src.vocab = input_vocab output_vocab = checkpoint.output_vocab tgt.vocab = output_vocab tgt.eos_id = tgt.vocab.stoi[tgt.SYM_EOS] tgt.sos_id = tgt.vocab.stoi[tgt.SYM_SOS] return models, input_vocab, output_vocab
if opt.use_attention_loss and opt.attention_method == 'hard': parser.warning( "Did you mean to use attention loss in combination with hard attention method?" ) if torch.cuda.is_available(): logging.info("Cuda device set to %i" % opt.cuda_device) torch.cuda.set_device(opt.cuda_device) ################################################################################# # load model logging.info("loading checkpoint from {}".format( os.path.join(opt.checkpoint_path))) checkpoint = Checkpoint.load(opt.checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab ############################################################################ # Prepare dataset and loss src = SourceField() tgt = TargetField(output_eos_used) tabular_data_fields = [('src', src), ('tgt', tgt)] if opt.use_attention_loss or opt.attention_method == 'hard': attn = AttentionField(use_vocab=False, ignore_index=IGNORE_INDEX) tabular_data_fields.append(('attn', attn))
def test_path_error(self): ckpt = Checkpoint(None, None, None, None, None, None) self.assertRaises(LookupError, lambda: ckpt.path)
tgt_len = len(vars(m[0])['tgt']) - 1 # -1 for SOS attn_len = len(vars( m[0])['attn']) - 1 # -1 for preprended ignore_index if attn_len != tgt_len: raise Exception( "Length of output sequence does not equal length of attention sequence in monitor data." ) ################################################################################# # prepare model if opt.load_checkpoint is not None: logging.info("loading checkpoint from {}".format( os.path.join(opt.output_dir, opt.load_checkpoint))) checkpoint_path = os.path.join(opt.output_dir, opt.load_checkpoint) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model input_vocab = checkpoint.input_vocab src.vocab = input_vocab output_vocab = checkpoint.output_vocab tgt.vocab = output_vocab tgt.eos_id = tgt.vocab.stoi[tgt.SYM_EOS] tgt.sos_id = tgt.vocab.stoi[tgt.SYM_SOS] else: # build vocabulary src.build_vocab(train, max_size=opt.src_vocab) tgt.build_vocab(train, max_size=opt.tgt_vocab) input_vocab = src.vocab
def train(self, model, data, dev_data, num_epochs=5, resume_training=False, monitor_data={}, optimizer=None, teacher_forcing_ratio=0, custom_callbacks=[], learning_rate=0.001, checkpoint_path=None, top_k=5, losses=[NLLLoss()], loss_weights=None, metrics=[], random_seed=None, checkpoint_every=100, print_every=100): """ Run training for a given model. Args: model (machine.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (torchtext.data.Iterator: torchtext iterator object to train on num_epochs (int, optional): number of epochs to run (default 5) resume_training(bool, optional): resume training with the latest checkpoint up until the number of epochs (default False) dev_data (torchtext.data.Iterator): dev/validation set iterator Note: must not pass in the train iterator here as this gets evaluated during training (in between batches) If you want to evaluate on the full train during training then make two iterators and pass the second one here monitor_data (list of torchtext.data.Iterator, optional): list of iterators to test on (default None) Note: must not pass in the train iterator here as this gets evaluated during training (in between batches) If you want to evaluate on the full train during training then make two iterators and pass the second one here optimizer (machine.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) custom_callbacks (list, optional): list of custom call backs (see utils.callbacks.callback for base class) learing_rate (float, optional): learning rate used by the optimizer (default 0.001) checkpoint_path (str, optional): path to load checkpoint from in case training should be resumed top_k (int): how many models should be stored during training loss (list, optional): list of machine.loss.Loss objects for training (default: [machine.loss.NLLLoss]) metrics (list, optional): list of machine.metric.metric objects to be computed during evaluation checkpoint_every (int, optional): number of epochs to checkpoint after, (default: 100) print_every (int, optional): number of iterations to print after, (default: 100) Returns: model (machine.models): trained model. """ self.set_local_parameters(random_seed, losses, metrics, loss_weights, checkpoint_every, print_every) # If training is set to resume if resume_training: resume_checkpoint = Checkpoint.load(checkpoint_path) model = resume_checkpoint.model self.model = model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( self.model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 self.model = model def get_optim(optim_name): optims = {'adam': optim.Adam, 'adagrad': optim.Adagrad, 'adadelta': optim.Adadelta, 'adamax': optim.Adamax, 'rmsprop': optim.RMSprop, 'sgd': optim.SGD, None: optim.Adam} return optims[optim_name] self.optimizer = Optimizer(get_optim(optimizer)(self.model.parameters(), lr=learning_rate), max_grad_norm=5) self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) callbacks = CallbackContainer(self, [Logger(), ModelCheckpoint(top_k=top_k), History()] + custom_callbacks) logs = self._train_epoches(data, num_epochs, start_epoch, step, dev_data=dev_data, monitor_data=monitor_data, callbacks=callbacks, teacher_forcing_ratio=teacher_forcing_ratio) return self.model, logs
def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, monitor_data=[], teacher_forcing_ratio=0, top_k=5): log = self.logger print_loss_total = defaultdict(float) # Reset every print_every epoch_loss_total = defaultdict(float) # Reset every epoch epoch_loss_avg = defaultdict(float) print_loss_avg = defaultdict(float) iterator_device = torch.cuda.current_device( ) if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=iterator_device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 # store initial model to be sure at least one model is stored val_data = dev_data or data losses, metrics = self.evaluator.evaluate(model, val_data, self.get_batch_data) total_loss, log_msg, model_name = self.get_losses( losses, metrics, step) log.info(log_msg) logs = Log() loss_best = top_k * [total_loss] best_checkpoints = top_k * [None] best_checkpoints[0] = model_name Checkpoint( model=model, optimizer=self.optimizer, epoch=start_epoch, step=start_step, input_vocab=data.fields[machine.src_field_name].vocab, output_vocab=data.fields[machine.tgt_field_name].vocab).save( self.expt_dir, name=model_name) for epoch in range(start_epoch, n_epochs + 1): log.info("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths, target_variables = self.get_batch_data( batch) losses = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss for loss in losses: name = loss.log_name print_loss_total[name] += loss.get_loss() epoch_loss_total[name] += loss.get_loss() # print log info according to print_every parm if step % self.print_every == 0 and step_elapsed > self.print_every: for loss in losses: name = loss.log_name print_loss_avg[ name] = print_loss_total[name] / self.print_every print_loss_total[name] = 0 m_logs = {} train_losses, train_metrics = self.evaluator.evaluate( model, data, self.get_batch_data) train_loss, train_log_msg, model_name = self.get_losses( train_losses, train_metrics, step) logs.write_to_log('Train', train_losses, train_metrics, step) logs.update_step(step) m_logs['Train'] = train_log_msg # compute vals for all monitored sets for m_data in monitor_data: losses, metrics = self.evaluator.evaluate( model, monitor_data[m_data], self.get_batch_data) total_loss, log_msg, model_name = self.get_losses( losses, metrics, step) m_logs[m_data] = log_msg logs.write_to_log(m_data, losses, metrics, step) all_losses = ' '.join([ '%s:\t %s\n' % (os.path.basename(name), m_logs[name]) for name in m_logs ]) log_msg = 'Progress %d%%, %s' % (step / total_steps * 100, all_losses) log.info(log_msg) # check if new model should be saved if step % self.checkpoint_every == 0 or step == total_steps: # compute dev loss losses, metrics = self.evaluator.evaluate( model, val_data, self.get_batch_data) total_loss, log_msg, model_name = self.get_losses( losses, metrics, step) max_eval_loss = max(loss_best) if total_loss < max_eval_loss: index_max = loss_best.index(max_eval_loss) # rm prev model if best_checkpoints[index_max] is not None: shutil.rmtree( os.path.join(self.expt_dir, best_checkpoints[index_max])) best_checkpoints[index_max] = model_name loss_best[index_max] = total_loss # save model Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[ machine.src_field_name].vocab, output_vocab=data.fields[ machine.tgt_field_name].vocab).save( self.expt_dir, name=model_name) if step_elapsed == 0: continue for loss in losses: epoch_loss_avg[ loss.log_name] = epoch_loss_total[loss.log_name] / min( steps_per_epoch, step - start_step) epoch_loss_total[loss.log_name] = 0 if dev_data is not None: losses, metrics = self.evaluator.evaluate( model, dev_data, self.get_batch_data) loss_total, log_, model_name = self.get_losses( losses, metrics, step) self.optimizer.update(loss_total, epoch) # TODO check if this makes sense! log_msg += ", Dev set: " + log_ model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) # TODO check if this makes sense! log.info(log_msg) return logs
def train(self, model, data, num_epochs=5, resume=False, dev_data=None, monitor_data={}, optimizer=None, teacher_forcing_ratio=0, learning_rate=0.001, checkpoint_path=None, top_k=5): """ Run training for a given model. Args: model (machine.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (machine.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (machine.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (machine.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) learing_rate (float, optional): learning rate used by the optimizer (default 0.001) checkpoint_path (str, optional): path to load checkpoint from in case training should be resumed top_k (int): how many models should be stored during training Returns: model (machine.models): trained model. """ # If training is set to resume if resume: resume_checkpoint = Checkpoint.load(checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 def get_optim(optim_name): optims = { 'adam': optim.Adam, 'adagrad': optim.Adagrad, 'adadelta': optim.Adadelta, 'adamax': optim.Adamax, 'rmsprop': optim.RMSprop, 'sgd': optim.SGD, None: optim.Adam } return optims[optim_name] self.optimizer = Optimizer(get_optim(optimizer)(model.parameters(), lr=learning_rate), max_grad_norm=5) self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) logs = self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, monitor_data=monitor_data, teacher_forcing_ratio=teacher_forcing_ratio, top_k=top_k) return model, logs