Example #1
0
 def save_checkpoint(self, filename, extra_state):
     """Save all training state in a checkpoint file."""
     if distributed_utils.is_master(self.args):  # only save one checkpoint
         extra_state["metrics"] = metrics.state_dict()
         checkpoint_utils.save_state(
             filename,
             self.args,
             self.get_model().state_dict(),
             self.get_criterion(),
             self.optimizer,
             self.lr_scheduler,
             self.get_num_updates(),
             self._optim_history,
             extra_state,
         )
Example #2
0
 def save_checkpoint(self, filename, extra_state):
     """Save all training state in a checkpoint file."""
     if self.is_data_parallel_master:  # only save one checkpoint
         extra_state["metrics"] = metrics.state_dict()
         extra_state["previous_training_time"] = self.cumulative_training_time()
         checkpoint_utils.save_state(
             filename,
             self.args,
             self.get_model().state_dict(),
             self.get_criterion(),
             self.optimizer,
             self.lr_scheduler,
             self.get_num_updates(),
             self._optim_history,
             extra_state,
         )
Example #3
0
 def save_checkpoint(self, filename, extra_state):
     """Save all training state in a checkpoint file."""
     if self.is_data_parallel_master:  # only save one checkpoint
         logger.info(f"Saving checkpoint to {filename}")
         extra_state["metrics"] = metrics.state_dict()
         extra_state["previous_training_time"] = self.cumulative_training_time()
         checkpoint_utils.save_state(
             filename,
             self.cfg,
             self.model.state_dict(),
             self.criterion,
             self.optimizer,
             self.lr_scheduler,
             self.get_num_updates(),
             self._optim_history,
             extra_state,
         )
         logger.info(f"Finished saving checkpoint to {filename}")