Example #1
0
def save_checkpoint_atomic(trainer, final_filename, extra_state):
    """Wrapper around trainer.save_checkpoint to make save atomic."""
    temp_filename = os.path.join(final_filename + ".tmp")
    trainer.save_checkpoint(temp_filename, extra_state)
    # TODO(T56266125): Use mv() instead of copy() + rm() after it's added to
    # PathManager.
    assert PathManager.copy(
        temp_filename, final_filename, overwrite=True
    ), f"Failed to copy {temp_filename} to {final_filename}"
    PathManager.rm(temp_filename)
Example #2
0
 def _remove_checkpoint(self, checkpoint_to_remove: Optional[str]):
     if checkpoint_to_remove:
         self.log_if_verbose(
             f"| Preparing to remove old checkpoint {checkpoint_to_remove}."
         )
         try:
             PathManager.rm(checkpoint_to_remove)
             self.log_if_verbose(
                 f"| Finished removing old checkpoint {checkpoint_to_remove}."
             )
         except FileNotFoundError:
             print(
                 f"| Unable to find old checkpoint {checkpoint_to_remove} for removal",
                 flush=True,
             )
Example #3
0
 def _remove_checkpoint(self, checkpoint_to_remove: Optional[str]):
     if checkpoint_to_remove:
         self.log_if_verbose(
             f"| Preparing to remove old checkpoint {checkpoint_to_remove}."
         )
         try:
             PathManager.rm(checkpoint_to_remove)
             self.log_if_verbose(
                 f"| Finished removing old checkpoint {checkpoint_to_remove}."
             )
         except OSError as e:
             print(
                 f"| Failed to remove old checkpoint {checkpoint_to_remove} "
                 f"- exception: {e}",
                 flush=True,
             )