def save_checkpoint_atomic(trainer, final_filename, extra_state): """Wrapper around trainer.save_checkpoint to make save atomic.""" temp_filename = os.path.join(final_filename + ".tmp") trainer.save_checkpoint(temp_filename, extra_state) # TODO(T56266125): Use mv() instead of copy() + rm() after it's added to # PathManager. assert PathManager.copy( temp_filename, final_filename, overwrite=True ), f"Failed to copy {temp_filename} to {final_filename}" PathManager.rm(temp_filename)
def _remove_checkpoint(self, checkpoint_to_remove: Optional[str]): if checkpoint_to_remove: self.log_if_verbose( f"| Preparing to remove old checkpoint {checkpoint_to_remove}." ) try: PathManager.rm(checkpoint_to_remove) self.log_if_verbose( f"| Finished removing old checkpoint {checkpoint_to_remove}." ) except FileNotFoundError: print( f"| Unable to find old checkpoint {checkpoint_to_remove} for removal", flush=True, )
def _remove_checkpoint(self, checkpoint_to_remove: Optional[str]): if checkpoint_to_remove: self.log_if_verbose( f"| Preparing to remove old checkpoint {checkpoint_to_remove}." ) try: PathManager.rm(checkpoint_to_remove) self.log_if_verbose( f"| Finished removing old checkpoint {checkpoint_to_remove}." ) except OSError as e: print( f"| Failed to remove old checkpoint {checkpoint_to_remove} " f"- exception: {e}", flush=True, )