def to_directory(self, path: Optional[str] = None) -> str: """Write checkpoint data to directory. Args: path: Target directory to restore data in. If not specified, will create a temporary directory. Returns: str: Directory containing checkpoint data. """ user_provided_path = path is not None path = path if user_provided_path else self._get_temporary_checkpoint_dir( ) path = os.path.normpath(path) _make_dir(path, acquire_del_lock=not user_provided_path) try: # Timeout 0 means there will be only one attempt to acquire # the file lock. If it cannot be aquired, a TimeoutError # will be thrown. with TempFileLock(f"{path}.lock", timeout=0): self._to_directory(path) except TimeoutError: # if the directory is already locked, then wait but do not do anything. with TempFileLock(f"{path}.lock", timeout=-1): pass if not os.path.exists(path): raise RuntimeError( f"Checkpoint directory {path} does not exist, " "even though it should have been created by " "another process. Please raise an issue on GitHub: " "https://github.com/ray-project/ray/issues") return path
def _copy_dir(source_dir: str, target_dir: str, *, _retry: bool = True) -> None: """Copy dir with shutil on the actor.""" target_dir = os.path.normpath(target_dir) try: # Timeout 0 means there will be only one attempt to acquire # the file lock. If it cannot be aquired, a TimeoutError # will be thrown. with TempFileLock(f"{target_dir}.lock", timeout=0): _delete_path_unsafe(target_dir) shutil.copytree(source_dir, target_dir) except TimeoutError: # wait, but do not do anything with TempFileLock(f"{target_dir}.lock"): pass # if the dir was locked due to being deleted, # recreate if not os.path.exists(target_dir): if _retry: _copy_dir(source_dir, target_dir, _retry=False) else: raise RuntimeError( f"Target directory {target_dir} does not exist " "and couldn't be recreated. " "Please raise an issue on GitHub: " "https://github.com/ray-project/ray/issues" )
def _unpack_dir(stream: io.BytesIO, target_dir: str, *, _retry: bool = True) -> None: """Unpack tarfile stream into target directory.""" stream.seek(0) target_dir = os.path.normpath(target_dir) try: # Timeout 0 means there will be only one attempt to acquire # the file lock. If it cannot be aquired, a TimeoutError # will be thrown. with TempFileLock(f"{target_dir}.lock", timeout=0): with tarfile.open(fileobj=stream) as tar: tar.extractall(target_dir) except TimeoutError: # wait, but do not do anything with TempFileLock(f"{target_dir}.lock"): pass # if the dir was locked due to being deleted, # recreate if not os.path.exists(target_dir): if _retry: _unpack_dir(stream, target_dir, _retry=False) else: raise RuntimeError( f"Target directory {target_dir} does not exist " "and couldn't be recreated. " "Please raise an issue on GitHub: " "https://github.com/ray-project/ray/issues" )
def download_from_uri(uri: str, local_path: str, filelock: bool = True): _assert_pyarrow_installed() fs, bucket_path = get_fs_and_path(uri) if not fs: raise ValueError( f"Could not download from URI: " f"URI `{uri}` is not a valid or supported cloud target. " f"Hint: {fs_hint(uri)}") if filelock: with TempFileLock(f"{os.path.normpath(local_path)}.lock"): pyarrow.fs.copy_files(bucket_path, local_path, source_filesystem=fs) else: pyarrow.fs.copy_files(bucket_path, local_path, source_filesystem=fs)
def as_directory(self) -> Iterator[str]: """Return checkpoint directory path in a context. This function makes checkpoint data available as a directory while avoiding unnecessary copies and left-over temporary data. If the checkpoint is already a directory checkpoint, it will return the existing path. If it is not, it will create a temporary directory, which will be deleted after the context is exited. If the checkpoint has been created from an object reference, the directory name will be constant and equal to the object reference ID. This allows for multiple processes to use the same files for improved performance. The directory will be deleted after exiting the context only if no other processes are using it. In any other case, a new temporary directory will be created with each call to ``as_directory``. Users should treat the returned checkpoint directory as read-only and avoid changing any data within it, as it might get deleted when exiting the context. Example: with checkpoint.as_directory() as checkpoint_dir: # Do some read-only processing of files within checkpoint_dir pass # At this point, if a temporary directory was created, it will have # been deleted. """ if self._local_path: yield self._local_path else: temp_dir = self.to_directory() del_lock_path = _get_del_lock_path(temp_dir) yield temp_dir # Cleanup try: os.remove(del_lock_path) except Exception: logger.warning( f"Could not remove {del_lock_path} deletion file lock. " f"Traceback:\n{traceback.format_exc()}") # In the edge case (process crash before del lock file is removed), # we do not remove the directory at all. # Since it's in /tmp, this is not that big of a deal. # check if any lock files are remaining temp_dir_base_name = Path(temp_dir).name if not list( Path(temp_dir).parent.glob( _get_del_lock_path(temp_dir_base_name, "*"))): try: # Timeout 0 means there will be only one attempt to acquire # the file lock. If it cannot be aquired, a TimeoutError # will be thrown. with TempFileLock(f"{temp_dir}.lock", timeout=0): shutil.rmtree(temp_dir, ignore_errors=True) except TimeoutError: pass
def _delete_path(target_path: str) -> bool: """Delete path (files and directories)""" target_path = os.path.normpath(target_path) with TempFileLock(f"{target_path}.lock"): return _delete_path_unsafe(target_path)