def test_links(tmpdir): tmpdir = str(tmpdir) fn0 = os.path.join(tmpdir, "target") fn1 = os.path.join(tmpdir, "link1") fn2 = os.path.join(tmpdir, "link2") data = b"my target data" with open(fn0, "wb") as f: f.write(data) os.symlink(fn0, fn1) os.symlink(fn0, fn2) fs = LocalFileSystem() assert fs.info(fn0)["type"] == "file" assert fs.info(fn1)["type"] == "link" assert fs.info(fn2)["type"] == "link" assert fs.info(fn0)["size"] == len(data) assert fs.info(fn1)["size"] == len(data) assert fs.info(fn2)["size"] == len(data) of = fsspec.open(fn1, "rb") with of as f: assert f.read() == data of = fsspec.open(fn2, "rb") with of as f: assert f.read() == data
def test_links(tmpdir): tmpdir = str(tmpdir) fn0 = os.path.join(tmpdir, "target") fn1 = os.path.join(tmpdir, "link1") fn2 = os.path.join(tmpdir, "link2") data = b"my target data" with open(fn0, "wb") as f: f.write(data) try: os.symlink(fn0, fn1) os.symlink(fn0, fn2) except OSError: if WIN: pytest.xfail("Ran on win without admin permissions") else: raise fs = LocalFileSystem() assert fs.info(fn0)["type"] == "file" assert fs.info(fn1)["type"] == "link" assert fs.info(fn2)["type"] == "link" assert fs.info(fn0)["size"] == len(data) assert fs.info(fn1)["size"] == len(data) assert fs.info(fn2)["size"] == len(data) of = fsspec.open(fn1, "rb") with of as f: assert f.read() == data of = fsspec.open(fn2, "rb") with of as f: assert f.read() == data
def test_file_ops(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with pytest.raises(FileNotFoundError): fs.info(tmpdir + "/nofile") fs.touch(tmpdir + "/afile") i1 = fs.ukey(tmpdir + "/afile") assert tmpdir + "/afile" in fs.ls(tmpdir) with fs.open(tmpdir + "/afile", "wb") as f: f.write(b"data") i2 = fs.ukey(tmpdir + "/afile") assert i1 != i2 # because file changed fs.copy(tmpdir + "/afile", tmpdir + "/afile2") assert tmpdir + "/afile2" in fs.ls(tmpdir) fs.move(tmpdir + "/afile", tmpdir + "/afile3") assert not fs.exists(tmpdir + "/afile") fs.rm(tmpdir + "/afile3", recursive=True) assert not fs.exists(tmpdir + "/afile3") fs.rm(tmpdir, recursive=True) assert not fs.exists(tmpdir)
def test_links(tmpdir): tmpdir = str(tmpdir) fn0 = os.path.join(tmpdir, 'target') fn1 = os.path.join(tmpdir, 'link1') fn2 = os.path.join(tmpdir, 'link2') data = b'my target data' with open(fn0, 'wb') as f: f.write(data) os.symlink(fn0, fn1) os.symlink(fn0, fn2) fs = LocalFileSystem() assert fs.info(fn0)['type'] == 'file' assert fs.info(fn1)['type'] == 'link' assert fs.info(fn2)['type'] == 'link' assert fs.info(fn0)['size'] == len(data) assert fs.info(fn1)['size'] == len(data) assert fs.info(fn2)['size'] == len(data) of = fsspec.open(fn1, 'rb') with of as f: assert f.read() == data of = fsspec.open(fn2, 'rb') with of as f: assert f.read() == data
def test_file_ops(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with pytest.raises(FileNotFoundError): fs.info(tmpdir + '/nofile') fs.touch(tmpdir + '/afile') i1 = fs.ukey(tmpdir + '/afile') assert tmpdir + '/afile' in fs.ls(tmpdir) with fs.open(tmpdir + '/afile', 'wb') as f: f.write(b'data') i2 = fs.ukey(tmpdir + '/afile') assert i1 != i2 # because file changed fs.copy(tmpdir + '/afile', tmpdir + '/afile2') assert tmpdir + '/afile2' in fs.ls(tmpdir) fs.move(tmpdir + '/afile', tmpdir + '/afile3') assert not fs.exists(tmpdir + '/afile') fs.rm(tmpdir + '/afile3', recursive=True) assert not fs.exists(tmpdir + '/afile3') fs.rm(tmpdir, recursive=True) assert not fs.exists(tmpdir)
def test_linked_directories(tmpdir): tmpdir = str(tmpdir) subdir0 = os.path.join(tmpdir, "target") subdir1 = os.path.join(tmpdir, "link1") subdir2 = os.path.join(tmpdir, "link2") os.makedirs(subdir0) try: os.symlink(subdir0, subdir1) os.symlink(subdir0, subdir2) except OSError: if WIN: pytest.xfail("Ran on win without admin permissions") else: raise fs = LocalFileSystem() assert fs.info(subdir0)["type"] == "directory" assert fs.info(subdir1)["type"] == "directory" assert fs.info(subdir2)["type"] == "directory" assert not fs.info(subdir0)["islink"] assert fs.info(subdir1)["islink"] assert fs.info(subdir2)["islink"]
def test_file_ops(tmpdir): tmpdir = make_path_posix(str(tmpdir)) fs = LocalFileSystem(auto_mkdir=True) with pytest.raises(FileNotFoundError): fs.info(tmpdir + "/nofile") fs.touch(tmpdir + "/afile") i1 = fs.ukey(tmpdir + "/afile") assert tmpdir + "/afile" in fs.ls(tmpdir) with fs.open(tmpdir + "/afile", "wb") as f: f.write(b"data") i2 = fs.ukey(tmpdir + "/afile") assert i1 != i2 # because file changed fs.copy(tmpdir + "/afile", tmpdir + "/afile2") assert tmpdir + "/afile2" in fs.ls(tmpdir) fs.move(tmpdir + "/afile", tmpdir + "/afile3") assert not fs.exists(tmpdir + "/afile") fs.cp(tmpdir + "/afile3", tmpdir + "/deeply/nested/file") assert fs.exists(tmpdir + "/deeply/nested/file") fs.rm(tmpdir + "/afile3", recursive=True) assert not fs.exists(tmpdir + "/afile3") files = [tmpdir + "/afile4", tmpdir + "/afile5"] [fs.touch(f) for f in files] with pytest.raises(TypeError): fs.rm_file(files) fs.rm(files) assert all(not fs.exists(f) for f in files) fs.touch(tmpdir + "/afile6") fs.rm_file(tmpdir + "/afile6") assert not fs.exists(tmpdir + "/afile6") # IsADirectoryError raised on Linux, PermissionError on Windows with pytest.raises((IsADirectoryError, PermissionError)): fs.rm_file(tmpdir) fs.rm(tmpdir, recursive=True) assert not fs.exists(tmpdir)
class LocalFileSystem(FileSystem): sep = os.sep scheme = Schemes.LOCAL PARAM_CHECKSUM = "md5" PARAM_PATH = "path" TRAVERSE_PREFIX_LEN = 2 def __init__(self, **config): from fsspec.implementations.local import LocalFileSystem as LocalFS super().__init__(**config) self.fs = LocalFS() @staticmethod def open(path, mode="r", encoding=None, **kwargs): return open(path, mode=mode, encoding=encoding) def exists(self, path) -> bool: # TODO: replace this with os.path.exists once the problem is fixed on # the fsspec https://github.com/intake/filesystem_spec/issues/742 return os.path.lexists(path) def checksum(self, path) -> str: from fsspec.utils import tokenize st = os.stat(path) return str(int(tokenize([st.st_ino, st.st_mtime, st.st_size]), 16)) def isfile(self, path) -> bool: return os.path.isfile(path) def isdir(self, path) -> bool: return os.path.isdir(path) def iscopy(self, path): return not (System.is_symlink(path) or System.is_hardlink(path)) def walk(self, top, topdown=True, onerror=None, **kwargs): """Directory fs generator. See `os.walk` for the docs. Differences: - no support for symlinks """ for root, dirs, files in os.walk(top, topdown=topdown, onerror=onerror): yield os.path.normpath(root), dirs, files def find(self, path, prefix=None): for root, _, files in self.walk(path): for file in files: # NOTE: os.path.join is ~5.5 times slower yield f"{root}{os.sep}{file}" def is_empty(self, path): if self.isfile(path) and os.path.getsize(path) == 0: return True if self.isdir(path) and len(os.listdir(path)) == 0: return True return False def remove(self, path): remove(path) def makedirs(self, path, **kwargs): makedirs(path, exist_ok=kwargs.pop("exist_ok", True)) def move(self, from_info, to_info): self.makedirs(self.path.parent(to_info)) move(from_info, to_info) def copy(self, from_info, to_info): tmp_info = self.path.join(self.path.parent(to_info), tmp_fname("")) try: copyfile(from_info, tmp_info) os.rename(tmp_info, to_info) except Exception: self.remove(tmp_info) raise def upload_fobj(self, fobj, to_info, **kwargs): self.makedirs(self.path.parent(to_info)) tmp_info = self.path.join(self.path.parent(to_info), tmp_fname("")) try: copy_fobj_to_file(fobj, tmp_info) os.rename(tmp_info, to_info) except Exception: self.remove(tmp_info) raise @staticmethod def symlink(from_info, to_info): System.symlink(from_info, to_info) @staticmethod def is_symlink(path): return System.is_symlink(path) def hardlink(self, from_info, to_info): # If there are a lot of empty files (which happens a lot in datasets), # and the cache type is `hardlink`, we might reach link limits and # will get something like: `too many links error` # # This is because all those empty files will have the same hash # (i.e. 68b329da9893e34099c7d8ad5cb9c940), therefore, they will be # linked to the same file in the cache. # # From https://en.wikipedia.org/wiki/Hard_link # * ext4 limits the number of hard links on a file to 65,000 # * Windows with NTFS has a limit of 1024 hard links on a file # # That's why we simply create an empty file rather than a link. if self.getsize(from_info) == 0: self.open(to_info, "w").close() logger.debug("Created empty file: %s -> %s", from_info, to_info) return System.hardlink(from_info, to_info) @staticmethod def is_hardlink(path): return System.is_hardlink(path) def reflink(self, from_info, to_info): System.reflink(from_info, to_info) def info(self, path): return self.fs.info(path) def put_file(self, from_file, to_info, callback=DEFAULT_CALLBACK, **kwargs): parent = self.path.parent(to_info) makedirs(parent, exist_ok=True) tmp_file = self.path.join(parent, tmp_fname()) copyfile(from_file, tmp_file, callback=callback) os.replace(tmp_file, to_info) def get_file(self, from_info, to_file, callback=DEFAULT_CALLBACK, **kwargs): copyfile(from_info, to_file, callback=callback)
class FsspecLocalFileSystem(AbstractFileSystem): sep = os.sep def __init__(self, *args, **kwargs): from fsspec.implementations.local import LocalFileSystem as LocalFS super().__init__(*args, **kwargs) self.fs = LocalFS() def makedirs(self, path, exist_ok=False): makedirs(path, exist_ok=exist_ok) def mkdir(self, path, create_parents=True, **kwargs): if self.exists(path): raise FileExistsError(path) if create_parents: self.makedirs(path, exist_ok=True) else: os.mkdir(path, **kwargs) def lexists(self, path, **kwargs): return os.path.lexists(path) def exists(self, path, **kwargs): # TODO: replace this with os.path.exists once the problem is fixed on # the fsspec https://github.com/intake/filesystem_spec/issues/742 return os.path.lexists(path) def checksum(self, path) -> str: from fsspec.utils import tokenize st = os.stat(path) return str(int(tokenize([st.st_ino, st.st_mtime, st.st_size]), 16)) def info(self, path, **kwargs): return self.fs.info(path) def ls(self, path, **kwargs): return self.fs.ls(path, **kwargs) def isfile(self, path) -> bool: return os.path.isfile(path) def isdir(self, path) -> bool: return os.path.isdir(path) def walk(self, path, maxdepth=None, topdown=True, **kwargs): """Directory fs generator. See `os.walk` for the docs. Differences: - no support for symlinks """ for root, dirs, files in os.walk( path, topdown=topdown, ): yield os.path.normpath(root), dirs, files def find(self, path, **kwargs): for root, _, files in self.walk(path, **kwargs): for file in files: # NOTE: os.path.join is ~5.5 times slower yield f"{root}{os.sep}{file}" @classmethod def _parent(cls, path): return os.path.dirname(path) def put_file(self, lpath, rpath, callback=None, **kwargs): parent = self._parent(rpath) makedirs(parent, exist_ok=True) tmp_file = os.path.join(parent, tmp_fname()) copyfile(lpath, tmp_file, callback=callback) os.replace(tmp_file, rpath) def get_file(self, rpath, lpath, callback=None, **kwargs): copyfile(rpath, lpath, callback=callback) def mv(self, path1, path2, **kwargs): self.makedirs(self._parent(path2), exist_ok=True) move(path1, path2) def rmdir(self, path): os.rmdir(path) def rm_file(self, path): remove(path) def rm(self, path, recursive=False, maxdepth=None): remove(path) def copy(self, path1, path2, recursive=False, on_error=None, **kwargs): tmp_info = os.path.join(self._parent(path2), tmp_fname("")) try: copyfile(path1, tmp_info) os.rename(tmp_info, path2) except Exception: self.rm_file(tmp_info) raise def open(self, path, mode="r", encoding=None, **kwargs): return open(path, mode=mode, encoding=encoding) def symlink(self, path1, path2): return System.symlink(path1, path2) @staticmethod def is_symlink(path): return System.is_symlink(path) @staticmethod def is_hardlink(path): return System.is_hardlink(path) def hardlink(self, path1, path2): # If there are a lot of empty files (which happens a lot in datasets), # and the cache type is `hardlink`, we might reach link limits and # will get something like: `too many links error` # # This is because all those empty files will have the same hash # (i.e. 68b329da9893e34099c7d8ad5cb9c940), therefore, they will be # linked to the same file in the cache. # # From https://en.wikipedia.org/wiki/Hard_link # * ext4 limits the number of hard links on a file to 65,000 # * Windows with NTFS has a limit of 1024 hard links on a file # # That's why we simply create an empty file rather than a link. if self.size(path1) == 0: self.open(path2, "w").close() logger.debug("Created empty file: %s -> %s", path1, path2) return return System.hardlink(path1, path2) def reflink(self, path1, path2): return System.reflink(path1, path2)