def py_fsspec_s3fs(request, s3_connection, s3_server): s3fs = pytest.importorskip("s3fs") if (sys.version_info < (3, 7) and Version(s3fs.__version__) >= Version("0.5")): pytest.skip("s3fs>=0.5 version is async and requires Python >= 3.7") host, port, access_key, secret_key = s3_connection bucket = 'pyarrow-filesystem/' fs = s3fs.S3FileSystem( key=access_key, secret=secret_key, client_kwargs=dict(endpoint_url='http://{}:{}'.format(host, port))) fs = PyFileSystem(FSSpecHandler(fs)) fs.create_dir(bucket) yield dict( fs=fs, pathfn=bucket.__add__, allow_move_dir=False, allow_append_to_file=True, ) fs.delete_dir(bucket)
def _resolve_paths_and_filesystem( paths: Union[str, List[str]], filesystem: "pyarrow.fs.FileSystem" = None, ) -> Tuple[List[str], "pyarrow.fs.FileSystem"]: """ Resolves and normalizes all provided paths, infers a filesystem from the paths and ensures that all paths use the same filesystem. Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation that should be used for reading these files. If None, a filesystem will be inferred. If not None, the provided filesystem will still be validated against all filesystems inferred from the provided paths to ensure compatibility. """ import pyarrow as pa from pyarrow.fs import ( FileSystem, PyFileSystem, FSSpecHandler, _resolve_filesystem_and_path, ) if isinstance(paths, str): paths = [paths] elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths): raise ValueError("paths must be a path string or a list of path strings.") elif len(paths) == 0: raise ValueError("Must provide at least one path.") if filesystem and not isinstance(filesystem, FileSystem): err_msg = ( f"The filesystem passed must either conform to " f"pyarrow.fs.FileSystem, or " f"fsspec.spec.AbstractFileSystem. The provided " f"filesystem was: {filesystem}" ) try: import fsspec except ModuleNotFoundError: # If filesystem is not a pyarrow filesystem and fsspec isn't # installed, then filesystem is neither a pyarrow filesystem nor # an fsspec filesystem, so we raise a TypeError. raise TypeError(err_msg) if not isinstance(filesystem, fsspec.spec.AbstractFileSystem): raise TypeError(err_msg) filesystem = PyFileSystem(FSSpecHandler(filesystem)) resolved_paths = [] for path in paths: try: resolved_filesystem, resolved_path = _resolve_filesystem_and_path( path, filesystem ) except pa.lib.ArrowInvalid as e: if "Cannot parse URI" in str(e): resolved_filesystem, resolved_path = _resolve_filesystem_and_path( _encode_url(path), filesystem ) resolved_path = _decode_url(resolved_path) else: raise if filesystem is None: filesystem = resolved_filesystem else: resolved_path = _unwrap_protocol(resolved_path) resolved_path = filesystem.normalize_path(resolved_path) resolved_paths.append(resolved_path) return resolved_paths, filesystem
describe_all_csvs_in_zips(fs) dlf = fsspec.open("/tmp/dl.zip") with dlf as f: zipf = zipfile.ZipFile(f) print(zipf.infolist()) dlf.close() d1f = fsspec.open("zip://dummy1.csv::/tmp/dl.zip", "rt") with d1f as f: print(f.read()) #d1f = fsspec.open("zip://dummy1.csv::github://tiagoantao:python-performance@/08-persistence/sec1-fsspec/dummy.zip") #with d1f as f: # print(pd.read_csv(f)) zfs = ZipFileSystem("/tmp/dl.zip") arrow_fs = PyFileSystem(FSSpecHandler(zfs)) my_csv = csv.read_csv(arrow_fs.open_input_stream("dummy1.csv")) print(my_csv) #with fsspec.open("zip:local.zip/dummy1.csv") as f: # pd.read_csv(f) ##fsa = fsspec.get_mapper("github://*****:*****@") ##print(fsa) ## fs = fsspec.open("git_https.py") ## with fs as f: ## print(f)
def test_py_open_append_stream(): fs = PyFileSystem(DummyHandler()) with fs.open_append_stream("somefile") as f: f.write(b"data")
def test_py_filesystem_ops(): handler = DummyHandler() fs = PyFileSystem(handler) fs.create_dir("recursive", recursive=True) fs.create_dir("non-recursive", recursive=False) with pytest.raises(IOError): fs.create_dir("foobar") fs.delete_dir("delete_dir") fs.delete_dir_contents("delete_dir_contents") fs.delete_file("delete_file") fs.move("move_from", "move_to") fs.copy_file("copy_file_from", "copy_file_to")
def httpfs_from_config(): return PyFileSystem(FSSpecHandler(HTTPFileSystem()))
def read_remote_parquet(path: str): fs, path = get_fs_and_path(path) return read_parquet(path, filesystem=PyFileSystem(FSSpecHandler(fs)))
def _resolve_paths_and_filesystem( paths: Union[str, List[str]], filesystem: "pyarrow.fs.FileSystem" = None, ) -> Tuple[List[str], "pyarrow.fs.FileSystem"]: """ Resolves and normalizes all provided paths, infers a filesystem from the paths and ensures that all paths use the same filesystem. Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation that should be used for reading these files. If None, a filesystem will be inferred. If not None, the provided filesystem will still be validated against all filesystems inferred from the provided paths to ensure compatibility. """ import pyarrow as pa from pyarrow.fs import ( FileSystem, FSSpecHandler, PyFileSystem, _resolve_filesystem_and_path, ) if isinstance(paths, str): paths = [paths] elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths): raise ValueError( "paths must be a path string or a list of path strings.") elif len(paths) == 0: raise ValueError("Must provide at least one path.") need_unwrap_path_protocol = True if filesystem and not isinstance(filesystem, FileSystem): err_msg = (f"The filesystem passed must either conform to " f"pyarrow.fs.FileSystem, or " f"fsspec.spec.AbstractFileSystem. The provided " f"filesystem was: {filesystem}") try: import fsspec from fsspec.implementations.http import HTTPFileSystem except ModuleNotFoundError: # If filesystem is not a pyarrow filesystem and fsspec isn't # installed, then filesystem is neither a pyarrow filesystem nor # an fsspec filesystem, so we raise a TypeError. raise TypeError(err_msg) from None if not isinstance(filesystem, fsspec.spec.AbstractFileSystem): raise TypeError(err_msg) from None if isinstance(filesystem, HTTPFileSystem): # If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths # should not be unwrapped/removed, because HTTPFileSystem expects full file # paths including protocol/scheme. This is different behavior compared to # file systems implementation in pyarrow.fs.FileSystem. need_unwrap_path_protocol = False filesystem = PyFileSystem(FSSpecHandler(filesystem)) resolved_paths = [] for path in paths: path = _resolve_example_path(path) try: resolved_filesystem, resolved_path = _resolve_filesystem_and_path( path, filesystem) except pa.lib.ArrowInvalid as e: if "Cannot parse URI" in str(e): resolved_filesystem, resolved_path = _resolve_filesystem_and_path( _encode_url(path), filesystem) resolved_path = _decode_url(resolved_path) elif "Unrecognized filesystem type in URI" in str(e): scheme = urllib.parse.urlparse(path, allow_fragments=False).scheme if scheme in ["http", "https"]: # If scheme of path is HTTP and filesystem is not resolved, # try to use fsspec HTTPFileSystem. This expects fsspec is # installed. try: from fsspec.implementations.http import HTTPFileSystem except ModuleNotFoundError: raise ImportError( "Please install fsspec to read files from HTTP." ) from None resolved_filesystem = PyFileSystem( FSSpecHandler(HTTPFileSystem())) resolved_path = path need_unwrap_path_protocol = False else: raise else: raise if filesystem is None: filesystem = resolved_filesystem elif need_unwrap_path_protocol: resolved_path = _unwrap_protocol(resolved_path) resolved_path = filesystem.normalize_path(resolved_path) resolved_paths.append(resolved_path) return resolved_paths, filesystem