def xgetsize(path, use_auth_token: Optional[Union[str, bool]] = None) -> int: """Extend `os.path.getsize` function to support remote files. Args: path (:obj:`str`): URL path. Returns: :obj:`int`, optional """ main_hop, *rest_hops = path.split("::") if is_local_path(main_hop): return os.path.getsize(path) else: if rest_hops and fsspec.get_fs_token_paths( rest_hops[0])[0].protocol == "https": storage_options = { "https": { "headers": get_authentication_headers_for_url( rest_hops[0], use_auth_token=use_auth_token) } } else: storage_options = None fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) size = fs.size(main_hop) if size is None: # use xopen instead of fs.open to make data fetching more robust with xopen(path, use_auth_token=use_auth_token) as f: size = len(f.read()) return size
def xlistdir(path: str, use_auth_token: Optional[Union[str, bool]] = None) -> List[str]: """Extend `os.listdir` function to support remote files. Args: path (:obj:`str`): URL path. Returns: :obj:`list` of :obj:`str` """ main_hop, *rest_hops = path.split("::") if is_local_path(main_hop): return os.listdir(path) else: # globbing inside a zip in a private repo requires authentication if rest_hops and fsspec.get_fs_token_paths( rest_hops[0])[0].protocol == "https": storage_options = { "https": { "headers": get_authentication_headers_for_url( rest_hops[0], use_auth_token=use_auth_token) } } else: storage_options = None fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) objects = fs.listdir(main_hop.split("://")[1]) return [os.path.basename(obj["name"]) for obj in objects]
def xisdir(path, use_auth_token: Optional[Union[str, bool]] = None) -> bool: """Extend `os.path.isdir` function to support remote files. Args: path (:obj:`str`): URL path. Returns: :obj:`bool` """ main_hop, *rest_hops = path.split("::") if is_local_path(main_hop): return os.path.isdir(path) else: if rest_hops and fsspec.get_fs_token_paths( rest_hops[0])[0].protocol == "https": storage_options = { "https": { "headers": get_authentication_headers_for_url( rest_hops[0], use_auth_token=use_auth_token) } } else: storage_options = None fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) return fs.isdir(main_hop)
def xglob(urlpath, *, recursive=False, use_auth_token: Optional[Union[str, bool]] = None): """Extend `glob.glob` function to support remote files. Args: urlpath (:obj:`str`): URL path with shell-style wildcard patterns. recursive (:obj:`bool`, default `False`): Whether to match the "**" pattern recursively to zero or more directories or subdirectories. Returns: :obj:`list` of :obj:`str` """ main_hop, *rest_hops = urlpath.split("::") if is_local_path(main_hop): return glob.glob(main_hop, recursive=recursive) else: # globbing inside a zip in a private repo requires authentication if rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")): url = rest_hops[0] url, kwargs = _prepare_http_url_kwargs(url, use_auth_token=use_auth_token) storage_options = {"https": kwargs} urlpath = "::".join([main_hop, url, *rest_hops[1:]]) else: storage_options = None fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. # - If there is "**" in the pattern, `fs.glob` must be called anyway. globbed_paths = fs.glob(main_hop) return ["::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops) for globbed_path in globbed_paths]
def filesystem(request): path = request.param["path"] storage_options = request.param["storage_options"] storage_backend = request.param.get("backend", "pandas") fs, _, paths = fsspec.get_fs_token_paths(path, storage_options=storage_options) return {"fs": fs, "location": paths[0], "backend": storage_backend}
def _get_reference_state( time: str, reference_dir: str, communicator: fv3gfs.util.CubedSphereCommunicator, only_names: Iterable[str], tracer_metadata: Mapping, ): label = _time_to_label(time) dirname = os.path.join(reference_dir, label) localdir = "download" if MPI.COMM_WORLD.rank == 0: fs = fsspec.get_fs_token_paths(dirname)[0] fs.get(dirname, localdir, recursive=True) # need this for synchronization MPI.COMM_WORLD.barrier() state = fv3gfs.util.open_restart( localdir, communicator, label=label, only_names=only_names, tracer_properties=tracer_metadata, ) # clean up the local directory # wait for other processes to finish using the data MPI.COMM_WORLD.barrier() if MPI.COMM_WORLD.rank == 0: shutil.rmtree(localdir) return _to_state_dataarrays(state)
def xwalk(urlpath, use_auth_token: Optional[Union[str, bool]] = None): """Extend `os.walk` function to support remote files. Args: urlpath (:obj:`str`): URL root path. use_auth_token (:obj:`bool` or :obj:`str`, optional): Whether to use token or token to authenticate on the Hugging Face Hub for private remote files. Yields: :obj:`tuple`: 3-tuple (dirpath, dirnames, filenames). """ main_hop, *rest_hops = urlpath.split("::") if is_local_path(main_hop): return os.walk(main_hop) else: # walking inside a zip in a private repo requires authentication if rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")): url = rest_hops[0] url, kwargs = _prepare_http_url_kwargs( url, use_auth_token=use_auth_token) storage_options = {"https": kwargs} urlpath = "::".join([main_hop, url, *rest_hops[1:]]) else: storage_options = None fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) for dirpath, dirnames, filenames in fs.walk(main_hop): yield "::".join([f"{fs.protocol}://{dirpath}"] + rest_hops), dirnames, filenames
def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs): """ Return either a filepath string to data, or a open file object to the output filesystem Parameters ---------- path_or_data : str, file-like object, bytes, ByteIO Path to data or the data itself. mode : str Mode in which file is opened Returns ------- filepath_or_buffer : str, Filepath string or buffer of data """ if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options", {}) path_or_data = os.path.expanduser(path_or_data) fs, _, _ = fsspec.get_fs_token_paths(path_or_data, mode=mode or "w", storage_options=storage_options) if not _is_local_filesystem(fs): filepath_or_buffer = fsspec.open(path_or_data, mode=mode or "w", **(storage_options)) return filepath_or_buffer return path_or_data
def xpathglob(path, pattern, use_auth_token: Optional[Union[str, bool]] = None): """Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs. Args: path (:obj:`~pathlib.Path`): Calling Path instance. pattern (:obj:`str`): Pattern that resulting paths must match. Yields: :obj:`~pathlib.Path` """ posix_path = _as_posix(path) main_hop, *rest_hops = posix_path.split("::") if is_local_path(main_hop): yield from Path(main_hop).glob(pattern) else: # globbing inside a zip in a private repo requires authentication if rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")): url = rest_hops[0] url, kwargs = _prepare_http_url_kwargs(url, use_auth_token=use_auth_token) storage_options = {"https": kwargs} posix_path = "::".join([main_hop, url, *rest_hops[1:]]) else: storage_options = None fs, *_ = fsspec.get_fs_token_paths(xjoin(posix_path, pattern), storage_options=storage_options) # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. # - If there is "**" in the pattern, `fs.glob` must be called anyway. globbed_paths = fs.glob(xjoin(main_hop, pattern)) for globbed_path in globbed_paths: yield type(path)("::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops))
def get_fs_path( urlpath_or_path: esa_safe.PathType, fs: T.Optional[fsspec.AbstractFileSystem] = None, storage_options: T.Optional[T.Dict[str, T.Any]] = None, ) -> T.Tuple[fsspec.AbstractFileSystem, str]: if fs is not None and storage_options is not None: raise TypeError( "only one of 'fs' and 'storage_options' can be not None") if fs is None: fs, _, paths = fsspec.get_fs_token_paths( urlpath_or_path, storage_options=storage_options) if len(paths) == 0: raise ValueError(f"file or object not found {urlpath_or_path!r}") elif len(paths) > 1: raise ValueError( f"multiple files or objects found {urlpath_or_path!r}") path = paths[0] else: path = str(urlpath_or_path) if fs.isdir(path): path = os.path.join(path, "manifest.safe") return fs, path
def href_exists(href: str) -> bool: """Returns true if the asset exists. Uses fssepc and its `exists` method: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists. """ fs, _, paths = fsspec.get_fs_token_paths(href) return bool(paths and fs.exists(paths[0]))
def test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path): compressed_file_paths = {"zip": zip_jsonl_path, "gzip": jsonl_gz_path} compressed_file_path = compressed_file_paths[protocol] member_file_path = "dataset.jsonl" path = f"{protocol}://{member_file_path}::{compressed_file_path}" fs, *_ = fsspec.get_fs_token_paths(path) assert fs.isfile(member_file_path) assert not fs.isfile("non_existing_" + member_file_path)
def _fs(self, name=None): fs, fs_token, paths = fsspec.get_fs_token_paths( self.url, storage_options=self._clean_dict(self.storage_options), ) if name: feature_path = posixpath.join(paths[0], "feature", name) else: feature_path = posixpath.join(paths[0], "feature") return fs, feature_path
def from_directory(url: str) -> "ComputedDiagnosticsList": """Open a directory of computed diagnostics Args: url: URL to directory containing rundirs as subdirectories. "rundirs". rundirs are subdirectories of this bucket. They each contain diags.nc, metrics.json, and .mp4 files. """ fs, _, _ = fsspec.get_fs_token_paths(url) return ComputedDiagnosticsList(detect_folders(url, fs))
def _CreateDeepDirectoryStructure(self, top_directory): """Creates a reasonable deep structure of subdirectories with files. Args: top_directory: The file:// path of the top level directory in which to create the directory structure. """ # Add a few subdirectories. directory_names = ( # An empty directory. "foo", # A directory with an events file (and a text file). "bar", # A deeper directory with events files. "bar/baz", # A non-empty subdir that lacks event files (should be ignored). "bar/quux", # This 3-level deep set of subdirectories tests logic that replaces # the full glob string with an absolute path prefix if there is # only 1 subdirectory in the final mapping. "quuz/garply", "quuz/garply/corge", "quuz/garply/grault", # A directory that lacks events files, but contains a subdirectory # with events files (first level should be ignored, second level # should be included). "waldo", "waldo/fred", ) for directory_name in directory_names: path = posixpath.join(top_directory, directory_name) fs, _, paths = fsspec.get_fs_token_paths(path) fs.makedirs(paths[0]) # Add a few files to the directory. file_names = ( "a.tfevents.1", "model.ckpt", "bar/b.tfevents.1", "bar/red_herring.txt", "bar/baz/c.tfevents.1", "bar/baz/d.tfevents.1", "bar/quux/some_flume_output.txt", "bar/quux/some_more_flume_output.txt", "quuz/e.tfevents.1", "quuz/garply/f.tfevents.1", "quuz/garply/corge/g.tfevents.1", "quuz/garply/grault/h.tfevents.1", "waldo/fred/i.tfevents.1", ) for file_name in file_names: with fsspec.open(posixpath.join(top_directory, file_name), "wb") as f: f.write(b"")
def load(cls, path, client=None): """Load up a saved workflow object from disk Parameters ---------- path: str The path to load the workflow from client: distributed.Client, optional The Dask distributed client to use for multi-gpu processing and multi-node processing Returns ------- Workflow """ # avoid a circular import getting the version from nvtabular import __version__ as nvt_version fs = fsspec.get_fs_token_paths(path)[0] # check version information from the metadata blob, and warn if we have a mismatch meta = json.load(fs.open(fs.sep.join([path, "metadata.json"]))) def parse_version(version): return version.split(".")[:2] def check_version(stored, current, name): if parse_version(stored) != parse_version(current): warnings.warn( f"Loading workflow generated with {name} version {stored} " f"- but we are running {name} {current}. This might cause issues" ) # make sure we don't have any major/minor version conflicts between the stored worklflow # and the current environment lib = cudf if cudf else pd versions = meta["versions"] check_version(versions["nvtabular"], nvt_version, "nvtabular") check_version(versions["python"], sys.version, "python") if lib.__name__ in versions: check_version(versions[lib.__name__], lib.__version__, lib.__name__) else: expected = "GPU" if "cudf" in versions else "CPU" warnings.warn(f"Loading workflow generated on {expected}") # load up the workflow object di workflow = cloudpickle.load(fs.open(fs.sep.join([path, "workflow.pkl"]), "rb")) workflow.client = client # we might have been copied since saving, update all the stat ops # with the new path to their storage locations for stat in _get_stat_ops([workflow.output_node]): stat.op.set_storage_path(path, copy=False) return workflow
def get_filepath_or_buffer(path_or_data, compression, iotypes=(BytesIO), **kwargs): """Return either a filepath string to data, or a memory buffer of data. If filepath, then the source filepath is expanded to user's environment. If buffer, then data is returned in-memory as bytes or a ByteIO object. Parameters ---------- path_or_data : str, file-like object, bytes, ByteIO Path to data or the data itself. compression : str Type of compression algorithm for the content iotypes : (), default (BytesIO) Object type to exclude from file-like check Returns ------- filepath_or_buffer : str, bytes, BytesIO Filepath string or in-memory buffer of data compression : str Type of compression algorithm for the content """ if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") # fsspec does not expanduser so handle here path_or_data = os.path.expanduser(path_or_data) fs, _, paths = fsspec.get_fs_token_paths( path_or_data, mode="rb", storage_options=storage_options) if len(paths) == 0: raise IOError(f"{path_or_data} could not be resolved to any files") elif len(paths) > 1: warnings.warn( f"`path_or_data` resolved to more than 1 file. " f"Only the first file {paths[0]} will be read.", UserWarning, ) if _is_local_filesystem(fs): # Doing this as `read_json` accepts a json string # path_or_data need not be a filepath like string if os.path.exists(paths[0]): path_or_data = paths[0] else: with fs.open(paths[0]) as f: path_or_data = BytesIO(f.read()) elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data): if isinstance(path_or_data, TextIOWrapper): path_or_data = path_or_data.buffer path_or_data = BytesIO(path_or_data.read()) return path_or_data, compression
def xopen(file, mode="r", *args, **kwargs): """ This function extends the builtin `open` function to support remote files using fsspec. It also has a retry mechanism in case connection fails. The args and kwargs are passed to fsspec.open, except `use_auth_token` which is used for queries to private repos on huggingface.co """ if fsspec.get_fs_token_paths(file)[0].protocol == "https": kwargs["headers"] = get_authentication_headers_for_url( file, use_auth_token=kwargs.pop("use_auth_token", None)) file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open() _add_retries_to_file_obj_read_method(file_obj) return file_obj
def _create_file(self, name: str) -> fsspec.core.OpenFile: """Open the file that should hold the serialized contentes for the table. Raises: TableExists if the underlying file already exists. """ filename = self._get_filename(name, self._instance_id) fs, _, _ = fsspec.get_fs_token_paths(filename) if fs.exists(filename): raise TableExists( f'{filename} containing serialized data for table {name} already exists.' ) return fsspec.open(filename, "wb")
def md5_blocks_fs(path, blocksize=1024 * 2048) -> str: fs, token, paths = get_fs_token_paths(path) if fs.isdir(path): DBG(f'Item is a directory and will not be hashed. {str(path)}') return try: hasher = md5() with fs.open(path, 'rb') as file: block = file.read(blocksize) while len(block) > 0: hasher.update(block) block = file.read(blocksize) return hasher.hexdigest() except Exception as error: logger.warning( f'Error trying to hash item: {str(path)}\nError:\n{error}') return
def append_segment(rundir: str, destination: str, segment_label: str, no_copy: bool): """Append local RUNDIR to possibly existing output at DESTINATION Zarr's will be appended to in place, while all other files will be saved to DESTINATION/artifacts/SEGMENT_LABEL. """ if not segment_label: segment_label = _get_initial_timestamp(rundir) fs, _, _ = fsspec.get_fs_token_paths(destination) with tempfile.TemporaryDirectory() as d_in: if no_copy: tmp_rundir = rundir else: # this copy is necessary to not destroy the input RUNDIR. Ideally, # append_segment could operate without making a copy or affecting RUNDIR. tmp_rundir = shutil.copytree(rundir, os.path.join(d_in, "rundir")) files = os.listdir(tmp_rundir) # Write a temporary artifacts dir to avoid conflict if prognostic run # already created an output dir named 'artifacts' tmp_artifacts_dir = _create_tmp_artifacts_dir(tmp_rundir, segment_label) for file_ in files: tmp_rundir_file = os.path.join(tmp_rundir, file_) logger.info(f"Processing {tmp_rundir_file}") if file_.endswith(".zarr"): destination_file = os.path.join(destination, file_) logger.info( f"Appending {tmp_rundir_file} to {destination_file}") append_zarr_along_time(tmp_rundir_file, destination_file, fs) # remove temporary local copy so not uploaded twice shutil.rmtree(tmp_rundir_file) else: renamed_file = os.path.join(tmp_artifacts_dir, file_) os.rename(tmp_rundir_file, renamed_file) os.rename(os.path.dirname(tmp_artifacts_dir), os.path.join(tmp_rundir, "artifacts")) logger.info( f"Uploading non-zarr files from {tmp_rundir} to {destination}") upload_dir(tmp_rundir, destination)
def is_directory(path_or_data, **kwargs): """Returns True if the provided filepath is a directory """ path_or_data = fsspec.utils.stringify_path(path_or_data) if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") path_or_data = os.path.expanduser(path_or_data) try: fs, _, paths = fsspec.get_fs_token_paths( path_or_data, mode="rb", storage_options=storage_options) except ValueError as e: if str(e).startswith("Protocol not known"): return True else: raise e return fs.isdir(path_or_data) return False
def dump(url: str, variable: str, info: bool): fs, _, _ = fsspec.get_fs_token_paths(url) if not fs.exists(url): raise click.ClickException(f"No file or directory at {url}") m = fs.get_mapper(url) consolidated = _metadata_is_consolidated(m) object_, object_is_xarray = _open_with_xarray_or_zarr(m, consolidated) if variable is not None: if info: raise click.ClickException("Cannot use both '-v' and '-i' options") object_ = object_[variable] if not object_is_xarray: object_ = object_.info if object_is_xarray and info: object_.info() else: print(object_)
def ensure_single_filepath_or_buffer(path_or_data, **kwargs): """Return False if `path_or_data` resolves to multiple filepaths or buffers """ path_or_data = fsspec.utils.stringify_path(path_or_data) if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") path_or_data = os.path.expanduser(path_or_data) try: fs, _, paths = fsspec.get_fs_token_paths( path_or_data, mode="rb", storage_options=storage_options) except ValueError as e: if str(e).startswith("Protocol not known"): return True else: raise e if len(paths) > 1: return False elif isinstance(path_or_data, (list, tuple)) and len(path_or_data) > 1: return False return True
def read_directory_schema(rundir: str) -> DirectorySchema: """Read schema from a directory Does not support recursive directories or files other than zarr See Also: dump_directory_schema_to_disk """ import fsspec fs, _, (path_with_no_prefix, ) = fsspec.get_fs_token_paths(rundir) files = fs.ls(path_with_no_prefix) read_schema = compose(read_schema_from_zarr, zarr.open_group, fs.get_mapper) return { os.path.relpath(file, path_with_no_prefix): read_schema(file) for file in files if file.endswith(".zarr") }
def save(self, path): """Save this workflow to disk Parameters ---------- path: str The path to save the workflow to """ # avoid a circular import getting the version from nvtabular import __version__ as nvt_version fs = fsspec.get_fs_token_paths(path)[0] fs.makedirs(path, exist_ok=True) # point all stat ops to store intermediate output (parquet etc) at the path # this lets us easily bundle for stat in _get_stat_ops([self.output_node]): stat.op.set_storage_path(path, copy=True) # generate a file of all versions used to generate this bundle lib = cudf if cudf else pd with fs.open(fs.sep.join([path, "metadata.json"]), "w") as o: json.dump( { "versions": { "nvtabular": nvt_version, lib.__name__: lib.__version__, "python": sys.version, }, "generated_timestamp": int(time.time()), }, o, ) # dump out the full workflow (graph/stats/operators etc) using cloudpickle with fs.open(fs.sep.join([path, "workflow.pkl"]), "wb") as o: cloudpickle.dump(self, o)
def xpathglob(path, pattern): """Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs. Args: path (:obj:`~pathlib.Path`): Calling Path instance. pattern (:obj:`str`): Pattern that resulting paths must match. Yields: :obj:`~pathlib.Path` """ posix_path = _as_posix(path) main_hop, *rest_hops = posix_path.split("::") if is_local_path(main_hop): yield from Path(main_hop).glob(pattern) else: fs, *_ = fsspec.get_fs_token_paths(xjoin(posix_path, pattern)) # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. # - If there is "**" in the pattern, `fs.glob` must be called anyway. globbed_paths = fs.glob(xjoin(main_hop, pattern)) for globbed_path in globbed_paths: yield type(path)("::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops))
def append_zarr_along_time(source_path: str, target_path: str, fs: fsspec.AbstractFileSystem, dim: str = "time"): """Append local zarr store at source_path to zarr store at target_path along time. Args: source_path: Local path to zarr store that represents an xarray dataset. target_path: Local or remote url for zarr store to be appended to. fs: Filesystem for target_path. dim: (optional) name of time dimension. Defaults to "time". Raises: ValueError: If the chunk size in time does not evenly divide length of time dimension for zarr stores at source_path. Warning: The zarr store as source_path will be modified in place. """ merged_time = _get_merged_time_coordinate(source_path, target_path, dim, fs) if fs.exists(target_path): source_store = zarr.open(source_path, mode="r+") target_store = zarr.open_consolidated(fsspec.get_mapper(target_path)) _assert_chunks_match(source_store, target_store, dim) _set_time_units_like(source_store, target_store) _shift_store(source_store, dim, _get_dim_size(target_store, dim)) elif fs.protocol == "file": os.makedirs(target_path) upload_dir(source_path, target_path) _overwrite_time_array_with_single_chunk(target_path, merged_time, dim) _, _, absolute_target_paths = fsspec.get_fs_token_paths(target_path) consolidate_metadata(fs, absolute_target_paths[0])
with ThreadPoolExecutor(max_workers=12) as pool: values = pool.map(maybe_get, urls) metadata_with_nan = dict(zip(keys_to_get, values)) metadata = { key: val for key, val in metadata_with_nan.items() if val is not None } return {"zarr_consolidated_format": 1, "metadata": metadata} def consolidate_metadata(fs, root): if root.rstrip("/").endswith(".zarr"): logger.info(f"Consolidating metadata of {root}") meta = _get_metadata_fs(fs, root) with fs.open(os.path.join(root, ".zmetadata"), "wb") as f: f.write(json_dumps(meta)) elif fs.isdir(root): logger.info(f"Recursing {root}") dirs = fs.ls(root) for dir_ in dirs: consolidate_metadata(fs, dir_) if __name__ == "__main__": url = sys.argv[1] fs, _, roots = fsspec.get_fs_token_paths(url) root = roots[0] consolidate_metadata(fs, root)
def get_filepath_or_buffer( path_or_data, compression, mode="rb", iotypes=(BytesIO), **kwargs, ): """Return either a filepath string to data, or a memory buffer of data. If filepath, then the source filepath is expanded to user's environment. If buffer, then data is returned in-memory as bytes or a ByteIO object. Parameters ---------- path_or_data : str, file-like object, bytes, ByteIO Path to data or the data itself. compression : str Type of compression algorithm for the content mode : str Mode in which file is opened iotypes : (), default (BytesIO) Object type to exclude from file-like check Returns ------- filepath_or_buffer : str, bytes, BytesIO, list Filepath string or in-memory buffer of data or a list of Filepath strings or in-memory buffers of data. compression : str Type of compression algorithm for the content """ path_or_data = fsspec.utils.stringify_path(path_or_data) if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") # fsspec does not expanduser so handle here path_or_data = os.path.expanduser(path_or_data) try: fs, _, paths = fsspec.get_fs_token_paths( path_or_data, mode=mode, storage_options=storage_options) except ValueError as e: if str(e).startswith("Protocol not known"): return path_or_data, compression else: raise e if len(paths) == 0: raise FileNotFoundError( f"{path_or_data} could not be resolved to any files") if _is_local_filesystem(fs): # Doing this as `read_json` accepts a json string # path_or_data need not be a filepath like string if os.path.exists(paths[0]): path_or_data = paths if len(paths) > 1 else paths[0] else: path_or_data = [BytesIO(fs.open(fpath).read()) for fpath in paths] if len(path_or_data) == 1: path_or_data = path_or_data[0] elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data): if isinstance(path_or_data, TextIOWrapper): path_or_data = path_or_data.buffer path_or_data = BytesIO(path_or_data.read()) return path_or_data, compression