Ejemplo n.º 1
0
def xgetsize(path, use_auth_token: Optional[Union[str, bool]] = None) -> int:
    """Extend `os.path.getsize` function to support remote files.

    Args:
        path (:obj:`str`): URL path.

    Returns:
        :obj:`int`, optional
    """
    main_hop, *rest_hops = path.split("::")
    if is_local_path(main_hop):
        return os.path.getsize(path)
    else:
        if rest_hops and fsspec.get_fs_token_paths(
                rest_hops[0])[0].protocol == "https":
            storage_options = {
                "https": {
                    "headers":
                    get_authentication_headers_for_url(
                        rest_hops[0], use_auth_token=use_auth_token)
                }
            }
        else:
            storage_options = None
        fs, *_ = fsspec.get_fs_token_paths(path,
                                           storage_options=storage_options)
        size = fs.size(main_hop)
        if size is None:
            # use xopen instead of fs.open to make data fetching more robust
            with xopen(path, use_auth_token=use_auth_token) as f:
                size = len(f.read())
        return size
Ejemplo n.º 2
0
def xlistdir(path: str,
             use_auth_token: Optional[Union[str, bool]] = None) -> List[str]:
    """Extend `os.listdir` function to support remote files.

    Args:
        path (:obj:`str`): URL path.

    Returns:
        :obj:`list` of :obj:`str`
    """
    main_hop, *rest_hops = path.split("::")
    if is_local_path(main_hop):
        return os.listdir(path)
    else:
        # globbing inside a zip in a private repo requires authentication
        if rest_hops and fsspec.get_fs_token_paths(
                rest_hops[0])[0].protocol == "https":
            storage_options = {
                "https": {
                    "headers":
                    get_authentication_headers_for_url(
                        rest_hops[0], use_auth_token=use_auth_token)
                }
            }
        else:
            storage_options = None
        fs, *_ = fsspec.get_fs_token_paths(path,
                                           storage_options=storage_options)
        objects = fs.listdir(main_hop.split("://")[1])
        return [os.path.basename(obj["name"]) for obj in objects]
Ejemplo n.º 3
0
def xisdir(path, use_auth_token: Optional[Union[str, bool]] = None) -> bool:
    """Extend `os.path.isdir` function to support remote files.

    Args:
        path (:obj:`str`): URL path.

    Returns:
        :obj:`bool`
    """
    main_hop, *rest_hops = path.split("::")
    if is_local_path(main_hop):
        return os.path.isdir(path)
    else:
        if rest_hops and fsspec.get_fs_token_paths(
                rest_hops[0])[0].protocol == "https":
            storage_options = {
                "https": {
                    "headers":
                    get_authentication_headers_for_url(
                        rest_hops[0], use_auth_token=use_auth_token)
                }
            }
        else:
            storage_options = None
        fs, *_ = fsspec.get_fs_token_paths(path,
                                           storage_options=storage_options)
        return fs.isdir(main_hop)
Ejemplo n.º 4
0
def xglob(urlpath, *, recursive=False, use_auth_token: Optional[Union[str, bool]] = None):
    """Extend `glob.glob` function to support remote files.

    Args:
        urlpath (:obj:`str`): URL path with shell-style wildcard patterns.
        recursive (:obj:`bool`, default `False`): Whether to match the "**" pattern recursively to zero or more
            directories or subdirectories.

    Returns:
        :obj:`list` of :obj:`str`
    """
    main_hop, *rest_hops = urlpath.split("::")
    if is_local_path(main_hop):
        return glob.glob(main_hop, recursive=recursive)
    else:
        # globbing inside a zip in a private repo requires authentication
        if rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")):
            url = rest_hops[0]
            url, kwargs = _prepare_http_url_kwargs(url, use_auth_token=use_auth_token)
            storage_options = {"https": kwargs}
            urlpath = "::".join([main_hop, url, *rest_hops[1:]])
        else:
            storage_options = None
        fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options)
        # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching
        #   so to be able to glob patterns like "[0-9]", we have to call `fs.glob`.
        # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories.
        # - If there is "**" in the pattern, `fs.glob` must be called anyway.
        globbed_paths = fs.glob(main_hop)
        return ["::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops) for globbed_path in globbed_paths]
Ejemplo n.º 5
0
def filesystem(request):
    path = request.param["path"]
    storage_options = request.param["storage_options"]
    storage_backend = request.param.get("backend", "pandas")

    fs, _, paths = fsspec.get_fs_token_paths(path, storage_options=storage_options)
    return {"fs": fs, "location": paths[0], "backend": storage_backend}
Ejemplo n.º 6
0
def _get_reference_state(
    time: str,
    reference_dir: str,
    communicator: fv3gfs.util.CubedSphereCommunicator,
    only_names: Iterable[str],
    tracer_metadata: Mapping,
):
    label = _time_to_label(time)
    dirname = os.path.join(reference_dir, label)

    localdir = "download"

    if MPI.COMM_WORLD.rank == 0:
        fs = fsspec.get_fs_token_paths(dirname)[0]
        fs.get(dirname, localdir, recursive=True)

    # need this for synchronization
    MPI.COMM_WORLD.barrier()

    state = fv3gfs.util.open_restart(
        localdir,
        communicator,
        label=label,
        only_names=only_names,
        tracer_properties=tracer_metadata,
    )

    # clean up the local directory
    # wait for other processes to finish using the data
    MPI.COMM_WORLD.barrier()
    if MPI.COMM_WORLD.rank == 0:
        shutil.rmtree(localdir)

    return _to_state_dataarrays(state)
Ejemplo n.º 7
0
def xwalk(urlpath, use_auth_token: Optional[Union[str, bool]] = None):
    """Extend `os.walk` function to support remote files.

    Args:
        urlpath (:obj:`str`): URL root path.
        use_auth_token (:obj:`bool` or :obj:`str`, optional): Whether to use token or token to authenticate on the
            Hugging Face Hub for private remote files.

    Yields:
        :obj:`tuple`: 3-tuple (dirpath, dirnames, filenames).
    """
    main_hop, *rest_hops = urlpath.split("::")
    if is_local_path(main_hop):
        return os.walk(main_hop)
    else:
        # walking inside a zip in a private repo requires authentication
        if rest_hops and (rest_hops[0].startswith("http://")
                          or rest_hops[0].startswith("https://")):
            url = rest_hops[0]
            url, kwargs = _prepare_http_url_kwargs(
                url, use_auth_token=use_auth_token)
            storage_options = {"https": kwargs}
            urlpath = "::".join([main_hop, url, *rest_hops[1:]])
        else:
            storage_options = None
        fs, *_ = fsspec.get_fs_token_paths(urlpath,
                                           storage_options=storage_options)
        for dirpath, dirnames, filenames in fs.walk(main_hop):
            yield "::".join([f"{fs.protocol}://{dirpath}"] +
                            rest_hops), dirnames, filenames
Ejemplo n.º 8
0
def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
    """
    Return either a filepath string to data,
    or a open file object to the output filesystem

    Parameters
    ----------
    path_or_data : str, file-like object, bytes, ByteIO
        Path to data or the data itself.
    mode : str
        Mode in which file is opened

    Returns
    -------
    filepath_or_buffer : str,
        Filepath string or buffer of data
    """
    if isinstance(path_or_data, str):
        storage_options = kwargs.get("storage_options", {})
        path_or_data = os.path.expanduser(path_or_data)
        fs, _, _ = fsspec.get_fs_token_paths(path_or_data,
                                             mode=mode or "w",
                                             storage_options=storage_options)

        if not _is_local_filesystem(fs):
            filepath_or_buffer = fsspec.open(path_or_data,
                                             mode=mode or "w",
                                             **(storage_options))
            return filepath_or_buffer

    return path_or_data
Ejemplo n.º 9
0
def xpathglob(path, pattern, use_auth_token: Optional[Union[str, bool]] = None):
    """Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

    Args:
        path (:obj:`~pathlib.Path`): Calling Path instance.
        pattern (:obj:`str`): Pattern that resulting paths must match.

    Yields:
        :obj:`~pathlib.Path`
    """
    posix_path = _as_posix(path)
    main_hop, *rest_hops = posix_path.split("::")
    if is_local_path(main_hop):
        yield from Path(main_hop).glob(pattern)
    else:
        # globbing inside a zip in a private repo requires authentication
        if rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")):
            url = rest_hops[0]
            url, kwargs = _prepare_http_url_kwargs(url, use_auth_token=use_auth_token)
            storage_options = {"https": kwargs}
            posix_path = "::".join([main_hop, url, *rest_hops[1:]])
        else:
            storage_options = None
        fs, *_ = fsspec.get_fs_token_paths(xjoin(posix_path, pattern), storage_options=storage_options)
        # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching
        #   so to be able to glob patterns like "[0-9]", we have to call `fs.glob`.
        # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories.
        # - If there is "**" in the pattern, `fs.glob` must be called anyway.
        globbed_paths = fs.glob(xjoin(main_hop, pattern))
        for globbed_path in globbed_paths:
            yield type(path)("::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops))
Ejemplo n.º 10
0
def get_fs_path(
    urlpath_or_path: esa_safe.PathType,
    fs: T.Optional[fsspec.AbstractFileSystem] = None,
    storage_options: T.Optional[T.Dict[str, T.Any]] = None,
) -> T.Tuple[fsspec.AbstractFileSystem, str]:
    if fs is not None and storage_options is not None:
        raise TypeError(
            "only one of 'fs' and 'storage_options' can be not None")

    if fs is None:
        fs, _, paths = fsspec.get_fs_token_paths(
            urlpath_or_path, storage_options=storage_options)
        if len(paths) == 0:
            raise ValueError(f"file or object not found {urlpath_or_path!r}")
        elif len(paths) > 1:
            raise ValueError(
                f"multiple files or objects found {urlpath_or_path!r}")
        path = paths[0]
    else:
        path = str(urlpath_or_path)

    if fs.isdir(path):
        path = os.path.join(path, "manifest.safe")

    return fs, path
Ejemplo n.º 11
0
def href_exists(href: str) -> bool:
    """Returns true if the asset exists.

    Uses fssepc and its `exists` method:
    https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists.
    """
    fs, _, paths = fsspec.get_fs_token_paths(href)
    return bool(paths and fs.exists(paths[0]))
Ejemplo n.º 12
0
def test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path):
    compressed_file_paths = {"zip": zip_jsonl_path, "gzip": jsonl_gz_path}
    compressed_file_path = compressed_file_paths[protocol]
    member_file_path = "dataset.jsonl"
    path = f"{protocol}://{member_file_path}::{compressed_file_path}"
    fs, *_ = fsspec.get_fs_token_paths(path)
    assert fs.isfile(member_file_path)
    assert not fs.isfile("non_existing_" + member_file_path)
Ejemplo n.º 13
0
 def _fs(self, name=None):
     fs, fs_token, paths = fsspec.get_fs_token_paths(
         self.url,
         storage_options=self._clean_dict(self.storage_options),
     )
     if name:
         feature_path = posixpath.join(paths[0], "feature", name)
     else:
         feature_path = posixpath.join(paths[0], "feature")
     return fs, feature_path
Ejemplo n.º 14
0
    def from_directory(url: str) -> "ComputedDiagnosticsList":
        """Open a directory of computed diagnostics

        Args:
            url: URL to directory containing rundirs as subdirectories.
                "rundirs". rundirs are subdirectories of this bucket. They each
                contain diags.nc, metrics.json, and .mp4 files.
        """
        fs, _, _ = fsspec.get_fs_token_paths(url)
        return ComputedDiagnosticsList(detect_folders(url, fs))
Ejemplo n.º 15
0
    def _CreateDeepDirectoryStructure(self, top_directory):
        """Creates a reasonable deep structure of subdirectories with files.

        Args:
          top_directory: The file:// path of the top level directory in
            which to create the directory structure.
        """

        # Add a few subdirectories.
        directory_names = (
            # An empty directory.
            "foo",
            # A directory with an events file (and a text file).
            "bar",
            # A deeper directory with events files.
            "bar/baz",
            # A non-empty subdir that lacks event files (should be ignored).
            "bar/quux",
            # This 3-level deep set of subdirectories tests logic that replaces
            # the full glob string with an absolute path prefix if there is
            # only 1 subdirectory in the final mapping.
            "quuz/garply",
            "quuz/garply/corge",
            "quuz/garply/grault",
            # A directory that lacks events files, but contains a subdirectory
            # with events files (first level should be ignored, second level
            # should be included).
            "waldo",
            "waldo/fred",
        )
        for directory_name in directory_names:
            path = posixpath.join(top_directory, directory_name)
            fs, _, paths = fsspec.get_fs_token_paths(path)
            fs.makedirs(paths[0])

        # Add a few files to the directory.
        file_names = (
            "a.tfevents.1",
            "model.ckpt",
            "bar/b.tfevents.1",
            "bar/red_herring.txt",
            "bar/baz/c.tfevents.1",
            "bar/baz/d.tfevents.1",
            "bar/quux/some_flume_output.txt",
            "bar/quux/some_more_flume_output.txt",
            "quuz/e.tfevents.1",
            "quuz/garply/f.tfevents.1",
            "quuz/garply/corge/g.tfevents.1",
            "quuz/garply/grault/h.tfevents.1",
            "waldo/fred/i.tfevents.1",
        )
        for file_name in file_names:
            with fsspec.open(posixpath.join(top_directory, file_name),
                             "wb") as f:
                f.write(b"")
Ejemplo n.º 16
0
    def load(cls, path, client=None):
        """Load up a saved workflow object from disk

        Parameters
        ----------
        path: str
            The path to load the workflow from
        client: distributed.Client, optional
            The Dask distributed client to use for multi-gpu processing and multi-node processing

        Returns
        -------
            Workflow
        """
        # avoid a circular import getting the version
        from nvtabular import __version__ as nvt_version

        fs = fsspec.get_fs_token_paths(path)[0]

        # check version information from the metadata blob, and warn if we have a mismatch
        meta = json.load(fs.open(fs.sep.join([path, "metadata.json"])))

        def parse_version(version):
            return version.split(".")[:2]

        def check_version(stored, current, name):
            if parse_version(stored) != parse_version(current):
                warnings.warn(
                    f"Loading workflow generated with {name} version {stored} "
                    f"- but we are running {name} {current}. This might cause issues"
                )

        # make sure we don't have any major/minor version conflicts between the stored worklflow
        # and the current environment
        lib = cudf if cudf else pd
        versions = meta["versions"]
        check_version(versions["nvtabular"], nvt_version, "nvtabular")
        check_version(versions["python"], sys.version, "python")

        if lib.__name__ in versions:
            check_version(versions[lib.__name__], lib.__version__, lib.__name__)
        else:
            expected = "GPU" if "cudf" in versions else "CPU"
            warnings.warn(f"Loading workflow generated on {expected}")

        # load up the workflow object di
        workflow = cloudpickle.load(fs.open(fs.sep.join([path, "workflow.pkl"]), "rb"))
        workflow.client = client

        # we might have been copied since saving, update all the stat ops
        # with the new path to their storage locations
        for stat in _get_stat_ops([workflow.output_node]):
            stat.op.set_storage_path(path, copy=False)

        return workflow
Ejemplo n.º 17
0
def get_filepath_or_buffer(path_or_data,
                           compression,
                           iotypes=(BytesIO),
                           **kwargs):
    """Return either a filepath string to data, or a memory buffer of data.
    If filepath, then the source filepath is expanded to user's environment.
    If buffer, then data is returned in-memory as bytes or a ByteIO object.

    Parameters
    ----------
    path_or_data : str, file-like object, bytes, ByteIO
        Path to data or the data itself.
    compression : str
        Type of compression algorithm for the content
    iotypes : (), default (BytesIO)
        Object type to exclude from file-like check

    Returns
    -------
    filepath_or_buffer : str, bytes, BytesIO
        Filepath string or in-memory buffer of data
    compression : str
        Type of compression algorithm for the content
    """
    if isinstance(path_or_data, str):
        storage_options = kwargs.get("storage_options")
        # fsspec does not expanduser so handle here
        path_or_data = os.path.expanduser(path_or_data)
        fs, _, paths = fsspec.get_fs_token_paths(
            path_or_data, mode="rb", storage_options=storage_options)
        if len(paths) == 0:
            raise IOError(f"{path_or_data} could not be resolved to any files")
        elif len(paths) > 1:
            warnings.warn(
                f"`path_or_data` resolved to more than 1 file. "
                f"Only the first file {paths[0]} will be read.",
                UserWarning,
            )

        if _is_local_filesystem(fs):
            # Doing this as `read_json` accepts a json string
            # path_or_data need not be a filepath like string
            if os.path.exists(paths[0]):
                path_or_data = paths[0]
        else:
            with fs.open(paths[0]) as f:
                path_or_data = BytesIO(f.read())

    elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data):
        if isinstance(path_or_data, TextIOWrapper):
            path_or_data = path_or_data.buffer
        path_or_data = BytesIO(path_or_data.read())

    return path_or_data, compression
Ejemplo n.º 18
0
def xopen(file, mode="r", *args, **kwargs):
    """
    This function extends the builtin `open` function to support remote files using fsspec.

    It also has a retry mechanism in case connection fails.
    The args and kwargs are passed to fsspec.open, except `use_auth_token` which is used for queries to private repos on huggingface.co
    """
    if fsspec.get_fs_token_paths(file)[0].protocol == "https":
        kwargs["headers"] = get_authentication_headers_for_url(
            file, use_auth_token=kwargs.pop("use_auth_token", None))
    file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
    _add_retries_to_file_obj_read_method(file_obj)
    return file_obj
Ejemplo n.º 19
0
    def _create_file(self, name: str) -> fsspec.core.OpenFile:
        """Open the file that should hold the serialized contentes for the table.

        Raises:
            TableExists if the underlying file already exists.
        """
        filename = self._get_filename(name, self._instance_id)
        fs, _, _ = fsspec.get_fs_token_paths(filename)
        if fs.exists(filename):
            raise TableExists(
                f'{filename} containing serialized data for table {name} already exists.'
            )
        return fsspec.open(filename, "wb")
Ejemplo n.º 20
0
Archivo: file.py Proyecto: brl0/bripy
def md5_blocks_fs(path, blocksize=1024 * 2048) -> str:
    fs, token, paths = get_fs_token_paths(path)
    if fs.isdir(path):
        DBG(f'Item is a directory and will not be hashed.  {str(path)}')
        return
    try:
        hasher = md5()
        with fs.open(path, 'rb') as file:
            block = file.read(blocksize)
            while len(block) > 0:
                hasher.update(block)
                block = file.read(blocksize)
        return hasher.hexdigest()
    except Exception as error:
        logger.warning(
            f'Error trying to hash item: {str(path)}\nError:\n{error}')
        return
Ejemplo n.º 21
0
def append_segment(rundir: str, destination: str, segment_label: str,
                   no_copy: bool):
    """Append local RUNDIR to possibly existing output at DESTINATION
    
    Zarr's will be appended to in place, while all other files will be saved to
    DESTINATION/artifacts/SEGMENT_LABEL.
    """
    if not segment_label:
        segment_label = _get_initial_timestamp(rundir)

    fs, _, _ = fsspec.get_fs_token_paths(destination)

    with tempfile.TemporaryDirectory() as d_in:
        if no_copy:
            tmp_rundir = rundir
        else:
            # this copy is necessary to not destroy the input RUNDIR. Ideally,
            # append_segment could operate without making a copy or affecting RUNDIR.
            tmp_rundir = shutil.copytree(rundir, os.path.join(d_in, "rundir"))
        files = os.listdir(tmp_rundir)

        # Write a temporary artifacts dir to avoid conflict if prognostic run
        # already created an output dir named 'artifacts'
        tmp_artifacts_dir = _create_tmp_artifacts_dir(tmp_rundir,
                                                      segment_label)

        for file_ in files:
            tmp_rundir_file = os.path.join(tmp_rundir, file_)
            logger.info(f"Processing {tmp_rundir_file}")
            if file_.endswith(".zarr"):
                destination_file = os.path.join(destination, file_)
                logger.info(
                    f"Appending {tmp_rundir_file} to {destination_file}")
                append_zarr_along_time(tmp_rundir_file, destination_file, fs)
                # remove temporary local copy so not uploaded twice
                shutil.rmtree(tmp_rundir_file)
            else:
                renamed_file = os.path.join(tmp_artifacts_dir, file_)
                os.rename(tmp_rundir_file, renamed_file)
        os.rename(os.path.dirname(tmp_artifacts_dir),
                  os.path.join(tmp_rundir, "artifacts"))
        logger.info(
            f"Uploading non-zarr files from {tmp_rundir} to {destination}")
        upload_dir(tmp_rundir, destination)
Ejemplo n.º 22
0
def is_directory(path_or_data, **kwargs):
    """Returns True if the provided filepath is a directory
    """
    path_or_data = fsspec.utils.stringify_path(path_or_data)
    if isinstance(path_or_data, str):
        storage_options = kwargs.get("storage_options")
        path_or_data = os.path.expanduser(path_or_data)
        try:
            fs, _, paths = fsspec.get_fs_token_paths(
                path_or_data, mode="rb", storage_options=storage_options)
        except ValueError as e:
            if str(e).startswith("Protocol not known"):
                return True
            else:
                raise e

        return fs.isdir(path_or_data)

    return False
Ejemplo n.º 23
0
def dump(url: str, variable: str, info: bool):
    fs, _, _ = fsspec.get_fs_token_paths(url)
    if not fs.exists(url):
        raise click.ClickException(f"No file or directory at {url}")

    m = fs.get_mapper(url)
    consolidated = _metadata_is_consolidated(m)
    object_, object_is_xarray = _open_with_xarray_or_zarr(m, consolidated)

    if variable is not None:
        if info:
            raise click.ClickException("Cannot use both '-v' and '-i' options")
        object_ = object_[variable]

    if not object_is_xarray:
        object_ = object_.info

    if object_is_xarray and info:
        object_.info()
    else:
        print(object_)
Ejemplo n.º 24
0
def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
    """Return False if `path_or_data` resolves to multiple filepaths or buffers
    """
    path_or_data = fsspec.utils.stringify_path(path_or_data)
    if isinstance(path_or_data, str):
        storage_options = kwargs.get("storage_options")
        path_or_data = os.path.expanduser(path_or_data)
        try:
            fs, _, paths = fsspec.get_fs_token_paths(
                path_or_data, mode="rb", storage_options=storage_options)
        except ValueError as e:
            if str(e).startswith("Protocol not known"):
                return True
            else:
                raise e

        if len(paths) > 1:
            return False
    elif isinstance(path_or_data, (list, tuple)) and len(path_or_data) > 1:
        return False

    return True
Ejemplo n.º 25
0
def read_directory_schema(rundir: str) -> DirectorySchema:
    """Read schema from a directory

    Does not support recursive directories or files other than zarr

    See Also:

        dump_directory_schema_to_disk

    """
    import fsspec

    fs, _, (path_with_no_prefix, ) = fsspec.get_fs_token_paths(rundir)
    files = fs.ls(path_with_no_prefix)

    read_schema = compose(read_schema_from_zarr, zarr.open_group,
                          fs.get_mapper)

    return {
        os.path.relpath(file, path_with_no_prefix): read_schema(file)
        for file in files if file.endswith(".zarr")
    }
Ejemplo n.º 26
0
    def save(self, path):
        """Save this workflow to disk

        Parameters
        ----------
        path: str
            The path to save the workflow to
        """
        # avoid a circular import getting the version
        from nvtabular import __version__ as nvt_version

        fs = fsspec.get_fs_token_paths(path)[0]

        fs.makedirs(path, exist_ok=True)

        # point all stat ops to store intermediate output (parquet etc) at the path
        # this lets us easily bundle
        for stat in _get_stat_ops([self.output_node]):
            stat.op.set_storage_path(path, copy=True)

        # generate a file of all versions used to generate this bundle
        lib = cudf if cudf else pd
        with fs.open(fs.sep.join([path, "metadata.json"]), "w") as o:
            json.dump(
                {
                    "versions": {
                        "nvtabular": nvt_version,
                        lib.__name__: lib.__version__,
                        "python": sys.version,
                    },
                    "generated_timestamp": int(time.time()),
                },
                o,
            )

        # dump out the full workflow (graph/stats/operators etc) using cloudpickle
        with fs.open(fs.sep.join([path, "workflow.pkl"]), "wb") as o:
            cloudpickle.dump(self, o)
Ejemplo n.º 27
0
def xpathglob(path, pattern):
    """Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

    Args:
        path (:obj:`~pathlib.Path`): Calling Path instance.
        pattern (:obj:`str`): Pattern that resulting paths must match.

    Yields:
        :obj:`~pathlib.Path`
    """
    posix_path = _as_posix(path)
    main_hop, *rest_hops = posix_path.split("::")
    if is_local_path(main_hop):
        yield from Path(main_hop).glob(pattern)
    else:
        fs, *_ = fsspec.get_fs_token_paths(xjoin(posix_path, pattern))
        # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching
        #   so to be able to glob patterns like "[0-9]", we have to call `fs.glob`.
        # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories.
        # - If there is "**" in the pattern, `fs.glob` must be called anyway.
        globbed_paths = fs.glob(xjoin(main_hop, pattern))
        for globbed_path in globbed_paths:
            yield type(path)("::".join([f"{fs.protocol}://{globbed_path}"] +
                                       rest_hops))
Ejemplo n.º 28
0
def append_zarr_along_time(source_path: str,
                           target_path: str,
                           fs: fsspec.AbstractFileSystem,
                           dim: str = "time"):
    """Append local zarr store at source_path to zarr store at target_path along time.
    
    Args:
        source_path: Local path to zarr store that represents an xarray dataset.
        target_path: Local or remote url for zarr store to be appended to.
        fs: Filesystem for target_path.
        dim: (optional) name of time dimension. Defaults to "time".

    Raises:
        ValueError: If the chunk size in time does not evenly divide length of time
            dimension for zarr stores at source_path.

    Warning:
        The zarr store as source_path will be modified in place.
    """

    merged_time = _get_merged_time_coordinate(source_path, target_path, dim,
                                              fs)
    if fs.exists(target_path):
        source_store = zarr.open(source_path, mode="r+")
        target_store = zarr.open_consolidated(fsspec.get_mapper(target_path))
        _assert_chunks_match(source_store, target_store, dim)
        _set_time_units_like(source_store, target_store)
        _shift_store(source_store, dim, _get_dim_size(target_store, dim))
    elif fs.protocol == "file":
        os.makedirs(target_path)

    upload_dir(source_path, target_path)
    _overwrite_time_array_with_single_chunk(target_path, merged_time, dim)

    _, _, absolute_target_paths = fsspec.get_fs_token_paths(target_path)
    consolidate_metadata(fs, absolute_target_paths[0])
Ejemplo n.º 29
0
    with ThreadPoolExecutor(max_workers=12) as pool:
        values = pool.map(maybe_get, urls)

    metadata_with_nan = dict(zip(keys_to_get, values))
    metadata = {
        key: val
        for key, val in metadata_with_nan.items() if val is not None
    }
    return {"zarr_consolidated_format": 1, "metadata": metadata}


def consolidate_metadata(fs, root):
    if root.rstrip("/").endswith(".zarr"):
        logger.info(f"Consolidating metadata of {root}")
        meta = _get_metadata_fs(fs, root)

        with fs.open(os.path.join(root, ".zmetadata"), "wb") as f:
            f.write(json_dumps(meta))
    elif fs.isdir(root):
        logger.info(f"Recursing {root}")
        dirs = fs.ls(root)
        for dir_ in dirs:
            consolidate_metadata(fs, dir_)


if __name__ == "__main__":
    url = sys.argv[1]
    fs, _, roots = fsspec.get_fs_token_paths(url)
    root = roots[0]
    consolidate_metadata(fs, root)
Ejemplo n.º 30
0
def get_filepath_or_buffer(
        path_or_data,
        compression,
        mode="rb",
        iotypes=(BytesIO),
        **kwargs,
):
    """Return either a filepath string to data, or a memory buffer of data.
    If filepath, then the source filepath is expanded to user's environment.
    If buffer, then data is returned in-memory as bytes or a ByteIO object.

    Parameters
    ----------
    path_or_data : str, file-like object, bytes, ByteIO
        Path to data or the data itself.
    compression : str
        Type of compression algorithm for the content
    mode : str
        Mode in which file is opened
    iotypes : (), default (BytesIO)
        Object type to exclude from file-like check

    Returns
    -------
    filepath_or_buffer : str, bytes, BytesIO, list
        Filepath string or in-memory buffer of data or a
        list of Filepath strings or in-memory buffers of data.
    compression : str
        Type of compression algorithm for the content
    """
    path_or_data = fsspec.utils.stringify_path(path_or_data)

    if isinstance(path_or_data, str):
        storage_options = kwargs.get("storage_options")
        # fsspec does not expanduser so handle here
        path_or_data = os.path.expanduser(path_or_data)

        try:
            fs, _, paths = fsspec.get_fs_token_paths(
                path_or_data, mode=mode, storage_options=storage_options)
        except ValueError as e:
            if str(e).startswith("Protocol not known"):
                return path_or_data, compression
            else:
                raise e

        if len(paths) == 0:
            raise FileNotFoundError(
                f"{path_or_data} could not be resolved to any files")

        if _is_local_filesystem(fs):
            # Doing this as `read_json` accepts a json string
            # path_or_data need not be a filepath like string
            if os.path.exists(paths[0]):
                path_or_data = paths if len(paths) > 1 else paths[0]

        else:
            path_or_data = [BytesIO(fs.open(fpath).read()) for fpath in paths]
            if len(path_or_data) == 1:
                path_or_data = path_or_data[0]

    elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data):
        if isinstance(path_or_data, TextIOWrapper):
            path_or_data = path_or_data.buffer
        path_or_data = BytesIO(path_or_data.read())

    return path_or_data, compression