Exemple #1
0
    def _open(self, path, mode="rb", **kwargs):
        path = self._strip_protocol(path)

        if not path.startswith(self.target_protocol):
            store_path = self.target_protocol + "://" + path
        else:
            store_path = path
        path = self.fs._strip_protocol(store_path)
        if "r" not in mode:
            return LocalTempFile(self, path, mode=mode)
        fn = self._check_file(path)
        if fn:
            return open(fn, mode)

        sha = hash_name(path, self.same_names)
        fn = os.path.join(self.storage[-1], sha)
        logger.debug("Copying %s to local cache" % path)
        kwargs["mode"] = mode

        self._mkcache()
        with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
            if isinstance(f, AbstractBufferedFile):
                # want no type of caching if just downloading whole thing
                f.cache = BaseCache(0, f.cache.fetcher, f.size)
            if getattr(f, "blocksize", 0) and f.size:
                # opportunity to parallelise here (if not compressed)
                if self.compression:
                    comp = (
                        infer_compression(path)
                        if self.compression == "infer"
                        else self.compression
                    )
                    f = compr[comp](f, mode="rb")
                data = True
                while data:
                    data = f.read(f.blocksize)
                    f2.write(data)
            else:
                # this only applies to HTTP, should instead use streaming
                if self.compression:
                    comp = (
                        infer_compression(path)
                        if self.compression == "infer"
                        else self.compression
                    )
                    f = compr[comp](f, mode="rb")
                f2.write(f.read())
        return self._open(path, mode)
Exemple #2
0
    def _open(self, path, mode="rb", **kwargs):
        path = self._strip_protocol(path)

        if not path.startswith(self.target_protocol):
            store_path = self.target_protocol + "://" + path
        else:
            store_path = path
        path = self.fs._strip_protocol(store_path)
        if "r" not in mode:
            return self.fs._open(path, mode=mode, **kwargs)
        detail, fn = self._check_file(store_path)
        if detail:
            hash, blocks = detail["fn"], detail["blocks"]
            if blocks is True:
                logger.debug("Opening local copy of %s" % path)
                return open(fn, mode)
            else:
                raise ValueError(
                    "Attempt to open partially cached file %s"
                    "as a wholly cached file" % path
                )
        else:
            hash = hash_name(path, self.same_names)
            fn = os.path.join(self.storage[-1], hash)
            blocks = True
            detail = {
                "fn": hash,
                "blocks": blocks,
                "time": time.time(),
                "uid": self.fs.ukey(path),
            }
            self.cached_files[-1][store_path] = detail
            logger.debug("Copying %s to local cache" % path)
        kwargs["mode"] = mode

        # call target filesystems open
        # TODO: why not just use fs.get ??
        f = self.fs._open(path, **kwargs)
        if self.compression:
            comp = (
                infer_compression(path)
                if self.compression == "infer"
                else self.compression
            )
            f = compr[comp](f, mode="rb")
        with open(fn, "wb") as f2:
            if isinstance(f, AbstractBufferedFile):
                # want no type of caching if just downloading whole thing
                f.cache = BaseCache(0, f.cache.fetcher, f.size)
            if getattr(f, "blocksize", 0) and f.size:
                # opportunity to parallelise here
                data = True
                while data:
                    data = f.read(f.blocksize)
                    f2.write(data)
            else:
                # this only applies to HTTP, should instead use streaming
                f2.write(f.read())
        self.save_cache()
        return self._open(path, mode)
Exemple #3
0
    def _open(self, path, mode="rb", **kwargs):
        path = self._strip_protocol(path)

        if "r" not in mode:
            return LocalTempFile(self, path, mode=mode)
        fn = self._check_file(path)
        if fn:
            return open(fn, mode)

        sha = self.hash_name(path, self.same_names)
        fn = os.path.join(self.storage[-1], sha)
        logger.debug("Copying %s to local cache" % path)
        kwargs["mode"] = mode

        self._mkcache()
        if self.compression:
            with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
                if isinstance(f, AbstractBufferedFile):
                    # want no type of caching if just downloading whole thing
                    f.cache = BaseCache(0, f.cache.fetcher, f.size)
                comp = (infer_compression(path)
                        if self.compression == "infer" else self.compression)
                f = compr[comp](f, mode="rb")
                data = True
                while data:
                    block = getattr(f, "blocksize", 5 * 2**20)
                    data = f.read(block)
                    f2.write(data)
        else:
            self.fs.get(path, fn)
        return self._open(path, mode)
Exemple #4
0
    def _open(self, path, mode="rb", **kwargs):
        path = self._strip_protocol(path)
        if "r" not in mode:
            return self.fs._open(path, mode=mode, **kwargs)
        detail = self._check_file(path)
        if detail:
            detail, fn = detail
            _, blocks = detail["fn"], detail["blocks"]
            if blocks is True:
                logger.debug("Opening local copy of %s" % path)

                # In order to support downstream filesystems to be able to
                # infer the compression from the original filename, like
                # the `TarFileSystem`, let's extend the `io.BufferedReader`
                # fileobject protocol by adding a dedicated attribute
                # `original`.
                f = open(fn, mode)
                f.original = detail.get("original")
                return f
            else:
                raise ValueError(
                    "Attempt to open partially cached file %s"
                    "as a wholly cached file" % path
                )
        else:
            fn = self._make_local_details(path)
        kwargs["mode"] = mode

        # call target filesystems open
        self._mkcache()
        if self.compression:
            with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
                if isinstance(f, AbstractBufferedFile):
                    # want no type of caching if just downloading whole thing
                    f.cache = BaseCache(0, f.cache.fetcher, f.size)
                comp = (
                    infer_compression(path)
                    if self.compression == "infer"
                    else self.compression
                )
                f = compr[comp](f, mode="rb")
                data = True
                while data:
                    block = getattr(f, "blocksize", 5 * 2 ** 20)
                    data = f.read(block)
                    f2.write(data)
        else:
            self.fs.get(path, fn)
        self.save_cache()
        return self._open(path, mode)
Exemple #5
0
    def _open(self, path, mode="rb", **kwargs):
        path = self._strip_protocol(path)
        if "r" not in mode:
            return self.fs._open(path, mode=mode, **kwargs)
        detail = self._check_file(path)
        if detail:
            detail, fn = detail
            hash, blocks = detail["fn"], detail["blocks"]
            if blocks is True:
                logger.debug("Opening local copy of %s" % path)
                return open(fn, mode)
            else:
                raise ValueError("Attempt to open partially cached file %s"
                                 "as a wholly cached file" % path)
        else:
            hash = self.hash_name(path, self.same_names)
            fn = os.path.join(self.storage[-1], hash)
            detail = {
                "fn": hash,
                "blocks": True,
                "time": time.time(),
                "uid": self.fs.ukey(path),
            }
            self.cached_files[-1][path] = detail
            logger.debug("Copying %s to local cache" % path)
        kwargs["mode"] = mode

        # call target filesystems open
        self._mkcache()
        if self.compression:
            with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
                if isinstance(f, AbstractBufferedFile):
                    # want no type of caching if just downloading whole thing
                    f.cache = BaseCache(0, f.cache.fetcher, f.size)
                comp = (infer_compression(path)
                        if self.compression == "infer" else self.compression)
                f = compr[comp](f, mode="rb")
                data = True
                while data:
                    block = getattr(f, "blocksize", 5 * 2**20)
                    data = f.read(block)
                    f2.write(data)
        else:
            self.fs.get(path, fn)
        self.save_cache()
        return self._open(path, mode)
Exemple #6
0
def _byte_block_counts(
    urlpath,
    blocksize,
    lineterminator=None,
    compression="infer",
    storage_options=None,
    **kwargs,
):
    """Return a list of paths and block counts.

    Logic copied from dask.bytes.read_bytes
    """

    if lineterminator is not None and len(lineterminator) == 1:
        kwargs["lineterminator"] = lineterminator
    else:
        lineterminator = "\n"

    if compression == "infer":
        paths = get_fs_token_paths(urlpath,
                                   mode="rb",
                                   storage_options=storage_options)[2]
        compression = infer_compression(paths[0])

    if isinstance(blocksize, str):
        blocksize = parse_bytes(blocksize)
    if blocksize and compression:
        blocksize = None

    b_out = read_bytes(
        urlpath,
        delimiter=lineterminator.encode(),
        blocksize=blocksize,
        sample=False,
        compression=compression,
        include_path=True,
        **(storage_options or {}),
    )
    _, values, paths = b_out

    if not isinstance(values[0], (tuple, list)):
        values = [values]

    return paths, [len(v) for v in values]
Exemple #7
0
    def _open(
        self,
        path,
        mode="rb",
        block_size=None,
        autocommit=True,
        cache_options=None,
        **kwargs
    ):
        """Wrap the target _open

        If the whole file exists in the cache, just open it locally and
        return that.

        Otherwise, open the file on the target FS, and make it have a mmap
        cache pointing to the location which we determine, in our cache.
        The ``blocks`` instance is shared, so as the mmap cache instance
        updates, so does the entry in our ``cached_files`` attribute.
        We monkey-patch this file, so that when it closes, we call
        ``close_and_update`` to save the state of the blocks.
        """
        path = self._strip_protocol(path)

        if not path.startswith(self.target_protocol):
            store_path = self.target_protocol + "://" + path
        else:
            store_path = path
        path = self.fs._strip_protocol(store_path)
        if "r" not in mode:
            return self.fs._open(
                path,
                mode=mode,
                block_size=block_size,
                autocommit=autocommit,
                cache_options=cache_options,
                **kwargs
            )
        detail, fn = self._check_file(store_path)
        if detail:
            # file is in cache
            hash, blocks = detail["fn"], detail["blocks"]
            if blocks is True:
                # stored file is complete
                logger.debug("Opening local copy of %s" % path)
                return open(fn, mode)
            # TODO: action where partial file exists in read-only cache
            logger.debug("Opening partially cached copy of %s" % path)
        else:
            hash = hash_name(path, self.same_names)
            fn = os.path.join(self.storage[-1], hash)
            blocks = set()
            detail = {
                "fn": hash,
                "blocks": blocks,
                "time": time.time(),
                "uid": self.fs.ukey(path),
            }
            self.cached_files[-1][store_path] = detail
            logger.debug("Creating local sparse file for %s" % path)

        # call target filesystems open
        f = self.fs._open(
            path,
            mode=mode,
            block_size=block_size,
            autocommit=autocommit,
            cache_options=cache_options,
            cache_type=None,
            **kwargs
        )
        if self.compression:
            comp = (
                infer_compression(path)
                if self.compression == "infer"
                else self.compression
            )
            f = compr[comp](f, mode="rb")
        if "blocksize" in detail:
            if detail["blocksize"] != f.blocksize:
                raise ValueError(
                    "Cached file must be reopened with same block"
                    "size as original (old: %i, new %i)"
                    "" % (detail["blocksize"], f.blocksize)
                )
        else:
            detail["blocksize"] = f.blocksize
        f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
        close = f.close
        f.close = lambda: self.close_and_update(f, close)
        self.save_cache()
        return f
def test_infer_compression():
    assert infer_compression('fn.zip') == 'zip'
    assert infer_compression('fn.gz') == 'gzip'
    assert infer_compression('fn.unknown') is None
Exemple #9
0
def read_pandas(
    reader,
    urlpath,
    blocksize="default",
    lineterminator=None,
    compression="infer",
    sample=256000,
    enforce=False,
    assume_missing=False,
    storage_options=None,
    include_path_column=False,
    **kwargs,
):
    reader_name = reader.__name__
    if lineterminator is not None and len(lineterminator) == 1:
        kwargs["lineterminator"] = lineterminator
    else:
        lineterminator = "\n"
    if include_path_column and isinstance(include_path_column, bool):
        include_path_column = "path"
    if "index" in kwargs or "index_col" in kwargs:
        raise ValueError("Keywords 'index' and 'index_col' not supported. "
                         "Use dd.{0}(...).set_index('my-index') "
                         "instead".format(reader_name))
    for kw in ["iterator", "chunksize"]:
        if kw in kwargs:
            raise ValueError("{0} not supported for dd.{1}".format(
                kw, reader_name))
    if kwargs.get("nrows", None):
        raise ValueError("The 'nrows' keyword is not supported by "
                         "`dd.{0}`. To achieve the same behavior, it's "
                         "recommended to use `dd.{0}(...)."
                         "head(n=nrows)`".format(reader_name))
    if isinstance(kwargs.get("skiprows"), int):
        skiprows = lastskiprow = firstrow = kwargs.get("skiprows")
    elif kwargs.get("skiprows") is None:
        skiprows = lastskiprow = firstrow = 0
    else:
        # When skiprows is a list, we expect more than max(skiprows) to
        # be included in the sample. This means that [0,2] will work well,
        # but [0, 440] might not work.
        skiprows = set(kwargs.get("skiprows"))
        lastskiprow = max(skiprows)
        # find the firstrow that is not skipped, for use as header
        firstrow = min(set(range(len(skiprows) + 1)) - set(skiprows))
    if isinstance(kwargs.get("header"), list):
        raise TypeError(
            "List of header rows not supported for dd.{0}".format(reader_name))
    if isinstance(kwargs.get("converters"), dict) and include_path_column:
        path_converter = kwargs.get("converters").get(include_path_column,
                                                      None)
    else:
        path_converter = None

    # If compression is "infer", inspect the (first) path suffix and
    # set the proper compression option if the suffix is recongnized.
    if compression == "infer":
        # Translate the input urlpath to a simple path list
        paths = get_fs_token_paths(urlpath,
                                   mode="rb",
                                   storage_options=storage_options)[2]

        # Infer compression from first path
        compression = infer_compression(paths[0])

    if blocksize == "default":
        blocksize = AUTO_BLOCKSIZE
    if isinstance(blocksize, str):
        blocksize = parse_bytes(blocksize)
    if blocksize and compression:
        # NONE of the compressions should use chunking
        warn("Warning %s compression does not support breaking apart files\n"
             "Please ensure that each individual file can fit in memory and\n"
             "use the keyword ``blocksize=None to remove this message``\n"
             "Setting ``blocksize=None``" % compression)
        blocksize = None
    if compression not in compr:
        raise NotImplementedError("Compression format %s not installed" %
                                  compression)
    if blocksize and sample and blocksize < sample and lastskiprow != 0:
        warn("Unexpected behavior can result from passing skiprows when\n"
             "blocksize is smaller than sample size.\n"
             "Setting ``sample=blocksize``")
        sample = blocksize
    b_lineterminator = lineterminator.encode()
    b_out = read_bytes(
        urlpath,
        delimiter=b_lineterminator,
        blocksize=blocksize,
        sample=sample,
        compression=compression,
        include_path=include_path_column,
        **(storage_options or {}),
    )

    if include_path_column:
        b_sample, values, paths = b_out
        path = (include_path_column, path_converter)
    else:
        b_sample, values = b_out
        path = None

    if not isinstance(values[0], (tuple, list)):
        values = [values]
    # If we have not sampled, then use the first row of the first values
    # as a representative sample.
    if b_sample is False and len(values[0]):
        b_sample = values[0][0].compute()

    # Get header row, and check that sample is long enough. If the file
    # contains a header row, we need at least 2 nonempty rows + the number of
    # rows to skip.
    names = kwargs.get("names", None)
    header = kwargs.get("header", "infer" if names is None else None)
    need = 1 if header is None else 2
    parts = b_sample.split(b_lineterminator, lastskiprow + need)
    # If the last partition is empty, don't count it
    nparts = 0 if not parts else len(parts) - int(not parts[-1])

    if sample is not False and nparts < lastskiprow + need and len(
            b_sample) >= sample:
        raise ValueError("Sample is not large enough to include at least one "
                         "row of data. Please increase the number of bytes "
                         "in `sample` in the call to `read_csv`/`read_table`")

    header = b"" if header is None else parts[firstrow] + b_lineterminator

    # Use sample to infer dtypes and check for presence of include_path_column
    head = reader(BytesIO(b_sample), **kwargs)
    if include_path_column and (include_path_column in head.columns):
        raise ValueError("Files already contain the column name: %s, so the "
                         "path column cannot use this name. Please set "
                         "`include_path_column` to a unique name." %
                         include_path_column)

    specified_dtypes = kwargs.get("dtype", {})
    if specified_dtypes is None:
        specified_dtypes = {}
    # If specified_dtypes is a single type, then all columns were specified
    if assume_missing and isinstance(specified_dtypes, dict):
        # Convert all non-specified integer columns to floats
        for c in head.columns:
            if is_integer_dtype(head[c].dtype) and c not in specified_dtypes:
                head[c] = head[c].astype(float)

    values = [[dsk.dask.values() for dsk in block] for block in values]

    return text_blocks_to_pandas(
        reader,
        values,
        header,
        head,
        kwargs,
        enforce=enforce,
        specified_dtypes=specified_dtypes,
        path=path,
        blocksize=blocksize,
    )
Exemple #10
0
def read_bytes(
    urlpath,
    delimiter=None,
    not_zero=False,
    blocksize="128 MiB",
    sample="10 kiB",
    compression=None,
    include_path=False,
    **kwargs,
):
    """Given a path or paths, return delayed objects that read from those paths.

    The path may be a filename like ``'2015-01-01.csv'`` or a globstring
    like ``'2015-*-*.csv'``.

    The path may be preceded by a protocol, like ``s3://`` or ``hdfs://`` if
    those libraries are installed.

    This cleanly breaks data by a delimiter if given, so that block boundaries
    start directly after a delimiter and end on the delimiter.

    Parameters
    ----------
    urlpath : string or list
        Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
        to read from alternative filesystems. To read from multiple files you
        can pass a globstring or a list of paths, with the caveat that they
        must all have the same protocol.
    delimiter : bytes
        An optional delimiter, like ``b'\\n'`` on which to split blocks of
        bytes.
    not_zero : bool
        Force seek of start-of-file delimiter, discarding header.
    blocksize : int, str
        Chunk size in bytes, defaults to "128 MiB"
    compression : string or None
        String like 'gzip' or 'xz'.  Must support efficient random access.
    sample : int, string, or boolean
        Whether or not to return a header sample.
        Values can be ``False`` for "no sample requested"
        Or an integer or string value like ``2**20`` or ``"1 MiB"``
    include_path : bool
        Whether or not to include the path with the bytes representing a particular file.
        Default is False.
    **kwargs : dict
        Extra options that make sense to a particular storage connection, e.g.
        host, port, username, password, etc.

    Examples
    --------
    >>> sample, blocks = read_bytes('2015-*-*.csv', delimiter=b'\\n')  # doctest: +SKIP
    >>> sample, blocks = read_bytes('s3://bucket/2015-*-*.csv', delimiter=b'\\n')  # doctest: +SKIP
    >>> sample, paths, blocks = read_bytes('2015-*-*.csv', include_path=True)  # doctest: +SKIP

    Returns
    -------
    sample : bytes
        The sample header
    blocks : list of lists of ``dask.Delayed``
        Each list corresponds to a file, and each delayed object computes to a
        block of bytes from that file.
    paths : list of strings, only included if include_path is True
        List of same length as blocks, where each item is the path to the file
        represented in the corresponding block.

    """
    if not isinstance(urlpath, (str, list, tuple, os.PathLike)):
        raise TypeError("Path should be a string, os.PathLike, list or tuple")

    fs, fs_token, paths = get_fs_token_paths(urlpath,
                                             mode="rb",
                                             storage_options=kwargs)

    if len(paths) == 0:
        raise OSError("%s resolved to no files" % urlpath)

    if blocksize is not None:
        if isinstance(blocksize, str):
            blocksize = parse_bytes(blocksize)
        if not is_integer(blocksize):
            raise TypeError("blocksize must be an integer")
        blocksize = int(blocksize)

    if blocksize is None:
        offsets = [[0]] * len(paths)
        lengths = [[None]] * len(paths)
    else:
        offsets = []
        lengths = []
        for path in paths:
            if compression == "infer":
                comp = infer_compression(path)
            else:
                comp = compression
            if comp is not None:
                raise ValueError(
                    "Cannot do chunked reads on compressed files. "
                    "To read, set blocksize=None")
            size = fs.info(path)["size"]
            if size is None:
                raise ValueError(
                    "Backing filesystem couldn't determine file size, cannot "
                    "do chunked reads. To read, set blocksize=None.")
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            if not_zero:
                off[0] = 1
                length[0] -= 1
            offsets.append(off)
            lengths.append(length)

    delayed_read = delayed(read_block_from_file)

    out = []
    for path, offset, length in zip(paths, offsets, lengths):
        token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression,
                         offset)
        keys = [f"read-block-{o}-{token}" for o in offset]
        values = [
            delayed_read(
                OpenFile(fs, path, compression=compression),
                o,
                l,
                delimiter,
                dask_key_name=key,
            ) for o, key, l in zip(offset, keys, length)
        ]
        out.append(values)

    if sample:
        if sample is True:
            sample = "10 kiB"  # backwards compatibility
        if isinstance(sample, str):
            sample = parse_bytes(sample)
        with OpenFile(fs, paths[0], compression=compression) as f:
            # read block without seek (because we start at zero)
            if delimiter is None:
                sample = f.read(sample)
            else:
                sample_buff = f.read(sample)
                while True:
                    new = f.read(sample)
                    if not new:
                        break
                    if delimiter in new:
                        sample_buff = (sample_buff +
                                       new.split(delimiter, 1)[0] + delimiter)
                        break
                    sample_buff = sample_buff + new
                sample = sample_buff
    if include_path:
        return sample, out, paths
    return sample, out
Exemple #11
0
def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
    if isinstance(chunksize, str):
        chunksize = parse_bytes(chunksize)

    if isinstance(path, list):
        filenames = path
    elif isinstance(path, str):
        filenames = sorted(glob(path))
    elif hasattr(path, "__fspath__"):
        filenames = sorted(glob(path.__fspath__()))
    else:
        raise TypeError(f"Path type not understood:{type(path)}")

    if not filenames:
        msg = f"A file in: {filenames} does not exist."
        raise FileNotFoundError(msg)

    name = "read-csv-" + tokenize(path, tokenize, **
                                  kwargs)  # TODO: get last modified time

    compression = kwargs.get("compression", "infer")

    if compression == "infer":
        # Infer compression from first path by default
        compression = infer_compression(filenames[0])

    if compression and chunksize:
        # compressed CSVs reading must read the entire file
        kwargs.pop("byte_range", None)
        warn("Warning %s compression does not support breaking apart files\n"
             "Please ensure that each individual file can fit in memory and\n"
             "use the keyword ``chunksize=None to remove this message``\n"
             "Setting ``chunksize=(size of file)``" % compression)
        chunksize = None

    if chunksize is None:
        return read_csv_without_chunksize(path, **kwargs)

    # Let dask.dataframe generate meta
    dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
    kwargs1 = kwargs.copy()
    usecols = kwargs1.pop("usecols", None)
    dtype = kwargs1.pop("dtype", None)
    meta = dask_reader(filenames[0], **kwargs1)._meta
    names = meta.columns
    if usecols or dtype:
        # Regenerate meta with original kwargs if
        # `usecols` or `dtype` was specified
        meta = dask_reader(filenames[0], **kwargs)._meta

    dsk = {}
    i = 0
    dtypes = meta.dtypes.values

    for fn in filenames:
        size = os.path.getsize(fn)
        for start in range(0, size, chunksize):
            kwargs2 = kwargs.copy()
            kwargs2["byte_range"] = (
                start,
                chunksize,
            )  # specify which chunk of the file we care about
            if start != 0:
                kwargs2["names"] = names  # no header in the middle of the file
                kwargs2["header"] = None
            dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)

            i += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Exemple #12
0
def test_infer_custom_compression():
    """Inferred compression gets values from fsspec.compression.compr."""
    assert infer_compression("fn.zip") == "zip"
    assert infer_compression("fn.gz") == "gzip"
    assert infer_compression("fn.unknown") is None
    assert infer_compression("fn.test_custom") is None
    assert infer_compression("fn.tst") is None

    register_compression("test_custom", lambda f, **kwargs: f, "tst")

    try:
        assert infer_compression("fn.zip") == "zip"
        assert infer_compression("fn.gz") == "gzip"
        assert infer_compression("fn.unknown") is None
        assert infer_compression("fn.test_custom") is None
        assert infer_compression("fn.tst") == "test_custom"

        # Duplicate registration in name or extension raises a value error.
        with pytest.raises(ValueError):
            register_compression("test_custom", lambda f, **kwargs: f, "tst")

        with pytest.raises(ValueError):
            register_compression("test_conflicting", lambda f, **kwargs: f, "tst")
        assert "test_conflicting" not in compr

        # ...but can be forced.
        register_compression(
            "test_conflicting", lambda f, **kwargs: f, "tst", force=True
        )
        assert infer_compression("fn.zip") == "zip"
        assert infer_compression("fn.gz") == "gzip"
        assert infer_compression("fn.unknown") is None
        assert infer_compression("fn.test_custom") is None
        assert infer_compression("fn.tst") == "test_conflicting"

    finally:
        del compr["test_custom"]
        del compr["test_conflicting"]
        del compressions["tst"]
Exemple #13
0
def test_lzma_compression_name():
    pytest.importorskip("lzma")
    assert infer_compression("fn.xz") == "xz"
Exemple #14
0
    def __init__(self,
                 fo="",
                 index_store=None,
                 storage_options=None,
                 compression=None,
                 **kwargs):
        super().__init__(**kwargs)

        if isinstance(fo, str):
            fo = fsspec.open(fo, **(storage_options or {})).open()

        # Try to infer compression.
        if compression is None:
            name = None

            # Try different ways to get hold of the filename. `fo` might either
            # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
            # `fsspec.AbstractFileSystem` instance.
            try:
                # Amended io.BufferedReader or similar.
                # This uses a "protocol extension" where original filenames are
                # propagated to archive-like filesystems in order to let them
                # infer the right compression appropriately.
                if hasattr(fo, "original"):
                    name = fo.original

                # fsspec.LocalFileOpener
                elif hasattr(fo, "path"):
                    name = fo.path

                # io.BufferedReader
                elif hasattr(fo, "name"):
                    name = fo.name

                # fsspec.AbstractFileSystem
                elif hasattr(fo, "info"):
                    name = fo.info()["name"]

            except Exception as ex:
                logger.warning(
                    f"Unable to determine file name, not inferring compression: {ex}"
                )

            if name is not None:
                compression = infer_compression(name)
                logger.info(
                    f"Inferred compression {compression} from file name {name}"
                )

        if compression is not None:
            # TODO: tarfile already implements compression with modes like "'r:gz'",
            #  but then would seek to offset in the file work?
            fo = compr[compression](fo)

        self._fo_ref = fo
        weakref.finalize(self, fo.close)
        self.fo = fo.__enter__()  # the whole instance is a context
        self.tar = tarfile.TarFile(fileobj=self.fo)
        self.dir_cache = None

        self.index_store = index_store
        self.index = None
        self._index()
Exemple #15
0
def test_infer_uppercase_compression():
    assert infer_compression("fn.ZIP") == "zip"
    assert infer_compression("fn.GZ") == "gzip"
    assert infer_compression("fn.UNKNOWN") is None
    assert infer_compression("fn.TEST_UPPERCASE") is None
    assert infer_compression("fn.TEST") is None