Esempio n. 1
0
def test_openfile_open(m):
    of = OpenFile(m, "somepath", mode="wt")
    f = of.open()
    f.write("hello")
    assert m.size("somepath") == 0  # no flush yet
    del of
    assert m.size("somepath") == 0  # still no flush
    f.close()
    assert m.size("somepath") == 5
Esempio n. 2
0
def test_openfile_api(m):
    m.open("somepath", "wb").write(b"data")
    of = OpenFile(m, "somepath")
    assert str(of) == "<OpenFile 'somepath'>"
    f = of.open()
    assert f.read() == b"data"
    f.close()
    with OpenFile(m, "somepath", mode="rt") as f:
        f.read() == "data"
Esempio n. 3
0
def test_openfile_api(m):
    m.open('somepath', 'wb').write(b'data')
    of = OpenFile(m, 'somepath')
    assert str(of) == "<OpenFile 'somepath'>"
    f = of.open()
    assert f.read() == b'data'
    f.close()
    with OpenFile(m, 'somepath', mode='rt') as f:
        f.read() == 'data'
Esempio n. 4
0
def test_compressions(fmt, mode, tmpdir):
    fn = os.path.join(tmpdir, '.tmp.getsize')
    fs = LocalFileSystem()
    f = OpenFile(fs, fn, compression=fmt, mode='wb')
    data = b'Long line of readily compressible text'
    with f as fo:
        fo.write(data)
    if fmt is None:
        assert fs.size(fn) == len(data)
    else:
        assert fs.size(fn) != len(data)

    f = OpenFile(fs, fn, compression=fmt, mode=mode)
    with f as fo:
        if mode == 'rb':
            assert fo.read() == data
        else:
            assert fo.read() == data.decode()
Esempio n. 5
0
def test_compressions(fmt, mode, tmpdir):
    tmpdir = str(tmpdir)
    fn = os.path.join(tmpdir, ".tmp.getsize")
    fs = LocalFileSystem()
    f = OpenFile(fs, fn, compression=fmt, mode="wb")
    data = b"Long line of readily compressible text"
    with f as fo:
        fo.write(data)
    if fmt is None:
        assert fs.size(fn) == len(data)
    else:
        assert fs.size(fn) != len(data)

    f = OpenFile(fs, fn, compression=fmt, mode=mode)
    with f as fo:
        if mode == "rb":
            assert fo.read() == data
        else:
            assert fo.read() == data.decode()
Esempio n. 6
0
def test_compressions(fmt, mode, tmpdir):
    if fmt == "zip" and sys.version_info < (3, 6):
        pytest.xfail("zip compression requires python3.6 or higher")

    tmpdir = str(tmpdir)
    fn = os.path.join(tmpdir, ".tmp.getsize")
    fs = LocalFileSystem()
    f = OpenFile(fs, fn, compression=fmt, mode="wb")
    data = b"Long line of readily compressible text"
    with f as fo:
        fo.write(data)
    if fmt is None:
        assert fs.size(fn) == len(data)
    else:
        assert fs.size(fn) != len(data)

    f = OpenFile(fs, fn, compression=fmt, mode=mode)
    with f as fo:
        if mode == "rb":
            assert fo.read() == data
        else:
            assert fo.read() == data.decode()
Esempio n. 7
0
def test_filesystem_cached(recipe, tmpdir):
    """
    Run tests through a real, cached, fsspec filesystem implementation.
    Here: `TarFileSystem` over `WholeFileCacheFileSystem` over `LocalFileSystem`.
    """

    filename = os.path.join(tmpdir, f'temp{recipe["suffix"]}')

    # Create a filesystem from test fixture.
    fs = fsspec.filesystem("file")
    f = OpenFile(fs, filename, mode="wb")

    with temptar(archive_data, mode=recipe["mode"],
                 suffix=recipe["suffix"]) as tf:
        with f as fo:
            fo.write(open(tf, "rb").read())

    # Verify that the tar archive has the correct compression.
    with open(filename, "rb") as raw:
        assert raw.read()[:10].startswith(recipe["magic"])

    # Access cached filesystem.
    cachedir = tempfile.mkdtemp()
    filesystem = WholeFileCacheFileSystem(fs=fs, cache_storage=cachedir)

    # Verify the cache is empty beforehand.
    assert os.listdir(cachedir) == []

    # Verify content of a sample file.
    with filesystem.open(filename) as resource:
        tarfs = fsspec.filesystem("tar", fo=resource)
        assert tarfs.cat("b") == b"hello"

    # Verify the cache is populated afterwards.
    assert len(os.listdir(cachedir)) == 2

    # Verify that the cache is empty after clearing it.
    filesystem.clear_cache()
    assert os.listdir(cachedir) == []

    filesystem.clear_cache()
    shutil.rmtree(cachedir)
Esempio n. 8
0
def test_filesystem_direct(recipe, tmpdir):
    """
    Run tests through a real fsspec filesystem implementation.
    Here: `LocalFileSystem`.
    """

    filename = os.path.join(tmpdir, f'temp{recipe["suffix"]}')

    fs = fsspec.filesystem("file")
    f = OpenFile(fs, filename, mode="wb")

    with temptar(archive_data, mode=recipe["mode"],
                 suffix=recipe["suffix"]) as tf:
        with f as fo:
            fo.write(open(tf, "rb").read())

    # Verify that the tar archive has the correct compression.
    with open(filename, "rb") as raw:
        assert raw.read()[:10].startswith(recipe["magic"])

    # Verify content of a sample file.
    with fs.open(filename) as resource:
        tarfs = fsspec.filesystem("tar", fo=resource)
        assert tarfs.cat("b") == b"hello"
Esempio n. 9
0
def test_not_found():
    fn = "not-a-file"
    fs = LocalFileSystem()
    with pytest.raises((FileNotFoundError, OSError)):
        with OpenFile(fs, fn, mode="rb"):
            pass
Esempio n. 10
0
def test_not_found():
    fn = 'not-a-file'
    fs = LocalFileSystem()
    with pytest.raises((FileNotFoundError, OSError)) as e:
        with OpenFile(fs, fn, mode='rb'):
            pass
Esempio n. 11
0
def read_bytes(
    urlpath,
    delimiter=None,
    not_zero=False,
    blocksize="128 MiB",
    sample="10 kiB",
    compression=None,
    include_path=False,
    **kwargs,
):
    """Given a path or paths, return delayed objects that read from those paths.

    The path may be a filename like ``'2015-01-01.csv'`` or a globstring
    like ``'2015-*-*.csv'``.

    The path may be preceded by a protocol, like ``s3://`` or ``hdfs://`` if
    those libraries are installed.

    This cleanly breaks data by a delimiter if given, so that block boundaries
    start directly after a delimiter and end on the delimiter.

    Parameters
    ----------
    urlpath : string or list
        Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
        to read from alternative filesystems. To read from multiple files you
        can pass a globstring or a list of paths, with the caveat that they
        must all have the same protocol.
    delimiter : bytes
        An optional delimiter, like ``b'\\n'`` on which to split blocks of
        bytes.
    not_zero : bool
        Force seek of start-of-file delimiter, discarding header.
    blocksize : int, str
        Chunk size in bytes, defaults to "128 MiB"
    compression : string or None
        String like 'gzip' or 'xz'.  Must support efficient random access.
    sample : int, string, or boolean
        Whether or not to return a header sample.
        Values can be ``False`` for "no sample requested"
        Or an integer or string value like ``2**20`` or ``"1 MiB"``
    include_path : bool
        Whether or not to include the path with the bytes representing a particular file.
        Default is False.
    **kwargs : dict
        Extra options that make sense to a particular storage connection, e.g.
        host, port, username, password, etc.

    Examples
    --------
    >>> sample, blocks = read_bytes('2015-*-*.csv', delimiter=b'\\n')  # doctest: +SKIP
    >>> sample, blocks = read_bytes('s3://bucket/2015-*-*.csv', delimiter=b'\\n')  # doctest: +SKIP
    >>> sample, paths, blocks = read_bytes('2015-*-*.csv', include_path=True)  # doctest: +SKIP

    Returns
    -------
    sample : bytes
        The sample header
    blocks : list of lists of ``dask.Delayed``
        Each list corresponds to a file, and each delayed object computes to a
        block of bytes from that file.
    paths : list of strings, only included if include_path is True
        List of same length as blocks, where each item is the path to the file
        represented in the corresponding block.

    """
    if not isinstance(urlpath, (str, list, tuple, os.PathLike)):
        raise TypeError("Path should be a string, os.PathLike, list or tuple")

    fs, fs_token, paths = get_fs_token_paths(urlpath,
                                             mode="rb",
                                             storage_options=kwargs)

    if len(paths) == 0:
        raise OSError("%s resolved to no files" % urlpath)

    if blocksize is not None:
        if isinstance(blocksize, str):
            blocksize = parse_bytes(blocksize)
        if not is_integer(blocksize):
            raise TypeError("blocksize must be an integer")
        blocksize = int(blocksize)

    if blocksize is None:
        offsets = [[0]] * len(paths)
        lengths = [[None]] * len(paths)
    else:
        offsets = []
        lengths = []
        for path in paths:
            if compression == "infer":
                comp = infer_compression(path)
            else:
                comp = compression
            if comp is not None:
                raise ValueError(
                    "Cannot do chunked reads on compressed files. "
                    "To read, set blocksize=None")
            size = fs.info(path)["size"]
            if size is None:
                raise ValueError(
                    "Backing filesystem couldn't determine file size, cannot "
                    "do chunked reads. To read, set blocksize=None.")
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            if not_zero:
                off[0] = 1
                length[0] -= 1
            offsets.append(off)
            lengths.append(length)

    delayed_read = delayed(read_block_from_file)

    out = []
    for path, offset, length in zip(paths, offsets, lengths):
        token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression,
                         offset)
        keys = [f"read-block-{o}-{token}" for o in offset]
        values = [
            delayed_read(
                OpenFile(fs, path, compression=compression),
                o,
                l,
                delimiter,
                dask_key_name=key,
            ) for o, key, l in zip(offset, keys, length)
        ]
        out.append(values)

    if sample:
        if sample is True:
            sample = "10 kiB"  # backwards compatibility
        if isinstance(sample, str):
            sample = parse_bytes(sample)
        with OpenFile(fs, paths[0], compression=compression) as f:
            # read block without seek (because we start at zero)
            if delimiter is None:
                sample = f.read(sample)
            else:
                sample_buff = f.read(sample)
                while True:
                    new = f.read(sample)
                    if not new:
                        break
                    if delimiter in new:
                        sample_buff = (sample_buff +
                                       new.split(delimiter, 1)[0] + delimiter)
                        break
                    sample_buff = sample_buff + new
                sample = sample_buff
    if include_path:
        return sample, out, paths
    return sample, out
Esempio n. 12
0
def read_avro(urlpath,
              blocksize=100000000,
              storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask import compute, delayed
    from dask.bag import from_delayed
    from dask.utils import import_required

    import_required(
        "fastavro",
        "fastavro is a required dependency for using bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode="rb", storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head["sync"]
            f = OpenFile(fs, path, compression=compression)
            token = fs_tokenize(fs_token, delimiter, path, fs.ukey(path),
                                compression, offset)
            keys = ["read-avro-%s-%s" % (o, token) for o in offset]
            values = [
                dread(f, o, l, head, dask_key_name=key)
                for o, key, l in zip(offset, keys, length)
            ]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, compression=compression, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Esempio n. 13
0
def open_head(fs, path, compression):
    """Open a file just to read its head and size"""
    with OpenFile(fs, path, compression=compression) as f:
        head = read_header(f)
    size = fs.info(path)["size"]
    return head, size