Beispiel #1
0
def test_modification_time_open_files():
    with s3_context('compress', files):
        a = open_files('s3://compress/test/accounts.*')
        b = open_files('s3://compress/test/accounts.*')

        assert [aa._key for aa in a] == [bb._key for bb in b]

    with s3_context('compress', valmap(double, files)):
        c = open_files('s3://compress/test/accounts.*')

    assert [aa._key for aa in a] != [cc._key for cc in c]
Beispiel #2
0
def test_simple(dir_server):
    root = 'http://localhost:8999/'
    fn = files[0]
    f = open_files(root + fn)[0]
    with f as f:
        data = f.read()
    assert data == open(os.path.join(dir_server, fn), 'rb').read()
Beispiel #3
0
def test_files(s3):
    myfiles = open_files('s3://' + test_bucket_name + '/test/accounts.*')
    assert len(myfiles) == len(files)
    for lazy_file, path in zip(myfiles, sorted(files)):
        with lazy_file as f:
            data = f.read()
            assert data == files[path]
Beispiel #4
0
def test_open_files():
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*')
        assert len(myfiles) == len(files)
        for lazy_file, data_file in zip(myfiles, sorted(files)):
            with lazy_file as f:
                x = f.read()
                assert x == files[data_file]
Beispiel #5
0
def test_registered_open_files(s3):
    myfiles = open_files('s3://%s/test/accounts.*.json' % test_bucket_name)
    assert len(myfiles) == len(files)
    data = []
    for file in myfiles:
        with file as f:
            data.append(f.read())
    assert list(data) == [files[k] for k in sorted(files)]
Beispiel #6
0
def test_compression_binary(fmt):
    files2 = valmap(compression.compress[fmt], files)
    with filetexts(files2, mode='b'):
        myfiles = open_files('.test.accounts.*', compression=fmt)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k] for k in sorted(files)]
Beispiel #7
0
def test_open_files(s3, mode):
    myfiles = open_files('s3://' + test_bucket_name + '/test/accounts.*',
                         mode=mode)
    assert len(myfiles) == len(files)
    for lazy_file, path in zip(myfiles, sorted(files)):
        with lazy_file as f:
            data = f.read()
            sol = files[path]
            assert data == sol if mode == 'rb' else sol.decode()
Beispiel #8
0
def test_open_files_write(s3):
    paths = ['s3://' + test_bucket_name + '/more/' + f for f in files]
    fils = open_files(paths, mode='wb')
    for fil, data in zip(fils, files.values()):
        with fil as f:
            f.write(data)
    sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*')
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)
Beispiel #9
0
def test_fetch_range_with_headers(dir_server):
    # https://github.com/dask/dask/issues/4479
    root = 'http://localhost:8999/'
    fn = files[0]
    headers = {'Date': 'Wed, 21 Oct 2015 07:28:00 GMT'}
    f = open_files(root + fn, headers=headers)[0]
    with f as f:
        data = f.read(length=1) + f.read(length=-1)
    assert data == open(os.path.join(dir_server, fn), 'rb').read()
Beispiel #10
0
def test_py2_local_bytes(tmpdir):
    fn = str(tmpdir / 'myfile.txt.gz')
    with gzip.open(fn, mode='wb') as f:
        f.write(b'hello\nworld')

    files = open_files(fn, compression='gzip', mode='rt')

    with files[0] as f:
        assert all(isinstance(line, unicode) for line in f)
Beispiel #11
0
def test_registered_open_files():
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*')
        assert len(myfiles) == len(files)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k] for k in sorted(files)]
Beispiel #12
0
def read_avro(urlpath, blocksize=100000000, storage_options=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import open_files, read_bytes
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    files = open_files(urlpath, **storage_options)
    if blocksize is not None:
        dhead = delayed(open_head)
        heads = compute(*[dhead(f) for f in files])
        dread = delayed(read_chunk)
        bits = []
        for head, f in zip(heads, files):
            _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize,
                                   delimiter=head['sync'], include_path=False,
                                   **storage_options)
            bits.extend([dread(ch, head) for ch in chunks[0]])
        return from_delayed(bits)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Beispiel #13
0
def test_open_files_text_mode(encoding):
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*', mode='rt', encoding=encoding)
        assert len(myfiles) == len(files)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k].decode(encoding)
                              for k in sorted(files)]
Beispiel #14
0
def test_errors(dir_server):
    f = open_files('http://localhost:8999/doesnotexist')[0]
    with pytest.raises(requests.exceptions.RequestException):
        with f:
            pass
    f = open_files('http://nohost/')[0]
    with pytest.raises(requests.exceptions.RequestException):
        with f:
            pass
    root = 'http://localhost:8999/'
    fn = files[0]
    f = open_files(root + fn, mode='wb')[0]
    with pytest.raises(NotImplementedError):
        with f:
            pass
    f = open_files(root + fn)[0]
    with f as f:
        with pytest.raises(ValueError):
            f.seek(-1)
Beispiel #15
0
def test_ops_blocksize(dir_server):
    root = 'http://localhost:8999/'
    fn = files[0]
    f = open_files(root + fn, block_size=2)[0]
    data = open(os.path.join(dir_server, fn), 'rb').read()
    with f as f:
        # it's OK to read the whole file
        assert f.read() == data
        # and now the file magically has a size
        assert f.size == len(data)

    # note that if we reuse f from above, because it is tokenized, we get
    # the same open file - where is this cached?
    fn = files[1]
    f = open_files(root + fn, block_size=2)[0]
    with f as f:
        # fails becasue we want only 12 bytes
        with pytest.raises(ValueError):
            assert f.read(10) == data[:10]
Beispiel #16
0
def test_open_files_compression(mode, fmt):
    files2 = valmap(compression.compress[fmt], files)
    with filetexts(files2, mode='b'):
        myfiles = open_files('.test.accounts.*', mode=mode, compression=fmt)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        sol = [files[k] for k in sorted(files)]
        if mode == 'rt':
            sol = [b.decode() for b in sol]
        assert list(data) == sol
Beispiel #17
0
def test_pickability_of_lazy_files(tmpdir):
    tmpdir = str(tmpdir)
    cloudpickle = pytest.importorskip('cloudpickle')

    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*')
        myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles))

        for f, f2 in zip(myfiles, myfiles2):
            assert f.path == f2.path
            assert type(f.fs) == type(f2.fs)
            with f as f_open, f2 as f2_open:
                assert f_open.read() == f2_open.read()
Beispiel #18
0
def test_open_files_write(hdfs):
    path = 'hdfs://%s/' % basedir
    data = [b'test data %i' % i for i in range(5)]

    files = open_files(path, num=len(data), mode='wb')
    for fil, b in zip(files, data):
        with fil as f:
            f.write(b)

    sample, vals = read_bytes('hdfs://%s/*.part' % basedir)

    (results,) = dask.compute(list(concat(vals)))
    assert data == results
Beispiel #19
0
def test_ops(dir_server, block_size):
    root = 'http://localhost:8999/'
    fn = files[0]
    f = open_files(root + fn)[0]
    data = open(os.path.join(dir_server, fn), 'rb').read()
    with f as f:
        # these pass because the default
        assert f.read(10) == data[:10]
        f.seek(0)
        assert f.read(10) == data[:10]
        assert f.read(10) == data[10:20]
        f.seek(-10, 2)
        assert f.read() == data[-10:]
Beispiel #20
0
def test_loc(dir_server):
    root = 'http://localhost:8999/'
    fn = files[0]
    f = open_files(root + fn)[0]
    expected = open(os.path.join(dir_server, fn), 'rb').read()
    with f as f:
        data = f.read(2)
        assert data == expected[:2]
        assert f.loc == 2
        f.seek(0)
        data = f.read(3)
        assert data == expected[:3]
        f.seek(1, 1)
        assert f.loc == 4
Beispiel #21
0
def test_open_files_write(tmpdir, compression_opener):
    compression, opener = compression_opener
    tmpdir = str(tmpdir)
    files = open_files(tmpdir, num=2, mode='wb', compression=compression)
    assert len(files) == 2
    assert {f.mode for f in files} == {'wb'}
    for fil in files:
        with fil as f:
            f.write(b'000')
    files = sorted(os.listdir(tmpdir))
    assert files == ['0.part', '1.part']

    with opener(os.path.join(tmpdir, files[0]), 'rb') as f:
        d = f.read()
    assert d == b'000'
Beispiel #22
0
def to_avro(b, filename, schema, name_function=None, storage_options=None,
            codec='null', sync_interval=16000, metadata=None, compute=True,
            **kwargs):
    """Write bag to set of avro files

    The schema is a complex dictionary dscribing the data, see
    https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
    and https://fastavro.readthedocs.io/en/latest/writer.html .
    It's structure is as follows::

        {'name': 'Test',
         'namespace': 'Test',
         'doc': 'Descriptive text',
         'type': 'record',
         'fields': [
            {'name': 'a', 'type': 'int'},
         ]}

    where the "name" field is required, but "namespace" and "doc" are optional
    descriptors; "type" must always be "record". The list of fields should
    have an entry for every key of the input records, and the types are
    like the primitive, complex or logical types of the Avro spec
    ( https://avro.apache.org/docs/1.8.2/spec.html ).

    Results in one avro file per input partition.

    Parameters
    ----------
    b: dask.bag.Bag
    filename: list of str or str
        Filenames to write to. If a list, number must match the number of
        partitions. If a string, must includ a glob character "*", which will
        be expanded using name_function
    schema: dict
        Avro schema dictionary, see above
    name_function: None or callable
        Expands integers into strings, see
        ``dask.bytes.utils.build_name_function``
    storage_options: None or dict
        Extra key/value options to pass to the backend file-system
    codec: 'null', 'deflate', or 'snappy'
        Compression algorithm
    sync_interval: int
        Number of records to include in each block within a file
    metadata: None or dict
        Included in the file header
    compute: bool
        If True, files are written immediately, and function blocks. If False,
        returns delayed objects, which can be computed by the user where
        convenient.
    kwargs: passed to compute(), if compute=True

    Examples
    --------
    >>> import dask.bag as db
    >>> b = db.from_sequence([{'name': 'Alice', 'value': 100},
    ...                       {'name': 'Bob', 'value': 200}])
    >>> schema = {'name': 'People', 'doc': "Set of people's scores",
    ...           'type': 'record',
    ...           'fields': [
    ...               {'name': 'name', 'type': 'string'},
    ...               {'name': 'value', 'type': 'int'}]}
    >>> b.to_avro('my-data.*.avro', schema)  # doctest: +SKIP
    ['my-data.0.avro', 'my-data.1.avro']
    """
    # TODO infer schema from first partition of data
    from dask.utils import import_required
    from dask.bytes.core import open_files
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.to_avro().")
    _verify_schema(schema)

    storage_options = storage_options or {}
    files = open_files(filename, 'wb', name_function=name_function,
                       num=b.npartitions, **storage_options)
    name = 'to-avro-' + uuid.uuid4().hex
    dsk = {(name, i): (_write_avro_part, (b.name, i), f, schema, codec,
                       sync_interval, metadata)
           for i, f in enumerate(files)}
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b])
    out = type(b)(graph, name, b.npartitions)
    if compute:
        out.compute(**kwargs)
        return [f.path for f in files]
    else:
        return out.to_delayed()
Beispiel #23
0
def to_avro(b,
            filename,
            schema,
            name_function=None,
            storage_options=None,
            codec="null",
            sync_interval=16000,
            metadata=None,
            compute=True,
            **kwargs):
    """Write bag to set of avro files

    The schema is a complex dictionary describing the data, see
    https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
    and https://fastavro.readthedocs.io/en/latest/writer.html .
    It's structure is as follows::

        {'name': 'Test',
         'namespace': 'Test',
         'doc': 'Descriptive text',
         'type': 'record',
         'fields': [
            {'name': 'a', 'type': 'int'},
         ]}

    where the "name" field is required, but "namespace" and "doc" are optional
    descriptors; "type" must always be "record". The list of fields should
    have an entry for every key of the input records, and the types are
    like the primitive, complex or logical types of the Avro spec
    ( https://avro.apache.org/docs/1.8.2/spec.html ).

    Results in one avro file per input partition.

    Parameters
    ----------
    b: dask.bag.Bag
    filename: list of str or str
        Filenames to write to. If a list, number must match the number of
        partitions. If a string, must includ a glob character "*", which will
        be expanded using name_function
    schema: dict
        Avro schema dictionary, see above
    name_function: None or callable
        Expands integers into strings, see
        ``dask.bytes.utils.build_name_function``
    storage_options: None or dict
        Extra key/value options to pass to the backend file-system
    codec: 'null', 'deflate', or 'snappy'
        Compression algorithm
    sync_interval: int
        Number of records to include in each block within a file
    metadata: None or dict
        Included in the file header
    compute: bool
        If True, files are written immediately, and function blocks. If False,
        returns delayed objects, which can be computed by the user where
        convenient.
    kwargs: passed to compute(), if compute=True

    Examples
    --------
    >>> import dask.bag as db
    >>> b = db.from_sequence([{'name': 'Alice', 'value': 100},
    ...                       {'name': 'Bob', 'value': 200}])
    >>> schema = {'name': 'People', 'doc': "Set of people's scores",
    ...           'type': 'record',
    ...           'fields': [
    ...               {'name': 'name', 'type': 'string'},
    ...               {'name': 'value', 'type': 'int'}]}
    >>> b.to_avro('my-data.*.avro', schema)  # doctest: +SKIP
    ['my-data.0.avro', 'my-data.1.avro']
    """
    # TODO infer schema from first partition of data
    from dask.utils import import_required
    from dask.bytes.core import open_files

    import_required(
        "fastavro", "fastavro is a required dependency for using "
        "bag.to_avro().")
    _verify_schema(schema)

    storage_options = storage_options or {}
    files = open_files(filename,
                       "wb",
                       name_function=name_function,
                       num=b.npartitions,
                       **storage_options)
    name = "to-avro-" + uuid.uuid4().hex
    dsk = {(name, i): (
        _write_avro_part,
        (b.name, i),
        f,
        schema,
        codec,
        sync_interval,
        metadata,
    )
           for i, f in enumerate(files)}
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b])
    out = type(b)(graph, name, b.npartitions)
    if compute:
        out.compute(**kwargs)
        return [f.path for f in files]
    else:
        return out.to_delayed()
Beispiel #24
0
def read_avro(urlpath,
              blocksize=100000000,
              storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import open_files, get_fs_token_paths, OpenFile, tokenize
    from dask.bag import from_delayed

    import_required(
        "fastavro", "fastavro is a required dependency for using "
        "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode="rb", storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head["sync"]
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ["read-avro-%s-%s" % (o, token) for o in offset]
            values = [
                dread(f, o, l, head, dask_key_name=key)
                for o, key, l in zip(offset, keys, length)
            ]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, compression=compression, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Beispiel #25
0
def test_files(dir_server):
    root = "http://localhost:8999/"
    fs = open_files([root + f for f in files])
    for f, f2 in zip(fs, files):
        with f as f:
            assert f.read() == open(os.path.join(dir_server, f2), "rb").read()
Beispiel #26
0
def test_files(dir_server):
    root = 'http://localhost:8999/'
    fs = open_files([root + f for f in files])
    for f, f2 in zip(fs, files):
        with f as f:
            assert f.read() == open(os.path.join(dir_server, f2), 'rb').read()
Beispiel #27
0
def test_open_files_write(tmpdir):
    tmpdir = str(tmpdir)
    files = open_files([os.path.join(tmpdir, 'test1'),
                        os.path.join(tmpdir, 'test2')], mode='wb')
    assert len(files) == 2
    assert files[0].mode == 'wb'
Beispiel #28
0
def test_open_glob(dir_server):
    root = "http://localhost:8999/"
    fs = open_files(root + "/*")
    assert fs[0].path == "http://localhost:8999/a"
    assert fs[1].path == "http://localhost:8999/b"
Beispiel #29
0
def test_open_glob(dir_server):
    root = 'http://localhost:8999/'
    fs = open_files(root + '/*')
    assert fs[0].path == 'http://localhost:8999/a'
    assert fs[1].path == 'http://localhost:8999/b'
Beispiel #30
0
def read_avro(urlpath, blocksize=100000000, storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import (open_files, get_fs_token_paths,
                                 OpenFile, tokenize)
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode='rb', storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head['sync']
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ['read-avro-%s-%s' % (o, token) for o in offset]
            values = [dread(f, o, l, head, dask_key_name=key)
                      for o, key, l in zip(offset, keys, length)]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, compression=compression, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)