Python read_bytes Examples, dask.bytes.core.read_bytes Python Examples

Example #1

0

Show file

File: test_s3.py Project: wikiped/dask

def test_read_bytes_blocksize_on_large_data():
    _, L = read_bytes('s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv',
                      blocksize=None, anon=True)
    assert len(L) == 1

    _, L = read_bytes('s3://dask-data/nyc-taxi/2014/*.csv', blocksize=None, anon=True)
    assert len(L) == 12

Example #2

0

Show file

File: avro.py Project: martindurant/dask

def read_header(fo):
    """Extract an avro file's header

    fo: file-like
        This should be in bytes mode, e.g., io.BytesIO

    Returns dict representing the header

    Parameters
    ----------
    fo: file-like
    """
    assert fo.read(len(MAGIC)) == MAGIC, 'Magic avro bytes missing'
    meta = {}
    out = {'meta': meta}
    while True:
        n_keys = read_long(fo)
        if n_keys == 0:
            break
        for i in range(n_keys):
            # ignore dtype mapping for bag version
            read_bytes(fo)  # schema keys
            read_bytes(fo)  # schema values
    out['sync'] = fo.read(SYNC_SIZE)
    out['header_size'] = fo.tell()
    fo.seek(0)
    out['head_bytes'] = fo.read(out['header_size'])
    return out

Example #3

0

Show file

File: test_s3.py Project: wikiped/dask

def test_read_bytes_delimited(s3, blocksize):
    _, values = read_bytes('s3://' + test_bucket_name + '/test/accounts*',
                           blocksize=blocksize, delimiter=b'\n')
    _, values2 = read_bytes('s3://' + test_bucket_name + '/test/accounts*',
                            blocksize=blocksize, delimiter=b'foo')
    assert ([a.key for a in concat(values)] !=
            [b.key for b in concat(values2)])

    results = compute(*concat(values))
    res = [r for r in results if r]
    assert all(r.endswith(b'\n') for r in res)
    ourlines = b''.join(res).split(b'\n')
    testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
    assert ourlines == testlines

    # delimiter not at the end
    d = b'}'
    _, values = read_bytes('s3://' + test_bucket_name + '/test/accounts*',
                           blocksize=blocksize, delimiter=d)
    results = compute(*concat(values))
    res = [r for r in results if r]
    # All should end in } except EOF
    assert sum(r.endswith(b'}') for r in res) == len(res) - 2
    ours = b"".join(res)
    test = b"".join(files[v] for v in sorted(files))
    assert ours == test

Example #4

0

Show file

File: test_local.py Project: gameduell/dask

def test_read_bytes_delimited():
    with filetexts(files, mode='b'):
        for bs in [5, 15, 45, 1500]:
            _, values = read_bytes('.test.accounts*',
                                   blocksize=bs, delimiter=b'\n')
            _, values2 = read_bytes('.test.accounts*',
                                    blocksize=bs, delimiter=b'foo')
            assert ([a.key for a in concat(values)] !=
                    [b.key for b in concat(values2)])

            results = compute(*concat(values))
            res = [r for r in results if r]
            assert all(r.endswith(b'\n') for r in res)
            ourlines = b''.join(res).split(b'\n')
            testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
            assert ourlines == testlines

            # delimiter not at the end
            d = b'}'
            _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d)
            results = compute(*concat(values))
            res = [r for r in results if r]
            # All should end in } except EOF
            assert sum(r.endswith(b'}') for r in res) == len(res) - 2
            ours = b"".join(res)
            test = b"".join(files[v] for v in sorted(files))
            assert ours == test

Example #5

0

Show file

File: test_s3.py Project: caseyclements/dask

def test_read_bytes_blocksize_on_large_data(s3_with_yellow_tripdata):
    _, L = read_bytes(
        's3://{}/nyc-taxi/2015/yellow_tripdata_2015-01.csv'.format(
            test_bucket_name),
        blocksize=None, anon=True)
    assert len(L) == 1

    _, L = read_bytes('s3://{}/nyc-taxi/2014/*.csv'.format(test_bucket_name),
                      blocksize=None, anon=True)
    assert len(L) == 12

Example #6

0

Show file

File: test_s3.py Project: wikiped/dask

def test_read_bytes_sample_delimiter(s3):
    sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.*',
                                sample=80, delimiter=b'\n')
    assert sample.endswith(b'\n')
    sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.1.json',
                                sample=80, delimiter=b'\n')
    assert sample.endswith(b'\n')
    sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.1.json',
                                sample=2, delimiter=b'\n')
    assert sample.endswith(b'\n')

Example #7

0

Show file

File: test_local.py Project: fortizc/dask

def test_with_paths():
    pathlib = pytest.importorskip('pathlib')
    with filetexts(files, mode='b'):
        url = pathlib.Path('./.test.accounts.*')
        sample, values = read_bytes(url, blocksize=None)
        assert sum(map(len, values)) == len(files)
    with pytest.raises(OSError):
        # relative path doesn't work
        url = pathlib.Path('file://.test.accounts.*')
        read_bytes(url, blocksize=None)

Example #8

0

Show file

File: test_local.py Project: fortizc/dask

def test_read_bytes_blocksize_float():
    with filetexts(files, mode='b'):
        sample, vals = read_bytes('.test.account*', blocksize=5.0)
        results = compute(*concat(vals))
        ourlines = b"".join(results).split(b'\n')
        testlines = b"".join(files.values()).split(b'\n')
        assert set(ourlines) == set(testlines)

        with pytest.raises(TypeError):
            read_bytes('.test.account*', blocksize=5.5)

Example #9

0

Show file

File: test_s3.py Project: wikiped/dask

def test_modification_time_read_bytes():
    with s3_context('compress', files):
        _, a = read_bytes('s3://compress/test/accounts.*')
        _, b = read_bytes('s3://compress/test/accounts.*')

        assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)]

    with s3_context('compress', valmap(double, files)):
        _, c = read_bytes('s3://compress/test/accounts.*')

    assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]

Example #10

0

Show file

File: test_local.py Project: gameduell/dask

def test_read_bytes_sample_delimiter():
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*',
                                    sample=80, delimiter=b'\n')
        assert sample.endswith(b'\n')
        sample, values = read_bytes('.test.accounts.1.json',
                                    sample=80, delimiter=b'\n')
        assert sample.endswith(b'\n')
        sample, values = read_bytes('.test.accounts.1.json',
                                    sample=2, delimiter=b'\n')
        assert sample.endswith(b'\n')

Example #11

0

Show file

File: test_hdfs.py Project: mrocklin/dask

def test_deterministic_key_names(hdfs):
    data = b'abc\n' * int(1e3)
    fn = '%s/file' % basedir

    with hdfs.open(fn, 'wb', replication=1) as fil:
        fil.write(data)

    _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False)
    _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False)
    _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c', sample=False)

    assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
    assert [f.key for f in concat(x)] != [f.key for f in concat(z)]

Example #12

0

Show file

def test_read_bytes_sample_delimiter():
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*',
                                    sample=80,
                                    delimiter=b'\n')
        assert sample.endswith(b'\n')
        sample, values = read_bytes('.test.accounts.1.json',
                                    sample=80,
                                    delimiter=b'\n')
        assert sample.endswith(b'\n')
        sample, values = read_bytes('.test.accounts.1.json',
                                    sample=2,
                                    delimiter=b'\n')
        assert sample.endswith(b'\n')

Example #13

0

Show file

File: test_local.py Project: xvr-hlt/dask

def test_read_bytes_sample_delimiter():
    with filetexts(files, mode="b"):
        sample, values = read_bytes(".test.accounts.*",
                                    sample=80,
                                    delimiter=b"\n")
        assert sample.endswith(b"\n")
        sample, values = read_bytes(".test.accounts.1.json",
                                    sample=80,
                                    delimiter=b"\n")
        assert sample.endswith(b"\n")
        sample, values = read_bytes(".test.accounts.1.json",
                                    sample=2,
                                    delimiter=b"\n")
        assert sample.endswith(b"\n")

Example #14

0

Show file

File: test_hdfs.py Project: dask/distributed

def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'abc\n' * int(1e3)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c')

        assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
        assert [f.key for f in concat(x)] != [f.key for f in concat(z)]

Example #15

0

Show file

File: test_s3.py Project: Tapishkumar/jarvise_the_virtual_assistent

def test_read_bytes_blocksize_on_large_data(s3_with_yellow_tripdata, s3so):
    _, L = read_bytes(
        "s3://{}/nyc-taxi/2015/yellow_tripdata_2015-01.csv".format(
            test_bucket_name),
        blocksize=None,
        anon=True,
        **s3so)
    assert len(L) == 1

    _, L = read_bytes("s3://{}/nyc-taxi/2014/*.csv".format(test_bucket_name),
                      blocksize=None,
                      anon=True,
                      **s3so)
    assert len(L) == 12

Example #16

0

Show file

def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'abc\n' * int(1e3)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c')

        assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
        assert [f.key for f in concat(x)] != [f.key for f in concat(z)]

Example #17

0

Show file

File: test_local.py Project: gameduell/dask

def test_registered_read_bytes():
    from dask.bytes.core import read_bytes
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*')

        results = compute(*concat(values))
        assert set(results) == set(files.values())

Example #18

0

Show file

File: test_local.py Project: bigmpc/dask

def test_read_bytes_blocksize_types(blocksize):
    with filetexts(files, mode="b"):
        sample, vals = read_bytes(".test.account*", blocksize=blocksize)
        results = compute(*concat(vals))
        ourlines = b"".join(results).split(b"\n")
        testlines = b"".join(files.values()).split(b"\n")
        assert set(ourlines) == set(testlines)

Example #19

0

Show file

File: test_local.py Project: caseyclements/dask

def test_read_bytes_blocksize_types(blocksize):
    with filetexts(files, mode='b'):
        sample, vals = read_bytes('.test.account*', blocksize=blocksize)
        results = compute(*concat(vals))
        ourlines = b"".join(results).split(b'\n')
        testlines = b"".join(files.values()).split(b'\n')
        assert set(ourlines) == set(testlines)

Example #20

0

Show file

def test_registered_read_bytes():
    from dask.bytes.core import read_bytes
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*')

        results = compute(*concat(values))
        assert set(results) == set(files.values())

Example #21

0

Show file

File: test_hdfs.py Project: dask/distributed

def test_read_bytes(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'a' * int(1e8)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        sample, values = read_bytes('hdfs://' + fn)
        assert sample[:5] == b'aaaaa'
        assert len(values[0]) == len(blocks)

        while not s.host_restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        assert {v.key for v in values[0]} == set(s.host_restrictions)
        assert {v.key for v in values[0]} == set(s.loose_restrictions)

        futures = c.compute(values[0])
        results = yield c._gather(futures)
        assert b''.join(results) == data
        assert s.host_restrictions

Example #22

0

Show file

File: test_s3.py Project: tomo461/dask

def test_read_bytes_sample_delimiter(s3):
    sample, values = read_bytes('s3://' + test_bucket_name +
                                '/test/accounts.*',
                                sample=80,
                                delimiter=b'\n')
    assert sample.endswith(b'\n')
    sample, values = read_bytes('s3://' + test_bucket_name +
                                '/test/accounts.1.json',
                                sample=80,
                                delimiter=b'\n')
    assert sample.endswith(b'\n')
    sample, values = read_bytes('s3://' + test_bucket_name +
                                '/test/accounts.1.json',
                                sample=2,
                                delimiter=b'\n')
    assert sample.endswith(b'\n')

Example #23

0

Show file

def test_read_bytes_sample_delimiter(s3):
    sample, values = read_bytes("s3://" + test_bucket_name +
                                "/test/accounts.*",
                                sample=80,
                                delimiter=b"\n")
    assert sample.endswith(b"\n")
    sample, values = read_bytes("s3://" + test_bucket_name +
                                "/test/accounts.1.json",
                                sample=80,
                                delimiter=b"\n")
    assert sample.endswith(b"\n")
    sample, values = read_bytes("s3://" + test_bucket_name +
                                "/test/accounts.1.json",
                                sample=2,
                                delimiter=b"\n")
    assert sample.endswith(b"\n")

Example #24

0

Show file

def test_read_bytes(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'a' * int(1e8)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        sample, values = read_bytes('hdfs://' + fn)
        assert sample[:5] == b'aaaaa'
        assert len(values[0]) == len(blocks)

        while not s.host_restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        assert {v.key for v in values[0]} == set(s.host_restrictions)
        assert {v.key for v in values[0]} == set(s.loose_restrictions)

        futures = c.compute(values[0])
        results = yield c._gather(futures)
        assert b''.join(results) == data
        assert s.host_restrictions

Example #25

0

Show file

File: test_s3.py Project: wikiped/dask

def test_write_bytes(s3):
    paths = ['s3://' + test_bucket_name + '/more/' + f for f in files]
    values = [delayed(v) for v in files.values()]
    out = core.write_bytes(values, paths)
    compute(*out)
    sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*')
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)

Example #26

0

Show file

File: test_local.py Project: gameduell/dask

def test_names():
    with filetexts(files, mode='b'):
        _, a = read_bytes('.test.accounts.*')
        _, b = read_bytes('.test.accounts.*')
        a = list(concat(a))
        b = list(concat(b))

        assert [aa._key for aa in a] == [bb._key for bb in b]

        sleep(1)
        for fn in files:
            with open(fn, 'ab') as f:
                f.write(b'x')

        _, c = read_bytes('.test.accounts.*')
        c = list(concat(c))
        assert [aa._key for aa in a] != [cc._key for cc in c]

Example #27

0

Show file

File: test_local.py Project: bigmpc/dask

def test_names():
    with filetexts(files, mode="b"):
        _, a = read_bytes(".test.accounts.*")
        _, b = read_bytes(".test.accounts.*")
        a = list(concat(a))
        b = list(concat(b))

        assert [aa._key for aa in a] == [bb._key for bb in b]

        sleep(1)
        for fn in files:
            with open(fn, "ab") as f:
                f.write(b"x")

        _, c = read_bytes(".test.accounts.*")
        c = list(concat(c))
        assert [aa._key for aa in a] != [cc._key for cc in c]

Example #28

0

Show file

File: test_s3.py Project: EnjoyLifeFund/macHighSierra-py36-pkgs

def test_write_bytes(s3):
    paths = ['s3://' + test_bucket_name + '/more/' + f for f in files]
    values = [delayed(v) for v in files.values()]
    out = core.write_bytes(values, paths)
    compute(*out)
    sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*')
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)

Example #29

0

Show file

File: test_s3.py Project: wikiped/dask

def test_compression(s3, fmt, blocksize):
    with s3_context('compress', valmap(compress[fmt], files)):
        sample, values = read_bytes('s3://compress/test/accounts.*',
                                    compression=fmt, blocksize=blocksize)
        assert sample.startswith(files[sorted(files)[0]][:10])
        assert sample.endswith(b'\n')

        results = compute(*concat(values))
        assert b''.join(results) == b''.join([files[k] for k in sorted(files)])

Example #30

0

Show file

File: test_s3.py Project: floriango/dask

def test_open_files_write(s3):
    paths = ['s3://' + test_bucket_name + '/more/' + f for f in files]
    fils = open_files(paths, mode='wb')
    for fil, data in zip(fils, files.values()):
        with fil as f:
            f.write(data)
    sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*')
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)

Example #31

0

Show file

File: test_s3.py Project: AnnaPalarkina171/HateDetection

def test_open_files_write(s3):
    paths = ["s3://" + test_bucket_name + "/more/" + f for f in files]
    fils = open_files(paths, mode="wb")
    for fil, data in zip(fils, files.values()):
        with fil as f:
            f.write(data)
    sample, values = read_bytes("s3://" + test_bucket_name + "/more/test/accounts.*")
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)

Example #32

0

Show file

File: test_s3.py Project: EnjoyLifeFund/macHighSierra-py36-pkgs

def test_compression(s3, fmt, blocksize):
    with s3_context('compress', valmap(compress[fmt], files)):
        sample, values = read_bytes('s3://compress/test/accounts.*',
                                    compression=fmt, blocksize=blocksize)
        assert sample.startswith(files[sorted(files)[0]][:10])
        assert sample.endswith(b'\n')

        results = compute(*concat(values))
        assert b''.join(results) == b''.join([files[k] for k in sorted(files)])

Example #33

0

Show file

File: test_hdfs.py Project: rsignell-usgs/dask

def test_write_bytes(hdfs):
    path = 'hdfs://%s/' % basedir
    data = [b'test data %i' % i for i in range(5)]
    values = write_bytes([delayed(d) for d in data], path)
    dask.compute(values)
    assert len(hdfs.ls(basedir)) == 5

    sample, vals = read_bytes('hdfs://%s/*.part' % basedir)
    (results,) = dask.compute(list(concat(vals)))
    assert data == results

Example #34

0

Show file

File: test_s3.py Project: AnnaPalarkina171/HateDetection

def test_read_bytes_block(s3, blocksize):
    _, vals = read_bytes(
        "s3://" + test_bucket_name + "/test/account*", blocksize=blocksize
    )
    assert list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()]

    results = compute(*concat(vals))
    assert sum(len(r) for r in results) == sum(len(v) for v in files.values())

    ourlines = b"".join(results).split(b"\n")
    testlines = b"".join(files.values()).split(b"\n")
    assert set(ourlines) == set(testlines)

Example #35

0

Show file

File: test_s3.py Project: AnnaPalarkina171/HateDetection

def test_compression(s3, fmt, blocksize):
    if fmt not in compress:
        pytest.skip("compression function not provided")
    s3._cache.clear()
    with s3_context("compress", valmap(compress[fmt], files)):
        if fmt and blocksize:
            with pytest.raises(ValueError):
                read_bytes(
                    "s3://compress/test/accounts.*",
                    compression=fmt,
                    blocksize=blocksize,
                )
            return
        sample, values = read_bytes(
            "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize
        )
        assert sample.startswith(files[sorted(files)[0]][:10])
        assert sample.endswith(b"\n")

        results = compute(*concat(values))
        assert b"".join(results) == b"".join([files[k] for k in sorted(files)])

Example #36

0

Show file

File: test_hdfs.py Project: rsignell-usgs/dask

def test_read_bytes_URL(hdfs):
    nfiles = 10
    data = b'a' * int(1e3)

    for fn in ['%s/file.%d' % (basedir, i) for i in range(nfiles)]:
        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

    path = 'hdfs://localhost:8020%s/file.*' % basedir
    sample, values = read_bytes(path)
    (results,) = dask.compute(values)
    assert [b''.join(r) for r in results] == nfiles * [data]

Example #37

0

Show file

File: test_local.py Project: bigmpc/dask

def test_read_bytes_block():
    with filetexts(files, mode="b"):
        for bs in [5, 15, 45, 1500]:
            sample, vals = read_bytes(".test.account*", blocksize=bs)
            assert list(map(len, vals)) == [(len(v) // bs + 1) for v in files.values()]

            results = compute(*concat(vals))
            assert sum(len(r) for r in results) == sum(len(v) for v in files.values())

            ourlines = b"".join(results).split(b"\n")
            testlines = b"".join(files.values()).split(b"\n")
            assert set(ourlines) == set(testlines)

Example #38

0

Show file

def test_compression(s3, fmt, blocksize):
    if fmt == "zip" and sys.version_info.minor == 5:
        pytest.skip("zipfile is read-only on py35")
    s3._cache.clear()
    with s3_context("compress", valmap(compress[fmt], files)):
        if fmt and blocksize:
            with pytest.raises(ValueError):
                read_bytes(
                    "s3://compress/test/accounts.*",
                    compression=fmt,
                    blocksize=blocksize,
                )
            return
        sample, values = read_bytes("s3://compress/test/accounts.*",
                                    compression=fmt,
                                    blocksize=blocksize)
        assert sample.startswith(files[sorted(files)[0]][:10])
        assert sample.endswith(b"\n")

        results = compute(*concat(values))
        assert b"".join(results) == b"".join([files[k] for k in sorted(files)])

Example #39

0

Show file

def test_read_bytes_sync(loop, nworkers):
    with cluster(nworkers=nworkers) as (s, workers):
        with make_hdfs() as (hdfs, basedir):
            data = b'a' * int(1e3)

            for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Client(s['address'], loop=loop) as e:
                sample, values = read_bytes('hdfs://%s/file.*' % basedir)
                results = delayed(values).compute()
                assert [b''.join(r) for r in results] == 100 * [data]

Example #40

0

Show file

def test_read_bytes_sync(loop, nworkers):
    with cluster(nworkers=nworkers) as (s, workers):
        with make_hdfs() as hdfs:
            data = b'a' * int(1e3)

            for fn in ['/tmp/test/file.%d' % i for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                sample, values = read_bytes('hdfs:///tmp/test/file.*')
                results = delayed(values).compute()
                assert [b''.join(r) for r in results] == 100 * [data]

Example #41

0

Show file

File: test_hdfs.py Project: dask/distributed

def test_read_bytes_sync(loop, nworkers):
    with cluster(nworkers=nworkers) as (s, workers):
        with make_hdfs() as (hdfs, basedir):
            data = b'a' * int(1e3)

            for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                sample, values = read_bytes('hdfs://%s/file.*' % basedir)
                results = delayed(values).compute()
                assert [b''.join(r) for r in results] == 100 * [data]

Example #42

0

Show file

File: test_local.py Project: EnjoyLifeFund/macHighSierra-py36-pkgs

def test_compression(fmt, blocksize):
    compress = compression.compress[fmt]
    files2 = valmap(compress, files)
    with filetexts(files2, mode='b'):
        sample, values = read_bytes('.test.accounts.*.json',
                                    blocksize=blocksize, delimiter=b'\n',
                                    compression=fmt)
        assert sample[:5] == files[sorted(files)[0]][:5]
        assert sample.endswith(b'\n')

        results = compute(*concat(values))
        assert (b''.join(results) ==
                b''.join([files[k] for k in sorted(files)]))

Example #43

0

Show file

File: test_local.py Project: gameduell/dask

def test_compression(fmt, blocksize):
    compress = compression.compress[fmt]
    files2 = valmap(compress, files)
    with filetexts(files2, mode='b'):
        sample, values = read_bytes('.test.accounts.*.json',
                                    blocksize=blocksize, delimiter=b'\n',
                                    compression=fmt)
        assert sample[:5] == files[sorted(files)[0]][:5]
        assert sample.endswith(b'\n')

        results = compute(*concat(values))
        assert (b''.join(results) ==
                b''.join([files[k] for k in sorted(files)]))

Example #44

0

Show file

File: test_hdfs.py Project: trentwatt/dask

def test_open_files_write(hdfs):
    path = "hdfs://%s/" % basedir
    data = [b"test data %i" % i for i in range(5)]

    files = open_files(path, num=len(data), mode="wb")
    for fil, b in zip(files, data):
        with fil as f:
            f.write(b)

    sample, vals = read_bytes("hdfs://%s/*.part" % basedir)

    (results,) = dask.compute(list(concat(vals)))
    assert data == results

Example #45

0

Show file

File: test_hdfs.py Project: trentwatt/dask

def test_read_bytes(hdfs):
    nfiles = 10

    data = b"a" * int(1e3)

    for fn in ["%s/file.%d" % (basedir, i) for i in range(nfiles)]:
        with hdfs.open(fn, "wb", replication=1) as f:
            f.write(data)

    sample, values = read_bytes("hdfs://%s/file.*" % basedir)

    (results,) = dask.compute(values)
    assert [b"".join(r) for r in results] == nfiles * [data]

Example #46

0

Show file

File: test_s3.py Project: wikiped/dask

def test_read_bytes(s3):
    sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.*')
    assert isinstance(sample, bytes)
    assert sample[:5] == files[sorted(files)[0]][:5]
    assert sample.endswith(b'\n')

    assert isinstance(values, (list, tuple))
    assert isinstance(values[0], (list, tuple))
    assert hasattr(values[0][0], 'dask')

    assert sum(map(len, values)) >= len(files)
    results = compute(*concat(values))
    assert set(results) == set(files.values())

Example #47

0

Show file

File: test_hdfs.py Project: mrocklin/dask

def test_read_bytes(hdfs):
    nfiles = 10

    data = b'a' * int(1e3)

    for fn in ['%s/file.%d' % (basedir, i) for i in range(nfiles)]:
        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

    sample, values = read_bytes('hdfs://%s/file.*' % basedir)

    (results,) = dask.compute(values)
    assert [b''.join(r) for r in results] == nfiles * [data]

Example #48

0

Show file

File: test_hdfs.py Project: mrocklin/dask

def test_open_files_write(hdfs):
    path = 'hdfs://%s/' % basedir
    data = [b'test data %i' % i for i in range(5)]

    files = open_files(path, num=len(data), mode='wb')
    for fil, b in zip(files, data):
        with fil as f:
            f.write(b)

    sample, vals = read_bytes('hdfs://%s/*.part' % basedir)

    (results,) = dask.compute(list(concat(vals)))
    assert data == results

Example #49

0

Show file

File: test_s3.py Project: wikiped/dask

def test_read_bytes_block(s3, blocksize):
    _, vals = read_bytes('s3://' + test_bucket_name + '/test/account*',
                         blocksize=blocksize)
    assert (list(map(len, vals)) ==
            [(len(v) // blocksize + 1) for v in files.values()])

    results = compute(*concat(vals))
    assert (sum(len(r) for r in results) ==
            sum(len(v) for v in files.values()))

    ourlines = b"".join(results).split(b'\n')
    testlines = b"".join(files.values()).split(b'\n')
    assert set(ourlines) == set(testlines)

Example #50

0

Show file

File: test_s3.py Project: AnnaPalarkina171/HateDetection

def test_read_bytes(s3):
    sample, values = read_bytes("s3://" + test_bucket_name + "/test/accounts.*")
    assert isinstance(sample, bytes)
    assert sample[:5] == files[sorted(files)[0]][:5]
    assert sample.endswith(b"\n")

    assert isinstance(values, (list, tuple))
    assert isinstance(values[0], (list, tuple))
    assert hasattr(values[0][0], "dask")

    assert sum(map(len, values)) >= len(files)
    results = compute(*concat(values))
    assert set(results) == set(files.values())

Example #51

0

Show file

File: test_hdfs.py Project: dask/distributed

def test_write_bytes_2(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        path = 'hdfs://%s/' % basedir
        data = [b'test data %i' % i for i in range(5)]
        values = [delayed(d) for d in data]
        out = write_bytes(values, path)
        futures = c.compute(out)
        results = yield c._gather(futures)
        assert len(hdfs.ls(basedir)) == 5

        sample, vals = read_bytes('hdfs://%s/*.part' % basedir)
        futures = c.compute(list(concat(vals)))
        results = yield c._gather(futures)
        assert data == results

Example #52

0

Show file

File: test_local.py Project: bigmpc/dask

def test_read_bytes():
    with filetexts(files, mode="b"):
        sample, values = read_bytes(".test.accounts.*")
        assert isinstance(sample, bytes)
        assert sample[:5] == files[sorted(files)[0]][:5]
        assert sample.endswith(b"\n")

        assert isinstance(values, (list, tuple))
        assert isinstance(values[0], (list, tuple))
        assert hasattr(values[0][0], "dask")

        assert sum(map(len, values)) >= len(files)
        results = compute(*concat(values))
        assert set(results) == set(files.values())

Example #53

0

Show file

def test_write_bytes_2(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        path = 'hdfs://%s/' % basedir
        data = [b'test data %i' % i for i in range(5)]
        values = [delayed(d) for d in data]
        out = write_bytes(values, path)
        futures = c.compute(out)
        results = yield c._gather(futures)
        assert len(hdfs.ls(basedir)) == 5

        sample, vals = read_bytes('hdfs://%s/*.part' % basedir)
        futures = c.compute(list(concat(vals)))
        results = yield c._gather(futures)
        assert data == results

Example #54

0

Show file

File: test_local.py Project: gameduell/dask

def test_read_bytes():
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*')
        assert isinstance(sample, bytes)
        assert sample[:5] == files[sorted(files)[0]][:5]
        assert sample.endswith(b'\n')

        assert isinstance(values, (list, tuple))
        assert isinstance(values[0], (list, tuple))
        assert hasattr(values[0][0], 'dask')

        assert sum(map(len, values)) >= len(files)
        results = compute(*concat(values))
        assert set(results) == set(files.values())

Example #55

0

Show file

File: test_local.py Project: gameduell/dask

def test_read_bytes_block():
    with filetexts(files, mode='b'):
        for bs in [5, 15, 45, 1500]:
            sample, vals = read_bytes('.test.account*', blocksize=bs)
            assert (list(map(len, vals)) ==
                    [(len(v) // bs + 1) for v in files.values()])

            results = compute(*concat(vals))
            assert (sum(len(r) for r in results) ==
                    sum(len(v) for v in files.values()))

            ourlines = b"".join(results).split(b'\n')
            testlines = b"".join(files.values()).split(b'\n')
            assert set(ourlines) == set(testlines)

Example #56

0

Show file

File: test_s3.py Project: astrojuanlu/dask

def test_read_bytes_delimited(s3, blocksize, s3so):
    _, values = read_bytes(
        "s3://" + test_bucket_name + "/test/accounts*",
        blocksize=blocksize,
        delimiter=b"\n",
        **s3so
    )
    _, values2 = read_bytes(
        "s3://" + test_bucket_name + "/test/accounts*",
        blocksize=blocksize,
        delimiter=b"foo",
        **s3so
    )
    assert [a.key for a in concat(values)] != [b.key for b in concat(values2)]

    results = compute(*concat(values))
    res = [r for r in results if r]
    assert all(r.endswith(b"\n") for r in res)
    ourlines = b"".join(res).split(b"\n")
    testlines = b"".join(files[k] for k in sorted(files)).split(b"\n")
    assert ourlines == testlines

    # delimiter not at the end
    d = b"}"
    _, values = read_bytes(
        "s3://" + test_bucket_name + "/test/accounts*",
        blocksize=blocksize,
        delimiter=d,
        **s3so
    )
    results = compute(*concat(values))
    res = [r for r in results if r]
    # All should end in } except EOF
    assert sum(r.endswith(b"}") for r in res) == len(res) - 2
    ours = b"".join(res)
    test = b"".join(files[v] for v in sorted(files))
    assert ours == test

Example #57

0

Show file

File: test_local.py Project: bigmpc/dask

def test_read_bytes_delimited():
    with filetexts(files, mode="b"):
        for bs in [5, 15, 45, "1.5 kB"]:
            _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"\n")
            _, values2 = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"foo")
            assert [a.key for a in concat(values)] != [b.key for b in concat(values2)]

            results = compute(*concat(values))
            res = [r for r in results if r]
            assert all(r.endswith(b"\n") for r in res)
            ourlines = b"".join(res).split(b"\n")
            testlines = b"".join(files[k] for k in sorted(files)).split(b"\n")
            assert ourlines == testlines

            # delimiter not at the end
            d = b"}"
            _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=d)
            results = compute(*concat(values))
            res = [r for r in results if r]
            # All should end in } except EOF
            assert sum(r.endswith(b"}") for r in res) == len(res) - 2
            ours = b"".join(res)
            test = b"".join(files[v] for v in sorted(files))
            assert ours == test

Example #58

0

Show file

File: avro.py Project: martindurant/dask

def read_avro(urlpath, blocksize=100000000, storage_options=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import open_files, read_bytes
    from dask.bag import from_delayed
    import_required(
        'fastavro', "fastavro is a required dependency for using "
        "bag.read_avro().")

    storage_options = storage_options or {}
    files = open_files(urlpath, **storage_options)
    if blocksize is not None:
        dhead = delayed(open_head)
        heads = compute(*[dhead(f) for f in files])
        dread = delayed(read_chunk)
        bits = []
        for head, f in zip(heads, files):
            _, chunks = read_bytes(f.path,
                                   sample=False,
                                   blocksize=blocksize,
                                   delimiter=head['sync'],
                                   include_path=False,
                                   **storage_options)
            bits.extend([dread(ch, head) for ch in chunks[0]])
        return from_delayed(bits)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)

Example #59

0

Show file

File: avro.py Project: martindurant/dask

def read_avro(urlpath, blocksize=100000000, storage_options=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import open_files, read_bytes
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    files = open_files(urlpath, **storage_options)
    if blocksize is not None:
        dhead = delayed(open_head)
        heads = compute(*[dhead(f) for f in files])
        dread = delayed(read_chunk)
        bits = []
        for head, f in zip(heads, files):
            _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize,
                                   delimiter=head['sync'], include_path=False,
                                   **storage_options)
            bits.extend([dread(ch, head) for ch in chunks[0]])
        return from_delayed(bits)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)

Example #60

0

Show file

File: test_hdfs.py Project: mrocklin/dask

def test_read_bytes_big_file(hdfs):
    fn = '%s/file' % basedir

    # Write 100 MB file
    nblocks = int(1e3)
    blocksize = int(1e5)
    data = b'a' * blocksize
    with hdfs.open(fn, 'wb', replication=1) as f:
        for i in range(nblocks):
            f.write(data)

    sample, values = read_bytes('hdfs://' + fn, blocksize=blocksize)

    assert sample[:5] == b'aaaaa'
    assert len(values[0]) == nblocks

    (results,) = dask.compute(values[0])
    assert sum(map(len, results)) == nblocks * blocksize
    for r in results:
        assert set(r.decode('utf-8')) == {'a'}