def test_read_bytes_blocksize_on_large_data(): _, L = read_bytes('s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv', blocksize=None, anon=True) assert len(L) == 1 _, L = read_bytes('s3://dask-data/nyc-taxi/2014/*.csv', blocksize=None, anon=True) assert len(L) == 12
def read_header(fo): """Extract an avro file's header fo: file-like This should be in bytes mode, e.g., io.BytesIO Returns dict representing the header Parameters ---------- fo: file-like """ assert fo.read(len(MAGIC)) == MAGIC, 'Magic avro bytes missing' meta = {} out = {'meta': meta} while True: n_keys = read_long(fo) if n_keys == 0: break for i in range(n_keys): # ignore dtype mapping for bag version read_bytes(fo) # schema keys read_bytes(fo) # schema values out['sync'] = fo.read(SYNC_SIZE) out['header_size'] = fo.tell() fo.seek(0) out['head_bytes'] = fo.read(out['header_size']) return out
def test_read_bytes_delimited(s3, blocksize): _, values = read_bytes('s3://' + test_bucket_name + '/test/accounts*', blocksize=blocksize, delimiter=b'\n') _, values2 = read_bytes('s3://' + test_bucket_name + '/test/accounts*', blocksize=blocksize, delimiter=b'foo') assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes('s3://' + test_bucket_name + '/test/accounts*', blocksize=blocksize, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_delimited(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'\n') _, values2 = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'foo') assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_blocksize_on_large_data(s3_with_yellow_tripdata): _, L = read_bytes( 's3://{}/nyc-taxi/2015/yellow_tripdata_2015-01.csv'.format( test_bucket_name), blocksize=None, anon=True) assert len(L) == 1 _, L = read_bytes('s3://{}/nyc-taxi/2014/*.csv'.format(test_bucket_name), blocksize=None, anon=True) assert len(L) == 12
def test_read_bytes_sample_delimiter(s3): sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.*', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.1.json', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.1.json', sample=2, delimiter=b'\n') assert sample.endswith(b'\n')
def test_with_paths(): pathlib = pytest.importorskip('pathlib') with filetexts(files, mode='b'): url = pathlib.Path('./.test.accounts.*') sample, values = read_bytes(url, blocksize=None) assert sum(map(len, values)) == len(files) with pytest.raises(OSError): # relative path doesn't work url = pathlib.Path('file://.test.accounts.*') read_bytes(url, blocksize=None)
def test_read_bytes_blocksize_float(): with filetexts(files, mode='b'): sample, vals = read_bytes('.test.account*', blocksize=5.0) results = compute(*concat(vals)) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines) with pytest.raises(TypeError): read_bytes('.test.account*', blocksize=5.5)
def test_modification_time_read_bytes(): with s3_context('compress', files): _, a = read_bytes('s3://compress/test/accounts.*') _, b = read_bytes('s3://compress/test/accounts.*') assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)] with s3_context('compress', valmap(double, files)): _, c = read_bytes('s3://compress/test/accounts.*') assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]
def test_read_bytes_sample_delimiter(): with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('.test.accounts.1.json', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('.test.accounts.1.json', sample=2, delimiter=b'\n') assert sample.endswith(b'\n')
def test_deterministic_key_names(hdfs): data = b'abc\n' * int(1e3) fn = '%s/file' % basedir with hdfs.open(fn, 'wb', replication=1) as fil: fil.write(data) _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False) _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False) _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c', sample=False) assert [f.key for f in concat(x)] == [f.key for f in concat(y)] assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
def test_read_bytes_sample_delimiter(): with filetexts(files, mode="b"): sample, values = read_bytes(".test.accounts.*", sample=80, delimiter=b"\n") assert sample.endswith(b"\n") sample, values = read_bytes(".test.accounts.1.json", sample=80, delimiter=b"\n") assert sample.endswith(b"\n") sample, values = read_bytes(".test.accounts.1.json", sample=2, delimiter=b"\n") assert sample.endswith(b"\n")
def test_deterministic_key_names(e, s, a, b): with make_hdfs() as (hdfs, basedir): data = b'abc\n' * int(1e3) fn = '%s/file' % basedir with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n') _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n') _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c') assert [f.key for f in concat(x)] == [f.key for f in concat(y)] assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
def test_read_bytes_blocksize_on_large_data(s3_with_yellow_tripdata, s3so): _, L = read_bytes( "s3://{}/nyc-taxi/2015/yellow_tripdata_2015-01.csv".format( test_bucket_name), blocksize=None, anon=True, **s3so) assert len(L) == 1 _, L = read_bytes("s3://{}/nyc-taxi/2014/*.csv".format(test_bucket_name), blocksize=None, anon=True, **s3so) assert len(L) == 12
def test_registered_read_bytes(): from dask.bytes.core import read_bytes with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*') results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes_blocksize_types(blocksize): with filetexts(files, mode="b"): sample, vals = read_bytes(".test.account*", blocksize=blocksize) results = compute(*concat(vals)) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_read_bytes_blocksize_types(blocksize): with filetexts(files, mode='b'): sample, vals = read_bytes('.test.account*', blocksize=blocksize) results = compute(*concat(vals)) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def test_read_bytes(c, s, a, b): with make_hdfs() as (hdfs, basedir): data = b'a' * int(1e8) fn = '%s/file' % basedir with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 sample, values = read_bytes('hdfs://' + fn) assert sample[:5] == b'aaaaa' assert len(values[0]) == len(blocks) while not s.host_restrictions: yield gen.sleep(0.01) assert not s.tasks assert {v.key for v in values[0]} == set(s.host_restrictions) assert {v.key for v in values[0]} == set(s.loose_restrictions) futures = c.compute(values[0]) results = yield c._gather(futures) assert b''.join(results) == data assert s.host_restrictions
def test_read_bytes_sample_delimiter(s3): sample, values = read_bytes("s3://" + test_bucket_name + "/test/accounts.*", sample=80, delimiter=b"\n") assert sample.endswith(b"\n") sample, values = read_bytes("s3://" + test_bucket_name + "/test/accounts.1.json", sample=80, delimiter=b"\n") assert sample.endswith(b"\n") sample, values = read_bytes("s3://" + test_bucket_name + "/test/accounts.1.json", sample=2, delimiter=b"\n") assert sample.endswith(b"\n")
def test_write_bytes(s3): paths = ['s3://' + test_bucket_name + '/more/' + f for f in files] values = [delayed(v) for v in files.values()] out = core.write_bytes(values, paths) compute(*out) sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*') results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_names(): with filetexts(files, mode='b'): _, a = read_bytes('.test.accounts.*') _, b = read_bytes('.test.accounts.*') a = list(concat(a)) b = list(concat(b)) assert [aa._key for aa in a] == [bb._key for bb in b] sleep(1) for fn in files: with open(fn, 'ab') as f: f.write(b'x') _, c = read_bytes('.test.accounts.*') c = list(concat(c)) assert [aa._key for aa in a] != [cc._key for cc in c]
def test_names(): with filetexts(files, mode="b"): _, a = read_bytes(".test.accounts.*") _, b = read_bytes(".test.accounts.*") a = list(concat(a)) b = list(concat(b)) assert [aa._key for aa in a] == [bb._key for bb in b] sleep(1) for fn in files: with open(fn, "ab") as f: f.write(b"x") _, c = read_bytes(".test.accounts.*") c = list(concat(c)) assert [aa._key for aa in a] != [cc._key for cc in c]
def test_compression(s3, fmt, blocksize): with s3_context('compress', valmap(compress[fmt], files)): sample, values = read_bytes('s3://compress/test/accounts.*', compression=fmt, blocksize=blocksize) assert sample.startswith(files[sorted(files)[0]][:10]) assert sample.endswith(b'\n') results = compute(*concat(values)) assert b''.join(results) == b''.join([files[k] for k in sorted(files)])
def test_open_files_write(s3): paths = ['s3://' + test_bucket_name + '/more/' + f for f in files] fils = open_files(paths, mode='wb') for fil, data in zip(fils, files.values()): with fil as f: f.write(data) sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*') results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_open_files_write(s3): paths = ["s3://" + test_bucket_name + "/more/" + f for f in files] fils = open_files(paths, mode="wb") for fil, data in zip(fils, files.values()): with fil as f: f.write(data) sample, values = read_bytes("s3://" + test_bucket_name + "/more/test/accounts.*") results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_write_bytes(hdfs): path = 'hdfs://%s/' % basedir data = [b'test data %i' % i for i in range(5)] values = write_bytes([delayed(d) for d in data], path) dask.compute(values) assert len(hdfs.ls(basedir)) == 5 sample, vals = read_bytes('hdfs://%s/*.part' % basedir) (results,) = dask.compute(list(concat(vals))) assert data == results
def test_read_bytes_block(s3, blocksize): _, vals = read_bytes( "s3://" + test_bucket_name + "/test/account*", blocksize=blocksize ) assert list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()] results = compute(*concat(vals)) assert sum(len(r) for r in results) == sum(len(v) for v in files.values()) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_compression(s3, fmt, blocksize): if fmt not in compress: pytest.skip("compression function not provided") s3._cache.clear() with s3_context("compress", valmap(compress[fmt], files)): if fmt and blocksize: with pytest.raises(ValueError): read_bytes( "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize, ) return sample, values = read_bytes( "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize ) assert sample.startswith(files[sorted(files)[0]][:10]) assert sample.endswith(b"\n") results = compute(*concat(values)) assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
def test_read_bytes_URL(hdfs): nfiles = 10 data = b'a' * int(1e3) for fn in ['%s/file.%d' % (basedir, i) for i in range(nfiles)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) path = 'hdfs://localhost:8020%s/file.*' % basedir sample, values = read_bytes(path) (results,) = dask.compute(values) assert [b''.join(r) for r in results] == nfiles * [data]
def test_read_bytes_block(): with filetexts(files, mode="b"): for bs in [5, 15, 45, 1500]: sample, vals = read_bytes(".test.account*", blocksize=bs) assert list(map(len, vals)) == [(len(v) // bs + 1) for v in files.values()] results = compute(*concat(vals)) assert sum(len(r) for r in results) == sum(len(v) for v in files.values()) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_compression(s3, fmt, blocksize): if fmt == "zip" and sys.version_info.minor == 5: pytest.skip("zipfile is read-only on py35") s3._cache.clear() with s3_context("compress", valmap(compress[fmt], files)): if fmt and blocksize: with pytest.raises(ValueError): read_bytes( "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize, ) return sample, values = read_bytes("s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize) assert sample.startswith(files[sorted(files)[0]][:10]) assert sample.endswith(b"\n") results = compute(*concat(values)) assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
def test_read_bytes_sync(loop, nworkers): with cluster(nworkers=nworkers) as (s, workers): with make_hdfs() as (hdfs, basedir): data = b'a' * int(1e3) for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Client(s['address'], loop=loop) as e: sample, values = read_bytes('hdfs://%s/file.*' % basedir) results = delayed(values).compute() assert [b''.join(r) for r in results] == 100 * [data]
def test_read_bytes_sync(loop, nworkers): with cluster(nworkers=nworkers) as (s, workers): with make_hdfs() as hdfs: data = b'a' * int(1e3) for fn in ['/tmp/test/file.%d' % i for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Client(('127.0.0.1', s['port']), loop=loop) as e: sample, values = read_bytes('hdfs:///tmp/test/file.*') results = delayed(values).compute() assert [b''.join(r) for r in results] == 100 * [data]
def test_read_bytes_sync(loop, nworkers): with cluster(nworkers=nworkers) as (s, workers): with make_hdfs() as (hdfs, basedir): data = b'a' * int(1e3) for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Client(('127.0.0.1', s['port']), loop=loop) as e: sample, values = read_bytes('hdfs://%s/file.*' % basedir) results = delayed(values).compute() assert [b''.join(r) for r in results] == 100 * [data]
def test_compression(fmt, blocksize): compress = compression.compress[fmt] files2 = valmap(compress, files) with filetexts(files2, mode='b'): sample, values = read_bytes('.test.accounts.*.json', blocksize=blocksize, delimiter=b'\n', compression=fmt) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b'\n') results = compute(*concat(values)) assert (b''.join(results) == b''.join([files[k] for k in sorted(files)]))
def test_open_files_write(hdfs): path = "hdfs://%s/" % basedir data = [b"test data %i" % i for i in range(5)] files = open_files(path, num=len(data), mode="wb") for fil, b in zip(files, data): with fil as f: f.write(b) sample, vals = read_bytes("hdfs://%s/*.part" % basedir) (results,) = dask.compute(list(concat(vals))) assert data == results
def test_read_bytes(hdfs): nfiles = 10 data = b"a" * int(1e3) for fn in ["%s/file.%d" % (basedir, i) for i in range(nfiles)]: with hdfs.open(fn, "wb", replication=1) as f: f.write(data) sample, values = read_bytes("hdfs://%s/file.*" % basedir) (results,) = dask.compute(values) assert [b"".join(r) for r in results] == nfiles * [data]
def test_read_bytes(s3): sample, values = read_bytes('s3://' + test_bucket_name + '/test/accounts.*') assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b'\n') assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], 'dask') assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes(hdfs): nfiles = 10 data = b'a' * int(1e3) for fn in ['%s/file.%d' % (basedir, i) for i in range(nfiles)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) sample, values = read_bytes('hdfs://%s/file.*' % basedir) (results,) = dask.compute(values) assert [b''.join(r) for r in results] == nfiles * [data]
def test_open_files_write(hdfs): path = 'hdfs://%s/' % basedir data = [b'test data %i' % i for i in range(5)] files = open_files(path, num=len(data), mode='wb') for fil, b in zip(files, data): with fil as f: f.write(b) sample, vals = read_bytes('hdfs://%s/*.part' % basedir) (results,) = dask.compute(list(concat(vals))) assert data == results
def test_read_bytes_block(s3, blocksize): _, vals = read_bytes('s3://' + test_bucket_name + '/test/account*', blocksize=blocksize) assert (list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()]) results = compute(*concat(vals)) assert (sum(len(r) for r in results) == sum(len(v) for v in files.values())) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def test_read_bytes(s3): sample, values = read_bytes("s3://" + test_bucket_name + "/test/accounts.*") assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b"\n") assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], "dask") assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_write_bytes_2(c, s, a, b): with make_hdfs() as (hdfs, basedir): path = 'hdfs://%s/' % basedir data = [b'test data %i' % i for i in range(5)] values = [delayed(d) for d in data] out = write_bytes(values, path) futures = c.compute(out) results = yield c._gather(futures) assert len(hdfs.ls(basedir)) == 5 sample, vals = read_bytes('hdfs://%s/*.part' % basedir) futures = c.compute(list(concat(vals))) results = yield c._gather(futures) assert data == results
def test_read_bytes(): with filetexts(files, mode="b"): sample, values = read_bytes(".test.accounts.*") assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b"\n") assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], "dask") assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes(): with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*') assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b'\n') assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], 'dask') assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes_block(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: sample, vals = read_bytes('.test.account*', blocksize=bs) assert (list(map(len, vals)) == [(len(v) // bs + 1) for v in files.values()]) results = compute(*concat(vals)) assert (sum(len(r) for r in results) == sum(len(v) for v in files.values())) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def test_read_bytes_delimited(s3, blocksize, s3so): _, values = read_bytes( "s3://" + test_bucket_name + "/test/accounts*", blocksize=blocksize, delimiter=b"\n", **s3so ) _, values2 = read_bytes( "s3://" + test_bucket_name + "/test/accounts*", blocksize=blocksize, delimiter=b"foo", **s3so ) assert [a.key for a in concat(values)] != [b.key for b in concat(values2)] results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b"\n") for r in res) ourlines = b"".join(res).split(b"\n") testlines = b"".join(files[k] for k in sorted(files)).split(b"\n") assert ourlines == testlines # delimiter not at the end d = b"}" _, values = read_bytes( "s3://" + test_bucket_name + "/test/accounts*", blocksize=blocksize, delimiter=d, **s3so ) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b"}") for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_delimited(): with filetexts(files, mode="b"): for bs in [5, 15, 45, "1.5 kB"]: _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"\n") _, values2 = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"foo") assert [a.key for a in concat(values)] != [b.key for b in concat(values2)] results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b"\n") for r in res) ourlines = b"".join(res).split(b"\n") testlines = b"".join(files[k] for k in sorted(files)).split(b"\n") assert ourlines == testlines # delimiter not at the end d = b"}" _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b"}") for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def read_avro(urlpath, blocksize=100000000, storage_options=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import open_files, read_bytes from dask.bag import from_delayed import_required( 'fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} files = open_files(urlpath, **storage_options) if blocksize is not None: dhead = delayed(open_head) heads = compute(*[dhead(f) for f in files]) dread = delayed(read_chunk) bits = [] for head, f in zip(heads, files): _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize, delimiter=head['sync'], include_path=False, **storage_options) bits.extend([dread(ch, head) for ch in chunks[0]]) return from_delayed(bits) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def read_avro(urlpath, blocksize=100000000, storage_options=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import open_files, read_bytes from dask.bag import from_delayed import_required('fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} files = open_files(urlpath, **storage_options) if blocksize is not None: dhead = delayed(open_head) heads = compute(*[dhead(f) for f in files]) dread = delayed(read_chunk) bits = [] for head, f in zip(heads, files): _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize, delimiter=head['sync'], include_path=False, **storage_options) bits.extend([dread(ch, head) for ch in chunks[0]]) return from_delayed(bits) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def test_read_bytes_big_file(hdfs): fn = '%s/file' % basedir # Write 100 MB file nblocks = int(1e3) blocksize = int(1e5) data = b'a' * blocksize with hdfs.open(fn, 'wb', replication=1) as f: for i in range(nblocks): f.write(data) sample, values = read_bytes('hdfs://' + fn, blocksize=blocksize) assert sample[:5] == b'aaaaa' assert len(values[0]) == nblocks (results,) = dask.compute(values[0]) assert sum(map(len, results)) == nblocks * blocksize for r in results: assert set(r.decode('utf-8')) == {'a'}