def test_read_bytes_delimited(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'\n') _, values2 = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'foo') assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_sample_delimiter(): with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('.test.accounts.1.json', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('.test.accounts.1.json', sample=2, delimiter=b'\n') assert sample.endswith(b'\n')
def test_registered_read_bytes(): from dask.bytes.core import read_bytes with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*') results = compute(*concat(values)) assert set(results) == set(files.values())
def test_names(): with filetexts(files, mode='b'): _, a = read_bytes('.test.accounts.*') _, b = read_bytes('.test.accounts.*') a = list(concat(a)) b = list(concat(b)) assert [aa._key for aa in a] == [bb._key for bb in b] sleep(1) for fn in files: with open(fn, 'ab') as f: f.write(b'x') _, c = read_bytes('.test.accounts.*') c = list(concat(c)) assert [aa._key for aa in a] != [cc._key for cc in c]
def test_compression(fmt, blocksize): compress = compression.compress[fmt] files2 = valmap(compress, files) with filetexts(files2, mode='b'): sample, values = read_bytes('.test.accounts.*.json', blocksize=blocksize, delimiter=b'\n', compression=fmt) assert sample[:5] == files[sorted(files)[0]][:5] results = compute(*concat(values)) assert (b''.join(results) == b''.join([files[k] for k in sorted(files)]))
def test_read_bytes(): with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*') assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], 'dask') assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_compression(fmt, blocksize): compress = compression.compress[fmt] files2 = valmap(compress, files) with filetexts(files2, mode='b'): sample, values = read_bytes('.test.accounts.*.json', blocksize=blocksize, delimiter=b'\n', compression=fmt) assert sample[:5] == files[sorted(files)[0]][:5] results = compute(*concat(values)) assert (b''.join(results) == b''.join( [files[k] for k in sorted(files)]))
def test_read_bytes_block(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: sample, vals = read_bytes('.test.account*', blocksize=bs) assert (list(map(len, vals)) == [(len(v) // bs + 1) for v in files.values()]) results = compute(*concat(vals)) assert (sum(len(r) for r in results) == sum(len(v) for v in files.values())) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def test_read_bytes_block(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: sample, vals = read_bytes('.test.account*', blocksize=bs) assert (list(map(len, vals)) == [(len(v) // bs + 1) for v in files.values()]) results = compute(*concat(vals)) assert (sum(len(r) for r in results) == sum( len(v) for v in files.values())) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def test_read_bytes_blocksize_none(): with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*', blocksize=None) assert sum(map(len, values)) == len(files)
def test_not_found(): fn = 'not-a-file' with pytest.raises(FileNotFoundError) as e: read_bytes(fn) assert fn in str(e)