Example #1
0
def test_read_bytes_blocksize_on_large_data():
    _, L = read_bytes('dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv',
                      blocksize=None)
    assert len(L) == 1

    _, L = read_bytes('dask-data/nyc-taxi/2014/*.csv', blocksize=None)
    assert len(L) == 12
Example #2
0
def test_read_bytes_sample_delimiter(s3):
    sample, values = read_bytes(test_bucket_name + "/test/accounts.*", s3=s3, sample=80, delimiter=b"\n")
    assert sample.endswith(b"\n")
    sample, values = read_bytes(test_bucket_name + "/test/accounts.1.json", s3=s3, sample=80, delimiter=b"\n")
    assert sample.endswith(b"\n")
    sample, values = read_bytes(test_bucket_name + "/test/accounts.1.json", s3=s3, sample=2, delimiter=b"\n")
    assert sample.endswith(b"\n")
Example #3
0
def test_read_bytes_delimited(s3, blocksize):
    _, values = read_bytes(test_bucket_name+'/test/accounts*',
                           blocksize=blocksize, delimiter=b'\n', s3=s3)
    _, values2 = read_bytes(test_bucket_name+'/test/accounts*',
                            blocksize=blocksize, delimiter=b'foo', s3=s3)
    assert ([a.key for a in concat(values)] !=
            [b.key for b in concat(values2)])

    results = compute(*concat(values))
    res = [r for r in results if r]
    assert all(r.endswith(b'\n') for r in res)
    ourlines = b''.join(res).split(b'\n')
    testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
    assert ourlines == testlines

    # delimiter not at the end
    d = b'}'
    _, values = read_bytes(test_bucket_name+'/test/accounts*',
                           blocksize=blocksize, delimiter=d, s3=s3)
    results = compute(*concat(values))
    res = [r for r in results if r]
    # All should end in } except EOF
    assert sum(r.endswith(b'}') for r in res) == len(res) - 2
    ours = b"".join(res)
    test = b"".join(files[v] for v in sorted(files))
    assert ours == test
Example #4
0
def test_modification_time_read_bytes():
    with s3_context('compress', files) as s3:
        _, a = read_bytes('compress/test/accounts.*', s3=s3)
        _, b = read_bytes('compress/test/accounts.*', s3=s3)

        assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)]

    with s3_context('compress', valmap(double, files)) as s3:
        _, c = read_bytes('compress/test/accounts.*', s3=s3)

    assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]
Example #5
0
def test_registered(s3):
    from dask.bytes.core import read_bytes

    sample, values = read_bytes("s3://%s/test/accounts.*.json" % test_bucket_name, s3=s3)

    results = compute(*concat(values))
    assert set(results) == set(files.values())
Example #6
0
def test_compression(s3, fmt, blocksize):
    with s3_context('compress', valmap(compress[fmt], files)) as s3:
        sample, values = read_bytes('compress/test/accounts.*', s3=s3,
                                    compression=fmt, blocksize=blocksize)
        assert sample.startswith(files[sorted(files)[0]][:10])

        results = compute(*concat(values))
        assert b''.join(results) == b''.join([files[k] for k in sorted(files)])
Example #7
0
def test_write_bytes(s3):
    paths = ["s3://" + test_bucket_name + "/more/" + f for f in files]
    values = [delayed(v) for v in files.values()]
    out = core.write_bytes(values, paths, s3=s3)
    compute(*out)
    sample, values = read_bytes(test_bucket_name + "/more/test/accounts.*", s3=s3)
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)
Example #8
0
def test_write_bytes(s3):
    paths = ['s3://' + test_bucket_name + '/more/' + f for f in files]
    values = list(files.values())
    out = core.write_bytes(values, paths, s3=s3)
    compute(*out)
    sample, values = read_bytes(test_bucket_name+'/more/test/accounts.*', s3=s3)
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)
Example #9
0
def test_read_bytes_block(s3, blocksize):
    _, vals = read_bytes(test_bucket_name + "/test/account*", blocksize=blocksize, s3=s3)
    assert list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()]

    results = compute(*concat(vals))
    assert sum(len(r) for r in results) == sum(len(v) for v in files.values())

    ourlines = b"".join(results).split(b"\n")
    testlines = b"".join(files.values()).split(b"\n")
    assert set(ourlines) == set(testlines)
Example #10
0
def test_read_bytes(s3):
    sample, values = read_bytes(test_bucket_name+'/test/accounts.*', s3=s3)
    assert isinstance(sample, bytes)
    assert sample[:5] == files[sorted(files)[0]][:5]

    assert isinstance(values, (list, tuple))
    assert isinstance(values[0], (list, tuple))
    assert hasattr(values[0][0], 'dask')

    assert sum(map(len, values)) >= len(files)
    results = compute(*concat(values))
    assert set(results) == set(files.values())
Example #11
0
def test_read_bytes_blocksize_none(s3):
    _, values = read_bytes(test_bucket_name+'/test/accounts.*', blocksize=None,
            s3=s3)
    assert sum(map(len, values)) == len(files)
Example #12
0
def test_read_bytes_non_existing_glob(s3):
    with pytest.raises(IOError):
        read_bytes(test_bucket_name+'/non-existing/*', s3=s3)