def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = { '/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes } with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'wb') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L) yield e._shutdown()
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'wb') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n', lazy=False) assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1 yield e._shutdown()
def test_read_csv_sync(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_csv('/tmp/test/*.csv', lineterminator='\n', header=True, collection=False, lazy=False) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert isinstance(L[0], pd.DataFrame) assert list(L[0].columns) == ['name', 'amount', 'id'] df = read_csv('/tmp/test/*.csv', lineterminator='\n', header=True, collection=True, lazy=False) assert isinstance(df, dd.DataFrame) assert list(df.head().iloc[0]) == ['Alice', 100, 1]
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.tasks results = e.compute(values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results) yield e._shutdown()
def test_read_bytes(c, s, a, b): with make_hdfs() as (hdfs, basedir): data = b'a' * int(1e8) fn = '%s/file' % basedir with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 sample, values = read_bytes('hdfs://' + fn) assert sample[:5] == b'aaaaa' assert len(values[0]) == len(blocks) while not s.host_restrictions: yield gen.sleep(0.01) assert not s.tasks assert {v.key for v in values[0]} == set(s.host_restrictions) assert {v.key for v in values[0]} == set(s.loose_restrictions) futures = c.compute(values[0]) results = yield c._gather(futures) assert b''.join(results) == data assert s.host_restrictions
def test_read_csv_sync(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Client(('127.0.0.1', s['port']), loop=loop) as e: values = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=False, header=0) futures = e.compute(values) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert isinstance(L[0], pd.DataFrame) assert list(L[0].columns) == ['name', 'amount', 'id'] df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=True, header=0) assert isinstance(df, dd.DataFrame) assert list(df.head().iloc[0]) == ['Alice', 100, 1]
def test_read_csv_sync(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Client(s['address'], loop=loop) as e: values = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=False, header=0) futures = e.compute(values) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert isinstance(L[0], pd.DataFrame) assert list(L[0].columns) == ['name', 'amount', 'id'] df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=True, header=0) assert isinstance(df, dd.DataFrame) assert list(df.head().iloc[0]) == ['Alice', 100, 1]
def test__read_text(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/text.1.txt', 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('/tmp/test/text.2.txt', 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('/tmp/test/other.txt', 'wb') as f: f.write('a b\nc d'.encode()) b = read_text('/tmp/test/text.*.txt', collection=True, lazy=True) yield gen.sleep(0.5) assert not s.tasks future = e.compute(b.str.strip().str.split().map(len)) result = yield future._result() assert result == [2, 2, 2, 2, 2, 2] b = read_text('/tmp/test/other.txt', collection=True, lazy=False) future = e.compute(b.str.split().concat()) result = yield future._result() assert result == ['a', 'b', 'c', 'd'] L = read_text('/tmp/test/text.*.txt', collection=False, lazy=False) assert all(isinstance(x, Future) for x in L) L = read_text('/tmp/test/text.*.txt', collection=False, lazy=True) assert all(isinstance(x, Value) for x in L)
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = {'/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes} with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'w') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L)
def test__read_text(c, s, a, b): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('%s/other.txt' % basedir, 'wb') as f: f.write('a b\nc d'.encode()) b = db.read_text('hdfs://%s/text.*.txt' % basedir) yield gen.sleep(0.5) assert not s.tasks import dask b.compute(get=dask.get) coll = b.str.strip().str.split().map(len) future = c.compute(coll) yield gen.sleep(0.5) result = yield future assert result == [2, 2, 2, 2, 2, 2] b = db.read_text('hdfs://%s/other.txt' % basedir) b = c.persist(b) future = c.compute(b.str.split().flatten()) result = yield future assert result == ['a', 'b', 'c', 'd']
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_read_bytes(e, s, a, b): with make_hdfs() as hdfs: data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 sample, values = read_bytes(fn, hdfs=hdfs) assert sample[:5] == b'aaaaa' assert len(values) == len(blocks) while not s.restrictions: yield gen.sleep(0.01) assert not s.tasks assert {v.key for v in values} == set(s.restrictions) assert {v.key for v in values} == set(s.loose_restrictions) futures = e.compute(values) results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test__read_text(c, s, a, b): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('%s/other.txt' % basedir, 'wb') as f: f.write('a b\nc d'.encode()) b = db.read_text('hdfs://%s/text.*.txt' % basedir) yield gen.sleep(0.5) assert not s.tasks import dask b.compute(get=dask.get) coll = b.str.strip().str.split().map(len) future = c.compute(coll) yield gen.sleep(0.5) result = yield future._result() assert result == [2, 2, 2, 2, 2, 2] b = db.read_text('hdfs://%s/other.txt' % basedir) b = c.persist(b) future = c.compute(b.str.split().concat()) result = yield future._result() assert result == ['a', 'b', 'c', 'd']
def test_read_csv_with_names(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = read_csv('/tmp/test/*.csv', names=['amount', 'name'], lineterminator='\n', lazy=False) assert list(df.columns) == ['amount', 'name']
def test_read_csv_with_names(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = dd.read_csv('hdfs:///tmp/test/*.csv', names=['amount', 'name'], lineterminator='\n') assert list(df.columns) == ['amount', 'name']
def test_read_text_sync(loop): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/data.txt', 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: b = read_text('/tmp/test/*.txt', lazy=False) assert list(b.str.upper()) == ['HELLO', 'WORLD']
def test_read_text_sync(loop): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/data.txt', 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Client(('127.0.0.1', s['port']), loop=loop): b = db.read_text('hdfs:///tmp/test/*.txt') assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
def test_read_csv_with_names(e, s, a, b): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = dd.read_csv('hdfs://%s/*.csv' % basedir, names=['amount', 'name'], lineterminator='\n') assert list(df.columns) == ['amount', 'name']
def test_read_text_sync(loop): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/data.txt' % basedir, 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Client(s['address'], loop=loop) as e: b = db.read_text('hdfs://%s/*.txt' % basedir) assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
def test_read_text_sync(loop): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/data.txt' % basedir, 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Client(('127.0.0.1', s['port']), loop=loop): b = db.read_text('hdfs://%s/*.txt' % basedir) assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
def test__read_text_json_endline(e, s, a): import json with make_hdfs() as hdfs: with hdfs.open('/tmp/test/text.1.txt', 'wb') as f: f.write(b'{"x": 1}\n{"x": 2}\n') b = read_text('/tmp/test/text.1.txt').map(json.loads) result = yield e.compute(b)._result() assert result == [{"x": 1}, {"x": 2}]
def test__read_text_json_endline(e, s, a): import json with make_hdfs() as hdfs: with hdfs.open('/tmp/test/text.1.txt', 'wb') as f: f.write(b'{"x": 1}\n{"x": 2}\n') b = db.read_text('hdfs:///tmp/test/text.1.txt').map(json.loads) result = yield e.compute(b)._result() assert result == [{"x": 1}, {"x": 2}]
def test__read_text_json_endline(e, s, a): import json with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f: f.write(b'{"x": 1}\n{"x": 2}\n') b = db.read_text('hdfs://%s/text.1.txt' % basedir).map(json.loads) result = yield e.compute(b) assert result == [{"x": 1}, {"x": 2}]
def test__read_text_json_endline(e, s, a): import json with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f: f.write(b'{"x": 1}\n{"x": 2}\n') b = db.read_text('hdfs://%s/text.1.txt' % basedir).map(json.loads) result = yield e.compute(b)._result() assert result == [{"x": 1}, {"x": 2}]
def test_read_csv_sync_compute(loop): with cluster(nworkers=1) as (s, [a]): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Client(('127.0.0.1', s['port']), loop=loop) as e: df = dd.read_csv('hdfs://%s/*.csv' % basedir, collection=True) assert df.amount.sum().compute(get=e.get) == 1000
def test_read_csv(e, s, a, b): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n') result = e.compute(df.id.sum(), sync=False) result = yield result._result() assert result == 1 + 2 + 3 + 4
def test__read_text_unicode(e, s, a, b): data = b'abcd\xc3\xa9' with make_hdfs() as (hdfs, basedir): fn = '%s/data.txt' % basedir with hdfs.open(fn, 'wb') as f: f.write(b'\n'.join([data, data])) f = db.read_text('hdfs://' + fn, collection=False) result = yield e.compute(f[0])._result() assert len(result) == 2 assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2 assert len(result[0].strip()) == 5
def test__read_text_unicode(e, s, a, b): fn = '/tmp/test/data.txt' data = b'abcd\xc3\xa9' with make_hdfs() as hdfs: with hdfs.open(fn, 'wb') as f: f.write(b'\n'.join([data, data])) f = read_text(fn, collection=False, lazy=False) result = yield f[0]._result() assert len(result) == 2 assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2 assert len(result[0]) == 5
def test__read_text_unicode(e, s, a, b): data = b'abcd\xc3\xa9' with make_hdfs() as (hdfs, basedir): fn = '%s/data.txt' % basedir with hdfs.open(fn, 'wb') as f: f.write(b'\n'.join([data, data])) f = db.read_text('hdfs://' + fn, collection=False) result = yield e.compute(f[0]) assert len(result) == 2 assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2 assert len(result[0].strip()) == 5
def test_read_csv(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = dd.read_csv('hdfs:///tmp/test/*.csv', lineterminator='\n') result = e.compute(df.id.sum(), sync=False) result = yield result._result() assert result == 1 + 2 + 3 + 4
def test_read_csv_sync_compute(loop): with cluster(nworkers=1) as (s, [a]): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Client(s['address'], loop=loop) as e: df = dd.read_csv('hdfs://%s/*.csv' % basedir, collection=True) assert df.amount.sum().compute(get=e.get) == 1000
def test_read_csv_sync_compute(loop): with cluster(nworkers=1) as (s, [a]): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Executor(('127.0.0.1', s['port']), loop=loop) as e: df = dd.read_csv('hdfs:///tmp/test/*.csv', collection=True) assert df.amount.sum().compute(get=e.get) == 1000
def test_read_bytes_sync(loop, nworkers): with cluster(nworkers=nworkers) as (s, workers): with make_hdfs() as (hdfs, basedir): data = b'a' * int(1e3) for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Client(s['address'], loop=loop) as e: sample, values = read_bytes('hdfs://%s/file.*' % basedir) results = delayed(values).compute() assert [b''.join(r) for r in results] == 100 * [data]
def test_read_csv_with_names(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = yield _read_csv('/tmp/test/*.csv', names=['amount', 'name'], lineterminator='\n', lazy=False) assert list(df.columns) == ['amount', 'name'] yield e._shutdown()
def test_get_block_locations_nested(): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6
def test_read_bytes_sync(loop): with make_hdfs() as hdfs: data = b'a' * int(1e3) for fn in ['/tmp/test/file.%d' % i for i in range(100)]: with hdfs.open(fn, 'w', repl=1) as f: f.write(data) with cluster(nworkers=3) as (s, [a, b, c]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_bytes('/tmp/test/file.*') results = e.gather(futures) assert b''.join(results) == 100 * data
def test_get_block_locations_nested(): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6
def test_read_bytes_sync(loop, nworkers): with cluster(nworkers=nworkers) as (s, workers): with make_hdfs() as (hdfs, basedir): data = b'a' * int(1e3) for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Client(('127.0.0.1', s['port']), loop=loop) as e: sample, values = read_bytes('hdfs://%s/file.*' % basedir) results = delayed(values).compute() assert [b''.join(r) for r in results] == 100 * [data]
def test_read_bytes_sync(loop): with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as hdfs: data = b'a' * int(1e3) for fn in ['/tmp/test/file.%d' % i for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_bytes('/tmp/test/file.*', lazy=False) results = e.gather(futures) assert b''.join(results) == 100 * data
def test_read_csv(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = read_csv('/tmp/test/*.csv', lineterminator='\n', lazy=False) assert df._known_dtype result = e.compute(df.id.sum(), sync=False) result = yield result._result() assert result == 1 + 2 + 3 + 4
def test_read_bytes_sync(loop, nworkers): with cluster(nworkers=nworkers) as (s, workers): with make_hdfs() as hdfs: data = b'a' * int(1e3) for fn in ['/tmp/test/file.%d' % i for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Client(('127.0.0.1', s['port']), loop=loop) as e: sample, values = read_bytes('hdfs:///tmp/test/file.*') results = delayed(values).compute() assert [b''.join(r) for r in results] == 100 * [data]
def test_write_bytes_2(c, s, a, b): with make_hdfs() as (hdfs, basedir): path = 'hdfs://%s/' % basedir data = [b'test data %i' % i for i in range(5)] values = [delayed(d) for d in data] out = write_bytes(values, path) futures = c.compute(out) results = yield c._gather(futures) assert len(hdfs.ls(basedir)) == 5 sample, vals = read_bytes('hdfs://%s/*.part' % basedir) futures = c.compute(list(concat(vals))) results = yield c._gather(futures) assert data == results
def test_deterministic_key_names(e, s, a, b): with make_hdfs() as (hdfs, basedir): data = b'abc\n' * int(1e3) fn = '%s/file' % basedir with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n') _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n') _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c') assert [f.key for f in concat(x)] == [f.key for f in concat(y)] assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
def test_deterministic_key_names(e, s, a, b): with make_hdfs() as hdfs: data = b'abc\n' * int(1e3) fn = '/tmp/test/file' with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) x = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n') y = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n') z = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'c') assert [f.key for f in x] == [f.key for f in y] assert [f.key for f in x] != [f.key for f in z]
def test_read_csv_lazy(e, s, a, b): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n') yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False) assert result == 1 + 2 + 3 + 4
def test_read_csv_lazy(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = dd.read_csv('hdfs:///tmp/test/*.csv', lineterminator='\n') yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False)._result() assert result == 1 + 2 + 3 + 4
def test_avro_sync(loop): avro_files = {'/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes} with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'w') as f: f.write(v) with cluster(nworkers=1) as (s, [a]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_avro('/tmp/test/*.avro') assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert L[0][:5] == data[:5]
def test_read_csv_lazy(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = read_csv('/tmp/test/*.csv', lazy=True, lineterminator='\n') assert df._known_dtype yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False)._result() assert result == 1 + 2 + 3 + 4
def test_get_block_locations_nested(): with make_hdfs() as (hdfs, basedir): data = b'a' for i in range(3): hdfs.mkdir('%s/data-%d' % (basedir, i)) for j in range(2): fn = '%s/data-%d/file-%d.csv' % (basedir, i, j) with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) L = [hdfs.get_block_locations(fn) for fn in hdfs.glob('%s/*/*.csv' % basedir)] L = list(concat(L)) assert len(L) == 6
def test_get_block_locations(): with make_hdfs() as hdfs: data = b'a' * int(1e8) # todo: reduce block size to speed up test fn_1 = '/tmp/test/file1' fn_2 = '/tmp/test/file2' with hdfs.open(fn_1, 'w', repl=1) as f: f.write(data) with hdfs.open(fn_2, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert L == get_block_locations(hdfs, fn_1) + get_block_locations(hdfs, fn_2) assert L[0]['filename'] == L[1]['filename'] == fn_1 assert L[2]['filename'] == L[3]['filename'] == fn_2
def test_read_csv(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'w') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'w') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', header=True, lineterminator='\n') result, = e.compute(df.id.sum(), sync=False) result = yield result._result() assert result == 1 + 2 + 3 + 4
def test_write_bytes(e, s, a, b): from dask.bytes.core import write_bytes, read_bytes with make_hdfs() as hdfs: path = 'hdfs:///tmp/test/' data = [b'test data %i' % i for i in range(5)] values = [delayed(d) for d in data] out = write_bytes(values, path, hdfs=hdfs) futures = e.compute(out) results = yield e._gather(futures) assert len(hdfs.ls('/tmp/test/')) == 5 sample, vals = read_bytes('hdfs:///tmp/test/*.part', hdfs=hdfs, lazy=True) futures = e.compute(vals) results = yield e._gather(futures) assert data == results
def test_avro_sync(loop): avro_files = { '/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes } with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'wb') as f: f.write(v) with cluster(nworkers=1) as (s, [a]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_avro('/tmp/test/*.avro', lazy=False) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert L[0][:5] == data[:5]