def test_read_csv_sync(loop): dd = pytest.importorskip('dask.dataframe') with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: df = read_csv('distributed-test/csv/2015/', lazy=True) assert isinstance(df, dd.DataFrame) assert list(df.columns) == ['name', 'amount', 'id'] f = e.compute(df.amount.sum()) assert f.result() == (100 + 200 + 300 + 400 + 500 + 600) df = read_csv('distributed-test/csv/2015/', lazy=False) assert df.amount.sum().compute() == (100 + 200 + 300 + 400 + 500 + 600)
def test_read_csv_gzip(e, s, a, b): dd = pytest.importorskip('dask.dataframe') s3 = S3FileSystem(anon=True) df = read_csv('distributed-test/csv/gzip/', compression='gzip') assert isinstance(df, dd.DataFrame) assert list(df.columns) == ['name', 'amount', 'id'] f = e.compute(df.amount.sum()) result = yield f._result() assert result == (100 + 200 + 300 + 400 + 500 + 600)
def test_read_csv(e, s, a, b): dd = pytest.importorskip('dask.dataframe') s3 = S3FileSystem(anon=True) df = read_csv('distributed-test/csv/2015/*', lazy=True, storage_options={'anon': True}) yield gen.sleep(0.1) assert not s.tasks assert isinstance(df, dd.DataFrame) df = read_csv('distributed-test/csv/2015/*', storage_options={'anon': True}) assert isinstance(df, dd.DataFrame) assert list(df.columns) == ['name', 'amount', 'id'] f = e.compute(df.amount.sum()) result = yield f._result() assert result == (100 + 200 + 300 + 400 + 500 + 600) futures = read_csv('distributed-test/csv/2015/*', collection=False, lazy=False, storage_options={'anon': True}) assert len(futures) == 3 assert all(isinstance(f, Future) for f in futures) results = yield e._gather(futures) assert results[0].id.sum() == 1 + 2 + 3 assert results[1].id.sum() == 0 assert results[2].id.sum() == 4 + 5 + 6 values = read_csv('distributed-test/csv/2015/*', collection=False, lazy=True, storage_options={'anon': True}) assert len(values) == 3 assert all(isinstance(v, Delayed) for v in values) df2 = read_csv('distributed-test/csv/2015/*', collection=True, lazy=True, blocksize=20, storage_options={'anon': True}) assert df2.npartitions > df.npartitions result = yield e.compute(df2.id.sum())._result() assert result == 1 + 2 + 3 + 4 + 5 + 6 df2 = read_csv('distributed-test/csv/2015/*', collection=True, lazy=False, blocksize=20, storage_options={'anon': True}) f = e.compute(df2.amount.sum()) result = yield f._result() assert result == (100 + 200 + 300 + 400 + 500 + 600)