Esempio n. 1
0
def test_read_csv_sync(loop):
    dd = pytest.importorskip('dask.dataframe')
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            df = read_csv('distributed-test/csv/2015/', lazy=True)
            assert isinstance(df, dd.DataFrame)
            assert list(df.columns) == ['name', 'amount', 'id']
            f = e.compute(df.amount.sum())
            assert f.result() == (100 + 200 + 300 + 400 + 500 + 600)

            df = read_csv('distributed-test/csv/2015/', lazy=False)
            assert df.amount.sum().compute() == (100 + 200 + 300 + 400 + 500 + 600)
Esempio n. 2
0
def test_read_csv_sync(loop):
    dd = pytest.importorskip('dask.dataframe')
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            df = read_csv('distributed-test/csv/2015/', lazy=True)
            assert isinstance(df, dd.DataFrame)
            assert list(df.columns) == ['name', 'amount', 'id']
            f = e.compute(df.amount.sum())
            assert f.result() == (100 + 200 + 300 + 400 + 500 + 600)

            df = read_csv('distributed-test/csv/2015/', lazy=False)
            assert df.amount.sum().compute() == (100 + 200 + 300 + 400 + 500 + 600)
Esempio n. 3
0
def test_read_csv_gzip(e, s, a, b):
    dd = pytest.importorskip('dask.dataframe')
    s3 = S3FileSystem(anon=True)

    df = read_csv('distributed-test/csv/gzip/', compression='gzip')
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ['name', 'amount', 'id']
    f = e.compute(df.amount.sum())
    result = yield f._result()
    assert result == (100 + 200 + 300 + 400 + 500 + 600)
Esempio n. 4
0
def test_read_csv_gzip(e, s, a, b):
    dd = pytest.importorskip('dask.dataframe')
    s3 = S3FileSystem(anon=True)

    df = read_csv('distributed-test/csv/gzip/', compression='gzip')
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ['name', 'amount', 'id']
    f = e.compute(df.amount.sum())
    result = yield f._result()
    assert result == (100 + 200 + 300 + 400 + 500 + 600)
Esempio n. 5
0
def test_read_csv(e, s, a, b):
    dd = pytest.importorskip('dask.dataframe')
    s3 = S3FileSystem(anon=True)

    df = read_csv('distributed-test/csv/2015/*',
                  lazy=True,
                  storage_options={'anon': True})
    yield gen.sleep(0.1)
    assert not s.tasks
    assert isinstance(df, dd.DataFrame)

    df = read_csv('distributed-test/csv/2015/*',
                  storage_options={'anon': True})
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ['name', 'amount', 'id']

    f = e.compute(df.amount.sum())
    result = yield f._result()
    assert result == (100 + 200 + 300 + 400 + 500 + 600)

    futures = read_csv('distributed-test/csv/2015/*',
                       collection=False,
                       lazy=False,
                       storage_options={'anon': True})
    assert len(futures) == 3
    assert all(isinstance(f, Future) for f in futures)
    results = yield e._gather(futures)
    assert results[0].id.sum() == 1 + 2 + 3
    assert results[1].id.sum() == 0
    assert results[2].id.sum() == 4 + 5 + 6

    values = read_csv('distributed-test/csv/2015/*',
                      collection=False,
                      lazy=True,
                      storage_options={'anon': True})
    assert len(values) == 3
    assert all(isinstance(v, Delayed) for v in values)

    df2 = read_csv('distributed-test/csv/2015/*',
                   collection=True,
                   lazy=True,
                   blocksize=20,
                   storage_options={'anon': True})
    assert df2.npartitions > df.npartitions
    result = yield e.compute(df2.id.sum())._result()
    assert result == 1 + 2 + 3 + 4 + 5 + 6

    df2 = read_csv('distributed-test/csv/2015/*',
                   collection=True,
                   lazy=False,
                   blocksize=20,
                   storage_options={'anon': True})
    f = e.compute(df2.amount.sum())
    result = yield f._result()
    assert result == (100 + 200 + 300 + 400 + 500 + 600)
Esempio n. 6
0
def test_read_csv(e, s, a, b):
    dd = pytest.importorskip('dask.dataframe')
    s3 = S3FileSystem(anon=True)

    df = read_csv('distributed-test/csv/2015/*', lazy=True,
            storage_options={'anon': True})
    yield gen.sleep(0.1)
    assert not s.tasks
    assert isinstance(df, dd.DataFrame)

    df = read_csv('distributed-test/csv/2015/*', storage_options={'anon': True})
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ['name', 'amount', 'id']

    f = e.compute(df.amount.sum())
    result = yield f._result()
    assert result == (100 + 200 + 300 + 400 + 500 + 600)

    futures = read_csv('distributed-test/csv/2015/*',
                              collection=False, lazy=False,
                              storage_options={'anon': True})
    assert len(futures) == 3
    assert all(isinstance(f, Future) for f in futures)
    results = yield e._gather(futures)
    assert results[0].id.sum() == 1 + 2 + 3
    assert results[1].id.sum() == 0
    assert results[2].id.sum() == 4 + 5 + 6

    values = read_csv('distributed-test/csv/2015/*',
                              collection=False, lazy=True,
                              storage_options={'anon': True})
    assert len(values) == 3
    assert all(isinstance(v, Delayed) for v in values)

    df2 = read_csv('distributed-test/csv/2015/*',
                          collection=True, lazy=True, blocksize=20,
                          storage_options={'anon': True})
    assert df2.npartitions > df.npartitions
    result = yield e.compute(df2.id.sum())._result()
    assert result == 1 + 2 + 3 + 4 + 5 + 6

    df2 = read_csv('distributed-test/csv/2015/*',
                          collection=True, lazy=False, blocksize=20,
                          storage_options={'anon': True})
    f = e.compute(df2.amount.sum())
    result = yield f._result()
    assert result == (100 + 200 + 300 + 400 + 500 + 600)