Esempio n. 1
0
def test_bytes_read_csv_with_header():
    b = files['2014-01-01.csv']
    header, b = b.split(b'\n', 1)
    df = bytes_read_csv(b, header, {})
    assert list(df.columns) == ['name', 'amount', 'id']
    assert len(df) == 3
    assert df.id.sum() == 1 + 2 + 3
Esempio n. 2
0
def test_bytes_read_csv_with_header():
    b = files['2014-01-01.csv']
    header, b = b.split(b'\n', 1)
    df = bytes_read_csv(b, header, {})
    assert list(df.columns) == ['name', 'amount', 'id']
    assert len(df) == 3
    assert df.id.sum() == 1 + 2 + 3
Esempio n. 3
0
def test_kwargs(e, s, a, b):
    blocks = [files[k] for k in sorted(files)]
    blocks = yield e._scatter(blocks)
    blocks = [[b] for b in blocks]
    kwargs = {'usecols': ['name', 'id']}
    head = bytes_read_csv(files['2014-01-01.csv'], '', kwargs)

    df = read_csv(blocks, header, head, kwargs, lazy=False, collection=True)
    assert list(df.columns) == ['name', 'id']
    result = yield e.compute(df)._result()
    assert (result.columns == df.columns).all()
Esempio n. 4
0
def test_kwargs(e, s, a, b):
    blocks = [files[k] for k in sorted(files)]
    blocks = yield e._scatter(blocks)
    blocks = [[b] for b in blocks]
    kwargs = {'usecols': ['name', 'id']}
    head = bytes_read_csv(files['2014-01-01.csv'], '', kwargs)

    df = read_csv(blocks, header, head, kwargs, lazy=False, collection=True)
    assert list(df.columns) == ['name', 'id']
    result = yield e.compute(df)._result()
    assert (result.columns == df.columns).all()
Esempio n. 5
0
def test_read_csv(e, s, a, b):
    bytes = [files[k] for k in sorted(files)]
    gzbytes = [gzip_compress(b) for b in bytes]
    kwargs = {}
    head = bytes_read_csv(files['2014-01-01.csv'], '', {})

    for _blocks, compression in [(bytes, None), (gzbytes, 'gzip')]:
        blocks = yield e._scatter(_blocks)
        blocks = [[b] for b in blocks]
        kwargs = {'compression': compression}

        ntasks = len(s.tasks)
        df = read_csv(blocks, header, head, kwargs, lazy=True, collection=True)
        assert isinstance(df, dd.DataFrame)
        assert list(df.columns) == ['name', 'amount', 'id']
        yield gen.sleep(0.1)
        assert len(s.tasks) == ntasks

        values = read_csv(blocks,
                          header,
                          head,
                          kwargs,
                          lazy=True,
                          collection=False)
        assert isinstance(values, list)
        assert len(values) == 3
        assert all(hasattr(item, 'dask') for item in values)

        f = e.compute(df.amount.sum())
        result = yield f._result()
        assert result == (100 + 200 + 300 + 400 + 500 + 600)

        futures = read_csv(blocks,
                           header,
                           head,
                           kwargs,
                           lazy=False,
                           collection=False)
        assert len(futures) == 3
        assert all(isinstance(f, Future) for f in futures)
        results = yield e._gather(futures)
        assert results[0].id.sum() == 1 + 2 + 3
        assert results[1].id.sum() == 0
        assert results[2].id.sum() == 4 + 5 + 6
Esempio n. 6
0
def test_read_csv(e, s, a, b):
    bytes = [files[k] for k in sorted(files)]
    gzbytes = [gzip_compress(b) for b in bytes]
    kwargs = {}
    head = bytes_read_csv(files['2014-01-01.csv'], '', {})

    for _blocks, compression in [(bytes, None), (gzbytes, 'gzip')]:
        blocks = yield e._scatter(_blocks)
        blocks = [[b] for b in blocks]
        kwargs = {'compression': compression}

        ntasks = len(s.tasks)
        df = read_csv(blocks, header, head, kwargs, lazy=True, collection=True)
        assert isinstance(df, dd.DataFrame)
        assert list(df.columns) == ['name', 'amount', 'id']
        yield gen.sleep(0.1)
        assert len(s.tasks) == ntasks

        values = read_csv(blocks, header, head, kwargs, lazy=True,
                          collection=False)
        assert isinstance(values, list)
        assert len(values) == 3
        assert all(hasattr(item, 'dask') for item in values)

        f = e.compute(df.amount.sum())
        result = yield f._result()
        assert result == (100 + 200 + 300 + 400 + 500 + 600)

        futures = read_csv(blocks, header, head, kwargs, lazy=False,
                collection=False)
        assert len(futures) == 3
        assert all(isinstance(f, Future) for f in futures)
        results = yield e._gather(futures)
        assert results[0].id.sum() == 1 + 2 + 3
        assert results[1].id.sum() == 0
        assert results[2].id.sum() == 4 + 5 + 6
Esempio n. 7
0
def test_bytes_read_csv_kwargs():
    b = files['2014-01-01.csv']
    df = bytes_read_csv(b, '', {'usecols': ['name', 'id']})
    assert list(df.columns) == ['name', 'id']
Esempio n. 8
0
def test_bytes_read_csv():
    b = files['2014-01-01.csv']
    df = bytes_read_csv(b, '', {})
    assert list(df.columns) == ['name', 'amount', 'id']
    assert len(df) == 3
    assert df.id.sum() == 1 + 2 + 3
Esempio n. 9
0
def test_bytes_read_csv_kwargs():
    b = files['2014-01-01.csv']
    df = bytes_read_csv(b, '', {'usecols': ['name', 'id']})
    assert list(df.columns) == ['name', 'id']
Esempio n. 10
0
def test_bytes_read_csv():
    b = files['2014-01-01.csv']
    df = bytes_read_csv(b, '', {})
    assert list(df.columns) == ['name', 'amount', 'id']
    assert len(df) == 3
    assert df.id.sum() == 1 + 2 + 3