Example #1
0
def test_enforce_dtypes():
    blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'],
              [b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40']]
    head = pd.read_csv(BytesIO(blocks[0][0]), header=0)
    dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {}, collection=False)
    dfs = compute(*dfs)
    assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
Example #2
0
def test_blocked():
    blocks = []
    for k in sorted(files):
        b = files[k]
        lines = b.split(b'\n')
        blocks.append([b'\n'.join(bs) for bs in partition_all(2, lines)])

    df = read_csv_from_bytes(blocks, header, expected.head(), {})
    eq(df.compute().reset_index(drop=True),
       expected.reset_index(drop=True), check_dtype=False)

    expected2 = expected[['name', 'id']]
    df = read_csv_from_bytes(blocks, header, expected2.head(),
                             {'usecols': ['name', 'id']})
    eq(df.compute().reset_index(drop=True),
       expected2.reset_index(drop=True), check_dtype=False)
Example #3
0
def test_read_csv_simple():
    blocks = [[files[k]] for k in sorted(files)]
    kwargs = {}
    head = bytes_read_csv(files['2014-01-01.csv'], b'', {})

    df = read_csv_from_bytes(blocks, header, head, kwargs, collection=True)
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ['name', 'amount', 'id']

    values = read_csv_from_bytes(blocks, header, head, kwargs,
                                 collection=False)
    assert isinstance(values, list)
    assert len(values) == 3
    assert all(hasattr(item, 'dask') for item in values)

    result = df.amount.sum().compute(get=get_sync)
    assert result == (100 + 200 + 300 + 400 + 500 + 600)
Example #4
0
def test_enforce_dtypes():
    blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'],
              [b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40']]
    head = pd.read_csv(BytesIO(blocks[0][0]), header=0)
    dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {},
                              enforce_dtypes=True, collection=False)
    dfs = compute(*dfs)
    assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
Example #5
0
def test_enforce_columns():
    blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'],
              [b'AA,bb\n1,1.0\n2.2.0', b'10,20\n30,40']]
    head = pd.read_csv(BytesIO(blocks[0][0]), header=0)
    with pytest.raises(ValueError):
        dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {},
                                  collection=False, enforce=True)
        compute(*dfs)
Example #6
0
def test_enforce_columns():
    blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'],
              [b'AA,bb\n1,1.0\n2.2.0', b'10,20\n30,40']]
    head = pd.read_csv(BytesIO(blocks[0][0]), header=0)
    with pytest.raises(ValueError):
        dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {},
                                  collection=False, enforce=True)
        compute(*dfs)
Example #7
0
def test_kwargs():
    blocks = [files[k] for k in sorted(files)]
    blocks = [[b] for b in blocks]
    kwargs = {'usecols': ['name', 'id']}
    head = bytes_read_csv(files['2014-01-01.csv'], b'', kwargs)

    df = read_csv_from_bytes(blocks, header, head, kwargs, collection=True)
    assert list(df.columns) == ['name', 'id']
    result = df.compute()
    assert (result.columns == df.columns).all()