Example #1
0
def test_text_blocks_to_pandas_simple(reader, files):
    blocks = [[files[k]] for k in sorted(files)]
    kwargs = {}
    head = pandas_read_text(reader, files["2014-01-01.csv"], b"", {})
    header = files["2014-01-01.csv"].split(b"\n")[0] + b"\n"

    df = text_blocks_to_pandas(reader,
                               blocks,
                               header,
                               head,
                               kwargs,
                               collection=True)
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ["name", "amount", "id"]

    values = text_blocks_to_pandas(reader,
                                   blocks,
                                   header,
                                   head,
                                   kwargs,
                                   collection=False)
    assert isinstance(values, list)
    assert len(values) == 3
    assert all(hasattr(item, "dask") for item in values)

    assert_eq(df.amount.sum(), 100 + 200 + 300 + 400 + 500 + 600)
Example #2
0
def test_text_blocks_to_pandas_simple(reader, files):
    blocks = [[files[k]] for k in sorted(files)]
    kwargs = {}
    head = pandas_read_text(reader, files['2014-01-01.csv'], b'', {})
    header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'

    df = text_blocks_to_pandas(reader,
                               blocks,
                               header,
                               head,
                               kwargs,
                               collection=True)
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ['name', 'amount', 'id']

    values = text_blocks_to_pandas(reader,
                                   blocks,
                                   header,
                                   head,
                                   kwargs,
                                   collection=False)
    assert isinstance(values, list)
    assert len(values) == 3
    assert all(hasattr(item, 'dask') for item in values)

    result = df.amount.sum().compute(get=get_sync)
    assert result == (100 + 200 + 300 + 400 + 500 + 600)
Example #3
0
def test_enforce_dtypes(reader, blocks):
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b'\n')[0] + b'\n'
    dfs = text_blocks_to_pandas(reader, blocks, header, head, {},
                                collection=False)
    dfs = dask.compute(*dfs, scheduler='sync')
    assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
Example #4
0
def test_enforce_dtypes(reader, blocks):
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b'\n')[0] + b'\n'
    dfs = text_blocks_to_pandas(reader, blocks, header, head, {},
                                collection=False)
    dfs = dask.compute(*dfs, scheduler='sync')
    assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
Example #5
0
def test_text_blocks_to_pandas_blocked(reader, files):
    header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'
    blocks = []
    for k in sorted(files):
        b = files[k]
        lines = b.split(b'\n')
        blocks.append([b'\n'.join(bs) for bs in partition_all(2, lines)])

    df = text_blocks_to_pandas(reader, blocks, header, expected.head(), {})
    assert_eq(df.compute().reset_index(drop=True),
              expected.reset_index(drop=True), check_dtype=False)

    expected2 = expected[['name', 'id']]
    df = text_blocks_to_pandas(reader, blocks, header, expected2.head(),
                               {'usecols': ['name', 'id']})
    assert_eq(df.compute().reset_index(drop=True),
              expected2.reset_index(drop=True), check_dtype=False)
Example #6
0
def test_text_blocks_to_pandas_blocked(reader, files):
    header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'
    blocks = []
    for k in sorted(files):
        b = files[k]
        lines = b.split(b'\n')
        blocks.append([b'\n'.join(bs) for bs in partition_all(2, lines)])

    df = text_blocks_to_pandas(reader, blocks, header, expected.head(), {})
    assert_eq(df.compute().reset_index(drop=True),
              expected.reset_index(drop=True), check_dtype=False)

    expected2 = expected[['name', 'id']]
    df = text_blocks_to_pandas(reader, blocks, header, expected2.head(),
                               {'usecols': ['name', 'id']})
    assert_eq(df.compute().reset_index(drop=True),
              expected2.reset_index(drop=True), check_dtype=False)
Example #7
0
def test_enforce_columns(reader, blocks):
    # Replace second header with different column name
    blocks = [blocks[0], [blocks[1][0].replace(b"a", b"A"), blocks[1][1]]]
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b"\n")[0] + b"\n"
    with pytest.raises(ValueError):
        dfs = text_blocks_to_pandas(reader, blocks, header, head, {}, enforce=True)
        dask.compute(*dfs, scheduler="sync")
Example #8
0
def test_enforce_columns(reader, blocks):
    # Replace second header with different column name
    blocks = [blocks[0], [blocks[1][0].replace(b'a', b'A'), blocks[1][1]]]
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b'\n')[0] + b'\n'
    with pytest.raises(ValueError):
        dfs = text_blocks_to_pandas(reader, blocks, header, head, {},
                                    collection=False, enforce=True)
        dask.compute(*dfs, scheduler='sync')
Example #9
0
def test_enforce_columns(reader, blocks):
    # Replace second header with different column name
    blocks = [blocks[0], [blocks[1][0].replace(b'a', b'A'), blocks[1][1]]]
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b'\n')[0] + b'\n'
    with pytest.raises(ValueError):
        dfs = text_blocks_to_pandas(reader, blocks, header, head, {},
                                    collection=False, enforce=True)
        dask.compute(*dfs, scheduler='sync')
Example #10
0
def test_text_blocks_to_pandas_simple(reader, files):
    blocks = [[files[k]] for k in sorted(files)]
    kwargs = {}
    head = pandas_read_text(reader, files['2014-01-01.csv'], b'', {})
    header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'

    df = text_blocks_to_pandas(reader, blocks, header, head, kwargs,
                               collection=True)
    assert isinstance(df, dd.DataFrame)
    assert list(df.columns) == ['name', 'amount', 'id']

    values = text_blocks_to_pandas(reader, blocks, header, head, kwargs,
                                   collection=False)
    assert isinstance(values, list)
    assert len(values) == 3
    assert all(hasattr(item, 'dask') for item in values)

    assert_eq(df.amount.sum(),
              100 + 200 + 300 + 400 + 500 + 600)
Example #11
0
def test_text_blocks_to_pandas_kwargs(reader, files):
    blocks = [files[k] for k in sorted(files)]
    blocks = [[b] for b in blocks]
    kwargs = {"usecols": ["name", "id"]}
    head = pandas_read_text(reader, files["2014-01-01.csv"], b"", kwargs)
    header = files["2014-01-01.csv"].split(b"\n")[0] + b"\n"

    df = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=True)
    assert list(df.columns) == ["name", "id"]
    result = df.compute()
    assert (result.columns == df.columns).all()
Example #12
0
def test_text_blocks_to_pandas_kwargs(reader, files):
    blocks = [files[k] for k in sorted(files)]
    blocks = [[b] for b in blocks]
    kwargs = {'usecols': ['name', 'id']}
    head = pandas_read_text(reader, files['2014-01-01.csv'], b'', kwargs)
    header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'

    df = text_blocks_to_pandas(reader, blocks, header, head, kwargs,
                               collection=True)
    assert list(df.columns) == ['name', 'id']
    result = df.compute()
    assert (result.columns == df.columns).all()
Example #13
0
def test_text_blocks_to_pandas_kwargs(reader, files):
    blocks = [files[k] for k in sorted(files)]
    blocks = [[b] for b in blocks]
    kwargs = {'usecols': ['name', 'id']}
    head = pandas_read_text(reader, files['2014-01-01.csv'], b'', kwargs)
    header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'

    df = text_blocks_to_pandas(reader, blocks, header, head, kwargs,
                               collection=True)
    assert list(df.columns) == ['name', 'id']
    result = df.compute()
    assert (result.columns == df.columns).all()
Example #14
0
def test_text_blocks_to_pandas_blocked(reader, files):
    header = files["2014-01-01.csv"].split(b"\n")[0] + b"\n"
    blocks = []
    for k in sorted(files):
        b = files[k]
        lines = b.split(b"\n")
        blocks.append([b"\n".join(bs) for bs in partition_all(2, lines)])

    df = text_blocks_to_pandas(reader, blocks, header, expected.head(), {})
    assert_eq(
        df.compute().reset_index(drop=True),
        expected.reset_index(drop=True),
        check_dtype=False,
    )

    expected2 = expected[["name", "id"]]
    df = text_blocks_to_pandas(reader, blocks, header, expected2.head(),
                               {"usecols": ["name", "id"]})
    assert_eq(
        df.compute().reset_index(drop=True),
        expected2.reset_index(drop=True),
        check_dtype=False,
    )
Example #15
0
def test_enforce_dtypes(reader, blocks):
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b"\n")[0] + b"\n"
    dfs = text_blocks_to_pandas(reader, blocks, header, head, {})
    dfs = dask.compute(dfs, scheduler="sync")
    assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)