def test_text_blocks_to_pandas_simple(reader, files): blocks = [[files[k]] for k in sorted(files)] kwargs = {} head = pandas_read_text(reader, files["2014-01-01.csv"], b"", {}) header = files["2014-01-01.csv"].split(b"\n")[0] + b"\n" df = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=True) assert isinstance(df, dd.DataFrame) assert list(df.columns) == ["name", "amount", "id"] values = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=False) assert isinstance(values, list) assert len(values) == 3 assert all(hasattr(item, "dask") for item in values) assert_eq(df.amount.sum(), 100 + 200 + 300 + 400 + 500 + 600)
def test_text_blocks_to_pandas_simple(reader, files): blocks = [[files[k]] for k in sorted(files)] kwargs = {} head = pandas_read_text(reader, files['2014-01-01.csv'], b'', {}) header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n' df = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=True) assert isinstance(df, dd.DataFrame) assert list(df.columns) == ['name', 'amount', 'id'] values = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=False) assert isinstance(values, list) assert len(values) == 3 assert all(hasattr(item, 'dask') for item in values) result = df.amount.sum().compute(get=get_sync) assert result == (100 + 200 + 300 + 400 + 500 + 600)
def test_enforce_dtypes(reader, blocks): head = reader(BytesIO(blocks[0][0]), header=0) header = blocks[0][0].split(b'\n')[0] + b'\n' dfs = text_blocks_to_pandas(reader, blocks, header, head, {}, collection=False) dfs = dask.compute(*dfs, scheduler='sync') assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
def test_text_blocks_to_pandas_blocked(reader, files): header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n' blocks = [] for k in sorted(files): b = files[k] lines = b.split(b'\n') blocks.append([b'\n'.join(bs) for bs in partition_all(2, lines)]) df = text_blocks_to_pandas(reader, blocks, header, expected.head(), {}) assert_eq(df.compute().reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False) expected2 = expected[['name', 'id']] df = text_blocks_to_pandas(reader, blocks, header, expected2.head(), {'usecols': ['name', 'id']}) assert_eq(df.compute().reset_index(drop=True), expected2.reset_index(drop=True), check_dtype=False)
def test_enforce_columns(reader, blocks): # Replace second header with different column name blocks = [blocks[0], [blocks[1][0].replace(b"a", b"A"), blocks[1][1]]] head = reader(BytesIO(blocks[0][0]), header=0) header = blocks[0][0].split(b"\n")[0] + b"\n" with pytest.raises(ValueError): dfs = text_blocks_to_pandas(reader, blocks, header, head, {}, enforce=True) dask.compute(*dfs, scheduler="sync")
def test_enforce_columns(reader, blocks): # Replace second header with different column name blocks = [blocks[0], [blocks[1][0].replace(b'a', b'A'), blocks[1][1]]] head = reader(BytesIO(blocks[0][0]), header=0) header = blocks[0][0].split(b'\n')[0] + b'\n' with pytest.raises(ValueError): dfs = text_blocks_to_pandas(reader, blocks, header, head, {}, collection=False, enforce=True) dask.compute(*dfs, scheduler='sync')
def test_text_blocks_to_pandas_simple(reader, files): blocks = [[files[k]] for k in sorted(files)] kwargs = {} head = pandas_read_text(reader, files['2014-01-01.csv'], b'', {}) header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n' df = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=True) assert isinstance(df, dd.DataFrame) assert list(df.columns) == ['name', 'amount', 'id'] values = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=False) assert isinstance(values, list) assert len(values) == 3 assert all(hasattr(item, 'dask') for item in values) assert_eq(df.amount.sum(), 100 + 200 + 300 + 400 + 500 + 600)
def test_text_blocks_to_pandas_kwargs(reader, files): blocks = [files[k] for k in sorted(files)] blocks = [[b] for b in blocks] kwargs = {"usecols": ["name", "id"]} head = pandas_read_text(reader, files["2014-01-01.csv"], b"", kwargs) header = files["2014-01-01.csv"].split(b"\n")[0] + b"\n" df = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=True) assert list(df.columns) == ["name", "id"] result = df.compute() assert (result.columns == df.columns).all()
def test_text_blocks_to_pandas_kwargs(reader, files): blocks = [files[k] for k in sorted(files)] blocks = [[b] for b in blocks] kwargs = {'usecols': ['name', 'id']} head = pandas_read_text(reader, files['2014-01-01.csv'], b'', kwargs) header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n' df = text_blocks_to_pandas(reader, blocks, header, head, kwargs, collection=True) assert list(df.columns) == ['name', 'id'] result = df.compute() assert (result.columns == df.columns).all()
def test_text_blocks_to_pandas_blocked(reader, files): header = files["2014-01-01.csv"].split(b"\n")[0] + b"\n" blocks = [] for k in sorted(files): b = files[k] lines = b.split(b"\n") blocks.append([b"\n".join(bs) for bs in partition_all(2, lines)]) df = text_blocks_to_pandas(reader, blocks, header, expected.head(), {}) assert_eq( df.compute().reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False, ) expected2 = expected[["name", "id"]] df = text_blocks_to_pandas(reader, blocks, header, expected2.head(), {"usecols": ["name", "id"]}) assert_eq( df.compute().reset_index(drop=True), expected2.reset_index(drop=True), check_dtype=False, )
def test_enforce_dtypes(reader, blocks): head = reader(BytesIO(blocks[0][0]), header=0) header = blocks[0][0].split(b"\n")[0] + b"\n" dfs = text_blocks_to_pandas(reader, blocks, header, head, {}) dfs = dask.compute(dfs, scheduler="sync") assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)