def test_read_json_meta(orient, tmpdir): df = pd.DataFrame({'x': range(5), 'y': ['a', 'b', 'c', 'd', 'e']}) df2 = df.assign(x=df.x + 0.5) lines = orient == 'records' df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines) df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines) sol = pd.concat([df, df2]) meta = df2.iloc[:0] if orient == 'values': # orient=values loses column names sol.columns = meta.columns = [0, 1] res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=lines) assert_eq(res, sol) if orient == 'records': # Also check chunked version res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=True, blocksize=50) assert_eq(res, sol, check_index=False)
def test_read_json_meta(orient, tmpdir): df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]}) df2 = df.assign(x=df.x + 0.5) lines = orient == "records" df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines) df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines) sol = pd.concat([df, df2]) meta = df2.iloc[:0] if orient == "values": # orient=values loses column names sol.columns = meta.columns = [0, 1] res = dd.read_json( str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=lines ) assert_eq(res, sol) if orient == "records": # Also check chunked version res = dd.read_json( str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=True, blocksize=50, ) assert_eq(res, sol, check_index=False)
def test_blockwise_dataframe_io(c, tmpdir, io, fuse): pd = pytest.importorskip("pandas") dd = pytest.importorskip("dask.dataframe") df = pd.DataFrame({"x": [1, 2, 3] * 5, "y": range(15)}) ddf0 = dd.from_pandas(df, npartitions=3) if io.startswith("parquet"): if io == "parquet-pyarrow": pytest.importorskip("pyarrow.parquet") engine = "pyarrow" else: pytest.importorskip("fastparquet") engine = "fastparquet" ddf0.to_parquet(str(tmpdir), engine=engine) ddf = dd.read_parquet(str(tmpdir), engine=engine) elif io == "csv": ddf0.to_csv(str(tmpdir), index=False) ddf = dd.read_csv(os.path.join(str(tmpdir), "*")) elif io == "hdf": pytest.importorskip("tables") fn = str(tmpdir.join("h5")) ddf0.to_hdf(fn, "/data*") ddf = dd.read_hdf(fn, "/data*") df = df[["x"]] + 10 ddf = ddf[["x"]] + 10 with dask.config.set({"optimization.fuse.active": fuse}): ddf.compute() dsk = dask.dataframe.optimize(ddf.dask, ddf.__dask_keys__()) # dsk should not be a dict unless fuse is explicitly True assert isinstance(dsk, dict) == bool(fuse) dd.assert_eq(ddf, df, check_index=False)
def test_tokenize_numpy_memmap_offset(tmpdir): # Test two different memmaps into the same numpy file fn = str(tmpdir.join("demo_data")) with open(fn, "wb") as f: f.write(b"ashekwicht") with open(fn, "rb") as f: mmap1 = np.memmap(f, dtype=np.uint8, mode="r", offset=0, shape=5) mmap2 = np.memmap(f, dtype=np.uint8, mode="r", offset=5, shape=5) assert tokenize(mmap1) != tokenize(mmap2)
def test_read_json_multiple_files_with_path_column(blocksize, tmpdir): fil1 = str(tmpdir.join("fil1.json")).replace(os.sep, "/") fil2 = str(tmpdir.join("fil2.json")).replace(os.sep, "/") df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]}) df2 = df.assign(x=df.x + 0.5) orient = "records" lines = True df.to_json(fil1, orient=orient, lines=lines) df2.to_json(fil2, orient=orient, lines=lines) path_dtype = pd.CategoricalDtype((fil1, fil2)) df["path"] = pd.Series((fil1, ) * len(df), dtype=path_dtype) df2["path"] = pd.Series((fil2, ) * len(df2), dtype=path_dtype) sol = pd.concat([df, df2]) res = dd.read_json( str(tmpdir.join("fil*.json")), orient=orient, lines=lines, include_path_column=True, blocksize=blocksize, ) assert_eq(res, sol, check_index=False)
def test_tokenize_numpy_memmap_offset(tmpdir): # Test two different memmaps into the same numpy file fn = str(tmpdir.join("demo_data")) with open(fn, "wb") as f: f.write(b"ashekwicht") with open(fn, "rb") as f: mmap1 = np.memmap(f, dtype=np.uint8, mode="r", offset=0, shape=5) mmap2 = np.memmap(f, dtype=np.uint8, mode="r", offset=5, shape=5) assert tokenize(mmap1) != tokenize(mmap2) # also make sure that they tokenize correctly when taking sub-arrays sub1 = mmap1[1:-1] sub2 = mmap2[1:-1] assert tokenize(sub1) != tokenize(sub2)
def test_blockwise_dataframe_io(c, tmpdir, io, fuse, from_futures): pd = pytest.importorskip("pandas") dd = pytest.importorskip("dask.dataframe") # TODO: this configuration is flaky on osx in CI # See https://github.com/dask/dask/issues/8816 if from_futures and sys.platform == "darwin": pytest.xfail("This test sometimes fails on osx in CI") df = pd.DataFrame({"x": [1, 2, 3] * 5, "y": range(15)}) if from_futures: parts = [df.iloc[:5], df.iloc[5:10], df.iloc[10:15]] futs = c.scatter(parts) ddf0 = dd.from_delayed(futs, meta=parts[0]) else: ddf0 = dd.from_pandas(df, npartitions=3) if io.startswith("parquet"): if io == "parquet-pyarrow": pytest.importorskip("pyarrow.parquet") engine = "pyarrow" else: pytest.importorskip("fastparquet") engine = "fastparquet" ddf0.to_parquet(str(tmpdir), engine=engine) ddf = dd.read_parquet(str(tmpdir), engine=engine) elif io == "csv": ddf0.to_csv(str(tmpdir), index=False) ddf = dd.read_csv(os.path.join(str(tmpdir), "*")) elif io == "hdf": pytest.importorskip("tables") fn = str(tmpdir.join("h5")) ddf0.to_hdf(fn, "/data*") ddf = dd.read_hdf(fn, "/data*") df = df[["x"]] + 10 ddf = ddf[["x"]] + 10 with dask.config.set({"optimization.fuse.active": fuse}): ddf.compute() dsk = dask.dataframe.optimize(ddf.dask, ddf.__dask_keys__()) # dsk should not be a dict unless fuse is explicitly True assert isinstance(dsk, dict) == bool(fuse) dd.assert_eq(ddf, df, check_index=False)