def test_read_json_meta(orient, tmpdir):
    df = pd.DataFrame({'x': range(5), 'y': ['a', 'b', 'c', 'd', 'e']})
    df2 = df.assign(x=df.x + 0.5)
    lines = orient == 'records'
    df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines)
    df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines)
    sol = pd.concat([df, df2])
    meta = df2.iloc[:0]

    if orient == 'values':
        # orient=values loses column names
        sol.columns = meta.columns = [0, 1]

    res = dd.read_json(str(tmpdir.join("fil*.json")),
                       orient=orient,
                       meta=meta,
                       lines=lines)
    assert_eq(res, sol)

    if orient == 'records':
        # Also check chunked version
        res = dd.read_json(str(tmpdir.join("fil*.json")),
                           orient=orient,
                           meta=meta,
                           lines=True,
                           blocksize=50)
        assert_eq(res, sol, check_index=False)
Example #2
0
def test_read_json_meta(orient, tmpdir):
    df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]})
    df2 = df.assign(x=df.x + 0.5)
    lines = orient == "records"
    df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines)
    df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines)
    sol = pd.concat([df, df2])
    meta = df2.iloc[:0]

    if orient == "values":
        # orient=values loses column names
        sol.columns = meta.columns = [0, 1]

    res = dd.read_json(
        str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=lines
    )
    assert_eq(res, sol)

    if orient == "records":
        # Also check chunked version
        res = dd.read_json(
            str(tmpdir.join("fil*.json")),
            orient=orient,
            meta=meta,
            lines=True,
            blocksize=50,
        )
        assert_eq(res, sol, check_index=False)
Example #3
0
def test_blockwise_dataframe_io(c, tmpdir, io, fuse):
    pd = pytest.importorskip("pandas")
    dd = pytest.importorskip("dask.dataframe")

    df = pd.DataFrame({"x": [1, 2, 3] * 5, "y": range(15)})
    ddf0 = dd.from_pandas(df, npartitions=3)

    if io.startswith("parquet"):
        if io == "parquet-pyarrow":
            pytest.importorskip("pyarrow.parquet")
            engine = "pyarrow"
        else:
            pytest.importorskip("fastparquet")
            engine = "fastparquet"
        ddf0.to_parquet(str(tmpdir), engine=engine)
        ddf = dd.read_parquet(str(tmpdir), engine=engine)
    elif io == "csv":
        ddf0.to_csv(str(tmpdir), index=False)
        ddf = dd.read_csv(os.path.join(str(tmpdir), "*"))
    elif io == "hdf":
        pytest.importorskip("tables")
        fn = str(tmpdir.join("h5"))
        ddf0.to_hdf(fn, "/data*")
        ddf = dd.read_hdf(fn, "/data*")

    df = df[["x"]] + 10
    ddf = ddf[["x"]] + 10
    with dask.config.set({"optimization.fuse.active": fuse}):
        ddf.compute()
        dsk = dask.dataframe.optimize(ddf.dask, ddf.__dask_keys__())
        # dsk should not be a dict unless fuse is explicitly True
        assert isinstance(dsk, dict) == bool(fuse)
        dd.assert_eq(ddf, df, check_index=False)
Example #4
0
def test_tokenize_numpy_memmap_offset(tmpdir):
    # Test two different memmaps into the same numpy file
    fn = str(tmpdir.join("demo_data"))

    with open(fn, "wb") as f:
        f.write(b"ashekwicht")

    with open(fn, "rb") as f:
        mmap1 = np.memmap(f, dtype=np.uint8, mode="r", offset=0, shape=5)
        mmap2 = np.memmap(f, dtype=np.uint8, mode="r", offset=5, shape=5)

        assert tokenize(mmap1) != tokenize(mmap2)
Example #5
0
def test_read_json_multiple_files_with_path_column(blocksize, tmpdir):
    fil1 = str(tmpdir.join("fil1.json")).replace(os.sep, "/")
    fil2 = str(tmpdir.join("fil2.json")).replace(os.sep, "/")
    df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]})
    df2 = df.assign(x=df.x + 0.5)
    orient = "records"
    lines = True
    df.to_json(fil1, orient=orient, lines=lines)
    df2.to_json(fil2, orient=orient, lines=lines)
    path_dtype = pd.CategoricalDtype((fil1, fil2))
    df["path"] = pd.Series((fil1, ) * len(df), dtype=path_dtype)
    df2["path"] = pd.Series((fil2, ) * len(df2), dtype=path_dtype)
    sol = pd.concat([df, df2])
    res = dd.read_json(
        str(tmpdir.join("fil*.json")),
        orient=orient,
        lines=lines,
        include_path_column=True,
        blocksize=blocksize,
    )
    assert_eq(res, sol, check_index=False)
Example #6
0
def test_read_json_meta(orient, tmpdir):
    df = pd.DataFrame({'x': range(5), 'y': ['a', 'b', 'c', 'd', 'e']})
    df2 = df.assign(x=df.x + 0.5)
    lines = orient == 'records'
    df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines)
    df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines)
    sol = pd.concat([df, df2])
    meta = df2.iloc[:0]

    if orient == 'values':
        # orient=values loses column names
        sol.columns = meta.columns = [0, 1]

    res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient,
                       meta=meta, lines=lines)
    assert_eq(res, sol)

    if orient == 'records':
        # Also check chunked version
        res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient,
                           meta=meta, lines=True, blocksize=50)
        assert_eq(res, sol, check_index=False)
Example #7
0
def test_tokenize_numpy_memmap_offset(tmpdir):
    # Test two different memmaps into the same numpy file
    fn = str(tmpdir.join("demo_data"))

    with open(fn, "wb") as f:
        f.write(b"ashekwicht")

    with open(fn, "rb") as f:
        mmap1 = np.memmap(f, dtype=np.uint8, mode="r", offset=0, shape=5)
        mmap2 = np.memmap(f, dtype=np.uint8, mode="r", offset=5, shape=5)

        assert tokenize(mmap1) != tokenize(mmap2)
        # also make sure that they tokenize correctly when taking sub-arrays
        sub1 = mmap1[1:-1]
        sub2 = mmap2[1:-1]
        assert tokenize(sub1) != tokenize(sub2)
Example #8
0
def test_blockwise_dataframe_io(c, tmpdir, io, fuse, from_futures):
    pd = pytest.importorskip("pandas")
    dd = pytest.importorskip("dask.dataframe")

    # TODO: this configuration is flaky on osx in CI
    # See https://github.com/dask/dask/issues/8816
    if from_futures and sys.platform == "darwin":
        pytest.xfail("This test sometimes fails on osx in CI")

    df = pd.DataFrame({"x": [1, 2, 3] * 5, "y": range(15)})

    if from_futures:
        parts = [df.iloc[:5], df.iloc[5:10], df.iloc[10:15]]
        futs = c.scatter(parts)
        ddf0 = dd.from_delayed(futs, meta=parts[0])
    else:
        ddf0 = dd.from_pandas(df, npartitions=3)

    if io.startswith("parquet"):
        if io == "parquet-pyarrow":
            pytest.importorskip("pyarrow.parquet")
            engine = "pyarrow"
        else:
            pytest.importorskip("fastparquet")
            engine = "fastparquet"
        ddf0.to_parquet(str(tmpdir), engine=engine)
        ddf = dd.read_parquet(str(tmpdir), engine=engine)
    elif io == "csv":
        ddf0.to_csv(str(tmpdir), index=False)
        ddf = dd.read_csv(os.path.join(str(tmpdir), "*"))
    elif io == "hdf":
        pytest.importorskip("tables")
        fn = str(tmpdir.join("h5"))
        ddf0.to_hdf(fn, "/data*")
        ddf = dd.read_hdf(fn, "/data*")

    df = df[["x"]] + 10
    ddf = ddf[["x"]] + 10
    with dask.config.set({"optimization.fuse.active": fuse}):
        ddf.compute()
        dsk = dask.dataframe.optimize(ddf.dask, ddf.__dask_keys__())
        # dsk should not be a dict unless fuse is explicitly True
        assert isinstance(dsk, dict) == bool(fuse)

        dd.assert_eq(ddf, df, check_index=False)