def test_parquet(s3, engine): dd = pytest.importorskip("dask.dataframe") from dask.dataframe._compat import tm lib = pytest.importorskip(engine) if engine == "pyarrow" and LooseVersion(lib.__version__) < "0.13.1": pytest.skip("pyarrow < 0.13.1 not supported for parquet") import pandas as pd import numpy as np url = "s3://%s/test.parquet" % test_bucket_name data = pd.DataFrame( { "i32": np.arange(1000, dtype=np.int32), "i64": np.arange(1000, dtype=np.int64), "f": np.arange(1000, dtype=np.float64), "bhello": np.random.choice([u"hello", u"you", u"people"], size=1000).astype( "O" ), }, index=pd.Index(np.arange(1000), name="foo"), ) df = dd.from_pandas(data, chunksize=500) df.to_parquet(url, engine=engine) files = [f.split("/")[-1] for f in s3.ls(url)] assert "_common_metadata" in files assert "part.0.parquet" in files df2 = dd.read_parquet(url, index="foo", engine=engine) assert len(df2.divisions) > 1 tm.assert_frame_equal(data, df2.compute())
def assert_eq( a, b, check_names=True, check_dtype=True, check_divisions=True, check_index=True, scheduler="sync", **kwargs, ): if check_divisions: assert_divisions(a, scheduler=scheduler) assert_divisions(b, scheduler=scheduler) if hasattr(a, "divisions") and hasattr(b, "divisions"): at = type(np.asarray(a.divisions).tolist()[0]) # numpy to python bt = type(np.asarray(b.divisions).tolist()[0]) # scalar conversion assert at == bt, (at, bt) assert_sane_keynames(a) assert_sane_keynames(b) a = _check_dask(a, check_names=check_names, check_dtypes=check_dtype, scheduler=scheduler) b = _check_dask(b, check_names=check_names, check_dtypes=check_dtype, scheduler=scheduler) if hasattr(a, "to_pandas"): a = a.to_pandas() if hasattr(b, "to_pandas"): b = b.to_pandas() if isinstance(a, (pd.DataFrame, pd.Series)): a = _maybe_sort(a, check_index) b = _maybe_sort(b, check_index) if not check_index: a = a.reset_index(drop=True) b = b.reset_index(drop=True) if isinstance(a, pd.DataFrame): tm.assert_frame_equal(a, b, check_names=check_names, check_dtype=check_dtype, **kwargs) elif isinstance(a, pd.Series): tm.assert_series_equal(a, b, check_names=check_names, check_dtype=check_dtype, **kwargs) elif isinstance(a, pd.Index): tm.assert_index_equal(a, b, exact=check_dtype, **kwargs) else: if a == b: return True else: if np.isnan(a): assert np.isnan(b) else: assert np.allclose(a, b) return True
def test_getitem(): df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3, }, columns=list("ABC"), ) ddf = dd.from_pandas(df, 2) assert_eq(ddf["A"], df["A"]) # check cache consistency tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"]) assert_eq(ddf[["A", "B"]], df[["A", "B"]]) tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]]) assert_eq(ddf[ddf.C], df[df.C]) tm.assert_series_equal(ddf.C._meta, ddf._meta.C) assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C]) pytest.raises(KeyError, lambda: df["X"]) pytest.raises(KeyError, lambda: df[["A", "X"]]) pytest.raises(AttributeError, lambda: df.X) # not str/unicode df = pd.DataFrame(np.random.randn(10, 5)) ddf = dd.from_pandas(df, 2) assert_eq(ddf[0], df[0]) assert_eq(ddf[[1, 2]], df[[1, 2]]) pytest.raises(KeyError, lambda: df[8]) pytest.raises(KeyError, lambda: df[[1, 8]])
def test_hdf_globbing(): pytest.importorskip("tables") df = pd.DataFrame( {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0] ) with tmpdir() as tdir: df.to_hdf(os.path.join(tdir, "one.h5"), "/foo/data", format="table") df.to_hdf(os.path.join(tdir, "two.h5"), "/bar/data", format="table") df.to_hdf(os.path.join(tdir, "two.h5"), "/foo/data", format="table") with dask.config.set(scheduler="sync"): res = dd.read_hdf(os.path.join(tdir, "one.h5"), "/*/data", chunksize=2) assert res.npartitions == 2 tm.assert_frame_equal(res.compute(), df) res = dd.read_hdf( os.path.join(tdir, "one.h5"), "/*/data", chunksize=2, start=1, stop=3 ) expected = pd.read_hdf( os.path.join(tdir, "one.h5"), "/foo/data", start=1, stop=3 ) tm.assert_frame_equal(res.compute(), expected) res = dd.read_hdf(os.path.join(tdir, "two.h5"), "/*/data", chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/foo/data", chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/*/data", chunksize=2) assert res.npartitions == 2 + 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
def test_meta_duplicated(): df = pd.DataFrame(columns=["A", "A", "B"]) res = meta_nonempty(df) exp = pd.DataFrame( [["foo", "foo", "foo"], ["foo", "foo", "foo"]], index=["a", "b"], columns=["A", "A", "B"], ) tm.assert_frame_equal(res, exp)
def test_to_csv_gzip(): df = pd.DataFrame( {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0] ) for npartitions in [1, 2]: a = dd.from_pandas(df, npartitions) with tmpfile("csv") as fn: a.to_csv(fn, compression="gzip") result = pd.read_csv(fn, index_col=0, compression="gzip") tm.assert_frame_equal(result, df)
def test_parquet(s3, engine, s3so, metadata_file): import s3fs dd = pytest.importorskip("dask.dataframe") pd = pytest.importorskip("pandas") np = pytest.importorskip("numpy") from dask.dataframe._compat import tm lib = pytest.importorskip(engine) if engine == "pyarrow" and LooseVersion(lib.__version__) < "0.13.1": pytest.skip("pyarrow < 0.13.1 not supported for parquet") if (engine == "pyarrow" and LooseVersion(lib.__version__) >= "2.0" and LooseVersion(lib.__version__) < "3.0" and LooseVersion(s3fs.__version__) > "0.5.0"): pytest.skip("#7056 - new s3fs not supported before pyarrow 3.0") url = "s3://%s/test.parquet" % test_bucket_name data = pd.DataFrame( { "i32": np.arange(1000, dtype=np.int32), "i64": np.arange(1000, dtype=np.int64), "f": np.arange(1000, dtype=np.float64), "bhello": np.random.choice(["hello", "you", "people"], size=1000).astype("O"), }, index=pd.Index(np.arange(1000), name="foo"), ) df = dd.from_pandas(data, chunksize=500) df.to_parquet(url, engine=engine, storage_options=s3so, write_metadata_file=metadata_file) files = [f.split("/")[-1] for f in s3.ls(url)] if metadata_file: assert "_common_metadata" in files assert "_metadata" in files assert "part.0.parquet" in files df2 = dd.read_parquet(url, index="foo", gather_statistics=True, engine=engine, storage_options=s3so) assert len(df2.divisions) > 1 tm.assert_frame_equal(data, df2.compute())
def test_to_hdf_kwargs(): pytest.importorskip("tables") df = pd.DataFrame({"A": ["a", "aaaa"]}) ddf = dd.from_pandas(df, npartitions=2) with tmpfile("h5") as fn: ddf.to_hdf(fn, "foo4", format="table", min_itemsize=4) df2 = pd.read_hdf(fn, "foo4") tm.assert_frame_equal(df, df2) # test shorthand 't' for table with tmpfile("h5") as fn: ddf.to_hdf(fn, "foo4", format="t", min_itemsize=4) df2 = pd.read_hdf(fn, "foo4") tm.assert_frame_equal(df, df2)
def test_gh_2730(): large = pd.DataFrame({"KEY": np.arange(0, 50000)}) small = pd.DataFrame({"KEY": np.arange(25, 500)}) dd_left = dd.from_pandas(small, npartitions=3) dd_right = dd.from_pandas(large, npartitions=257) with dask.config.set(shuffle="tasks", scheduler="sync"): dd_merged = dd_left.merge(dd_right, how="inner", on="KEY") result = dd_merged.compute() expected = large.merge(small, how="inner", on="KEY") tm.assert_frame_equal(result.sort_values("KEY").reset_index(drop=True), expected)
def test_hdf_file_list(): pytest.importorskip("tables") df = pd.DataFrame( {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0] ) with tmpdir() as tdir: df.iloc[:2].to_hdf(os.path.join(tdir, "one.h5"), "dataframe", format="table") df.iloc[2:].to_hdf(os.path.join(tdir, "two.h5"), "dataframe", format="table") with dask.config.set(scheduler="sync"): input_files = [os.path.join(tdir, "one.h5"), os.path.join(tdir, "two.h5")] res = dd.read_hdf(input_files, "dataframe") tm.assert_frame_equal(res.compute(), df)
def test_concat_unions_categoricals(): # Categorical DataFrame, regular index tm.assert_frame_equal(_concat(frames), pd.concat(frames2)) # Categorical Series, regular index tm.assert_series_equal(_concat([i.y for i in frames]), pd.concat([i.y for i in frames2])) # Categorical Index tm.assert_index_equal(_concat([i.index for i in frames3]), pd.concat([i for i in frames4]).index) # Categorical DataFrame, Categorical Index tm.assert_frame_equal(_concat(frames3), pd.concat(frames4)) # Non-categorical DataFrame, Categorical Index tm.assert_frame_equal( _concat([i[["x", "z"]] for i in frames3]), pd.concat([i[["x", "z"]] for i in frames4]), ) # Categorical Series, Categorical Index tm.assert_series_equal(_concat([i.z for i in frames3]), pd.concat([i.z for i in frames4])) # Non-categorical Series, Categorical Index tm.assert_series_equal(_concat([i.x for i in frames3]), pd.concat([i.x for i in frames4])) # MultiIndex with Categorical Index tm.assert_index_equal(_concat([i.index for i in frames5]), pd.concat([i for i in frames6]).index) # DataFrame, MultiIndex with CategoricalIndex tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))
def test_to_hdf(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) with tmpfile("h5") as fn: a.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_frame_equal(df, out[:]) with tmpfile("h5") as fn: a.x.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_series_equal(df.x, out[:]) a = dd.from_pandas(df, 1) with tmpfile("h5") as fn: a.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_frame_equal(df, out[:]) # test compute = False with tmpfile("h5") as fn: r = a.to_hdf(fn, "/data", compute=False) r.compute() out = pd.read_hdf(fn, "/data") tm.assert_frame_equal(df, out[:])
def test_from_pandas_dataframe(): a = list("aaaaaaabbbbbbbbccccccc") df = pd.DataFrame( dict(a=a, b=np.random.randn(len(a))), index=pd.date_range(start="20120101", periods=len(a)), ) ddf = dd.from_pandas(df, 3) assert len(ddf.dask) == 3 assert len(ddf.divisions) == len(ddf.dask) + 1 assert isinstance(ddf.divisions[0], type(df.index[0])) tm.assert_frame_equal(df, ddf.compute()) ddf = dd.from_pandas(df, chunksize=8) msg = "Exactly one of npartitions and chunksize must be specified." with pytest.raises(ValueError) as err: dd.from_pandas(df, npartitions=2, chunksize=2) assert msg in str(err.value) with pytest.raises((ValueError, AssertionError)) as err: dd.from_pandas(df) assert msg in str(err.value) assert len(ddf.dask) == 3 assert len(ddf.divisions) == len(ddf.dask) + 1 assert isinstance(ddf.divisions[0], type(df.index[0])) tm.assert_frame_equal(df, ddf.compute())
def test_to_hdf_multiple_nodes(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) b = dd.from_pandas(df16, 16) # saving to multiple nodes with tmpfile("h5") as fn: a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df, out) # saving to multiple nodes making sure order is kept with tmpfile("h5") as fn: b.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple datasets with custom name_function with tmpfile("h5") as fn: a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1)) out = dd.read_hdf(fn, "/data_*") assert_eq(df, out) out = pd.read_hdf(fn, "/data_a") tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, "/data_aa") tm.assert_frame_equal(out, df.iloc[2:]) # test multiple nodes with hdf object with tmpfile("h5") as fn: with pd.HDFStore(fn) as hdf: b.to_hdf(hdf, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out)
def test_make_timeseries(): df = dd.demo.make_timeseries("2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M") assert df.divisions[0] == pd.Timestamp("2000-01-31", freq="6M") assert df.divisions[-1] == pd.Timestamp("2014-07-31", freq="6M") tm.assert_index_equal(df.columns, pd.Index(["A", "B", "C"])) assert df["A"].head().dtype == float assert df["B"].head().dtype == int assert df["C"].head().dtype == object assert df.index.name == "timestamp" assert df.head().index.name == df.index.name assert df.divisions == tuple( pd.date_range(start="2000", end="2015", freq="6M")) tm.assert_frame_equal(df.head(), df.head()) a = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M", seed=123, ) b = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M", seed=123, ) c = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M", seed=456, ) d = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="3M", seed=123, ) e = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="1D", partition_freq="6M", seed=123, ) tm.assert_frame_equal(a.head(), b.head()) assert not (a.head(10) == c.head(10)).all().all() assert a._name == b._name assert a._name != c._name assert a._name != d._name assert a._name != e._name
def test_to_hdf_multiple_files(): pytest.importorskip("tables") df = pd.DataFrame( {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0] ) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) b = dd.from_pandas(df16, 16) # saving to multiple files with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") a.to_hdf(fn, "/data") out = dd.read_hdf(fn, "/data") assert_eq(df, out) # saving to multiple files making sure order is kept with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") b.to_hdf(fn, "/data") out = dd.read_hdf(fn, "/data") assert_eq(df16, out) # saving to multiple files where first file is longer # https://github.com/dask/dask/issues/8023 with tmpdir() as dn: fn1 = os.path.join(dn, "data_1.h5") fn2 = os.path.join(dn, "data_2.h5") b.to_hdf(fn1, "/data") a.to_hdf(fn2, "/data") out = dd.read_hdf([fn1, fn2], "/data") assert_eq(pd.concat([df16, df]), out) # saving to multiple files with custom name_function with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") a.to_hdf(fn, "/data", name_function=lambda i: "a" * (i + 1)) out = dd.read_hdf(fn, "/data") assert_eq(df, out) out = pd.read_hdf(os.path.join(dn, "data_a.h5"), "/data") tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, "data_aa.h5"), "/data") tm.assert_frame_equal(out, df.iloc[2:]) # test hdf object with tmpfile("h5") as fn: with pd.HDFStore(fn) as hdf: a.to_hdf(hdf, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df, out)
def test_to_hdf_multiple_nodes(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) b = dd.from_pandas(df16, 16) # saving to multiple nodes with tmpfile("h5") as fn: a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df, out) # saving to multiple nodes making sure order is kept with tmpfile("h5") as fn: b.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple datasets with custom name_function with tmpfile("h5") as fn: a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1)) out = dd.read_hdf(fn, "/data_*") assert_eq(df, out) out = pd.read_hdf(fn, "/data_a") tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, "/data_aa") tm.assert_frame_equal(out, df.iloc[2:]) # test multiple nodes with hdf object with tmpfile("h5") as fn: with pd.HDFStore(fn) as hdf: b.to_hdf(hdf, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # Test getitem optimization with tmpfile("h5") as fn: a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*")[["x"]] dsk = optimize_dataframe_getitem(out.dask, keys=out.__dask_keys__()) read = [key for key in dsk.layers if key.startswith("read-hdf")][0] subgraph = dsk.layers[read] assert isinstance(subgraph, DataFrameIOLayer) assert subgraph.columns == ["x"]