def test_getitem(): df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3, }, columns=list("ABC"), ) ddf = dd.from_pandas(df, 2) assert_eq(ddf["A"], df["A"]) # check cache consistency tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"]) assert_eq(ddf[["A", "B"]], df[["A", "B"]]) tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]]) assert_eq(ddf[ddf.C], df[df.C]) tm.assert_series_equal(ddf.C._meta, ddf._meta.C) assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C]) pytest.raises(KeyError, lambda: df["X"]) pytest.raises(KeyError, lambda: df[["A", "X"]]) pytest.raises(AttributeError, lambda: df.X) # not str/unicode df = pd.DataFrame(np.random.randn(10, 5)) ddf = dd.from_pandas(df, 2) assert_eq(ddf[0], df[0]) assert_eq(ddf[[1, 2]], df[[1, 2]]) pytest.raises(KeyError, lambda: df[8]) pytest.raises(KeyError, lambda: df[[1, 8]])
def test_to_hdf(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) with tmpfile("h5") as fn: a.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_frame_equal(df, out[:]) with tmpfile("h5") as fn: a.x.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_series_equal(df.x, out[:]) a = dd.from_pandas(df, 1) with tmpfile("h5") as fn: a.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_frame_equal(df, out[:]) # test compute = False with tmpfile("h5") as fn: r = a.to_hdf(fn, "/data", compute=False) r.compute() out = pd.read_hdf(fn, "/data") tm.assert_frame_equal(df, out[:])
def test_object_missing_values(): # Check that the presence of missing values doesn't change how object dtype # is hashed. s = pd.Series(["a", "b", "c", None]) h1 = hash_pandas_object(s).iloc[:3] h2 = hash_pandas_object(s.iloc[:3]) tm.assert_series_equal(h1, h2)
def test_pivot_table_dtype(): df = pd.DataFrame({ "A": list("AABB"), "B": pd.Categorical(list("ABAB")), "C": [1, 2, 3, 4] }) ddf = dd.from_pandas(df, 2) res = dd.pivot_table(ddf, index="A", columns="B", values="C", aggfunc="count") exp_index = pd.CategoricalIndex(["A", "B"], name="B") exp = pd.Series([np.float64] * 2, index=exp_index) tm.assert_series_equal(res.dtypes, exp) exp = pd.pivot_table(df, index="A", columns="B", values="C", aggfunc="count").astype(np.float64) assert_eq(res, exp)
def assert_eq( a, b, check_names=True, check_dtype=True, check_divisions=True, check_index=True, scheduler="sync", **kwargs, ): if check_divisions: assert_divisions(a, scheduler=scheduler) assert_divisions(b, scheduler=scheduler) if hasattr(a, "divisions") and hasattr(b, "divisions"): at = type(np.asarray(a.divisions).tolist()[0]) # numpy to python bt = type(np.asarray(b.divisions).tolist()[0]) # scalar conversion assert at == bt, (at, bt) assert_sane_keynames(a) assert_sane_keynames(b) a = _check_dask(a, check_names=check_names, check_dtypes=check_dtype, scheduler=scheduler) b = _check_dask(b, check_names=check_names, check_dtypes=check_dtype, scheduler=scheduler) if hasattr(a, "to_pandas"): a = a.to_pandas() if hasattr(b, "to_pandas"): b = b.to_pandas() if isinstance(a, (pd.DataFrame, pd.Series)): a = _maybe_sort(a, check_index) b = _maybe_sort(b, check_index) if not check_index: a = a.reset_index(drop=True) b = b.reset_index(drop=True) if isinstance(a, pd.DataFrame): tm.assert_frame_equal(a, b, check_names=check_names, check_dtype=check_dtype, **kwargs) elif isinstance(a, pd.Series): tm.assert_series_equal(a, b, check_names=check_names, check_dtype=check_dtype, **kwargs) elif isinstance(a, pd.Index): tm.assert_index_equal(a, b, exact=check_dtype, **kwargs) else: if a == b: return True else: if np.isnan(a): assert np.isnan(b) else: assert np.allclose(a, b) return True
def test_from_pandas_series(): n = 20 s = pd.Series(np.random.randn(n), index=pd.date_range(start="20120101", periods=n)) ds = dd.from_pandas(s, 3) assert len(ds.dask) == 3 assert len(ds.divisions) == len(ds.dask) + 1 assert isinstance(ds.divisions[0], type(s.index[0])) tm.assert_series_equal(s, ds.compute()) ds = dd.from_pandas(s, chunksize=8) assert len(ds.dask) == 3 assert len(ds.divisions) == len(ds.dask) + 1 assert isinstance(ds.divisions[0], type(s.index[0])) tm.assert_series_equal(s, ds.compute())
def test_categorical_consistency(): # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype for s1 in [ pd.Series(["a", "b", "c", "d"]), pd.Series([1000, 2000, 3000, 4000]), pd.Series(pd.date_range(0, periods=4)), ]: s2 = s1.astype("category").cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) for categorize in [True, False]: # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3)
def test_concat_unions_categoricals(): # Categorical DataFrame, regular index tm.assert_frame_equal(_concat(frames), pd.concat(frames2)) # Categorical Series, regular index tm.assert_series_equal(_concat([i.y for i in frames]), pd.concat([i.y for i in frames2])) # Categorical Index tm.assert_index_equal(_concat([i.index for i in frames3]), pd.concat([i for i in frames4]).index) # Categorical DataFrame, Categorical Index tm.assert_frame_equal(_concat(frames3), pd.concat(frames4)) # Non-categorical DataFrame, Categorical Index tm.assert_frame_equal( _concat([i[["x", "z"]] for i in frames3]), pd.concat([i[["x", "z"]] for i in frames4]), ) # Categorical Series, Categorical Index tm.assert_series_equal(_concat([i.z for i in frames3]), pd.concat([i.z for i in frames4])) # Non-categorical Series, Categorical Index tm.assert_series_equal(_concat([i.x for i in frames3]), pd.concat([i.x for i in frames4])) # MultiIndex with Categorical Index tm.assert_index_equal(_concat([i.index for i in frames5]), pd.concat([i for i in frames6]).index) # DataFrame, MultiIndex with CategoricalIndex tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))