def test_unknown_categoricals(): ddf = dd.DataFrame( {("unknown", i): df for (i, df) in enumerate(frames)}, "unknown", make_meta_util( { "v": "object", "w": "category", "x": "i8", "y": "category", "z": "f8" }, parent_meta=frames[0], ), [None] * 4, ) # Compute df = ddf.compute() assert_eq(ddf.w.value_counts(), df.w.value_counts()) assert_eq(ddf.w.nunique(), df.w.nunique()) assert_eq(ddf.groupby(ddf.w).sum(), df.groupby(df.w).sum()) assert_eq(ddf.groupby(ddf.w).y.nunique(), df.groupby(df.w).y.nunique()) assert_eq(ddf.y.groupby(ddf.w).count(), df.y.groupby(df.w).count())
def test_pivot_table_errors(): df = pd.DataFrame({ "A": np.random.choice(list("abc"), size=10), "B": np.random.randn(10), "C": pd.Categorical(np.random.choice(list("abc"), size=10)), }) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index=["A"], columns="C", values="B") assert msg in str(err.value) msg = "'columns' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns=["C"], values="B") assert msg in str(err.value) msg = "'values' must refer to an existing column or columns" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values=[["B"]]) assert msg in str(err.value) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc=["sum"]) assert msg in str(err.value) with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc="xx") assert msg in str(err.value) # unknown categories ddf._meta = make_meta_util({ "A": object, "B": float, "C": "category" }, parent_meta=pd.DataFrame()) msg = "'columns' must have known categories" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values=["B"]) assert msg in str(err.value) df = pd.DataFrame({ "A": np.random.choice(list("abc"), size=10), "B": np.random.randn(10), "C": np.random.choice(list("abc"), size=10), }) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B") assert msg in str(err.value)
def test_get_dummies_errors(): with pytest.raises(NotImplementedError): # not Categorical s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) ds = dd.from_pandas(s, 2) dd.get_dummies(ds) # unknown categories df = pd.DataFrame({"x": list("abcbc"), "y": list("bcbcb")}) ddf = dd.from_pandas(df, npartitions=2) ddf._meta = make_meta_util({ "x": "category", "y": "category" }, parent_meta=pd.DataFrame()) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=["x", "y"]) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.x)
("x", 0): pd.DataFrame({ "a": [1, 2, 3], "b": [1, 4, 7] }, index=[0, 1, 3]), ("x", 1): pd.DataFrame({ "a": [4, 5, 6], "b": [2, 5, 8] }, index=[5, 6, 8]), ("x", 2): pd.DataFrame({ "a": [7, 8, 9], "b": [3, 6, 9] }, index=[9, 9, 9]), } meta = make_meta_util({ "a": "i8", "b": "i8" }, index=pd.Index([], "i8"), parent_meta=pd.DataFrame()) d = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) full = d.compute() CHECK_FREQ = {} if dd._compat.PANDAS_GT_110: CHECK_FREQ["check_freq"] = False shuffle_func = shuffle # conflicts with keyword argument @pytest.mark.parametrize("shuffle", ["disk", "tasks"]) def test_shuffle(shuffle): s = shuffle_func(d, d.b, shuffle=shuffle) assert isinstance(s, dd.DataFrame)
def test_make_meta(): df = pd.DataFrame({ "a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0] }, index=[10, 20, 30]) # Pandas dataframe meta = make_meta_util(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta_util(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta_util(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta_util(ddf) is ddf._meta # Dict meta = make_meta_util({ "a": "i8", "b": "O", "c": "f8" }, parent_meta=pd.DataFrame()) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta_util([("a", "i8"), ("c", "f8"), ("b", "O")], parent_meta=pd.DataFrame()) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta_util(("a", "i8"), parent_meta=pd.DataFrame()) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == "i8" assert meta.name == "a" # With index meta = make_meta_util( { "a": "i8", "b": "i4" }, index=pd.Int64Index([1, 2], name="foo"), parent_meta=pd.DataFrame(), ) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta_util(("a", "i8"), index=pd.Int64Index([1, 2], name="foo"), parent_meta=pd.DataFrame()) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta_util({"a": "category"}, parent_meta=df) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta_util(("a", "category"), parent_meta=df) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta_util(np.float64(1.0), parent_meta=df) assert isinstance(meta, np.float64) # Python scalar meta = make_meta_util(1.0, parent_meta=df) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta_util(x, parent_meta=df) assert meta is x # Dtype expressions meta = make_meta_util("i8", parent_meta=df) assert isinstance(meta, np.int64) meta = make_meta_util(float, parent_meta=df) assert isinstance(meta, np.dtype(float).type) meta = make_meta_util(np.dtype("bool"), parent_meta=df) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta_util(None))