Ejemplo n.º 1
0
def test_concat_unions_categoricals():
    # Categorical DataFrame, regular index
    tm.assert_frame_equal(_concat(frames), pd.concat(frames2))

    # Categorical Series, regular index
    tm.assert_series_equal(_concat([i.y for i in frames]),
                           pd.concat([i.y for i in frames2]))

    # Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames3]),
                          pd.concat([i for i in frames4]).index)

    # Categorical DataFrame, Categorical Index
    tm.assert_frame_equal(_concat(frames3), pd.concat(frames4))

    # Non-categorical DataFrame, Categorical Index
    tm.assert_frame_equal(
        _concat([i[["x", "z"]] for i in frames3]),
        pd.concat([i[["x", "z"]] for i in frames4]),
    )

    # Categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.z for i in frames3]),
                           pd.concat([i.z for i in frames4]))

    # Non-categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.x for i in frames3]),
                           pd.concat([i.x for i in frames4]))

    # MultiIndex with Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames5]),
                          pd.concat([i for i in frames6]).index)

    # DataFrame, MultiIndex with CategoricalIndex
    tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))
Ejemplo n.º 2
0
def test_get_dummies(data):
    exp = pd.get_dummies(data)

    ddata = dd.from_pandas(data, 2)
    res = dd.get_dummies(ddata)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)
Ejemplo n.º 3
0
def assert_eq(
    a,
    b,
    check_names=True,
    check_dtype=True,
    check_divisions=True,
    check_index=True,
    scheduler="sync",
    **kwargs,
):
    if check_divisions:
        assert_divisions(a, scheduler=scheduler)
        assert_divisions(b, scheduler=scheduler)
        if hasattr(a, "divisions") and hasattr(b, "divisions"):
            at = type(np.asarray(a.divisions).tolist()[0])  # numpy to python
            bt = type(np.asarray(b.divisions).tolist()[0])  # scalar conversion
            assert at == bt, (at, bt)
    assert_sane_keynames(a)
    assert_sane_keynames(b)
    a = _check_dask(a,
                    check_names=check_names,
                    check_dtypes=check_dtype,
                    scheduler=scheduler)
    b = _check_dask(b,
                    check_names=check_names,
                    check_dtypes=check_dtype,
                    scheduler=scheduler)
    if hasattr(a, "to_pandas"):
        a = a.to_pandas()
    if hasattr(b, "to_pandas"):
        b = b.to_pandas()
    if isinstance(a, (pd.DataFrame, pd.Series)):
        a = _maybe_sort(a, check_index)
        b = _maybe_sort(b, check_index)
    if not check_index:
        a = a.reset_index(drop=True)
        b = b.reset_index(drop=True)
    if isinstance(a, pd.DataFrame):
        tm.assert_frame_equal(a,
                              b,
                              check_names=check_names,
                              check_dtype=check_dtype,
                              **kwargs)
    elif isinstance(a, pd.Series):
        tm.assert_series_equal(a,
                               b,
                               check_names=check_names,
                               check_dtype=check_dtype,
                               **kwargs)
    elif isinstance(a, pd.Index):
        tm.assert_index_equal(a, b, exact=check_dtype, **kwargs)
    else:
        if a == b:
            return True
        else:
            if np.isnan(a):
                assert np.isnan(b)
            else:
                assert np.allclose(a, b)
    return True
Ejemplo n.º 4
0
def test_categorical_known():
    text1 = normalize_text("""
    A,B
    a,a
    b,b
    a,a
    """)
    text2 = normalize_text("""
    A,B
    a,a
    b,b
    c,c
    """)
    dtype = pd.api.types.CategoricalDtype(["a", "b", "c"])
    with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}):
        result = dd.read_csv("foo.*.csv",
                             dtype={
                                 "A": "category",
                                 "B": "category"
                             })
        assert result.A.cat.known is False
        assert result.B.cat.known is False
        expected = pd.DataFrame(
            {
                "A":
                pd.Categorical(["a", "b", "a", "a", "b", "c"],
                               categories=dtype.categories),
                "B":
                pd.Categorical(["a", "b", "a", "a", "b", "c"],
                               categories=dtype.categories),
            },
            index=[0, 1, 2, 0, 1, 2],
        )
        assert_eq(result, expected)

        # Specify a dtype
        result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"})
        assert result.A.cat.known is True
        assert result.B.cat.known is False
        tm.assert_index_equal(result.A.cat.categories, dtype.categories)
        assert result.A.cat.ordered is False
        assert_eq(result, expected)

        # ordered
        dtype = pd.api.types.CategoricalDtype(["a", "b", "c"], ordered=True)
        result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"})
        expected["A"] = expected["A"].cat.as_ordered()
        assert result.A.cat.known is True
        assert result.B.cat.known is False
        assert result.A.cat.ordered is True

        assert_eq(result, expected)

        # Specify "unknown" categories
        result = dd.read_csv("foo.*.csv",
                             dtype=pd.api.types.CategoricalDtype())
        assert result.A.cat.known is False

        result = dd.read_csv("foo.*.csv", dtype="category")
        assert result.A.cat.known is False
Ejemplo n.º 5
0
def test_from_dask_array_compat_numpy_array_1d():

    x = da.ones(10, chunks=3)
    d1 = dd.from_dask_array(x)  # dask
    assert isinstance(d1, dd.Series)
    assert (d1.compute().values == x.compute()).all()
    assert d1.name is None

    d2 = dd.from_array(x.compute())  # numpy
    assert isinstance(d1, dd.Series)
    assert (d2.compute().values == x.compute()).all()
    assert d2.name is None

    d1 = dd.from_dask_array(x, columns="name")  # dask
    assert isinstance(d1, dd.Series)
    assert (d1.compute().values == x.compute()).all()
    assert d1.name == "name"

    d2 = dd.from_array(x.compute(), columns="name")  # numpy
    assert isinstance(d1, dd.Series)
    assert (d2.compute().values == x.compute()).all()
    assert d2.name == "name"

    # passing list via columns results in DataFrame
    d1 = dd.from_dask_array(x, columns=["name"])  # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index(["name"]))

    d2 = dd.from_array(x.compute(), columns=["name"])  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index(["name"]))
Ejemplo n.º 6
0
    def test_unknown_categories(self, series):
        a, da = series
        assert da.cat.known
        da = da.cat.as_unknown()
        assert not da.cat.known

        with pytest.raises(NotImplementedError,
                           match="with unknown categories"):
            da.cat.categories
        with pytest.raises(NotImplementedError,
                           match="with unknown categories"):
            da.cat.codes
        # Also AttributeError so glob searching in IPython such as `da.cat.*?` works
        with pytest.raises(AttributeError, match="with unknown categories"):
            da.cat.categories
        with pytest.raises(AttributeError, match="with unknown categories"):
            da.cat.codes

        db = da.cat.set_categories(["a", "b", "c"])
        assert db.cat.known
        tm.assert_index_equal(db.cat.categories, get_cat(a).categories)
        assert_array_index_eq(db.cat.codes, get_cat(a).codes)

        db = da.cat.as_known()
        assert db.cat.known
        res = db.compute()
        tm.assert_index_equal(db.cat.categories, get_cat(res).categories)
        assert_array_index_eq(db.cat.codes, get_cat(res).codes)
Ejemplo n.º 7
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, "a"), (2, "b")], dtype=[("a", "i4"), ("b", "object")])
    y = da.from_array(x, chunks=(1, ))
    df = dd.from_dask_array(y)
    tm.assert_index_equal(df.columns, pd.Index(["a", "b"]))
    assert_eq(df, pd.DataFrame(x))

    assert_eq(dd.from_dask_array(y, columns=["b", "a"]),
              pd.DataFrame(x, columns=["b", "a"]))
Ejemplo n.º 8
0
def test_from_dask_array_compat_numpy_array():
    x = da.ones((3, 3, 3), chunks=2)

    with pytest.raises(ValueError):
        dd.from_dask_array(x)  # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute())  # numpy

    x = da.ones((10, 3), chunks=(3, 3))
    d1 = dd.from_dask_array(x)  # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index([0, 1, 2]))

    d2 = dd.from_array(x.compute())  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index([0, 1, 2]))

    with pytest.raises(ValueError):
        dd.from_dask_array(x, columns=["a"])  # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute(), columns=["a"])  # numpy

    d1 = dd.from_dask_array(x, columns=["a", "b", "c"])  # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index(["a", "b", "c"]))

    d2 = dd.from_array(x.compute(), columns=["a", "b", "c"])  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index(["a", "b", "c"]))
Ejemplo n.º 9
0
def test_get_dummies_kwargs():
    s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype="category")
    exp = pd.get_dummies(s, prefix="X", prefix_sep="-")

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, prefix="X", prefix_sep="-")
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index(["X-1", "X-2", "X-3", "X-4"]))

    exp = pd.get_dummies(s, drop_first=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, drop_first=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # nan
    s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype="category")
    exp = pd.get_dummies(s)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # dummy_na
    exp = pd.get_dummies(s, dummy_na=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, dummy_na=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))
Ejemplo n.º 10
0
def test_DataFrame_from_dask_array():
    x = da.ones((10, 3), chunks=(4, 2))

    df = dd.from_dask_array(x, ["a", "b", "c"])
    assert isinstance(df, dd.DataFrame)
    tm.assert_index_equal(df.columns, pd.Index(["a", "b", "c"]))
    assert list(df.divisions) == [0, 4, 8, 9]
    assert (df.compute(scheduler="sync").values == x.compute(scheduler="sync")).all()

    # dd.from_array should re-route to from_dask_array
    df2 = dd.from_array(x, columns=["a", "b", "c"])
    assert isinstance(df, dd.DataFrame)
    tm.assert_index_equal(df2.columns, df.columns)
    assert df2.divisions == df.divisions
Ejemplo n.º 11
0
def test_from_array():
    x = np.arange(10 * 3).reshape(10, 3)
    d = dd.from_array(x, chunksize=4)
    assert isinstance(d, dd.DataFrame)
    tm.assert_index_equal(d.columns, pd.Index([0, 1, 2]))
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    d = dd.from_array(x, chunksize=4, columns=list("abc"))
    assert isinstance(d, dd.DataFrame)
    tm.assert_index_equal(d.columns, pd.Index(["a", "b", "c"]))
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    with pytest.raises(ValueError):
        dd.from_array(np.ones(shape=(10, 10, 10)))
Ejemplo n.º 12
0
def test_meta_from_recarray():
    x = np.array([(i, i * 10) for i in range(10)],
                 dtype=[("a", np.float64), ("b", np.int64)])
    res = _meta_from_array(x)
    assert isinstance(res, pd.DataFrame)
    assert res["a"].dtype == np.float64
    assert res["b"].dtype == np.int64
    tm.assert_index_equal(res.columns, pd.Index(["a", "b"]))

    res = _meta_from_array(x, columns=["b", "a"])
    assert isinstance(res, pd.DataFrame)
    assert res["a"].dtype == np.float64
    assert res["b"].dtype == np.int64
    tm.assert_index_equal(res.columns, pd.Index(["b", "a"]))

    with pytest.raises(ValueError):
        _meta_from_array(x, columns=["a", "b", "c"])
Ejemplo n.º 13
0
def test_meta_from_1darray():
    x = np.array([1.0, 2.0, 3.0], dtype=np.float64)
    res = _meta_from_array(x)
    assert isinstance(res, pd.Series)
    assert res.dtype == np.float64

    x = np.array([1, 2, 3], dtype=np.object_)
    res = _meta_from_array(x, columns="x")
    assert isinstance(res, pd.Series)
    assert res.name == "x"
    assert res.dtype == np.object_

    x = np.array([1, 2, 3], dtype=np.object_)
    res = _meta_from_array(x, columns=["x"])
    assert isinstance(res, pd.DataFrame)
    assert res["x"].dtype == np.object_
    tm.assert_index_equal(res.columns, pd.Index(["x"]))

    with pytest.raises(ValueError):
        _meta_from_array(x, columns=["a", "b"])
Ejemplo n.º 14
0
    def test_unknown_categories(self, series):
        a, da = series
        assert da.cat.known
        da = da.cat.as_unknown()
        assert not da.cat.known

        with pytest.raises(NotImplementedError):
            da.cat.categories
        with pytest.raises(NotImplementedError):
            da.cat.codes

        db = da.cat.set_categories(["a", "b", "c"])
        assert db.cat.known
        tm.assert_index_equal(db.cat.categories, get_cat(a).categories)
        assert_array_index_eq(db.cat.codes, get_cat(a).codes)

        db = da.cat.as_known()
        assert db.cat.known
        res = db.compute()
        tm.assert_index_equal(db.cat.categories, get_cat(res).categories)
        assert_array_index_eq(db.cat.codes, get_cat(res).codes)
Ejemplo n.º 15
0
def test_meta_from_array():
    x = np.array([[1, 2], [3, 4]], dtype=np.int64)
    res = _meta_from_array(x)
    assert isinstance(res, pd.DataFrame)
    assert res[0].dtype == np.int64
    assert res[1].dtype == np.int64
    tm.assert_index_equal(res.columns, pd.Index([0, 1]))

    x = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float64)
    res = _meta_from_array(x, columns=["a", "b"])
    assert isinstance(res, pd.DataFrame)
    assert res["a"].dtype == np.float64
    assert res["b"].dtype == np.float64
    tm.assert_index_equal(res.columns, pd.Index(["a", "b"]))

    with pytest.raises(ValueError):
        _meta_from_array(x, columns=["a", "b", "c"])

    np.random.seed(42)
    x = np.random.rand(201, 2)
    x = dd.from_array(x, chunksize=50, columns=["a", "b"])
    assert len(x.divisions) == 6  # Should be 5 partitions and the end
Ejemplo n.º 16
0
def test_get_dummies_object():
    df = pd.DataFrame({
        "a": pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]),
        "b": list("abcdabcd"),
        "c": pd.Categorical(list("abcdabcd")),
    })
    ddf = dd.from_pandas(df, 2)

    # Explicitly exclude object columns
    exp = pd.get_dummies(df, columns=["a", "c"])
    res = dd.get_dummies(ddf, columns=["a", "c"])
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.b)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=["b"])
Ejemplo n.º 17
0
def test_make_timeseries():
    df = dd.demo.make_timeseries("2000",
                                 "2015", {
                                     "A": float,
                                     "B": int,
                                     "C": str
                                 },
                                 freq="2D",
                                 partition_freq="6M")

    assert df.divisions[0] == pd.Timestamp("2000-01-31", freq="6M")
    assert df.divisions[-1] == pd.Timestamp("2014-07-31", freq="6M")
    tm.assert_index_equal(df.columns, pd.Index(["A", "B", "C"]))
    assert df["A"].head().dtype == float
    assert df["B"].head().dtype == int
    assert df["C"].head().dtype == object
    assert df.index.name == "timestamp"
    assert df.head().index.name == df.index.name
    assert df.divisions == tuple(
        pd.date_range(start="2000", end="2015", freq="6M"))

    tm.assert_frame_equal(df.head(), df.head())

    a = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="6M",
        seed=123,
    )
    b = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="6M",
        seed=123,
    )
    c = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="6M",
        seed=456,
    )
    d = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="3M",
        seed=123,
    )
    e = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="1D",
        partition_freq="6M",
        seed=123,
    )
    tm.assert_frame_equal(a.head(), b.head())
    assert not (a.head(10) == c.head(10)).all().all()
    assert a._name == b._name
    assert a._name != c._name
    assert a._name != d._name
    assert a._name != e._name
Ejemplo n.º 18
0
def _check_dask(dsk,
                check_names=True,
                check_dtypes=True,
                result=None,
                scheduler=None):
    import dask.dataframe as dd

    if hasattr(dsk, "__dask_graph__"):
        graph = dsk.__dask_graph__()
        if hasattr(graph, "validate"):
            graph.validate()
        if result is None:
            result = dsk.compute(scheduler=scheduler)
        if isinstance(dsk, dd.Index):
            assert "Index" in type(result).__name__, type(result)
            # assert type(dsk._meta) == type(result), type(dsk._meta)
            if check_names:
                assert dsk.name == result.name
                assert dsk._meta.name == result.name
                if isinstance(result, pd.MultiIndex):
                    assert result.names == dsk._meta.names
            if check_dtypes:
                assert_dask_dtypes(dsk, result)
        elif isinstance(dsk, dd.Series):
            assert "Series" in type(result).__name__, type(result)
            assert type(dsk._meta) == type(result), type(dsk._meta)
            if check_names:
                assert dsk.name == result.name, (dsk.name, result.name)
                assert dsk._meta.name == result.name
            if check_dtypes:
                assert_dask_dtypes(dsk, result)
            _check_dask(
                dsk.index,
                check_names=check_names,
                check_dtypes=check_dtypes,
                result=result.index,
            )
        elif isinstance(dsk, dd.DataFrame):
            assert "DataFrame" in type(result).__name__, type(result)
            assert isinstance(dsk.columns, pd.Index), type(dsk.columns)
            assert type(dsk._meta) == type(result), type(dsk._meta)
            if check_names:
                tm.assert_index_equal(dsk.columns, result.columns)
                tm.assert_index_equal(dsk._meta.columns, result.columns)
            if check_dtypes:
                assert_dask_dtypes(dsk, result)
            _check_dask(
                dsk.index,
                check_names=check_names,
                check_dtypes=check_dtypes,
                result=result.index,
            )
        elif isinstance(dsk, dd.core.Scalar):
            assert np.isscalar(result) or isinstance(
                result, (pd.Timestamp, pd.Timedelta))
            if check_dtypes:
                assert_dask_dtypes(dsk, result)
        else:
            msg = f"Unsupported dask instance {type(dsk)} found"
            raise AssertionError(msg)
        return result
    return dsk