Ejemplo n.º 1
0
def test_getitem():
    df = pd.DataFrame(
        {
            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9],
            "B": [9, 8, 7, 6, 5, 4, 3, 2, 1],
            "C": [True, False, True] * 3,
        },
        columns=list("ABC"),
    )
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf["A"], df["A"])
    # check cache consistency
    tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"])

    assert_eq(ddf[["A", "B"]], df[["A", "B"]])
    tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]])

    assert_eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._meta, ddf._meta.C)

    assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    pytest.raises(KeyError, lambda: df["X"])
    pytest.raises(KeyError, lambda: df[["A", "X"]])
    pytest.raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf[0], df[0])
    assert_eq(ddf[[1, 2]], df[[1, 2]])

    pytest.raises(KeyError, lambda: df[8])
    pytest.raises(KeyError, lambda: df[[1, 8]])
Ejemplo n.º 2
0
def test_to_hdf():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)

    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_frame_equal(df, out[:])

    with tmpfile("h5") as fn:
        a.x.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_series_equal(df.x, out[:])

    a = dd.from_pandas(df, 1)
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_frame_equal(df, out[:])

    # test compute = False
    with tmpfile("h5") as fn:
        r = a.to_hdf(fn, "/data", compute=False)
        r.compute()
        out = pd.read_hdf(fn, "/data")
        tm.assert_frame_equal(df, out[:])
Ejemplo n.º 3
0
def test_object_missing_values():
    # Check that the presence of missing values doesn't change how object dtype
    # is hashed.
    s = pd.Series(["a", "b", "c", None])
    h1 = hash_pandas_object(s).iloc[:3]
    h2 = hash_pandas_object(s.iloc[:3])
    tm.assert_series_equal(h1, h2)
Ejemplo n.º 4
0
def test_pivot_table_dtype():

    df = pd.DataFrame({
        "A": list("AABB"),
        "B": pd.Categorical(list("ABAB")),
        "C": [1, 2, 3, 4]
    })
    ddf = dd.from_pandas(df, 2)
    res = dd.pivot_table(ddf,
                         index="A",
                         columns="B",
                         values="C",
                         aggfunc="count")

    exp_index = pd.CategoricalIndex(["A", "B"], name="B")
    exp = pd.Series([np.float64] * 2, index=exp_index)
    tm.assert_series_equal(res.dtypes, exp)

    exp = pd.pivot_table(df,
                         index="A",
                         columns="B",
                         values="C",
                         aggfunc="count").astype(np.float64)

    assert_eq(res, exp)
Ejemplo n.º 5
0
def assert_eq(
    a,
    b,
    check_names=True,
    check_dtype=True,
    check_divisions=True,
    check_index=True,
    scheduler="sync",
    **kwargs,
):
    if check_divisions:
        assert_divisions(a, scheduler=scheduler)
        assert_divisions(b, scheduler=scheduler)
        if hasattr(a, "divisions") and hasattr(b, "divisions"):
            at = type(np.asarray(a.divisions).tolist()[0])  # numpy to python
            bt = type(np.asarray(b.divisions).tolist()[0])  # scalar conversion
            assert at == bt, (at, bt)
    assert_sane_keynames(a)
    assert_sane_keynames(b)
    a = _check_dask(a,
                    check_names=check_names,
                    check_dtypes=check_dtype,
                    scheduler=scheduler)
    b = _check_dask(b,
                    check_names=check_names,
                    check_dtypes=check_dtype,
                    scheduler=scheduler)
    if hasattr(a, "to_pandas"):
        a = a.to_pandas()
    if hasattr(b, "to_pandas"):
        b = b.to_pandas()
    if isinstance(a, (pd.DataFrame, pd.Series)):
        a = _maybe_sort(a, check_index)
        b = _maybe_sort(b, check_index)
    if not check_index:
        a = a.reset_index(drop=True)
        b = b.reset_index(drop=True)
    if isinstance(a, pd.DataFrame):
        tm.assert_frame_equal(a,
                              b,
                              check_names=check_names,
                              check_dtype=check_dtype,
                              **kwargs)
    elif isinstance(a, pd.Series):
        tm.assert_series_equal(a,
                               b,
                               check_names=check_names,
                               check_dtype=check_dtype,
                               **kwargs)
    elif isinstance(a, pd.Index):
        tm.assert_index_equal(a, b, exact=check_dtype, **kwargs)
    else:
        if a == b:
            return True
        else:
            if np.isnan(a):
                assert np.isnan(b)
            else:
                assert np.allclose(a, b)
    return True
Ejemplo n.º 6
0
def test_from_pandas_series():
    n = 20
    s = pd.Series(np.random.randn(n), index=pd.date_range(start="20120101", periods=n))
    ds = dd.from_pandas(s, 3)
    assert len(ds.dask) == 3
    assert len(ds.divisions) == len(ds.dask) + 1
    assert isinstance(ds.divisions[0], type(s.index[0]))
    tm.assert_series_equal(s, ds.compute())

    ds = dd.from_pandas(s, chunksize=8)
    assert len(ds.dask) == 3
    assert len(ds.divisions) == len(ds.dask) + 1
    assert isinstance(ds.divisions[0], type(s.index[0]))
    tm.assert_series_equal(s, ds.compute())
Ejemplo n.º 7
0
def test_categorical_consistency():
    # Check that categoricals hash consistent with their values, not codes
    # This should work for categoricals of any dtype
    for s1 in [
            pd.Series(["a", "b", "c", "d"]),
            pd.Series([1000, 2000, 3000, 4000]),
            pd.Series(pd.date_range(0, periods=4)),
    ]:
        s2 = s1.astype("category").cat.set_categories(s1)
        s3 = s2.cat.set_categories(list(reversed(s1)))
        for categorize in [True, False]:
            # These should all hash identically
            h1 = hash_pandas_object(s1, categorize=categorize)
            h2 = hash_pandas_object(s2, categorize=categorize)
            h3 = hash_pandas_object(s3, categorize=categorize)
            tm.assert_series_equal(h1, h2)
            tm.assert_series_equal(h1, h3)
Ejemplo n.º 8
0
def test_concat_unions_categoricals():
    # Categorical DataFrame, regular index
    tm.assert_frame_equal(_concat(frames), pd.concat(frames2))

    # Categorical Series, regular index
    tm.assert_series_equal(_concat([i.y for i in frames]),
                           pd.concat([i.y for i in frames2]))

    # Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames3]),
                          pd.concat([i for i in frames4]).index)

    # Categorical DataFrame, Categorical Index
    tm.assert_frame_equal(_concat(frames3), pd.concat(frames4))

    # Non-categorical DataFrame, Categorical Index
    tm.assert_frame_equal(
        _concat([i[["x", "z"]] for i in frames3]),
        pd.concat([i[["x", "z"]] for i in frames4]),
    )

    # Categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.z for i in frames3]),
                           pd.concat([i.z for i in frames4]))

    # Non-categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.x for i in frames3]),
                           pd.concat([i.x for i in frames4]))

    # MultiIndex with Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames5]),
                          pd.concat([i for i in frames6]).index)

    # DataFrame, MultiIndex with CategoricalIndex
    tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))