Example #1
0
def test_custom_groupby():
    df = DataFrame({"A": ["a", "a", "c", "c"], "B": [1, 3, 5, 2]})
    assert df.groupby("A").select("B").apply(lambda x: x.sum()).shape == (2, 2)
    assert df.groupby("A").select("B").apply(
        lambda x: Series("", np.array(x))).shape == (
            2,
            2,
        )

    df = DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})

    out = (df.lazy().groupby("b").agg(
        [col("a").apply(lambda x: x.sum(), dtype_out=int)]).collect())
    assert out.shape == (3, 2)
Example #2
0
def test_groupby():
    df = DataFrame(
        {
            "a": ["a", "b", "a", "b", "b", "c"],
            "b": [1, 2, 3, 4, 5, 6],
            "c": [6, 5, 4, 3, 2, 1],
        }
    )
    assert df.groupby(by="a", select="b", agg="sum").frame_equal(
        DataFrame({"a": ["a", "b", "c"], "": [4, 11, 6]})
    )
    assert df.groupby(by="a", select="c", agg="sum").frame_equal(
        DataFrame({"a": ["a", "b", "c"], "": [10, 10, 1]})
    )
    assert df.groupby(by="a", select="b", agg="min").frame_equal(
        DataFrame({"a": ["a", "b", "c"], "": [1, 2, 6]})
    )
    assert df.groupby(by="a", select="b", agg="min").frame_equal(
        DataFrame({"a": ["a", "b", "c"], "": [1, 2, 6]})
    )
    assert df.groupby(by="a", select="b", agg="max").frame_equal(
        DataFrame({"a": ["a", "b", "c"], "": [3, 5, 6]})
    )
    assert df.groupby(by="a", select="b", agg="mean").frame_equal(
        DataFrame({"a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0]})
    )

    # TODO: is false because count is u32
    df.groupby(by="a", select="b", agg="count").frame_equal(
        DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]})
    )
Example #3
0
def test_groupby():
    df = DataFrame({
        "a": ["a", "b", "a", "b", "b", "c"],
        "b": [1, 2, 3, 4, 5, 6],
        "c": [6, 5, 4, 3, 2, 1],
    })

    # use __getitem__ to map to select
    assert (df.groupby("a")["b"].sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [4, 11, 6]
        })))

    assert (df.groupby("a").select("b").sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [4, 11, 6]
        })))
    assert (df.groupby("a").select("c").sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [10, 10, 1]
        })))
    assert (df.groupby("a").select("b").min().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [1, 2, 6]
        })))
    assert (df.groupby("a").select("b").max().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [3, 5, 6]
        })))
    assert (df.groupby("a").select("b").mean().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [2.0, (2 + 4 + 5) / 3, 6.0]
        })))
    assert (df.groupby("a").select("b").last().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [3, 5, 6]
        })))
    # check if it runs
    (df.groupby("a").select("b").n_unique())

    (df.groupby("a").select("b").quantile(0.3))
    (df.groupby("a").select("b").agg_list())

    gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"})
    assert "b_sum" in gb_df.columns
    assert "b_min" in gb_df.columns

    #
    # # TODO: is false because count is u32
    # df.groupby(by="a", select="b", agg="count").frame_equal(
    #     DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]})
    # )
    assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort(
        "c")["c"][0] == 1

    assert df.groupby("a").groups().sort("a")["a"].series_equal(
        Series(["a", "b", "c"]))

    for subdf in df.groupby("a"):
        if subdf["a"][0] == "b":
            assert subdf.shape == (3, 3)

    assert df.groupby("a").get_group("c").shape == (1, 3)
    assert df.groupby("a").get_group("b").shape == (3, 3)
    assert df.groupby("a").get_group("a").shape == (2, 3)

    # Use lazy API in eager groupby
    assert df.groupby("a").agg([pl.sum("b")]).shape == (3, 2)