def test_custom_groupby(): df = DataFrame({"A": ["a", "a", "c", "c"], "B": [1, 3, 5, 2]}) assert df.groupby("A").select("B").apply(lambda x: x.sum()).shape == (2, 2) assert df.groupby("A").select("B").apply( lambda x: Series("", np.array(x))).shape == ( 2, 2, ) df = DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]}) out = (df.lazy().groupby("b").agg( [col("a").apply(lambda x: x.sum(), dtype_out=int)]).collect()) assert out.shape == (3, 2)
def test_groupby(): df = DataFrame( { "a": ["a", "b", "a", "b", "b", "c"], "b": [1, 2, 3, 4, 5, 6], "c": [6, 5, 4, 3, 2, 1], } ) assert df.groupby(by="a", select="b", agg="sum").frame_equal( DataFrame({"a": ["a", "b", "c"], "": [4, 11, 6]}) ) assert df.groupby(by="a", select="c", agg="sum").frame_equal( DataFrame({"a": ["a", "b", "c"], "": [10, 10, 1]}) ) assert df.groupby(by="a", select="b", agg="min").frame_equal( DataFrame({"a": ["a", "b", "c"], "": [1, 2, 6]}) ) assert df.groupby(by="a", select="b", agg="min").frame_equal( DataFrame({"a": ["a", "b", "c"], "": [1, 2, 6]}) ) assert df.groupby(by="a", select="b", agg="max").frame_equal( DataFrame({"a": ["a", "b", "c"], "": [3, 5, 6]}) ) assert df.groupby(by="a", select="b", agg="mean").frame_equal( DataFrame({"a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0]}) ) # TODO: is false because count is u32 df.groupby(by="a", select="b", agg="count").frame_equal( DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]}) )
def test_groupby(): df = DataFrame({ "a": ["a", "b", "a", "b", "b", "c"], "b": [1, 2, 3, 4, 5, 6], "c": [6, 5, 4, 3, 2, 1], }) # use __getitem__ to map to select assert (df.groupby("a")["b"].sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("b").sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("c").sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [10, 10, 1] }))) assert (df.groupby("a").select("b").min().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [1, 2, 6] }))) assert (df.groupby("a").select("b").max().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) assert (df.groupby("a").select("b").mean().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0] }))) assert (df.groupby("a").select("b").last().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) # check if it runs (df.groupby("a").select("b").n_unique()) (df.groupby("a").select("b").quantile(0.3)) (df.groupby("a").select("b").agg_list()) gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"}) assert "b_sum" in gb_df.columns assert "b_min" in gb_df.columns # # # TODO: is false because count is u32 # df.groupby(by="a", select="b", agg="count").frame_equal( # DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]}) # ) assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort( "c")["c"][0] == 1 assert df.groupby("a").groups().sort("a")["a"].series_equal( Series(["a", "b", "c"])) for subdf in df.groupby("a"): if subdf["a"][0] == "b": assert subdf.shape == (3, 3) assert df.groupby("a").get_group("c").shape == (1, 3) assert df.groupby("a").get_group("b").shape == (3, 3) assert df.groupby("a").get_group("a").shape == (2, 3) # Use lazy API in eager groupby assert df.groupby("a").agg([pl.sum("b")]).shape == (3, 2)