def test_custom_groupby(): df = DataFrame({"A": ["a", "a", "c", "c"], "B": [1, 3, 5, 2]}) assert df.groupby("A").select("B").apply(lambda x: x.sum()).shape == (2, 2) assert df.groupby("A").select("B").apply( lambda x: Series("", np.array(x))).shape == (2, 2) df = DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]}) out = (df.lazy().groupby("b").agg( [col("a").apply_groups(lambda x: x.sum(), dtype_out=int)]).collect()) assert out.shape == (3, 2)
def test_groupby(): df = DataFrame( { "a": ["a", "b", "a", "b", "b", "c"], "b": [1, 2, 3, 4, 5, 6], "c": [6, 5, 4, 3, 2, 1], } ) assert ( df.groupby("a") .select("b") .sum() .frame_equal(DataFrame({"a": ["a", "b", "c"], "": [4, 11, 6]})) ) assert ( df.groupby("a") .select("c") .sum() .frame_equal(DataFrame({"a": ["a", "b", "c"], "": [10, 10, 1]})) ) assert ( df.groupby("a") .select("b") .min() .frame_equal(DataFrame({"a": ["a", "b", "c"], "": [1, 2, 6]})) ) assert ( df.groupby("a") .select("b") .max() .frame_equal(DataFrame({"a": ["a", "b", "c"], "": [3, 5, 6]})) ) assert ( df.groupby("a") .select("b") .mean() .frame_equal(DataFrame({"a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0]})) ) assert ( df.groupby("a") .select("b") .last() .frame_equal(DataFrame({"a": ["a", "b", "c"], "": [3, 5, 6]})) ) # check if it runs (df.groupby("a").select("b").n_unique()) (df.groupby("a").select("b").quantile(0.3)) (df.groupby("a").select("b").agg_list()) gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"}) assert "b_sum" in gb_df.columns assert "b_min" in gb_df.columns
def test_groupby(): df = DataFrame({ "a": ["a", "b", "a", "b", "b", "c"], "b": [1, 2, 3, 4, 5, 6], "c": [6, 5, 4, 3, 2, 1], }) assert (df.groupby("a").select("b").sum().frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("c").sum().frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [10, 10, 1] }))) assert (df.groupby("a").select("b").min().frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [1, 2, 6] }))) assert (df.groupby("a").select("b").max().frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) assert (df.groupby("a").select("b").mean().frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0] }))) assert (df.groupby("a").select("b").last().frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) # check if it runs (df.groupby("a").select("b").n_unique()) (df.groupby("a").select("b").quantile(0.3))
def test_groupby(): df = DataFrame({ "a": ["a", "b", "a", "b", "b", "c"], "b": [1, 2, 3, 4, 5, 6], "c": [6, 5, 4, 3, 2, 1], }) assert (df.groupby("a").select("b").sum().sort(by_column="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("c").sum().sort(by_column="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [10, 10, 1] }))) assert (df.groupby("a").select("b").min().sort(by_column="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [1, 2, 6] }))) assert (df.groupby("a").select("b").max().sort(by_column="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) assert (df.groupby("a").select("b").mean().sort(by_column="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0] }))) assert (df.groupby("a").select("b").last().sort(by_column="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) # check if it runs (df.groupby("a").select("b").n_unique()) (df.groupby("a").select("b").quantile(0.3)) (df.groupby("a").select("b").agg_list()) gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"}) assert "b_sum" in gb_df.columns assert "b_min" in gb_df.columns # # # TODO: is false because count is u32 # df.groupby(by="a", select="b", agg="count").frame_equal( # DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]}) # ) assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort( "c")["c"][0] == 1