def test_n(): df = tibble(x=[1, 2, 3], g=[1, 1, 2]) out = df >> summarise(n=n()) assert_iterable_equal(out.n, [3]) gf = df >> group_by(f.g) out = gf >> summarise(n=n()) assert_iterable_equal(out.n, [2, 1])
def test_peels_off_a_single_layer_of_grouping(): df = tibble( x=rep([1, 2, 3, 4], each=4), y=rep([1, 2], each=8), z=runif(16) ) gf = df >> group_by(f.x, f.y) assert group_vars(summarise(gf)) == ["x"] assert group_vars(summarise(summarise(gf))) == []
def test_cur_group_rows(): df = tibble(x=c("b", "a", "b"), y=[1, 2, 3]) gf = df >> group_by(f.x, _sort=True) out = gf >> summarise(x=cur_group_rows()) >> pull() assert out.values.tolist() == [[1], [0, 2]] # data frame out = df >> summarise(x=cur_group_rows()) >> pull() assert out.values.tolist() == [[0, 1, 2]]
def test_recycling(): df = tibble(x=1, y=2) out = df >> summarise(across(everything(), lambda col: rep(42, col))) expect = tibble(x=rep(42, 2), y=rep(42, 2)) assert out.equals(expect) df = tibble(x=2, y=3) with pytest.raises(ValueError): df >> summarise(across(everything(), lambda col: rep(42, col)))
def test_works_with_empty_data_frames(): df = tibble(x=[]) df1 = summarise(df) df2 = tibble(_rows=1) assert df1.equals(df2) df = tibble(_rows=10) df1 = summarise(df) assert df1.equals(df2)
def test_summarise_cols_inside_func(): df = tibble(x=2, y=4, z=8) @register_func(None, context=None) def data_frame(**kwargs): return tibble(**kwargs) out = df >> summarise(data_frame(x=f.x / f.y, y=f.y / f.y, z=f.z / f.y)) expect = df >> summarise(across(everything(), lambda col: col / df.y)) assert out.equals(expect)
def test_works_with_grouped_empty_data_frames(): df = tibble(x=[]) df1 = df >> group_by(f.x) >> summarise(y=1) assert dim(df1) == (0, 2) assert df1.columns.tolist() == ["x", "y"] df1 = df >> rowwise(f.x) >> summarise(y=1) assert group_vars(df1) == ["x"] assert dim(df1) == (0, 2) assert df1.columns.tolist() == ["x", "y"]
def test_to_functions(): df = tibble(x=c(1, NA)) # -> float out = df >> summarise(across(everything(), mean, na_rm=True)) expect = tibble(x=1.0) assert_frame_equal(out, expect) out = df >> summarise( across(everything(), dict(mean=mean, median=median), na_rm=True)) expect = tibble(x_mean=1.0, x_median=1.0) assert_frame_equal(out, expect)
def test_cur_group(): df = tibble(g=1, x=1) gf = df >> group_by(f.g) out = df >> summarise(key=cur_group()) >> pull(f.key) assert len(out) == 1 assert_iterable_equal(out, [np.nan]) out = gf >> summarise(key=cur_group()) >> pull(f.key, to="list") assert len(out) == 1 assert out[0].equals(tibble(g=1))
def test_cur_group_id(): df = tibble(x=c("b", "a", "b")) out = df >> summarise(id=cur_group_id()) assert_iterable_equal(out.id, [0]) gf = df >> group_by(f.x, _sort=True) out = gf >> summarise(id=cur_group_id()) # group_by not sorted expect = tibble(x=c("a", "b"), id=[0, 1]) assert_frame_equal(out, expect) out = gf >> mutate(id=cur_group_id()) expect = tibble(x=["b", "a", "b"], id=[1, 0, 1]) assert_frame_equal(out, expect)
def test_error_messages(): with pytest.raises(ValueError, match="Argument `_fns` of across must be"): tibble(x=1) >> summarise(res=across(where(is_numeric), 42)) with pytest.raises(ValueError, match="must only be used inside verbs"): across() with pytest.raises(ValueError, match="must only be used inside verbs"): c_across()
def test_zero_row_dfs(): df = tibble(a=[], b=[], g=[]) dfg = group_by(df, f.g, _drop=False) assert dfg.shape == (0, 3) assert group_vars(dfg) == ["g"] assert group_size(dfg) == [] x = summarise(dfg, n=n()) assert x.shape == (0, 2) assert group_vars(x) == [] x = mutate(dfg, c=f.b + 1) assert x.shape == (0, 4) assert group_vars(x) == ["g"] assert group_size(x) == [] x = filter(dfg, f.a == 100) assert x.shape == (0, 3) assert group_vars(x) == ["g"] assert group_size(x) == [] x = arrange(dfg, f.a, f.g) assert x.shape == (0, 3) assert group_vars(x) == ["g"] assert group_size(x) == [] x = select(dfg, f.a) assert x.shape == (0, 2) assert group_vars(x) == ["g"] assert group_size(x) == []
def test_input_recycled(): df1 = tibble() >> summarise(x=1, y=[1, 2, 3], z=1) df2 = tibble(x=1, y=[1, 2, 3], z=1) assert df1.equals(df2) gf = group_by(tibble(a=[1, 2]), f.a) df1 = gf >> summarise(x=1, y=[1, 2, 3], z=1) df2 = tibble( a=rep([1, 2], each=3), x=1, y=rep([1, 2, 3], 2), z=1 ) >> group_by(f.a) assert_tibble_equal(df1, df2) df1 = gf >> summarise(x=seq_len(f.a), y=1) df2 = tibble(a=c(1, 2, 2), x=c(1, 1, 2), y=1) >> group_by(f.a) # assert df1.equals(df2) assert_tibble_equal(df1, df2)
def test_one_group_for_NA(): x = c(NA, NA, NA, range(10, 0, -1), range(10, 0, -1)) w = numpy.array(c(20, 30, 40, range(1, 11), range(1, 11))) * 10 assert n_distinct(x, na_rm=False) == 11 res = tibble(x=x, w=w) >> group_by(f.x) >> summarise(n=n()) assert nrow(res) == 11
def test_allows_names(): res = ( tibble(x=[1, 2, 3], y=letters[:3]) >> group_by(f.y) >> summarise(a=length(f.x), b=quantile(f.x, 0.5)) ) assert res.b.tolist() == [1.0, 2.0, 3.0]
def test_summarise_maintains_drop(): df = tibble( f1=factor("a", levels=c("a", "b", "c")), f2=factor("d", levels=c("d", "e", "f", "g")), x=42, ) res = df >> group_by(f.f1, f.f2, _drop=True) ng = n_groups(res) assert ng == 1 assert group_by_drop_default(res) # DataFrame.groupby(..., observed=False) doesn't support # multiple categoricals # res1 = df >> group_by(f.f1, f.f2, _drop=False) # ng = n_groups(res1) # assert ng == 12 res1 = df >> group_by(f.f1, _drop=True) ng = n_groups(res1) assert ng == 1 res1 = df >> group_by(f.f1, _drop=False) ng = n_groups(res1) assert ng == 3 res1 = df >> group_by(f.f2, _drop=False) ng = n_groups(res1) assert ng == 4 res2 = res >> summarise(x=sum(f.x), _groups="drop_last") ng = n_groups(res2) assert ng == 1 assert group_by_drop_default(res2)
def test_0col_df_in_results_ignored(): df1 = tibble(x=[1, 2]) df2 = df1 >> group_by(f.x) >> summarise(tibble()) assert df2.equals(df1) df2 = df1 >> group_by(f.x) >> summarise(tibble(), y=65) df3 = df1 >> mutate(y=65) assert df2.equals(df3) df2 = tibble(x=[1, 2], y=[3, 4]) df3 = df2 >> group_by(f.x) >> summarise(tibble()) assert df3.equals(df1) df3 = df2 >> group_by(f.x) >> summarise(tibble(), z=98) df4 = df1 >> mutate(z=98) assert df3.equals(df4)
def test_rowwise_preserved_by_major_verbs(): rf = rowwise(tibble(x=range(1, 6), y=range(5, 0, -1)), f.x) out = arrange(rf, f.y) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = filter(rf, f.x < 3) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = mutate(rf, x=f.x + 1) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = rename(rf, X=f.x) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["X"] out = select(rf, "x") assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = slice(rf, c(0, 0)) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] # Except for summarise out = summarise(rf, z=mean(f.x, f.y)) assert isinstance(out, TibbleGrouped) assert group_vars(out) == ["x"]
def test_names_output(): gf = tibble(x=1, y=2, z=3, s="") >> group_by(f.x) out = gf >> summarise(across()) assert out.columns.tolist() == ["x", "y", "z", "s"] out = gf >> summarise(across(_names="id_{_col}")) assert out.columns.tolist() == ["x", "id_y", "id_z", "id_s"] out = gf >> summarise(across(where(is_numeric), mean)) assert out.columns.tolist() == ["x", "y", "z"] out = gf >> summarise(across(where(is_numeric), mean, _names="mean_{_col}")) assert out.columns.tolist() == ["x", "mean_y", "mean_z"] out = gf >> summarise(across(where(is_numeric), { "mean": mean, "sum": sum })) assert out.columns.tolist() == ["x", "y_mean", "y_sum", "z_mean", "z_sum"] # Different from R's list out = gf >> summarise(across(where(is_numeric), {"mean": mean, 1: sum})) assert out.columns.tolist() == ["x", "y_mean", "y_1", "z_mean", "z_1"] # Different from R's list out = gf >> summarise(across(where(is_numeric), {0: mean, "sum": sum})) assert out.columns.tolist() == ["x", "y_0", "y_sum", "z_0", "z_sum"] out = gf >> summarise(across(where(is_numeric), [mean, sum])) assert out.columns.tolist() == ["x", "y_0", "y_1", "z_0", "z_1"] out = gf >> summarise( across(where(is_numeric), [mean, sum], _names="{_col}_{_fn1}")) assert out.columns.tolist() == ["x", "y_1", "y_2", "z_1", "z_2"] out = gf >> summarise( across( where(is_numeric), { "mean": mean, "sum": sum }, _names="{_fn}_{_col}", )) assert out.columns.tolist() == ["x", "mean_y", "sum_y", "mean_z", "sum_z"]
def test_pd_cat(): df = tibble( x=Categorical(["a", "b"], categories=["a", "b", "c"])) >> group_by( g=[1, 2]) out = df >> summarise(lvls=pd_cat(f.x).categories) assert_iterable_equal(out.lvls[0], ["a", "b", "c"]) assert_iterable_equal(out.lvls[1], ["a", "b", "c"])
def test_list_output_columns(): df = tibble(x=range(1, 11), g=rep([1, 2], each=5)) res = ( df >> group_by(f.g) >> summarise(y=f.x.apply(list)) ) assert_iterable_equal(res.y[0], [1, 2, 3, 4, 5])
def test_cur_data_all(): df = tibble(x=c("b", "a", "b"), y=[1, 2, 3]) gf = df >> group_by(f.x, _sort=True) out = df >> summarise(x=cur_data()) >> pull(f.x, to="list") assert out[0].equals(df) out = df >> summarise(x=cur_data_all()) >> pull(f.x, to="list") assert out[0].equals(df) out = gf >> summarise(x=cur_data()) >> pull(f.x) assert out.values[0].values.flatten().tolist() == [2] assert out.values[1].values.flatten().tolist() == [1, 3] out = gf >> summarise(x=cur_data_all()) >> pull(f.x) assert out.values[0].values.flatten().tolist() == ["a", 2] assert out.values[1].values.flatten().tolist() == ["b", 1, "b", 3]
def test_correctly_reconstructs_groups(): d = ( tibble(x=[1, 2, 3, 4], g1=rep([1, 2], 2), g2=[1, 2, 3, 4]) >> group_by(f.g1, f.g2) >> summarise(x=f.x + 1) ) # Different from dplyr, original df does not reorder. assert group_rows(d) == [[0, 2], [1, 3]]
def test_summarise_rowwise(): params = tibble(sim=[1, 2, 3], n=[1, 2, 3], mean=[1, 2, 1], sd=[1, 4, 2]) out = params >> rowwise(f.sim) >> summarise(z=rnorm(f.n, f.mean, f.sd)) assert len(out.columns) == 2 assert len(out.z.obj.values[0]) == 1 assert len(out.z.obj.values[1]) == 2 assert len(out.z.obj.values[2]) == 3
def test_n_distinct_handles_in_na_rm(): d = tibble(x=c([1, 2, 3, 4], NA)) yes = True no = False out = d >> summarise(n=n_distinct(f.x, na_rm=True)) >> pull(to="list") assert out == [4] out = d >> summarise(n=n_distinct(f.x, na_rm=False)) >> pull(to="list") assert out == [5] out = d >> summarise(n=n_distinct(f.x, na_rm=yes)) >> pull(to="list") assert out == [4] out = d >> summarise(n=n_distinct(f.x, na_rm=no)) >> pull(to="list") assert out == [5] out = (d >> summarise(n=n_distinct(f.x, na_rm=True or True)) >> pull(to="list")) assert out == [4]
def test_n_distinct_works_with_str_col(): wrapper = lambda data, col: summarise( data, result=n_distinct(f[col], na_rm=True)) df = tibble(x=[1, 1, 3, NA]) out = wrapper(df, "x") exp = tibble(result=2) assert out.equals(exp)
def test_mutate_does_not_loose_variables(): df = tibble( a=rep([1, 2, 3, 4], 2), b=rep([1, 2, 3, 4], each=2), x=runif(8) ) by_ab = df >> group_by(f.a, f.b) by_a = by_ab >> summarise(x=sum(f.x), _groups="drop_last") by_a_quantile = by_a >> group_by(quantile=ntile(f.x, 4)) assert by_a_quantile.columns.tolist() == ["a", "b", "x", "quantile"]
def test_can_be_before_group_by(): df = tibble(id=c(1, 1, 2, 2, 2, 3, 3, 4, 4, 5), year=c(2013, 2013, 2012, 2013, 2013, 2013, 2012, 2012, 2013, 2013), var1=rnorm(10)) dfagg = df >> group_by(f.id, f.year) >> select( f.id, f.year, f.var1) >> summarise(var1=mean(f.var1)) assert_iterable_equal(names(dfagg), ["id", "year", "var1"])
def test_result_locations_aligned_with_column_names(): df = tibble(x=[1, 2], y=["a", "b"]) expect = tibble(x_cls=numpy.int64, x_type=True, y_cls=object, y_type=False) x = df >> summarise( across(everything(), { "cls": lambda x: x.dtype, "type": is_numeric })) assert_frame_equal(x, expect)
def test_c_across(): df = tibble(x=[1, 2], y=[3, 4]) out = df >> summarise(z=c_across([f.x, f.y])) assert_frame_equal(out["z"], df) # what if no columns specified gf = df >> rowwise(f.x) out = gf >> mutate(z=sum(c_across())) assert out.z.obj.tolist() == [3, 4]