def test_rep_df(): df = tibble(x=f[:3]) with pytest.raises(ValueError): rep(df, each=2) out = rep(df, times=2, length=5) assert_frame_equal(out, tibble(x=[0, 1, 2, 0, 1]))
def test_peels_off_a_single_layer_of_grouping(): df = tibble( x=rep([1, 2, 3, 4], each=4), y=rep([1, 2], each=8), z=runif(16) ) gf = df >> group_by(f.x, f.y) assert group_vars(summarise(gf)) == ["x"] assert group_vars(summarise(summarise(gf))) == []
def test_dense_rank_with_groups(): df = tibble(x=rep(f[1:5], each=2), y=rep([1, 2], each=4)) out = dense_rank(df.x) assert out.tolist() == [1, 1, 2, 2, 3, 3, 4, 4] df = df.groupby("y") out = dense_rank(df.x) assert out.tolist() == [1, 1, 2, 2, 1, 1, 2, 2]
def test_min_rank_with_groups(): df = tibble(x=rep(f[1:5], each=2), y=rep([1, 2], each=4)) out = min_rank(df.x) assert out.tolist() == [1, 1, 3, 3, 5, 5, 7, 7] df = df.groupby("y") out = min_rank(df.x) assert out.tolist() == [1, 1, 3, 3, 1, 1, 3, 3]
def test_mutate_does_not_loose_variables(): df = tibble( a=rep([1, 2, 3, 4], 2), b=rep([1, 2, 3, 4], each=2), x=runif(8) ) by_ab = df >> group_by(f.a, f.b) by_a = by_ab >> summarise(x=sum(f.x), _groups="drop_last") by_a_quantile = by_a >> group_by(quantile=ntile(f.x, 4)) assert by_a_quantile.columns.tolist() == ["a", "b", "x", "quantile"]
def test_recycling(): df = tibble(x=1, y=2) out = df >> summarise(across(everything(), lambda col: rep(42, col))) expect = tibble(x=rep(42, 2), y=rep(42, 2)) assert out.equals(expect) df = tibble(x=2, y=3) with pytest.raises(ValueError): df >> summarise(across(everything(), lambda col: rep(42, col)))
def test_complex_expression_as_value(): # https://stackoverflow.com/questions/30714810/ # pandas-group-by-and-aggregate-column-1-with-condition-from-column-2 dat = (tibble( user=rep(c("1", 2, 3, 4), each=5), cancel_date=rep(c(12, 5, 10, 11), each=5), ) >> group_by(f.user)) out = dat >> mutate( # mulitple size not supported yet # login=sample(f[1 : ], size=n(), replace=True) login=sample(f[1:], size=1, replace=True)) assert nrow(out) == 20
def test_cume_dist_with_groups(): df = tibble(x=rep(f[1:5], each=2), y=rep([1, 2], each=4)) out = cume_dist(df.x) assert_iterable_equal( out, [0.25, 0.25, 0.5, 0.5, 0.75, 0.75, 1.0, 1.0], approx=1e-3, ) df = df.groupby("y") out = cume_dist(df.x) assert_iterable_equal( out, [0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0], )
def test_percent_rank_with_groups(): df = tibble(x=rep(f[1:5], each=2), y=rep([1, 2], each=4)) out = percent_rank(df.x) assert_iterable_equal( out, [0.0, 0.0, 0.333, 0.333, 0.666, 0.666, 1.0, 1.0], approx=1e-3, ) df = df.groupby("y") out = percent_rank(df.x) assert_iterable_equal( out, [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0], )
def test_arguments_to_sample_are_passed_along(): df = tibble(x=range(1, 101), wt=c(1, rep(0, 99))) out = df >> slice_sample(n=1, weight_by=f.wt) assert out.x.tolist() == [1] out = df >> slice_sample(n=2, weight_by=f.wt, replace=True) assert out.x.tolist() == [1, 1]
def test_input_recycled(): df1 = tibble() >> summarise(x=1, y=[1, 2, 3], z=1) df2 = tibble(x=1, y=[1, 2, 3], z=1) assert df1.equals(df2) gf = group_by(tibble(a=[1, 2]), f.a) df1 = gf >> summarise(x=1, y=[1, 2, 3], z=1) df2 = tibble( a=rep([1, 2], each=3), x=1, y=rep([1, 2, 3], 2), z=1 ) >> group_by(f.a) assert_tibble_equal(df1, df2) df1 = gf >> summarise(x=seq_len(f.a), y=1) df2 = tibble(a=c(1, 2, 2), x=c(1, 1, 2), y=1) >> group_by(f.a) # assert df1.equals(df2) assert_tibble_equal(df1, df2)
def test_handles_simple_symbols(): df = tibble(x=range(1, 5), test=rep(c(True, False), each=2)) res = filter(df, f.test) gdf = group_by(df, f.x) res = filter(gdf, f.test) def h(data): test2 = c(True, True, False, False) return filter(data, test2) out = h(df) assert out.equals(df.iloc[:2, :]) def ff(data, *args): one = 1 return filter(data, f.test, f.x > one, *args) def g(data, *args): four = 4 return ff(data, f.x < four, *args) res = g(df) assert res.x.tolist() == [2] assert res.test.tolist() == [True] res = g(gdf) assert res.x.obj.tolist() == [2] assert res.test.obj.tolist() == [True]
def test_list_output_columns(): df = tibble(x=range(1, 11), g=rep([1, 2], each=5)) res = ( df >> group_by(f.g) >> summarise(y=f.x.apply(list)) ) assert_iterable_equal(res.y[0], [1, 2, 3, 4, 5])
def test_correctly_reconstructs_groups(): d = ( tibble(x=[1, 2, 3, 4], g1=rep([1, 2], 2), g2=[1, 2, 3, 4]) >> group_by(f.g1, f.g2) >> summarise(x=f.x + 1) ) # Different from dplyr, original df does not reorder. assert group_rows(d) == [[0, 2], [1, 3]]
def test_group_split_keeps_group_variables_by_default(): tbl = tibble(x=[1, 2, 3, 4], g=factor(rep(["a", "b"], each=2))) out = group_split(tbl, f.g) res = list(out) assert len(res) == 2 assert res[0].equals(tbl.iloc[[0, 1], :]) assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True))
def test_slicex_on_grouped_data(): gf = tibble(g=rep([1, 2], each=3), x=seq(1, 6)) >> group_by(f.g) out = gf >> slice_min(f.x) assert out.equals(tibble(g=[1, 2], x=[1, 4])) out = gf >> slice_max(f.x) assert out.equals(tibble(g=[1, 2], x=[3, 6])) out = gf >> slice_sample() assert dim(out) == (2, 2)
def test_cache_key(): df = tibble(g=rep([1, 2], each=2), a=range(1, 5)) >> group_by(f.g) out = df >> mutate( tibble( x=across(where(is_numeric), mean).a, y=across(where(is_numeric), max).a, )) expect = df >> mutate(x=mean(f.a), y=max(f.a)) assert_frame_equal(out, expect)
def test_slice_handles_na(): df = tibble(x=[1, 2, 3]) assert nrow(slice(df, NA)) == 0 assert nrow(slice(df, c(1, NA))) == 1 out = df >> slice(c(~c(1), NA)) >> nrow() assert out == 2 df = tibble(x=[1, 2, 3, 4], g=rep([1, 2], 2)) >> group_by(f.g) assert nrow(slice(df, c(1, NA))) == 2 out = df >> slice(c(~c(1), NA)) >> nrow() assert out == 2
def test_group_modify_works_with_additional_arguments(): def myfun(x, y, foo): x = x.copy() x[foo] = 1 return x srcdata = tibble(A=rep([1, 2], each=3)) >> group_by(f.A) targetdata = srcdata.copy() targetdata["bar"] = 1 out = group_modify(srcdata, _f=myfun, foo="bar") assert_frame_equal(out.reset_index(drop=True), targetdata)
def test_rep_sgb_param(caplog): df = tibble( x=[1, 1, 2, 2], times=[1, 2, 1, 2], length=[3, 4, 4, 3], each=[1, 1, 1, 1], ).group_by("x") out = rep([1, 2], df.times) assert_iterable_equal(out.obj, [1, 2, 2, 1, 2, 2]) out = rep([1, 2], times=df.times, each=1, length=df.length) assert "first element" in caplog.text assert_iterable_equal(out.obj, [1, 2, 2, 1, 2, 2, 1]) assert_iterable_equal(out.grouper.size(), [3, 4]) df2 = tibble(x=[1, 2], each=[1, 1]).group_by("x") out = rep(df2.x, each=df2.each) assert_iterable_equal(out.obj, [1, 2]) out = rep(df2.x, times=df2.each, length=df2.each, each=df2.each) assert_iterable_equal(out.obj, [1, 2]) out = rep(3, each=df2.each) assert_iterable_equal(out.obj, [3, 3]) out = rep(df2.x.obj, 2) assert_iterable_equal(out, [1, 2, 1, 2])
def test_errors(): x = Series(1, name="x") df = tibble(x, x, _name_repair="minimal") with pytest.raises(NameNonUniqueError): df >> arrange(f.x) df = tibble(x=x) with pytest.raises(KeyError): df >> arrange(f.y) with pytest.raises(ValueError, match="Length of values"): df >> arrange(rep(f.x, 2))
def test_joins_preserve_groups(): gf1 = tibble(a=[1, 2, 3]) >> group_by(f.a) gf2 = tibble(a=rep([1, 2, 3, 4], 2), b=1) >> group_by(f.b) out = inner_join(gf1, gf2, by="a") assert group_vars(out) == ["a"] out = semi_join(gf1, gf2, by="a") assert group_vars(out) == ["a"] # See comment in nest_join out = nest_join(gf1, gf2, by="a") assert group_vars(out) == ["a"]
def test_group_list_respects_empty_groups(): tbl = tibble( x=[1, 2, 3, 4], g=factor(rep(["a", "b"], each=2), levels=["a", "b", "c"]), ) res = group_split.list(tbl, f.g) assert res[0].equals(tbl.iloc[:2, :]) assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True)) res = group_split.list(tbl, f.g, _drop=False) assert res[0].equals(tbl.iloc[:2, :]) assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True)) assert res[2].equals(tbl.iloc[[], :])
def test_slice_gives_correct_rows(): a = tibble(value=[f"row{i}" for i in range(1, 11)]) out = slice(a, c(0, 1, 2)) assert out.value.tolist() == ["row1", "row2", "row3"] out = slice(a, c(3, 5, 8)) assert out.value.tolist() == ["row4", "row6", "row9"] a = tibble(value=[f"row{i}" for i in range(1, 11)], group=rep([1, 2], each=5)) >> group_by(f.group) out = slice(a, f[:3]) assert out.value.obj.tolist() == [f"row{i}" for i in [1, 2, 3, 6, 7, 8]] out = slice(a, c(1, 3)) assert out.value.obj.tolist() == [f"row{i}" for i in [2, 4, 7, 9]]
def test_can_recycle_when_add_multiple_columns_of_len1(): df = tibble(a=[1, 2, 3]) df_new = add_column(df, b=4, c=5) assert_frame_equal(df_new, tibble(a=[1, 2, 3], b=rep(4, 3), c=rep(5, 3)))
def test_can_recycle_when_adding_a_column_of_len1(): df = tibble(a=[1, 2, 3]) df_new = add_column(df, b=4) assert_frame_equal(df_new, tibble(a=[1, 2, 3], b=rep(4, 3)))
def test_can_recycle_when_adding_columns(): df = tibble(a=[1, 2, 3]) df_new = add_column(df, b=4, c=[3, 2, 1]) assert_frame_equal(df_new, tibble(a=[1, 2, 3], b=rep(4, 3), c=[3, 2, 1]))
def test_rep_error(): with pytest.raises(ValueError): rep(c(1, 2, 3), c(1, 2)) with pytest.raises(ValueError): rep(c(1, 2, 3), c(1, 2, 3), each=2)
def test_rep_grouped_df(): df = tibble(x=f[:3], g=[1, 1, 2]).group_by("g") out = rep(df, 2, length=5) assert isinstance(out, TibbleGrouped) assert_iterable_equal(out.x.obj, [0, 1, 2, 0, 1]) assert out._datar["grouped"].grouper.ngroups == 2
def test_rep(x, times, length, each, expected): assert_iterable_equal( rep(x, times=times, length=length, each=each), expected )