def test_cache_key(): df = tibble(g=rep([1, 2], each=2), a=range(1, 5)) >> group_by(f.g) out = df >> mutate( tibble( x=across(where(is_numeric), mean).a, y=across(where(is_numeric), max).a, )) expect = df >> mutate(x=mean(f.a), y=max(f.a)) assert_frame_equal(out, expect)
def test_with_group_id(): df = tibble(g=[1, 2], a=[1, 2], b=[3, 4]) >> group_by(f.g) @register_func(context=None) def switcher(data, group_id, across_a, across_b): return group_id.apply(lambda x: across_a.a.obj[0] if x == 0 else across_b.b.obj[1]) out = df >> mutate(x=switcher(cur_group_id(), across(f.a), across(f.b))) assert out.x.obj.tolist() == [1, 4]
def test_works_sequentially(): df = tibble(a=1) out = df >> mutate(x=ncol(across(where(is_numeric))), y=ncol(across(where(is_numeric)))) expect = tibble(a=1, x=1, y=2) assert out.equals(expect) out = df >> mutate(a="x", y=ncol(across(where(is_numeric)))) expect = tibble(a="x", y=0) assert out.equals(expect)
def test_to_functions(): df = tibble(x=c(1, NA)) # -> float out = df >> summarise(across(everything(), mean, na_rm=True)) expect = tibble(x=1.0) assert_frame_equal(out, expect) out = df >> summarise( across(everything(), dict(mean=mean, median=median), na_rm=True)) expect = tibble(x_mean=1.0, x_median=1.0) assert_frame_equal(out, expect)
def test_use_env_var(): # not a problem, since we use f.y df = tibble(x=1.0, y=2.4) y = "x" out = df >> summarise(across(all_of(y), mean)) expect = tibble(x=1.0) assert out.equals(expect) out = df >> mutate(across(all_of(y), mean)) assert out.equals(df) out = df >> filter(if_all(all_of(y), lambda col: col < 2)) assert out.equals(df)
def test_auto_splicing(): species = tibble(Species=iris.Species) df1 = iris >> distinct(f.Species) df2 = iris >> distinct(species) assert df1.equals(df2) df3 = iris >> distinct(across(f.Species)) assert df1.equals(df3) df4 = (iris >> mutate(across(starts_with("Sepal"), round)) >> distinct( f.Sepal_Length, f.Sepal_Width)) df5 = iris >> distinct(across(starts_with("Sepal"), round)) assert df4.equals(df5)
def test_summarise_with_multiple_acrosses(): """https://stackoverflow.com/questions/63200530/python-pandas-equivalent-to-dplyr-1-0-0-summarizeacross""" out = ( mtcars >> group_by(f.cyl) >> summarize(across(ends_with("p"), sum), across(ends_with("t"), mean)) ) exp = tibble( cyl=[6, 4, 8], disp=[1283.2, 1156.5, 4943.4], hp=[856, 909, 2929], drat=[3.585714, 4.070909, 3.229286], wt=[3.117143, 2.285727, 3.999214], ) assert_tibble_equal(out, exp)
def test_errors(): # wrong type with pytest.raises(ValueError): iris >> group_by(f.Species) >> filter(range(1, 10)) with pytest.raises(ValueError): iris >> filter(range(1, 10)) # wrong size with pytest.raises(ValueError): iris >> group_by(f.Species) >> filter([True, False]) with pytest.raises(ValueError): iris >> rowwise(f.Species) >> filter([True, False]) with pytest.raises(ValueError): iris >> filter([True, False]) # wrong size in column with pytest.raises(ValueError): iris >> group_by(f.Species) >> filter(tibble([True, False])) with pytest.raises(ValueError): iris >> rowwise() >> filter(tibble([True, False])) with pytest.raises(ValueError): iris >> filter(tibble([True, False])) with pytest.raises(ValueError): tibble(x=1) >> filter([True, False]) # named inputs with pytest.raises(TypeError): mtcars >> filter(x=1) with pytest.raises(TypeError): mtcars >> filter(f.y > 2, z=3) with pytest.raises(TypeError): mtcars >> filter(True, x=1) # across() in filter() does not warn yet tibble(x=1, y=2) >> filter(across(everything(), lambda x: x > 0))
def test_names_output(): gf = tibble(x=1, y=2, z=3, s="") >> group_by(f.x) out = gf >> summarise(across()) assert out.columns.tolist() == ["x", "y", "z", "s"] out = gf >> summarise(across(_names="id_{_col}")) assert out.columns.tolist() == ["x", "id_y", "id_z", "id_s"] out = gf >> summarise(across(where(is_numeric), mean)) assert out.columns.tolist() == ["x", "y", "z"] out = gf >> summarise(across(where(is_numeric), mean, _names="mean_{_col}")) assert out.columns.tolist() == ["x", "mean_y", "mean_z"] out = gf >> summarise(across(where(is_numeric), { "mean": mean, "sum": sum })) assert out.columns.tolist() == ["x", "y_mean", "y_sum", "z_mean", "z_sum"] # Different from R's list out = gf >> summarise(across(where(is_numeric), {"mean": mean, 1: sum})) assert out.columns.tolist() == ["x", "y_mean", "y_1", "z_mean", "z_1"] # Different from R's list out = gf >> summarise(across(where(is_numeric), {0: mean, "sum": sum})) assert out.columns.tolist() == ["x", "y_0", "y_sum", "z_0", "z_sum"] out = gf >> summarise(across(where(is_numeric), [mean, sum])) assert out.columns.tolist() == ["x", "y_0", "y_1", "z_0", "z_1"] out = gf >> summarise( across(where(is_numeric), [mean, sum], _names="{_col}_{_fn1}")) assert out.columns.tolist() == ["x", "y_1", "y_2", "z_1", "z_2"] out = gf >> summarise( across( where(is_numeric), { "mean": mean, "sum": sum }, _names="{_fn}_{_col}", )) assert out.columns.tolist() == ["x", "mean_y", "sum_y", "mean_z", "sum_z"]
def test_nb_fail(): from datar.datasets import iris out = iris >> mutate( across( where(is_double) & ~c(f["Petal_Length"], f["Petal_Width"]), round)) rows = out >> nrow() assert rows == 150
def test_auto_splicing(): df1 = iris >> group_by(f.Species) df2 = iris >> group_by(tibble(Species=iris.Species)) assert df1.equals(df2) df1 = iris >> group_by(f.Species) df2 = iris >> group_by(across(f.Species)) assert df1.equals(df2) df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Sepal_Length, f.Sepal_Width) ) df2 = iris >> group_by(across(starts_with("Sepal"), round)) assert df1.equals(df2) # across(character()), across(NULL) not supported df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Sepal_Length, f.Sepal_Width, f.Species) ) df2 = iris >> group_by(across(starts_with("Sepal"), round), f.Species) assert df1.equals(df2) df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Species, f.Sepal_Length, f.Sepal_Width) ) df2 = iris >> group_by(f.Species, across(starts_with("Sepal"), round)) assert df1.equals(df2)
def test_result_locations_aligned_with_column_names(): df = tibble(x=[1, 2], y=["a", "b"]) expect = tibble(x_cls=numpy.int64, x_type=True, y_cls=object, y_type=False) x = df >> summarise( across(everything(), { "cls": lambda x: x.dtype, "type": is_numeric })) assert_frame_equal(x, expect)
def test_across(): df = tibble(x=[1, 3, 2, 1], y=[4, 3, 2, 1]) out = df >> arrange(across()) expect = df >> arrange(f.x, f.y) assert out.equals(expect) out = df >> arrange(across(None, desc)) expect = df >> arrange(desc(f.x), desc(f.y)) assert out.equals(expect) out = df >> arrange(across(f.x)) expect = df >> arrange(f.x) assert out.equals(expect) out = df >> arrange(across(f.y)) expect = df >> arrange(f.y) assert out.equals(expect)
def test_summarise_cols_inside_func(): df = tibble(x=2, y=4, z=8) @register_func(None, context=None) def data_frame(**kwargs): return tibble(**kwargs) out = df >> summarise(data_frame(x=f.x / f.y, y=f.y / f.y, z=f.z / f.y)) expect = df >> summarise(across(everything(), lambda col: col / df.y)) assert out.equals(expect)
def test_mutate_cols_inside_func(): df = tibble(x=2, y=4, z=8) @register_func(None, context=None) def data_frame(**kwargs): return tibble(**kwargs) out = df >> mutate(data_frame(x=f.x / f.y, y=f.y / f.y, z=f.z / f.y)) # df.y does not work on grouped data expect = df >> mutate(across(everything(), lambda col: col / df.y)) assert out.equals(expect)
def test_used_separately(): df = tibble(a=1, b=2) out = df >> mutate(x=ncol(across(where(is_numeric))), y=ncol(across(f.a))) expect = tibble(a=1, b=2, x=2, y=1) assert out.equals(expect)
def test_cur_column(): df = tibble(x=1, y=2, z=3) out = df >> mutate(across(f[f.x :], (lambda x, y: y), y=cur_column())) assert out.values.tolist() == [["x", "y", "z"]]
def test_not_selecting_grouping_var(): df = tibble(g=1, x=1) out = df >> group_by(f.g) >> summarise(x=across(everything())) expected = tibble(x=1) assert_frame_equal(out["x"], expected)
def test_on_one_column(): df = tibble(x=1) out = df >> mutate(across()) assert out.equals(df)
def test_keep_used_not_affected_by_across(): df = tibble(x=1, y=2, z=3, a="a", b="b", c="c") out = df >> mutate(across(where(is_numeric), identity), _keep="unused") assert out.columns.tolist() == df.columns.tolist()
def test_cols_in_lambda(): df = tibble(x=1.0, y=2.0) out = df >> mutate(across("x", lambda x: x / df.y)) assert out.x.tolist() == [0.5]
def test_empty_df(): df = tibble() out = df >> mutate(across()) assert out.equals(df)
def test_kwargs(): df = tibble(x=c(1, 2)) tail_n = lambda d, n: d.tail(n) out = df >> summarise(across(f.x, tail_n, 1)) expect = tibble(x=2) assert_frame_equal(out, expect)
def test_reject_non_vectors(): with pytest.raises(ValueError, match="Argument `_fns` of across must be"): tibble(x=1) >> summarise(across(where(is_numeric), object()))
def test_original_ordering(): df = tibble(a=1, b=2) out = df >> mutate(a=2, x=across()) assert out.columns.tolist() == ["a", "b", "x$a", "x$b"]
def test_used_twice(): df = tibble(a=1, b=2) out = df >> mutate(x=ncol(across(where(is_numeric))) + ncol(across(f.a))) expect = tibble(a=1, b=2, x=3) assert out.equals(expect)
def test_implicit_mutate_operates_on_ungrouped_data(): vars = tibble(x=c(1, 2), y=c(3, 4), z=c(5, 6)) >> group_by(f.y) vars >>= group_by(across(any_of(c("y", "z")))) gv = group_vars(vars) assert gv == ["y", "z"]