def test_ignores_null_empty(): df = tibble(a=1) out = df >> bind_rows(NULL) assert out.equals(df) df0 = tibble() out = df >> bind_rows(df0) assert out.equals(df) # no rows df_no_rows = df.iloc[[], :] out = df >> bind_rows(df_no_rows) assert out.equals(df) # no cols df_no_cols = df.iloc[:, []] out = df >> bind_rows(df_no_cols) rows = out >> nrow() assert rows == 2 val = out.fillna(1234) >> get(1, f.a) assert val == 1234 out = df_no_cols >> bind_rows(df) rows = out >> nrow() assert rows == 2 val = out.fillna(888) >> get(0, f.a) assert val == 888
def test_can_recycle_when_adding_rows(): iris_new = add_row(iris, Sepal_Length=[-1, -2], Species="unknown") assert nrow(iris_new) == nrow(iris) + 2 assert_iterable_equal(iris_new.Sepal_Length, iris.Sepal_Length.tolist() + [-1, -2]) assert_iterable_equal(iris_new.Species, iris.Species.tolist() + ["unknown"] * 2)
def test_can_add_row(): df_all_new = add_row(df_all, a=4, b=3) assert df_all_new.columns.tolist() == df_all.columns.tolist() assert nrow(df_all_new) == nrow(df_all) + 1 assert_iterable_equal(df_all_new.a, [1.0, 2.5, NA, 4]) assert_iterable_equal(df_all_new.b, [1.0, 2.0, NA, 3.0]) assert_iterable_equal(df_all_new.c, [True, False, NA, NA])
def test_joins_matches_nas_by_default(): # test_that("joins matches NAs by default (#892, #2033)", { df1 = tibble(x=c(None, 1)) df2 = tibble(x=c(None, 2)) assert nrow(inner_join(df1, df2, by=f.x)) == 1 assert nrow(semi_join(df1, df2, by=f.x)) == 1
def test_with_no_args_returns_nothing(): empty = select(mtcars) assert ncol(empty) == 0 assert nrow(empty) == 32 empty = select(mtcars, **{}) assert ncol(empty) == 0 assert nrow(empty) == 32
def test_grouped_filter_handles_indices(): res = iris >> group_by(f.Species) >> filter(f.Sepal_Length > 5) res2 = res >> mutate(Petal=f.Petal_Width * f.Petal_Length) assert nrow(res) == nrow(res2) grows1 = group_rows(res) grows2 = group_rows(res2) assert grows1 == grows2 assert all(group_keys(res) == group_keys(res2))
def test_works_on_empty_data_frames(): df = tibble() res = df >> mutate() assert nrow(res) == 0 assert len(res) == 0 res = df >> mutate(x=[]) assert res.columns.tolist() == ["x"] assert nrow(res) == 0 assert ncol(res) == 1
def test_contains(): df = tibble(a=c("a", "b", "ab"), g=c(1, 1, 2)) res = df >> filter(is_element(f.a, letters)) rows = nrow(res) assert rows == 2 res = df >> group_by(f.g) >> filter(is_element(f.a, letters)) rows = nrow(res) assert rows == 2
def test_min_and_max_ignore_nas(): df = tibble(id=range(1, 5), x=c(2, NA, 1, 2), y=[NA] * 4) out = df >> slice_min(f.x, n=2) assert out.id.tolist() == [3, 1, 4] out = df >> slice_min(f.y, n=2) >> nrow() assert out == 0 out = df >> slice_max(f.x, n=2) assert out.id.tolist() == [1, 4] out = df >> slice_max(f.y, n=2) >> nrow() assert out == 0
def test_0_vars(df): gdata = group_data(group_by(iris)) assert names(gdata) == ["_rows"] out = gdata assert_iterable_equal(out._rows[0], range(nrow(iris))) gdata = group_data(group_by(iris, **{})) assert names(gdata) == ["_rows"] out = gdata assert_iterable_equal(out._rows[0], range(nrow(iris)))
def test_slice_handles_na(): df = tibble(x=[1, 2, 3]) assert nrow(slice(df, NA)) == 0 assert nrow(slice(df, c(1, NA))) == 1 out = df >> slice(c(~c(1), NA)) >> nrow() assert out == 2 df = tibble(x=[1, 2, 3, 4], g=rep([1, 2], 2)) >> group_by(f.g) assert nrow(slice(df, c(1, NA))) == 2 out = df >> slice(c(~c(1), NA)) >> nrow() assert out == 2
def test_min_and_max_return_ties_by_default(): df = tibble(x=c(1, 1, 1, 2, 2)) out = df >> slice_min(f.x) >> nrow() assert out == 3 out = df >> slice_max(f.x) >> nrow() assert out == 2 out = df >> slice_min(f.x, with_ties=False) >> nrow() assert out == 1 out = df >> slice_max(f.x, with_ties=False) >> nrow() assert out == 1
def test_functions_silently_truncate_results(): df = tibble(x=range(1, 6)) out = df >> slice_head(n=6) >> nrow() assert out == 5 out = df >> slice_tail(n=6) >> nrow() assert out == 5 out = df >> slice_sample(n=6) >> nrow() assert out == 5 out = df >> slice_min(f.x, n=6) >> nrow() assert out == 5 out = df >> slice_max(f.x, n=6) >> nrow() assert out == 5
def test_rowid_to_column(): # test_that("rowid_to_column keeps the tbl classes", { res = rowid_to_column(mtcars) assert not has_rownames(res) assert_iterable_equal(res.rowid, seq_len(nrow(mtcars)) - 1) with pytest.raises(ValueError, match="duplicated"): rowid_to_column(mtcars, f.wt) res1 = rowid_to_column(mtcars, "row_id") assert not has_rownames(res1) assert_iterable_equal(res1.row_id, seq_len(nrow(mtcars)) - 1) with pytest.raises(ValueError, match="duplicated"): rowid_to_column(res1, f.wt)
def test_one_group_for_NA(): x = c(NA, NA, NA, range(10, 0, -1), range(10, 0, -1)) w = numpy.array(c(20, 30, 40, range(1, 11), range(1, 11))) * 10 assert n_distinct(x, na_rm=False) == 11 res = tibble(x=x, w=w) >> group_by(f.x) >> summarise(n=n()) assert nrow(res) == 11
def test_group_modify_makes_a_grouped_df(): res = group_by(mtcars, f.cyl) >> group_modify(lambda df: head(df, 2)) assert nrow(res) == 6 assert group_rows(res) == [[0, 1], [2, 3], [4, 5]] res = (iris >> group_by(f.Species) >> filter(f.Species == "setosa") >> group_modify(lambda df: tally(df))) assert nrow(res) == 1 assert group_rows(res) == [[0]] res = (iris >> group_by(f.Species, _drop=False) >> filter(f.Species == "setosa") >> group_modify(lambda df: tally(df))) # assert nrow(res) == 3 assert nrow(res) == 1 # assert group_rows(res) == [[0], [1], [2]] assert group_rows(res) == [[0]]
def test_complex(): df1 = tibble(r=[1 + 1j, 2 - 1j]) df2 = tibble(r=[1 - 1j, 2 + 1j]) df3 = df1 >> bind_rows(df2) out = df3 >> nrow() assert out == 4 assert df3.r.tolist() == df1.r.tolist() + df2.r.tolist()
def test_hierachical_data(): my_list = [dict(x=1, y="a"), dict(x=2, y="b")] res = my_list >> bind_rows() rows = nrow(res) assert rows == 2 out = is_int(res.x) assert out out = is_character(res.y) assert out res = dict(x=1, y="a") >> bind_rows(dict(x=2, y="b")) rows = nrow(res) assert rows == 2 out = is_int(res.x) assert out out = is_character(res.y) assert out
def test_drop(): res = ( iris >> filter(f.Species == "setosa") >> group_by(f.Species, _drop=True) ) out = res >> count() >> nrow() assert out == 1
def test_nb_fail(): from datar.datasets import iris out = iris >> mutate( across( where(is_double) & ~c(f["Petal_Length"], f["Petal_Width"]), round)) rows = out >> nrow() assert rows == 150
def test_preserves_grouping(): gf = group_by(tibble(x=[1, 2], y=2), f.x) out = mutate(gf, x=1) assert group_vars(out) == ["x"] assert nrow(group_data(out)) == 1 out = mutate(gf, z=1) assert group_data(out).equals(group_data(gf))
def test_slice_handles_numeric_input(): g = mtcars >> arrange(f.cyl) >> group_by(f.cyl) res = g >> slice(0) assert nrow(res) == 3 exp = g >> filter(row_number() == 1) assert_frame_equal(res, exp) res1 = mtcars >> slice(0) >> as_tibble() res2 = mtcars >> filter(row_number() == 1) assert_frame_equal(res1, res2)
def test_group_modify_map_want_functions_with_at_least_1_arg(): head1 = lambda df: head(df, 1) g = iris >> group_by(f.Species) assert nrow(group_modify(g, head1)) == 3 assert len(list(group_map(g, head1))) == 3 head_err = lambda: 1 with pytest.raises(TypeError): group_modify(g, head_err) with pytest.raises(TypeError): group_map.list(g, head_err) # force function to execute
def test_complex_expression_as_value(): # https://stackoverflow.com/questions/30714810/ # pandas-group-by-and-aggregate-column-1-with-condition-from-column-2 dat = (tibble( user=rep(c("1", 2, 3, 4), each=5), cancel_date=rep(c(12, 5, 10, 11), each=5), ) >> group_by(f.user)) out = dat >> mutate( # mulitple size not supported yet # login=sample(f[1 : ], size=n(), replace=True) login=sample(f[1:], size=1, replace=True)) assert nrow(out) == 20
def test_slice_family_on_rowwise_df(): df = tibble(x=f[1:6]) >> rowwise() out = df >> slice_head(prop=0.1) assert out.shape[0] == 0 out = df >> slice([0, 1, 2]) assert isinstance(out, TibbleRowwise) assert nrow(out) == 5 out = df >> slice_head(n=3) assert isinstance(out, TibbleRowwise) assert nrow(out) == 5 out = df >> slice_tail(n=3) assert isinstance(out, TibbleRowwise) assert nrow(out) == 5 out = df >> slice_min(f.x, n=3) assert isinstance(out, TibbleRowwise) assert nrow(out) == 5 out = df >> slice_max(f.x, n=3) assert isinstance(out, TibbleRowwise) assert nrow(out) == 5 out = df >> slice_sample(n=3) assert isinstance(out, TibbleRowwise) assert nrow(out) == 5
def test_slice_accepts_star_args(): out1 = slice(mtcars, 1, 2) out2 = slice(mtcars, [1, 2]) assert out1.equals(out2) out3 = slice(mtcars, 0, n() - 1) out4 = slice(mtcars, c(0, nrow(mtcars) - 1)) assert out3.equals(out4) g = mtcars >> group_by(f.cyl) out5 = slice(g, 0, n() - 1) out6 = slice(g, c(0, n() - 1)) assert out5.equals(out6)
def test_nb_fail_c_across(): df = tibble( id=[1, 2, 3, 4], k=["a", "b", "c", "d"], w=runif(4), x=runif(4), y=runif(4), z=runif(4), ) out = (df >> rowwise() >> mutate(sum=sum(c_across(f[f.w:])), sd=sd(c_across(f[f.w:])))) assert isinstance(out, TibbleRowwise) assert nrow(out) == 4
def test_rowwise(): @register_func(None) def grepl(a, b): return Series([x in y for x, y in zip(a.obj, b.obj)], index=a.obj.index) df = tibble( First=c("string1", "string2"), Second=c("Sentence with string1", "something"), ) res = df >> rowwise() >> filter(grepl(f.First, f.Second)) assert nrow(res) == 1 df1 = df >> slice(0) df2 = res >> ungroup() assert df1.equals(df2)
def test_proportion_computed_correctly(): df = tibble(x=range(1, 11)) out = df >> slice_head(prop=0.11) >> nrow() assert out == 1 out = df >> slice_tail(prop=0.11) >> nrow() assert out == 1 out = df >> slice_sample(prop=0.11) >> nrow() assert out == 1 out = df >> slice_min(f.x, prop=0.11) >> nrow() assert out == 1 out = df >> slice_max(f.x, prop=0.11) >> nrow() assert out == 1 out = df >> slice_max(f.x, prop=0.11, with_ties=False) >> nrow() assert out == 1 out = df >> slice_min(f.x, prop=0.11, with_ties=False) >> nrow() assert out == 1
def test_column_to_rownames(caplog): var = "var" assert has_rownames(mtcars) res0 = rownames_to_column(mtcars, var) res = column_to_rownames(res0, var) assert caplog.text == "" assert has_rownames(res) assert_iterable_equal(rownames(res), rownames(mtcars)) assert_frame_equal(res, mtcars) # has_name is not a public API # expect_false(has_name(res, var)) mtcars1 = mtcars.copy() mtcars1["num"] = rev(seq_len(nrow(mtcars))) - 1 res0 = rownames_to_column(mtcars1) res = column_to_rownames(res0, var="num") assert caplog.text == "" assert has_rownames(res) assert_iterable_equal(rownames(res), as_character(mtcars1.num)) with pytest.raises(ValueError): column_to_rownames(res) with pytest.raises(KeyError): column_to_rownames(rownames_to_column(mtcars1, var), "num2")