def test_summarise_maintains_drop(): df = tibble( f1=factor("a", levels=c("a", "b", "c")), f2=factor("d", levels=c("d", "e", "f", "g")), x=42, ) res = df >> group_by(f.f1, f.f2, _drop=True) ng = n_groups(res) assert ng == 1 assert group_by_drop_default(res) # DataFrame.groupby(..., observed=False) doesn't support # multiple categoricals # res1 = df >> group_by(f.f1, f.f2, _drop=False) # ng = n_groups(res1) # assert ng == 12 res1 = df >> group_by(f.f1, _drop=True) ng = n_groups(res1) assert ng == 1 res1 = df >> group_by(f.f1, _drop=False) ng = n_groups(res1) assert ng == 3 res1 = df >> group_by(f.f2, _drop=False) ng = n_groups(res1) assert ng == 4 res2 = res >> summarise(x=sum(f.x), _groups="drop_last") ng = n_groups(res2) assert ng == 1 assert group_by_drop_default(res2)
def test_can_safely_add_to_factor_columns_everywhere(): # test_that("can safely add to factor columns everywhere (#296)", { df = tibble(a=factor(letters[:3])) out = add_row(df) exp = tibble(a=factor(c(letters[:3], NA))) assert_frame_equal(out, exp) out = add_row(df, _before=0) exp = tibble(a=factor(c(NA, letters[:3]))) assert_frame_equal(out, exp) out = add_row(df, _before=1) exp = tibble(a=factor(c("a", NA, letters[1:3]))) assert_frame_equal(out, exp) out = add_row(df, a="d") exp = tibble(a=letters[:4], _dtypes=object) assert_frame_equal(out, exp) out = add_row(df, a="d", _before=0) exp = tibble(a=c("d", letters[:3]), _dtypes=object) assert_frame_equal(out, exp) out = add_row(df, a="d", _before=1) exp = tibble(a=list("adbc"), _dtypes=object) assert_frame_equal(out, exp)
def test_add_passes_drop(): d = tibble( f1=factor("b", levels=c("a", "b", "c")), f2=factor("g", levels=c("e", "f", "g")), x=48, ) res = group_by(group_by(d, f.f1, _drop=True), f.f2, _add=True) ng = n_groups(res) assert ng == 1 assert group_by_drop_default(res)
def test_keys_are_coerced_to_symmetric_type(): foo = tibble(id=factor(c("a", "b")), var1="foo") bar = tibble(id=c("a", "b"), var2="bar") idcoltype = inner_join(foo, bar, by="id").id.dtype.name assert idcoltype != "category" idcoltype = inner_join(bar, foo, by="id").id.dtype.name assert idcoltype != "category" df1 = tibble(x=1, y=factor("a")) df2 = tibble(x=2, y=factor("b")) out = full_join(df1, df2, by=["x", "y"]) assert out.y.dtype.name == "category"
def test_bind_factors(): df1 = tibble(a=factor("a")) df2 = tibble(a=factor("b")) out = df1 >> bind_rows(df2) assert out.a.cat.categories.tolist() == ["a", "b"] df1 = tibble(a=factor("a")) df2 = tibble(a=factor(NA)) out = df1 >> bind_rows(df2) assert out.a.cat.categories.tolist() == ["a"] assert out.a.astype(object).fillna("NA").tolist() == ["a", "NA"] out2 = None >> bind_rows([df1, df2]) assert_frame_equal(out2, out)
def test_tabulate(): out = tabulate(3) assert_iterable_equal(out, [0, 0, 1]) fac = factor(list("abc")) out = tabulate(fac, 3) assert_iterable_equal(out, [1, 1, 1])
def test_factor_to_chars(): # we don't have warnings df1 = tibble(a=factor("a")) df2 = tibble(a="b") out = df1 >> bind_rows(df1, df2) a_type = is_factor(out.a) assert not a_type
def test_group_split_keeps_group_variables_by_default(): tbl = tibble(x=[1, 2, 3, 4], g=factor(rep(["a", "b"], each=2))) out = group_split(tbl, f.g) res = list(out) assert len(res) == 2 assert res[0].equals(tbl.iloc[[0, 1], :]) assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True))
def test_desc(): x = factor(c(letters[:3], NA), levels=letters[:3]) out = desc(x) assert_iterable_equal(out, [-0.0, -1.0, -2.0, NA]) out = desc([1, 2, 3]) assert_iterable_equal(out, [-1, -2, -3]) out = desc(["a", "b", "c"]) assert_iterable_equal(out, [-0.0, -1.0, -2.0])
def test_bind_empty_dfs(): out = bind_rows(None) assert dim(out) == (0, 0) out = bind_cols(None) assert dim(out) == (0, 0) df1 = tibble(x=factor([1, 2, 3])) df2 = tibble() out = df1 >> bind_rows(df2) assert out.x.tolist() == [1, 2, 3]
def test_errors(): df1 = tibble(x=[1, 2, 3]) df2 = tibble(x=[4, 5, 6]) with pytest.raises(ValueError): df1 >> bind_rows(df2, _id=5) df1 = tibble(a=factor("a")) df2 = tibble(a=1) df1 >> bind_rows(df2) # no error, all converted to object with pytest.raises(ValueError): [1, 2] >> bind_rows()
def test_drop(): df = tibble(f=factor("b", levels=c("a", "b", "c"))) out = df >> count(f.f) assert out.n.tolist() == [1] out = df >> count(f.f, _drop=False) # note the order # assert out.n.tolist() == [0,1,0] assert out.n.tolist() == [1, 0, 0] out = df >> group_by(f.f, _drop=FALSE) >> count() # print(out.obj) assert out.n.obj.tolist() == [1, 0, 0]
def test_joins_maintains__drop(): df1 = group_by( tibble(f1=factor(c("a", "b"), levels=c("a", "b", "c")), x=[42, 43]), f.f1, _drop=True, ) df2 = group_by( tibble(f1=factor(c("a"), levels=c("a", "b", "c")), y=1), f.f1, _drop=True, ) res = left_join(df1, df2, by="f1") assert n_groups(res) == 2 df2 = group_by( tibble(f1=factor(c("a", "c"), levels=c("a", "b", "c")), y=[1, 2]), f.f1, _drop=True, ) res = full_join(df1, df2, by="f1") assert n_groups(res) == 3
def test_group_list_respects_empty_groups(): tbl = tibble( x=[1, 2, 3, 4], g=factor(rep(["a", "b"], each=2), levels=["a", "b", "c"]), ) res = group_split.list(tbl, f.g) assert res[0].equals(tbl.iloc[:2, :]) assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True)) res = group_split.list(tbl, f.g, _drop=False) assert res[0].equals(tbl.iloc[:2, :]) assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True)) assert res[2].equals(tbl.iloc[[], :])
def test_bind_na_cols(): df1 = tibble(x=factor(["foo", "bar"])) df2 = tibble(x=NA) out = df1 >> bind_rows(df2) res = out >> get(2, f.x) y = is_na(res) assert_iterable_equal(y, [True]) out = df2 >> bind_rows(df1) res = out >> get(0, f.x) y = is_na(res) assert_iterable_equal(y, [True]) y = is_categorical(out.x) assert y
def test_table(): # https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/table z = rpois(100, 5) x = table(z) assert sum(x.values.flatten()) == 100 # ----------------- with data_context(warpbreaks) as _: tab = table(f.wool, f.tension) assert tab.columns.tolist() == ["H", "L", "M"] assert tab.index.tolist() == ["A", "B"] assert_iterable_equal(tab.values.flatten(), [9] * 6) tab = table(warpbreaks.loc[:, ["wool", "tension"]]) assert tab.columns.tolist() == ["H", "L", "M"] assert tab.index.tolist() == ["A", "B"] assert_iterable_equal(tab.values.flatten(), [9] * 6) # ----------------- tab = table(state_division, state_region) assert tab.loc["New England", "Northeast"] == 6 # ----------------- with data_context(airquality) as _: qt = quantile(f.Temp) ct = cut(f.Temp, qt) tab = table(ct, f.Month) assert tab.iloc[0, 0] == 24 # ----------------- a = letters[:3] tab = table(a, sample(a)) assert sum(tab.values.flatten()) == 3 # ----------------- tab = table(a, sample(a), dnn=["x", "y"]) assert tab.index.name == "x" assert tab.columns.name == "y" # ----------------- a = c(NA, Inf, (1.0 / (i + 1) for i in range(3))) a = a * 10 # tab = table(a) # assert_iterable_equal(tab.values.flatten(), [10] * 4) tab = table(a, exclude=None) assert_iterable_equal(tab.values.flatten(), [10] * 5) # ------------------ b = as_factor(rep(c("A", "B", "C"), 10)) tab = table(b) assert tab.shape == (1, 3) assert_iterable_equal(tab.values.flatten(), [10] * 3) tab = table(b, exclude="B") assert tab.shape == (1, 2) assert_iterable_equal(tab.values.flatten(), [10] * 2) assert "B" not in tab.columns # ------------------- d = factor(rep(c("A", "B", "C"), 10), levels=c("A", "B", "C", "D", "E")) tab = table(d, exclude="B", dnn=["x"]) assert_iterable_equal(tab.columns.to_list(), ["A", "C", "D", "E"]) assert_iterable_equal(tab.values.flatten(), [10, 10, 0, 0]) d2 = factor(rep(c("A", "B", "C"), 10), levels=c("A", "B", "C", "D", "E")) tab = table(d, d2, exclude="B") assert tab.shape == (4, 4) tab = table("abc", "cba", dnn="x") assert tab.shape == (3, 3) assert sum(tab.values.flatten()) == 3 with data_context(airquality) as _: tab = table(f.Ozone, f.Solar_R, exclude=None) assert "<NA>" in tab.columns assert "<NA>" in tab.index with pytest.raises(ValueError): table([NA_REPR, np.nan], exclude=None) tab = table(factor([1, np.nan]), exclude=1) assert tab.shape == (1, 1) assert_iterable_equal(tab[NA_REPR], [1])
def test_cat_ordered(): df = tibble(x=factor([1, 2, 3], ordered=True)) y = bind_rows(df, df) assert y.x.cat.ordered
def test_group_split_respects__drop(): # test_that("group_split() respects .drop", { chunks = tibble(f=factor(["b"], levels=list("abc"))) >> group_split.list( f.f, _drop=True) assert len(chunks) == 1
def test_group_split_can_discard_grouping_vars_by__keep_eqs_false(): tbl = tibble(x=[1, 2, 3, 4], g=factor(rep(["a", "b"], each=2))) res = group_split.list(tbl, f.g, _keep=False) assert res[0].equals(tbl.iloc[:2, [0]]) assert res[1].equals(tbl.iloc[[2, 3], [0]].reset_index(drop=True))
from datar.base import c, factor, letters, NA, identity, sum from datar.dplyr import ( n_distinct, summarise, group_by, pull, ) from datar.tibble import tibble from datar.datasets import iris from ..conftest import assert_iterable_equal df_var = tibble( l=c(True, False, False), i=c(1, 1, 2), # d = Sys.Date() + c(1, 1, 2), f=factor(letters[c(1, 1, 2)]), n=np.array(c(1, 1, 2)) + 0.5, # t = Sys.time() + c(1, 1, 2), c=letters[c(1, 1, 2)], ) def test_n_disinct_gives_the_correct_results_on_iris(): out = iris.apply(n_distinct) exp = iris.apply(lambda col: len(col.unique())) assert_iterable_equal(out, exp) def test_n_distinct_treats_na_correctly(): # test_that("n_distinct treats NA correctly in the REALSXP case (#384)", { assert n_distinct(c(1.0, NA, NA), na_rm=False) == 2
def plot(self): """Plot the figures using R""" df = pandas.DataFrame( self.data, columns=self.datacols, ) with capture_c_msg("datar", prefix=f"[r]{self.title}[/r]: "): df.columns = make_unique(df.columns.tolist()) if self.savedata: datafile = self.outprefix + ".csv" logger.info( "[r]%s[/r]: Saving data to: %r", self.title, datafile, extra={"markup": True}, ) df.to_csv(datafile, index=False) if df.shape[0] == 0: logger.warning("No data points to plot") return aes_for_geom_fill = None aes_for_geom_color = None theme_elems = p9.theme(axis_text_x=p9.element_text(angle=60, hjust=2)) if df.shape[1] > 2: aes_for_geom_fill = p9.aes(fill=df.columns[2]) aes_for_geom_color = p9.aes(color=df.columns[2]) plt = p9.ggplot(df, p9.aes(y=df.columns[0], x=df.columns[1])) if self.figtype == "scatter": plt = plt + p9.geom_point(aes_for_geom_color) theme_elems = None elif self.figtype == "line": pass elif self.figtype == "bar": plt = plt + p9.geom_bar(p9.aes(fill=df.columns[0])) elif self.figtype == "col": plt = plt + p9.geom_col(aes_for_geom_fill) elif self.figtype == "pie": logger.warning("Pie chart is not support by plotnine yet, " "plotting bar chart instead.") col0 = df.iloc[:, 0] if df.shape[1] > 2: plt = plt + p9.geom_bar( p9.aes(x=df.columns[2], y=col0.name, fill=df.columns[2]), stat="identity" # aes_for_geom_fill, # x=df.Group, # y=col0, # label=paste0(round_(100 * col0 / sum_(col0), 1), "%"), # show_legend=False, # position=p9.position_adjust_text(), ) else: col0 = factor(col0, levels=rev(unique(as_character(col0)))) fills = rev(levels(col0)) sums = map(lambda x: sum(col0 == x), fills) print(col0) print(fills) plt = (p9.ggplot(df, p9.aes(x=df.columns[1])) + p9.geom_bar(p9.aes(fill=df.columns[0])) + p9.geom_label( x=1, y=cumsum(sums) - sums / 2, label=paste0(round(sums / sum(sums) * 100, 1), "%"), show_legend=False, )) theme_elems = p9.theme( axis_title_x=p9.element_blank(), axis_title_y=p9.element_blank(), axis_text_y=p9.element_blank(), ) elif self.figtype == "violin": plt = plt + p9.geom_violin(aes_for_geom_fill) elif self.figtype == "boxplot": plt = plt + p9.geom_boxplot(aes_for_geom_fill) elif self.figtype in ("histogram", "density"): plt = p9.ggplot(df, p9.aes(x=df.columns[0])) geom = getattr(p9, f"geom_{self.figtype}") if df.columns[1] != "ONE": plt = plt + geom(p9.aes(fill=df.columns[1]), alpha=0.6) theme_elems = None else: plt = plt + geom(alpha=0.6) theme_elems = p9.theme(legend_position="none") elif self.figtype == "freqpoly": plt = p9.ggplot(df, p9.aes(x=df.columns[0])) if df.columns[1] != "ONE": plt = plt + p9.geom_freqpoly(p9.aes(fill=df.columns[1])) else: plt = plt + p9.geom_freqpoly() theme_elems = None else: raise ValueError(f"Unknown figure type: {self.figtype}") plt = plt + p9.ggtitle(self.title) self.save_plot(plt, theme_elems)
def test_can_deframe_3col_df_with_warning(caplog): out = deframe(tibble(name=letters[:3], value=seq(3, 1), oops=[1, 2, 3])) assert out == {"a": 3, "b": 2, "c": 1} assert "one- or two-column" in caplog.text # add_row ------------------------------------------------------------- df_all = tibble( a=[1, 2.5, NA], b=[1, 2, NA], c=[True, False, NA], d=["a", "b", NA], e=factor(c("a", "b", NA)), ) def test_can_add_row(): df_all_new = add_row(df_all, a=4, b=3) assert df_all_new.columns.tolist() == df_all.columns.tolist() assert nrow(df_all_new) == nrow(df_all) + 1 assert_iterable_equal(df_all_new.a, [1.0, 2.5, NA, 4]) assert_iterable_equal(df_all_new.b, [1.0, 2.0, NA, 3.0]) assert_iterable_equal(df_all_new.c, [True, False, NA, NA]) def test_add_empty_row_if_no_arguments(): iris1 = add_row(iris) assert nrow(iris1) == nrow(iris) + 1