Beispiel #1
0
def test_summarise_maintains_drop():
    df = tibble(
        f1=factor("a", levels=c("a", "b", "c")),
        f2=factor("d", levels=c("d", "e", "f", "g")),
        x=42,
    )
    res = df >> group_by(f.f1, f.f2, _drop=True)
    ng = n_groups(res)
    assert ng == 1
    assert group_by_drop_default(res)

    # DataFrame.groupby(..., observed=False) doesn't support
    # multiple categoricals
    # res1 = df >> group_by(f.f1, f.f2, _drop=False)
    # ng = n_groups(res1)
    # assert ng == 12

    res1 = df >> group_by(f.f1, _drop=True)
    ng = n_groups(res1)
    assert ng == 1

    res1 = df >> group_by(f.f1, _drop=False)
    ng = n_groups(res1)
    assert ng == 3

    res1 = df >> group_by(f.f2, _drop=False)
    ng = n_groups(res1)
    assert ng == 4

    res2 = res >> summarise(x=sum(f.x), _groups="drop_last")
    ng = n_groups(res2)
    assert ng == 1
    assert group_by_drop_default(res2)
Beispiel #2
0
def test_can_safely_add_to_factor_columns_everywhere():
    # test_that("can safely add to factor columns everywhere (#296)", {
    df = tibble(a=factor(letters[:3]))
    out = add_row(df)
    exp = tibble(a=factor(c(letters[:3], NA)))
    assert_frame_equal(out, exp)

    out = add_row(df, _before=0)
    exp = tibble(a=factor(c(NA, letters[:3])))
    assert_frame_equal(out, exp)

    out = add_row(df, _before=1)
    exp = tibble(a=factor(c("a", NA, letters[1:3])))
    assert_frame_equal(out, exp)

    out = add_row(df, a="d")
    exp = tibble(a=letters[:4], _dtypes=object)
    assert_frame_equal(out, exp)

    out = add_row(df, a="d", _before=0)
    exp = tibble(a=c("d", letters[:3]), _dtypes=object)
    assert_frame_equal(out, exp)

    out = add_row(df, a="d", _before=1)
    exp = tibble(a=list("adbc"), _dtypes=object)
    assert_frame_equal(out, exp)
Beispiel #3
0
def test_add_passes_drop():
    d = tibble(
        f1=factor("b", levels=c("a", "b", "c")),
        f2=factor("g", levels=c("e", "f", "g")),
        x=48,
    )
    res = group_by(group_by(d, f.f1, _drop=True), f.f2, _add=True)
    ng = n_groups(res)
    assert ng == 1
    assert group_by_drop_default(res)
Beispiel #4
0
def test_keys_are_coerced_to_symmetric_type():
    foo = tibble(id=factor(c("a", "b")), var1="foo")
    bar = tibble(id=c("a", "b"), var2="bar")

    idcoltype = inner_join(foo, bar, by="id").id.dtype.name
    assert idcoltype != "category"
    idcoltype = inner_join(bar, foo, by="id").id.dtype.name
    assert idcoltype != "category"

    df1 = tibble(x=1, y=factor("a"))
    df2 = tibble(x=2, y=factor("b"))
    out = full_join(df1, df2, by=["x", "y"])
    assert out.y.dtype.name == "category"
Beispiel #5
0
def test_bind_factors():
    df1 = tibble(a=factor("a"))
    df2 = tibble(a=factor("b"))

    out = df1 >> bind_rows(df2)
    assert out.a.cat.categories.tolist() == ["a", "b"]

    df1 = tibble(a=factor("a"))
    df2 = tibble(a=factor(NA))

    out = df1 >> bind_rows(df2)
    assert out.a.cat.categories.tolist() == ["a"]
    assert out.a.astype(object).fillna("NA").tolist() == ["a", "NA"]

    out2 = None >> bind_rows([df1, df2])
    assert_frame_equal(out2, out)
Beispiel #6
0
def test_tabulate():
    out = tabulate(3)
    assert_iterable_equal(out, [0, 0, 1])

    fac = factor(list("abc"))
    out = tabulate(fac, 3)
    assert_iterable_equal(out, [1, 1, 1])
Beispiel #7
0
def test_factor_to_chars():
    # we don't have warnings
    df1 = tibble(a=factor("a"))
    df2 = tibble(a="b")

    out = df1 >> bind_rows(df1, df2)
    a_type = is_factor(out.a)
    assert not a_type
Beispiel #8
0
def test_group_split_keeps_group_variables_by_default():
    tbl = tibble(x=[1, 2, 3, 4], g=factor(rep(["a", "b"], each=2)))
    out = group_split(tbl, f.g)
    res = list(out)

    assert len(res) == 2
    assert res[0].equals(tbl.iloc[[0, 1], :])
    assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True))
Beispiel #9
0
def test_desc():
    x = factor(c(letters[:3], NA), levels=letters[:3])
    out = desc(x)
    assert_iterable_equal(out, [-0.0, -1.0, -2.0, NA])

    out = desc([1, 2, 3])
    assert_iterable_equal(out, [-1, -2, -3])

    out = desc(["a", "b", "c"])
    assert_iterable_equal(out, [-0.0, -1.0, -2.0])
Beispiel #10
0
def test_bind_empty_dfs():
    out = bind_rows(None)
    assert dim(out) == (0, 0)

    out = bind_cols(None)
    assert dim(out) == (0, 0)

    df1 = tibble(x=factor([1, 2, 3]))
    df2 = tibble()
    out = df1 >> bind_rows(df2)
    assert out.x.tolist() == [1, 2, 3]
Beispiel #11
0
def test_errors():
    df1 = tibble(x=[1, 2, 3])
    df2 = tibble(x=[4, 5, 6])
    with pytest.raises(ValueError):
        df1 >> bind_rows(df2, _id=5)

    df1 = tibble(a=factor("a"))
    df2 = tibble(a=1)
    df1 >> bind_rows(df2)  # no error, all converted to object

    with pytest.raises(ValueError):
        [1, 2] >> bind_rows()
Beispiel #12
0
def test_drop():
    df = tibble(f=factor("b", levels=c("a", "b", "c")))
    out = df >> count(f.f)
    assert out.n.tolist() == [1]

    out = df >> count(f.f, _drop=False)
    # note the order
    # assert out.n.tolist() == [0,1,0]
    assert out.n.tolist() == [1, 0, 0]

    out = df >> group_by(f.f, _drop=FALSE) >> count()
    # print(out.obj)
    assert out.n.obj.tolist() == [1, 0, 0]
Beispiel #13
0
def test_joins_maintains__drop():
    df1 = group_by(
        tibble(f1=factor(c("a", "b"), levels=c("a", "b", "c")), x=[42, 43]),
        f.f1,
        _drop=True,
    )

    df2 = group_by(
        tibble(f1=factor(c("a"), levels=c("a", "b", "c")), y=1),
        f.f1,
        _drop=True,
    )

    res = left_join(df1, df2, by="f1")
    assert n_groups(res) == 2

    df2 = group_by(
        tibble(f1=factor(c("a", "c"), levels=c("a", "b", "c")), y=[1, 2]),
        f.f1,
        _drop=True,
    )
    res = full_join(df1, df2, by="f1")
    assert n_groups(res) == 3
Beispiel #14
0
def test_group_list_respects_empty_groups():
    tbl = tibble(
        x=[1, 2, 3, 4],
        g=factor(rep(["a", "b"], each=2), levels=["a", "b", "c"]),
    )

    res = group_split.list(tbl, f.g)

    assert res[0].equals(tbl.iloc[:2, :])
    assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True))

    res = group_split.list(tbl, f.g, _drop=False)
    assert res[0].equals(tbl.iloc[:2, :])
    assert res[1].equals(tbl.iloc[[2, 3], :].reset_index(drop=True))
    assert res[2].equals(tbl.iloc[[], :])
Beispiel #15
0
def test_bind_na_cols():
    df1 = tibble(x=factor(["foo", "bar"]))
    df2 = tibble(x=NA)

    out = df1 >> bind_rows(df2)
    res = out >> get(2, f.x)
    y = is_na(res)
    assert_iterable_equal(y, [True])

    out = df2 >> bind_rows(df1)
    res = out >> get(0, f.x)
    y = is_na(res)
    assert_iterable_equal(y, [True])

    y = is_categorical(out.x)
    assert y
Beispiel #16
0
def test_table():
    # https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/table
    z = rpois(100, 5)
    x = table(z)
    assert sum(x.values.flatten()) == 100

    # -----------------
    with data_context(warpbreaks) as _:
        tab = table(f.wool, f.tension)

    assert tab.columns.tolist() == ["H", "L", "M"]
    assert tab.index.tolist() == ["A", "B"]
    assert_iterable_equal(tab.values.flatten(), [9] * 6)

    tab = table(warpbreaks.loc[:, ["wool", "tension"]])
    assert tab.columns.tolist() == ["H", "L", "M"]
    assert tab.index.tolist() == ["A", "B"]
    assert_iterable_equal(tab.values.flatten(), [9] * 6)

    # -----------------
    tab = table(state_division, state_region)
    assert tab.loc["New England", "Northeast"] == 6

    # -----------------
    with data_context(airquality) as _:
        qt = quantile(f.Temp)
        ct = cut(f.Temp, qt)
        tab = table(ct, f.Month)

    assert tab.iloc[0, 0] == 24

    # -----------------
    a = letters[:3]
    tab = table(a, sample(a))
    assert sum(tab.values.flatten()) == 3

    # -----------------
    tab = table(a, sample(a), dnn=["x", "y"])
    assert tab.index.name == "x"
    assert tab.columns.name == "y"

    # -----------------
    a = c(NA, Inf, (1.0 / (i + 1) for i in range(3)))
    a = a * 10
    # tab = table(a)
    # assert_iterable_equal(tab.values.flatten(), [10] * 4)

    tab = table(a, exclude=None)
    assert_iterable_equal(tab.values.flatten(), [10] * 5)

    # ------------------
    b = as_factor(rep(c("A", "B", "C"), 10))
    tab = table(b)
    assert tab.shape == (1, 3)
    assert_iterable_equal(tab.values.flatten(), [10] * 3)

    tab = table(b, exclude="B")
    assert tab.shape == (1, 2)
    assert_iterable_equal(tab.values.flatten(), [10] * 2)
    assert "B" not in tab.columns

    # -------------------
    d = factor(rep(c("A", "B", "C"), 10), levels=c("A", "B", "C", "D", "E"))
    tab = table(d, exclude="B", dnn=["x"])
    assert_iterable_equal(tab.columns.to_list(), ["A", "C", "D", "E"])
    assert_iterable_equal(tab.values.flatten(), [10, 10, 0, 0])

    d2 = factor(rep(c("A", "B", "C"), 10), levels=c("A", "B", "C", "D", "E"))
    tab = table(d, d2, exclude="B")
    assert tab.shape == (4, 4)

    tab = table("abc", "cba", dnn="x")
    assert tab.shape == (3, 3)
    assert sum(tab.values.flatten()) == 3

    with data_context(airquality) as _:
        tab = table(f.Ozone, f.Solar_R, exclude=None)
    assert "<NA>" in tab.columns
    assert "<NA>" in tab.index

    with pytest.raises(ValueError):
        table([NA_REPR, np.nan], exclude=None)

    tab = table(factor([1, np.nan]), exclude=1)
    assert tab.shape == (1, 1)
    assert_iterable_equal(tab[NA_REPR], [1])
Beispiel #17
0
def test_cat_ordered():
    df = tibble(x=factor([1, 2, 3], ordered=True))
    y = bind_rows(df, df)
    assert y.x.cat.ordered
Beispiel #18
0
def test_group_split_respects__drop():
    # test_that("group_split() respects .drop", {
    chunks = tibble(f=factor(["b"], levels=list("abc"))) >> group_split.list(
        f.f, _drop=True)
    assert len(chunks) == 1
Beispiel #19
0
def test_group_split_can_discard_grouping_vars_by__keep_eqs_false():
    tbl = tibble(x=[1, 2, 3, 4], g=factor(rep(["a", "b"], each=2)))
    res = group_split.list(tbl, f.g, _keep=False)

    assert res[0].equals(tbl.iloc[:2, [0]])
    assert res[1].equals(tbl.iloc[[2, 3], [0]].reset_index(drop=True))
Beispiel #20
0
from datar.base import c, factor, letters, NA, identity, sum
from datar.dplyr import (
    n_distinct,
    summarise,
    group_by,
    pull,
)
from datar.tibble import tibble
from datar.datasets import iris
from ..conftest import assert_iterable_equal

df_var = tibble(
    l=c(True, False, False),
    i=c(1, 1, 2),
    # d = Sys.Date() + c(1, 1, 2),
    f=factor(letters[c(1, 1, 2)]),
    n=np.array(c(1, 1, 2)) + 0.5,
    # t = Sys.time() + c(1, 1, 2),
    c=letters[c(1, 1, 2)],
)


def test_n_disinct_gives_the_correct_results_on_iris():
    out = iris.apply(n_distinct)
    exp = iris.apply(lambda col: len(col.unique()))
    assert_iterable_equal(out, exp)


def test_n_distinct_treats_na_correctly():
    # test_that("n_distinct treats NA correctly in the REALSXP case (#384)", {
    assert n_distinct(c(1.0, NA, NA), na_rm=False) == 2
Beispiel #21
0
    def plot(self):
        """Plot the figures using R"""
        df = pandas.DataFrame(
            self.data,
            columns=self.datacols,
        )
        with capture_c_msg("datar", prefix=f"[r]{self.title}[/r]: "):
            df.columns = make_unique(df.columns.tolist())

        if self.savedata:
            datafile = self.outprefix + ".csv"
            logger.info(
                "[r]%s[/r]: Saving data to: %r",
                self.title,
                datafile,
                extra={"markup": True},
            )
            df.to_csv(datafile, index=False)

        if df.shape[0] == 0:
            logger.warning("No data points to plot")
            return

        aes_for_geom_fill = None
        aes_for_geom_color = None
        theme_elems = p9.theme(axis_text_x=p9.element_text(angle=60, hjust=2))
        if df.shape[1] > 2:
            aes_for_geom_fill = p9.aes(fill=df.columns[2])
            aes_for_geom_color = p9.aes(color=df.columns[2])
        plt = p9.ggplot(df, p9.aes(y=df.columns[0], x=df.columns[1]))
        if self.figtype == "scatter":
            plt = plt + p9.geom_point(aes_for_geom_color)
            theme_elems = None
        elif self.figtype == "line":
            pass
        elif self.figtype == "bar":
            plt = plt + p9.geom_bar(p9.aes(fill=df.columns[0]))
        elif self.figtype == "col":
            plt = plt + p9.geom_col(aes_for_geom_fill)
        elif self.figtype == "pie":
            logger.warning("Pie chart is not support by plotnine yet, "
                           "plotting bar chart instead.")
            col0 = df.iloc[:, 0]
            if df.shape[1] > 2:
                plt = plt + p9.geom_bar(
                    p9.aes(x=df.columns[2], y=col0.name, fill=df.columns[2]),
                    stat="identity"
                    # aes_for_geom_fill,
                    # x=df.Group,
                    # y=col0,
                    # label=paste0(round_(100 * col0 / sum_(col0), 1), "%"),
                    # show_legend=False,
                    # position=p9.position_adjust_text(),
                )
            else:
                col0 = factor(col0, levels=rev(unique(as_character(col0))))
                fills = rev(levels(col0))
                sums = map(lambda x: sum(col0 == x), fills)
                print(col0)
                print(fills)
                plt = (p9.ggplot(df, p9.aes(x=df.columns[1])) +
                       p9.geom_bar(p9.aes(fill=df.columns[0])) + p9.geom_label(
                           x=1,
                           y=cumsum(sums) - sums / 2,
                           label=paste0(round(sums / sum(sums) * 100, 1), "%"),
                           show_legend=False,
                       ))
                theme_elems = p9.theme(
                    axis_title_x=p9.element_blank(),
                    axis_title_y=p9.element_blank(),
                    axis_text_y=p9.element_blank(),
                )
        elif self.figtype == "violin":
            plt = plt + p9.geom_violin(aes_for_geom_fill)
        elif self.figtype == "boxplot":
            plt = plt + p9.geom_boxplot(aes_for_geom_fill)
        elif self.figtype in ("histogram", "density"):
            plt = p9.ggplot(df, p9.aes(x=df.columns[0]))
            geom = getattr(p9, f"geom_{self.figtype}")
            if df.columns[1] != "ONE":
                plt = plt + geom(p9.aes(fill=df.columns[1]), alpha=0.6)
                theme_elems = None
            else:
                plt = plt + geom(alpha=0.6)
                theme_elems = p9.theme(legend_position="none")
        elif self.figtype == "freqpoly":
            plt = p9.ggplot(df, p9.aes(x=df.columns[0]))
            if df.columns[1] != "ONE":
                plt = plt + p9.geom_freqpoly(p9.aes(fill=df.columns[1]))
            else:
                plt = plt + p9.geom_freqpoly()
            theme_elems = None
        else:
            raise ValueError(f"Unknown figure type: {self.figtype}")

        plt = plt + p9.ggtitle(self.title)
        self.save_plot(plt, theme_elems)
Beispiel #22
0

def test_can_deframe_3col_df_with_warning(caplog):
    out = deframe(tibble(name=letters[:3], value=seq(3, 1), oops=[1, 2, 3]))
    assert out == {"a": 3, "b": 2, "c": 1}
    assert "one- or two-column" in caplog.text


# add_row -------------------------------------------------------------

df_all = tibble(
    a=[1, 2.5, NA],
    b=[1, 2, NA],
    c=[True, False, NA],
    d=["a", "b", NA],
    e=factor(c("a", "b", NA)),
)


def test_can_add_row():
    df_all_new = add_row(df_all, a=4, b=3)
    assert df_all_new.columns.tolist() == df_all.columns.tolist()
    assert nrow(df_all_new) == nrow(df_all) + 1
    assert_iterable_equal(df_all_new.a, [1.0, 2.5, NA, 4])
    assert_iterable_equal(df_all_new.b, [1.0, 2.0, NA, 3.0])
    assert_iterable_equal(df_all_new.c, [True, False, NA, NA])


def test_add_empty_row_if_no_arguments():
    iris1 = add_row(iris)
    assert nrow(iris1) == nrow(iris) + 1