def test_n_distinct_handles_in_na_rm(): d = tibble(x=c([1, 2, 3, 4], NA)) yes = True no = False out = d >> summarise(n=n_distinct(f.x, na_rm=True)) >> pull(to="list") assert out == [4] out = d >> summarise(n=n_distinct(f.x, na_rm=False)) >> pull(to="list") assert out == [5] out = d >> summarise(n=n_distinct(f.x, na_rm=yes)) >> pull(to="list") assert out == [4] out = d >> summarise(n=n_distinct(f.x, na_rm=no)) >> pull(to="list") assert out == [5] out = (d >> summarise(n=n_distinct(f.x, na_rm=True or True)) >> pull(to="list")) assert out == [4]
def test_cur_data_all(): df = tibble(x=c("b", "a", "b"), y=[1, 2, 3]) gf = df >> group_by(f.x, _sort=True) out = df >> summarise(x=cur_data()) >> pull(f.x, to="list") assert out[0].equals(df) out = df >> summarise(x=cur_data_all()) >> pull(f.x, to="list") assert out[0].equals(df) out = gf >> summarise(x=cur_data()) >> pull(f.x) assert out.values[0].values.flatten().tolist() == [2] assert out.values[1].values.flatten().tolist() == [1, 3] out = gf >> summarise(x=cur_data_all()) >> pull(f.x) assert out.values[0].values.flatten().tolist() == ["a", 2] assert out.values[1].values.flatten().tolist() == ["b", 1, "b", 3]
def test_proportion_computed_correctly(): df = tibble(x=range(1, 11)) out = df >> slice_head(prop=0.11) >> nrow() assert out == 1 out = df >> slice_tail(prop=0.11) >> nrow() assert out == 1 out = df >> slice_sample(prop=0.11) >> nrow() assert out == 1 out = df >> slice_min(f.x, prop=0.11) >> nrow() assert out == 1 out = df >> slice_max(f.x, prop=0.11) >> nrow() assert out == 1 out = df >> slice_max(f.x, prop=0.11, with_ties=False) >> nrow() assert out == 1 out = df >> slice_min(f.x, prop=0.11, with_ties=False) >> nrow() assert out == 1
def test_handles_passing_args(): df = tibble(x=range(1, 5)) def ff(*args): x1 = 4 f1 = lambda y: y return df >> filter(*args, f1(x1) > f.x) def g(): x2 = 2 return ff(f.x > x2) res = g() assert res.x.tolist() == [3] df >>= group_by(f.x) res = g() assert res.x.obj.tolist() == [3]
def test_across(): df = tibble(x=[1, 3, 2, 1], y=[4, 3, 2, 1]) out = df >> arrange(across()) expect = df >> arrange(f.x, f.y) assert out.equals(expect) out = df >> arrange(across(None, desc)) expect = df >> arrange(desc(f.x), desc(f.y)) assert out.equals(expect) out = df >> arrange(across(f.x)) expect = df >> arrange(f.x) assert out.equals(expect) out = df >> arrange(across(f.y)) expect = df >> arrange(f.y) assert out.equals(expect)
def test_dup_keyword_args(): df = tibble(a=1) out = df >> mutate(_b=f.a + 1, b=f._b * 2) assert_tibble_equal(out, tibble(a=1, b=4)) # order doesn't matter out = df >> mutate(b=f.a + 1, _b=f.b * 2) assert_tibble_equal(out, tibble(a=1, b=2, _b=4)) # support >= 2 dups out = df >> mutate(__b=f.a + 1, _b=f.__b * 2, b=f._b / 4.0) assert_tibble_equal(out, tibble(a=1, b=1.0)) # has to be consective out = df >> mutate(__b=f.a + 1, _b=f.__b * 2, b=f._b / 4.0) assert_tibble_equal(out, tibble(a=1, b=1.0)) out = df >> mutate(__b=f.a + 1, _b=f.__b * 2) assert_tibble_equal(out, tibble(a=1, _b=4)) out = df >> mutate(_b=f.a + 1) assert_tibble_equal(out, tibble(a=1, _b=2))
def test_head_tail(): df = tibble(x=range(20)) z = df >> head() assert z.shape[0] == 6 z = df >> head(3) assert z.shape[0] == 3 z = list(range(10)) >> head() assert len(z) == 6 with pytest.raises(NotImplementedError): head(3) z = df >> tail() assert z.shape[0] == 6 z = df >> tail(3) assert z.shape[0] == 3 z = list(range(10)) >> tail() assert len(z) == 6 with pytest.raises(NotImplementedError): tail(3)
def _get_input_data(ch1, ch2, ch3, ch4): metadf = _get_metadf(ch1) normal_masks = metadf[options.type_col] == options.type_normal return tibble( covfiles=( [None] if sum(normal_masks) == 0 else [ ch2.outfile[normal_masks].tolist() + ch3.outfile[normal_masks].tolist() ] ), target_file=ch4.target_file, antitarget_file=ch4.antitarget_file, sample_sex=( ",".join(metadf.SampleSex[normal_masks]) if "SampleSex" in metadf.columns else [None] ), )
class MetabolicExprNormalization(Proc): """Normalize the expression data using deconvolution Requires: - name: r-scran check: | {{proc.lang}} <(echo "library(scran)") """ requires = MetabolicPrepareSCE, MetabolicInputs input = "sceobj:file, configfile:file" output = "outfile:file:{{in.sceobj | stem0}}.sce.RDS" input_data = lambda ch1, ch2: tibble( sceobj=ch1.outfile, configfile=ch2.configfile, ) envs = {"dropout": 0.75, "refexon": config.ref.refexon} lang = config.lang.rscript script = ( "file://../scripts/scrna_metabolic/MetabolicExprNormalization.R")
def _get_input_data(ch1, ch2): metadf = _get_metadf(ch1) tumor_masks = metadf[options.type_col] == options.type_tumor return tibble( chrfile=ch2.outfile, vcf=( metadf.SnpVcf[tumor_masks] if "SnpVcf" in metadf.columns else [None] ), sample_id=( metadf.VcfSampleID[tumor_masks] if "VcfSampleID" in metadf.columns else [None] ), normal_id=( metadf.NormalID[tumor_masks] if "NormalID" in metadf.columns else [None] ), )
def test_get(): df = tibble(x=2) df.index = ["a"] out = df >> get() assert_frame_equal(out, df) out = df >> get(0, 0) assert out == 2 out = df >> get("a", "x") assert out == 2 out = df >> get(["a"], ["x"]) assert out.equals(df) out = df >> get("a") assert out.equals(df) out = df >> get(cols="x") assert out.equals(df)
def test_0col_df_in_results_ignored(): df1 = tibble(x=[1, 2]) df2 = df1 >> group_by(f.x) >> summarise(tibble()) assert df2.equals(df1) df2 = df1 >> group_by(f.x) >> summarise(tibble(), y=65) df3 = df1 >> mutate(y=65) assert df2.equals(df3) df2 = tibble(x=[1, 2], y=[3, 4]) df3 = df2 >> group_by(f.x) >> summarise(tibble()) assert df3.equals(df1) df3 = df2 >> group_by(f.x) >> summarise(tibble(), z=98) df4 = df1 >> mutate(z=98) assert df3.equals(df4)
def test_col_row_verbs(): df = tribble(f.x, f.y, f.z, 1, NA, 6, 2, 4, 9, 3, 6, 15) assert_iterable_equal(row_medians(df), [NA, 4, 6]) assert_iterable_equal(row_medians(df, na_rm=True), [3.5, 4, 6]) assert_iterable_equal(col_medians(df), [2, NA, 9]) assert_iterable_equal(col_medians(df, na_rm=True), [2, 5, 9]) assert_iterable_equal(row_means(df), [NA, 5, 8]) assert_iterable_equal(row_means(df, na_rm=True), [3.5, 5, 8]) assert_iterable_equal(col_means(df), [2, NA, 10]) assert_iterable_equal(col_means(df, na_rm=True), [2, 5, 10]) assert_iterable_equal(row_sums(df), [NA, 15, 24]) assert_iterable_equal(row_sums(df, na_rm=True), [7, 15, 24]) assert_iterable_equal(col_sums(df), [6, NA, 30]) assert_iterable_equal(col_sums(df, na_rm=True), [6, 10, 30]) assert_iterable_equal( row_sds(df), [NA, 3.605551275463989, 6.244997998398398], approx=True ) assert_iterable_equal( row_sds(df, na_rm=True), [3.5355339059327378, 3.605551275463989, 6.244997998398398], approx=True, ) assert_iterable_equal( col_sds(df), [1.0, NA, 4.58257569495584], approx=True ) assert_iterable_equal( col_sds(df, na_rm=True), [1.0, 1.4142135623730951, 4.58257569495584], approx=True, ) # grouped df = tibble(x=[1, 1, 2, 2], y=[3, 4, 3, 4]).group_by('x') assert_iterable_equal(col_sums(df).y, [7, 7]) assert_iterable_equal(col_means(df).y, [3.5, 3.5]) assert_iterable_equal(col_medians(df).y, [3.5, 3.5]) assert_iterable_equal(col_sds(df).y, [0.7071, 0.7071], approx=1e-3)
class MetabolicFeaturesIntraSubsets(Proc): """Intra-subset metabolic features - Enrichment analysis in details Requires: - name: r-parallel check: | {{proc.lang}} <(echo "library(parallel)") - name: r-scater check: | {{proc.lang}} <(echo "library(scater)") - name: r-fgsea check: | {{proc.lang}} <(echo "library(fgsea)") """ if options["intra-subset"]: requires = MetabolicExprNormalization, MetabolicInputs input = "sceobjs:files, gmtfile:file, configfile:file" input_data = lambda ch1, ch2: tibble( sceobjs=[list(ch1.outfile)], gmtfile=ch2.gmtfile, configfile=ch2.configfile, ) output = "outdir:dir:{{in.configfile | stem}}.intras-pathwayfeatures" lang = config.lang.rscript order = 4 envs = { "ncores": config.misc.ncores, "fgsea": True, "prerank_method": "signal_to_noise", "top": 10, } script = ( "file://../scripts/scrna_metabolic/MetabolicFeaturesIntraSubsets.R" ) plugin_opts = { "report": ("file://../reports/scrna_metabolic/" "MetabolicFeaturesIntraSubsets.svelte") }
def test_transform_register(): @func_factory(kind="transform", data_args="x") def double(x): return x * 2 @double.register(DataFrame) def _(x): return x * 3 x = Series([2, 3]) out = double(x) assert_iterable_equal(out, [4, 6]) double.register(Series, lambda x: x * 4) out = double(x) assert_iterable_equal(out, [8, 12]) x = tibble(a=[1, 3]) out = double(x) assert_iterable_equal(out.a, [3, 9]) out = double([1, 4]) assert_iterable_equal(out, [4, 16]) # register an available string func for tranform double.register(SeriesGroupBy, "sum") x = Series([1, -2]).groupby([1, 2]) out = double(x) assert_iterable_equal(out.obj, [1, -2]) # seriesrowwise double.register(SeriesRowwise, lambda x: x + 1) x.is_rowwise = True out = double(x) assert_iterable_equal(out.obj, [2, -1]) assert out.is_rowwise
def test_mixed_rows(): df = tibble(x=range(5)) # order kept # 0 1 2 3 4 # -3 -1 # 3 out = slice(df, c(-c(3, 1), 3)) assert out.x.tolist() == [2, 4, 3] # 0 1 2 3 4 # -2 -1 # 3 out = slice(df, c(-f[1:3], 3)) assert out.x.tolist() == [4, 3, 3] # 0 1 2 3 4 # 0 2 # -1 out = slice(df, c(~c(0, 2), ~c(-1))) assert out.x.tolist() == [1, 3] out = df >> slice(c(~f[3:], ~c(1))) assert out.x.tolist() == [0, 2]
def test_row_number_handles_empty_dfs(): df = tibble(a=[]) res = df >> mutate( row_number_0=row_number(), # row_number_a=row_number(f.a), # row_number doesn't support extra arg ntile=ntile(f.a, 2), min_rank=min_rank(f.a), percent_rank=percent_rank(f.a), dense_rank=dense_rank(f.a), cume_dist=cume_dist(f.a), ) assert_iterable_equal( res.columns, [ "a", "row_number_0", "ntile", "min_rank", "percent_rank", "dense_rank", "cume_dist", ], ) assert nrow(res) == 0
class MetabolicPrepareSCE(Proc): """Prepare SingleCellExperiment objects Requires: - name: r-scater check: | {{proc.lang}} <(echo "library(scater)") - name: r-seurat check: | {{proc.lang}} <(echo "library(Seurat)") """ requires = MetabolicExprImputation, MetabolicInputs input = "impfiles:files, gmtfile:file" input_data = lambda ch1, ch2: tibble( impfiles=_group_imputed_files(ch1.outfile), gmtfile=ch2.gmtfile, ) output = ( "outfile:file:" "{{in.impfiles | first | stem | split: '.' | first}}.sce.RDS") lang = config.lang.rscript envs = {"refexon": config.ref.refexon} script = "file://../scripts/scrna_metabolic/MetabolicPrepareSCE.R"
def test_arguments_to_select_dont_match_vars_select_arguments(): df = tibble(a=1) out = select(df, var=f.a) assert out.equals(tibble(var=1)) out = select(group_by(df, f.a), var=f.a) exp = group_by(tibble(var=1), f.var) assert out.equals(exp) assert group_vars(out) == group_vars(exp) out = select(df, exclude=f.a) assert out.equals(tibble(exclude=1)) out = select(df, include=f.a) assert out.equals(tibble(include=1)) out = select(group_by(df, f.a), exclude=f.a) exp = group_by(tibble(exclude=1), f.exclude) assert out.equals(exp) assert group_vars(out) == group_vars(exp) out = select(group_by(df, f.a), include=f.a) exp = group_by(tibble(include=1), f.include) assert out.equals(exp) assert group_vars(out) == group_vars(exp)
def rn(x): return tibble(x=[1, 2, 3])
def test_sort_empty_df(): df = tibble() out = df >> arrange() assert_tibble_equal(out, df)
def test_incompatible_size_fill_with_NA(): df1 = tibble(x=range(1, 4)) df2 = tibble(y=range(1, 2)) out = (df1 >> bind_cols(df2)).fillna(100) assert out.x.tolist() == [1, 2, 3] assert out.y.tolist() == [1, 100, 100]
def test_transform_hooks(): @func_factory(kind="transform", data_args="x") def times(x, t): return x * t with pytest.raises(ValueError): times.register(Series, meta=False, pre=1, func=None) times.register( Series, func=None, pre=lambda x, t: (x, (-t, ), {}), post=lambda out, x, t: out + t, ) x = Series([1, 2]) out = times(x, -1) assert_iterable_equal(out, [2, 3]) @times.register(Series, meta=False) def _(x, t): return x + t out = times(x, 10) assert_iterable_equal(out, [11, 12]) @times.register(SeriesGroupBy, meta=True) def _(x, t): return x + 10 x = Series([1, 2, 1, 2]).groupby([1, 1, 2, 2]) out = times(x, 1) assert_iterable_equal(out.obj, [11, 12, 11, 12]) times.register( SeriesGroupBy, func=None, pre=lambda x, t: (x, (t + 1, ), {}), post=lambda out, x, *args, **kwargs: out, ) out = times(x, 1) assert_iterable_equal(out, [2, 4, 2, 4]) times.register( Series, func=None, pre=lambda *args, **kwargs: None, post=lambda out, x, t: out + t, ) x = Series([1, 2]) out = times(x, 3) assert_iterable_equal(out, [4, 5]) @times.register(DataFrame, meta=True) def _(x, t): return x**t x = tibble(a=[1, 2], b=[2, 3]) out = times(x, 3) assert_iterable_equal(out.a, [1, 8]) assert_iterable_equal(out.b, [8, 27]) # TibbleGrouped times.register( TibbleGrouped, func=None, pre=lambda x, t: (x, (t - 1, ), {}), post=lambda out, x, t: out.reindex([1, 0]), ) x = x.group_by("a") out = times(x, 3) assert_iterable_equal(out.b, [6, 4]) @times.register( TibbleGrouped, meta=False, ) def _(x, t): out = x.transform(lambda d, t: d * t, 0, t - 1) out.iloc[0, 1] = 10 return out # x = tibble(a=[1, 2], b=[2, 3]) # grouped by a out = times(x, 3) assert isinstance(out, TibbleGrouped) assert_iterable_equal(out.group_vars, ["a"]) assert_iterable_equal(out.b.obj, [10, 6])
def test_complex_cols(): df = tibble(x=[1, 2, 3], y=[3 + 2j, 2 + 2j, 1 + 2j]) out = df >> arrange(f.y) assert_iterable_equal(out.x, [3, 2, 1])
def test_update_grouping(): df = tibble(g=[2, 2, 1, 1], x=[1, 3, 2, 4]) res = df >> group_by(f.g) >> arrange(f.x) assert isinstance(res, TibbleGrouped) assert group_rows(res) == [[0, 2], [1, 3]]
def test_df_cols(): df = tibble(x=[1, 2, 3], y=tibble(z=[3, 2, 1])) out = df >> arrange(f.y) expect = tibble(x=[3, 2, 1], y=tibble(z=[1, 2, 3])) assert out.reset_index(drop=True).equals(expect)
def test_na_end(): df = tibble(x=c(4, 3, NA)) # NA makes it float out = df >> arrange(f.x) assert_iterable_equal(out.x, [3, 4, None]) out = df >> arrange(desc(f.x)) assert_iterable_equal(out.x, [4, 3, None])
def test_reorder_cols(): df = tibble(a=1, b=2, c=3, d=4, e=5, f=6) df_scramble = df[sample(df.columns)] out = df >> bind_rows(df_scramble) assert out.columns.tolist() == list("abcdef")
from datar.base import c, factor, letters, NA, identity, sum from datar.dplyr import ( n_distinct, summarise, group_by, pull, ) from datar.tibble import tibble from datar.datasets import iris from ..conftest import assert_iterable_equal df_var = tibble( l=c(True, False, False), i=c(1, 1, 2), # d = Sys.Date() + c(1, 1, 2), f=factor(letters[c(1, 1, 2)]), n=np.array(c(1, 1, 2)) + 0.5, # t = Sys.time() + c(1, 1, 2), c=letters[c(1, 1, 2)], ) def test_n_disinct_gives_the_correct_results_on_iris(): out = iris.apply(n_distinct) exp = iris.apply(lambda col: len(col.unique())) assert_iterable_equal(out, exp) def test_n_distinct_treats_na_correctly(): # test_that("n_distinct treats NA correctly in the REALSXP case (#384)", { assert n_distinct(c(1.0, NA, NA), na_rm=False) == 2
def test_n_distinct_respects_data(): df = tibble(x=42) out = df >> summarise(n=n_distinct(df.x)) exp = tibble(n=1) assert out.equals(exp)