def input_converter(): """Input converter context from Python objects to RPY2 objects.""" return localconverter( default_converter + pandas2ri.converter + numpy2ri.converter )
def R2pd(self, rdf): with localconverter(RO.default_converter + pandas2ri.converter): pd_from_r_df = RO.conversion.rpy2py(rdf) return pd_from_r_df
def numpy_conversion(): with conversion.localconverter(robjects.default_converter + rpyn.converter) as lc: yield
def test_ri2pandas(self): rdataf = robjects.r('data.frame(a=1:2, ' ' row.names=c("a", "b"))') with localconverter(default_converter + rpyp.converter) as cv: pandas_df = cv.rpy2py(rdataf) assert all(x == y for x, y in zip(rdataf.rownames, pandas_df.index))
def test_series_int(self, dtype): Series = pandas.core.series.Series s = Series(range(5), index=['a', 'b', 'c', 'd', 'e'], dtype=dtype) with localconverter(default_converter + rpyp.converter) as cv: rp_s = robjects.conversion.py2rpy(s) assert isinstance(rp_s, rinterface.IntSexpVector)
def test_object2String_with_None(self): series = pandas.Series([None, "a", "b", "c", "a"], dtype="O") with localconverter(default_converter + rpyp.converter) as cv: rp_c = robjects.conversion.py2rpy(series) assert isinstance(rp_c, rinterface.StrSexpVector)
def test_category2Factor(self): category = pandas.Series(["a", "b", "c", "a"], dtype="category") with localconverter(default_converter + rpyp.converter) as cv: rp_c = robjects.conversion.py2rpy(category) assert isinstance(rp_c, robjects.vectors.FactorVector)
def get_r_df(y_tr): with localconverter(ro.default_converter + pandas2ri.converter): r_y_tr = ro.conversion.py2rpy(y_tr) return r_y_tr
def get_brms_data(dataset_name: str): "A helper function for importing different datasets included in brms." with localconverter(default_converter + pandas2ri.converter + numpy2ri.converter) as cv: return pd.DataFrame( rpackages.data(brms).fetch(dataset_name)[dataset_name])
def glm_gam_poi( adata: AnnData, *, groupby: str, contrasts: Sequence[Tuple[Sequence[str], Sequence[str]]], cofactors: Sequence[str] = None, layer: Optional[str] = None, subsample_disp=2000, n_cores_per_job: int = 4, n_jobs: int = 4, ): """ Perform DE analysis using edgeR. Requires that an R installation and the following packages are available GlmGamPoi BiocParallel RhpcBLASctl Install them with `conda install bioconductor-glmgampoi bioconductor-biocparallel r-rhpcblasctl`. Parameters ---------- adata annotated data matrix groupby The column in adata.obs to test for DE contrast Liste of tuples with tests to perform, e.g. `[('A', 'B'), (('A', 'B'), ('C', 'D','E'))]` which is equivalent to `[(('A', ), ('B', )), (('A', 'B'), ('C', 'D','E'))] cofactors Additional columns to include into the model layer layer in adata that contains raw counts. If None, use `X`. subsample_disp Subsample cells to this nubmer during estimation of overdispersion. n_cores_per_job Number of cores to run per job (including BLAS parallelization) n_jobs Number of tests to run in parallel. """ try: from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri, numpy2ri from rpy2.robjects.conversion import localconverter from rpy2 import robjects as ro except ImportError: raise ImportError("edger requires rpy2 to be installed. ") try: base = importr("base") glm = importr("glmGamPoi") stats = importr("stats") blasctl = importr("RhpcBLASctl") bcparallel = importr("BiocParallel") except ImportError: raise ImportError( "GlmGamPoi requires a valid R installation with the following packages: " "glmGamPoi, BiocParallal, RhpcBLASctl") blasctl.blas_set_num_threads(n_cores_per_job) blasctl.omp_set_num_threads(n_cores_per_job) logging.info("Preparing R objects") # Define model formula cofactors = [] if cofactors is None else _make_names(cofactors) groupby = _make_names([groupby])[0] model = f"~ 0 + {groupby} + {' + '.join(cofactors)}" contrasts = _fix_contrasts(contrasts, groupby) bcparallel.register(bcparallel.MulticoreParam(n_jobs)) with localconverter(ro.default_converter + pandas2ri.converter): tmp_obs = adata.obs.loc[:, [groupby] + cofactors] tmp_obs.columns = _make_names(tmp_obs.columns) for col in tmp_obs.columns: if not is_numeric_dtype(tmp_obs[col]): tmp_obs[col] = _make_names(tmp_obs[col]) obs_r = ro.conversion.py2rpy(tmp_obs) with localconverter(ro.default_converter + pandas2ri.converter): expr = adata.X if layer is None else adata.layers[layer] if issparse(expr): expr = expr.T.toarray() else: expr = expr.T expr = pd.DataFrame(expr) expr.index = adata.var_names expr.columns = adata.obs_names expr_r = ro.conversion.py2rpy(expr) # convert as dataframe and then convert to matrix - didn't keep rownames otherwise. expr_r = base.as_matrix(expr_r) design = stats.model_matrix(stats.as_formula(model), data=obs_r) contrasts = [ f'({"+".join(b)}) / {len(b)} - ({"+".join(a)}) / {len(a)}' for a, b in contrasts ] logging.info("Fitting GLM") fit = glm.glm_gp(expr_r, design=design, subsample=subsample_disp) ro.globalenv["fit"] = fit ro.globalenv["contrasts"] = contrasts ro.r(""" library(dplyr) de_res = BiocParallel::bplapply(contrasts, function(contrast) { glmGamPoi::test_de(fit, contrast) %>% mutate(comparision = contrast) }) %>% bind_rows() """) with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter): return ro.conversion.rpy2py(ro.globalenv["de_res"])
def mast( adata: AnnData, *, groupby: str, groups: Union[Literal["all"], Sequence[str]], cofactors: Sequence[str] = None, layer: Optional[str] = None, n_cores_per_job: int = 4, n_jobs: int = 4, ): """ Perform DE analysis using edgeR. Requires that an R installation and the following packages are available MAST BiocParallel Install them with `conda install bioconductor-mast bioconductor-biocparallel`. Parameters ---------- adata annotated data matrix. X must contain normalized and log-transformed values. groupby The column in adata.obs to test for DE cofactors Additional columns to include into the model layer layer in adata that contains raw counts. If None, use `X`. subsample_disp Subsample cells to this nubmer during estimation of overdispersion. n_cores_per_job Number of cores to run per job (including BLAS parallelization) n_jobs Number of tests to run in parallel. """ try: from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter from rpy2 import robjects as ro import anndata2ri except ImportError: raise ImportError( "MAST requires rpy2 and anndata2ri to be installed. ") try: mast = importr("MAST") bcparallel = importr("BiocParallel") except ImportError: raise ImportError( "MAST requires a valid R installation with the following packages: " "MAST, BiocParallel") bcparallel.register(bcparallel.MulticoreParam(n_jobs)) logging.info("Preparing AnnData") tmp_adata = AnnData( X=adata.X if layer is None else adata.layers[layer], obs=adata.obs, var=adata.var, ) tmp_adata.obs.columns = _make_names(tmp_adata.obs.columns) tmp_adata.obs[groupby] = _make_names(tmp_adata.obs[groupby]) contrasts = [] for group in tmp_adata.obs[groupby].unique(): contrasts.append(f"is_group_{group}") tmp_adata.obs[f"is_group_{group}"] = tmp_adata.obs[groupby] == group logging.info("Preparing R objects") with localconverter(anndata2ri.converter): sce = ro.conversion.py2rpy(tmp_adata) sca = mast.SceToSingleCellAssay(sce) groupby = _make_names([groupby])[0] cofactor_formula = ("" if cofactors is None else "+ " + " + ".join(_make_names(cofactors))) logging.info("Running MAST") ro.globalenv["cpus_per_thread"] = n_cores_per_job ro.globalenv["contrasts"] = contrasts ro.globalenv["cofactor_formula"] = cofactor_formula ro.globalenv["sca"] = sca ro.r(""" library(dplyr) de_res = bplapply(contrasts, function(model_col) { op = options(mc.cores=cpus_per_thread) on.exit(options(op)) contrast_to_test = paste0(model_col, "TRUE") fit = zlm(as.formula(paste0("~", model_col, cofactor_formula)), sca) res = summary(fit, doLRT=contrast_to_test)$datatable merge( res[contrast==contrast_to_test & component=='H', .(primerid, `Pr(>Chisq)`)], #P-vals res[contrast==contrast_to_test & component=='logFC', .(primerid, coef)], by='primerid' ) %>% mutate(comparison=model_col) }) %>% bind_rows() """) with localconverter(ro.default_converter + pandas2ri.converter): de_res = ro.conversion.rpy2py(ro.globalenv["de_res"]) de_res["comparison"] = de_res["comparison"].str.replace("is_group_", "") return de_res
def edger( adata: AnnData, *, groupby: str, groups: Union[Literal["all"], Sequence[str]], cofactors: Sequence[str] = None, layer: Optional[str] = None, n_cores_per_job: int = 4, n_jobs: int = 4, ) -> pd.DataFrame: """ Perform DE analysis using edgeR. Requires that an R installation and the following packages are available edgeR BiocParallel RhpcBLASctl Install them with `conda install bioconductor-edger bioconductor-biocparallel r-rhpcblasctl`. Parameters ---------- adata annotated data matrix groupby The column in adata.obs to test for DE groups Subset of groups, e.g. `['g1', 'g2', 'g3']`, to which comparison shall be restricted, or `'all'` (default), for all groups. cofactors Additional columns to include into the model layer layer in adata that contains raw counts. If None, use `X`. n_cores_per_job Number of cores to run per job (including BLAS parallelization) n_jobs Number of tests to run in parallel. Returns ------- DataFrame with differential expression results """ try: from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri, numpy2ri from rpy2.robjects.conversion import localconverter from rpy2 import robjects as ro except ImportError: raise ImportError("edger requires rpy2 to be installed. ") try: base = importr("base") edger = importr("edgeR") stats = importr("stats") limma = importr("limma") blasctl = importr("RhpcBLASctl") bcparallel = importr("BiocParallel") except ImportError: raise ImportError( "edgeR requires a valid R installation with the following packages: " "edgeR, BiocParallel, RhpcBLASctl") # Set parallelism blasctl.blas_set_num_threads(n_cores_per_job) blasctl.omp_set_num_threads(n_cores_per_job) bcparallel.register(bcparallel.MulticoreParam(n_jobs)) logging.info("Preparing R objects") cofactor_formula = ("" if cofactors is None else f"+ {' + '.join(_make_names(cofactors))}") groupby = _make_names(groupby) model = f"~ 0 + {groupby} {cofactor_formula}" tmp_adata = (adata if groups == "all" else adata[adata.obs[groupby].isin(groups), :]).copy() tmp_adata.obs.columns = _make_names(tmp_adata.obs.columns) for col in tmp_adata.obs.columns: if not is_numeric_dtype(tmp_adata.obs[col]): tmp_adata.obs[col] = _make_names(tmp_adata.obs[col]) groups = tmp_adata.obs[groupby].unique() if len(groups) < 2: raise ValueError("Need at least two groups to compare. ") with localconverter(ro.default_converter + pandas2ri.converter): obs_r = ro.conversion.py2rpy( tmp_adata.obs.loc[:, [groupby] + ([] if cofactors is None else cofactors)]) # just need the index var_r = ro.conversion.py2rpy( pd.DataFrame({"gene_symbol": tmp_adata.var_names}, index=tmp_adata.var_names)) with localconverter(ro.default_converter + numpy2ri.converter): expr = tmp_adata.X if layer is None else tmp_adata.layers[layer] if issparse(expr): expr = expr.T.toarray() else: expr = expr.T expr_r = ro.conversion.py2rpy(expr) design = stats.model_matrix(stats.as_formula(model), data=obs_r) dge = edger.DGEList(counts=expr_r, samples=obs_r, genes=var_r) contrasts_r = limma.makeContrasts( contrasts=[ f'({"+".join([f"{groupby}{g}" for g in groups if g != group])})' f" / {len(groups) - 1}" f" - {groupby}{group}" for group in groups ], levels=base.colnames(design), ) logging.info("Calculating NormFactors") dge = edger.calcNormFactors(dge) logging.info("Estimating Dispersions") dge = edger.estimateDisp(dge, design=design) logging.info("Fitting linear model") fit = edger.glmQLFit(dge, design=design) ro.globalenv["fit"] = fit ro.globalenv["contrasts"] = contrasts_r ro.r(""" library(dplyr) de_res = BiocParallel::bplapply(1:ncol(contrasts), function(i) { test = edgeR::glmQLFTest(fit, contrast=contrasts[, i]) edgeR::topTags(test, n=Inf, adjust.method="BH")$table %>% mutate(contrast_idx = i - 1) }) %>% bind_rows() """) with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter): de_res = ro.conversion.rpy2py(ro.globalenv["de_res"]) # TODO fix this # de_res["group"] = [groups[i] for i in de_res["contrast_idx"]] return de_res
def test_dataframe_columnnames(self): pd_df = pandas.DataFrame({'the one': [1, 2], 'the other': [3, 4]}) # Convert to R with localconverter(default_converter + rpyp.converter) as cv: rp_df = robjects.conversion.py2rpy(pd_df) assert tuple(rp_df.names) == ('the one', 'the other')
def test_series_obj_str(self, data, dtype): Series = pandas.core.series.Series s = Series(data, index=['a', 'b', 'c'], dtype=dtype) with localconverter(default_converter + rpyp.converter) as cv: rp_s = robjects.conversion.py2rpy(s) assert isinstance(rp_s, rinterface.StrSexpVector)
def pandas2R(df): """Local conversion of pandas dataframe to R dataframe as recommended by rpy2""" with localconverter(robjects.default_converter + pandas2ri.converter): data = robjects.conversion.py2rpy(df) return data
def r_to_py(object_): if isinstance(object_, robjects.DataFrame): with localconverter(pandas2ri.converter): py_object_ = robjects.conversion.rpy2py(object_) return py_object_ return object_
def test_series_obj_allnone(self): Series = pandas.core.series.Series s = Series([None, None, None], index=['a', 'b', 'c']) with localconverter(default_converter + rpyp.converter) as cv: rp_s = robjects.conversion.py2rpy(s) assert isinstance(rp_s, rinterface.BoolSexpVector)
def testObject2String(self): series = pandas.Series(["a", "b", "c", "a"], dtype="O") with localconverter(default_converter + rpyp.converter) as cv: rp_c = robjects.conversion.py2ro(series) self.assertEqual(robjects.vectors.StrVector, type(rp_c))
def test_orderedFactor2Category(self): factor = robjects.vectors.FactorVector(('a', 'b', 'a'), ordered=True) with localconverter(default_converter + rpyp.converter) as cv: rp_c = robjects.conversion.rpy2py(factor) assert isinstance(rp_c, pandas.Categorical)
def testOrderedFactor2Category(self): factor = robjects.vectors.FactorVector(('a', 'b', 'a'), ordered=True) with localconverter(default_converter + rpyp.converter) as cv: rp_c = robjects.conversion.ri2py(factor) self.assertEqual(pandas.Categorical, type(rp_c))
def test_categorywithNA2Factor(self): category = pandas.Series(['a', 'b', 'c', numpy.nan], dtype='category') with localconverter(default_converter + rpyp.converter) as cv: rp_c = robjects.conversion.py2rpy(category) assert isinstance(rp_c, robjects.vectors.FactorVector) assert rp_c[3] == rinterface.NA_Integer
def testCategory2Factor(self): category = pandas.Series(["a", "b", "c", "a"], dtype="category") with localconverter(default_converter + rpyp.converter) as cv: rp_c = robjects.conversion.py2ro(category) self.assertEqual(robjects.vectors.FactorVector, type(rp_c))
def test_series(self): Series = pandas.core.series.Series s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) with localconverter(default_converter + rpyp.converter) as cv: rp_s = robjects.conversion.py2rpy(s) assert isinstance(rp_s, rinterface.FloatSexpVector)
def testSeries(self): Series = pandas.core.series.Series s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) with localconverter(default_converter + rpyp.converter) as cv: rp_s = robjects.conversion.py2ri(s) self.assertEqual(rinterface.FloatSexpVector, type(rp_s))
def R(self, line, cell=None, local_ns=None): """ Execute code in R, optionally returning results to the Python runtime. In line mode, this will evaluate an expression and convert the returned value to a Python object. The return value is determined by rpy2's behaviour of returning the result of evaluating the final expression. Multiple R expressions can be executed by joining them with semicolons:: In [9]: %R X=c(1,4,5,7); sd(X); mean(X) Out[9]: array([ 4.25]) In cell mode, this will run a block of R code. The resulting value is printed if it would be printed when evaluating the same code within a standard R REPL. Nothing is returned to python by default in cell mode:: In [10]: %%R ....: Y = c(2,4,3,9) ....: summary(lm(Y~X)) Call: lm(formula = Y ~ X) Residuals: 1 2 3 4 0.88 -0.24 -2.28 1.64 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.0800 2.3000 0.035 0.975 X 1.0400 0.4822 2.157 0.164 Residual standard error: 2.088 on 2 degrees of freedom Multiple R-squared: 0.6993,Adjusted R-squared: 0.549 F-statistic: 4.651 on 1 and 2 DF, p-value: 0.1638 In the notebook, plots are published as the output of the cell:: %R plot(X, Y) will create a scatter plot of X bs Y. If cell is not None and line has some R code, it is prepended to the R code in cell. Objects can be passed back and forth between rpy2 and python via the -i -o flags in line:: In [14]: Z = np.array([1,4,5,10]) In [15]: %R -i Z mean(Z) Out[15]: array([ 5.]) In [16]: %R -o W W=Z*mean(Z) Out[16]: array([ 5., 20., 25., 50.]) In [17]: W Out[17]: array([ 5., 20., 25., 50.]) The return value is determined by these rules: * If the cell is not None (i.e., has contents), the magic returns None. * If the final line results in a NULL value when evaluated by rpy2, then None is returned. * No attempt is made to convert the final value to a structured array. Use %Rget to push a structured array. * If the -n flag is present, there is no return value. * A trailing ';' will also result in no return value as the last value in the line is an empty string. """ args = parse_argstring(self.R, line) # arguments 'code' in line are prepended to # the cell lines if cell is None: code = '' return_output = True line_mode = True else: code = cell return_output = False line_mode = False code = ' '.join(args.code) + code # if there is no local namespace then default to an empty dict if local_ns is None: local_ns = {} if args.converter is None: converter = self.converter else: try: converter = local_ns[args.converter] except KeyError: try: converter = self.shell.user_ns[args.converter] except KeyError: raise NameError( "name '%s' is not defined" % args.converter ) if not isinstance(converter, Converter): raise TypeError("'%s' must be a %s object (but it is a %s)." % (args.converter, Converter, type(localconverter))) if args.input: for input in ','.join(args.input).split(','): try: val = local_ns[input] except KeyError: try: val = self.shell.user_ns[input] except KeyError: raise NameError("name '%s' is not defined" % input) with localconverter(converter) as cv: ro.r.assign(input, val) if args.display: try: cell_display = local_ns[args.display] except KeyError: try: cell_display = self.shell.user_ns[args.display] except KeyError: raise NameError("name '%s' is not defined" % args.display) else: cell_display = CELL_DISPLAY_DEFAULT tmpd = self.setup_graphics(args) text_output = '' try: if line_mode: for line in code.split(';'): text_result, result, visible = self.eval(line) text_output += text_result if text_result: # The last line printed something to the console so # we won't return it. return_output = False else: text_result, result, visible = self.eval(code) text_output += text_result if visible: with contextlib.ExitStack() as stack: if self.cache_display_data: stack.enter_context( rpy2.rinterface_lib .callbacks .obj_in_module(rpy2.rinterface_lib .callbacks, 'consolewrite_print', self.write_console_regular)) cell_display(result, args) text_output += self.flush() except RInterpreterError as e: # TODO: Maybe we should make this red or something? print(e.stdout) if not e.stdout.endswith(e.err): print(e.err) raise e finally: if self.device in ['png', 'svg']: ro.r('dev.off()') if text_output: # display_data.append(('RMagic.R', {'text/plain':text_output})) displaypub.publish_display_data( data={'text/plain': text_output}, source='RMagic.R') # publish the R images if self.device in ['png', 'svg']: display_data, md = self.publish_graphics( tmpd, args.isolate_svgs ) for tag, disp_d in display_data: displaypub.publish_display_data(data=disp_d, source=tag, metadata=md) # kill the temporary directory - currently created only for "svg" # and "png" (else it's None) if tmpd: rmtree(tmpd) if args.output: with localconverter(converter) as cv: for output in ','.join(args.output).split(','): output_ipy = ro.globalenv.find(output) self.shell.push({output: output_ipy}) # this will keep a reference to the display_data # which might be useful to other objects who happen to use # this method if self.cache_display_data: self.display_cache = display_data # We're in line mode and return_output is still True, # so return the converted result if return_output and not args.noreturn: if result is not ri.NULL: with localconverter(converter) as cv: res = cv.rpy2py(result) return res
def convert_pd_df_to_r(pd_df): with localconverter(ro.default_converter + pandas2ri.converter): r_df = ro.conversion.py2rpy(pd_df) return r_df
from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter base = importr('base') utils = importr('utils') ## only need to install once ## # utils.install_packages('PlayerRatings') # utils.chooseCRANmirror(ind=65) pyPR = importr('PlayerRatings') pd_df = pd.DataFrame({ 'Time Period': [1, 1, 1], 'Player 1': [1, 2, 3], 'Player 2': [2, 3, 1], 'Result': [1, 0, 0] }) print(pd_df) with localconverter(ro.default_converter + pandas2ri.converter): r_from_pd_df = ro.conversion.py2rpy(pd_df) print(r_from_pd_df) sobj = pyPR.steph(r_from_pd_df, cval=8, hval=8) #
r(''' f <- function() { data(quasiflow) qf <- quasiflow[1:1000,1:3] qf.label <- quasiflow[1:1000,4] thr <- c(0.25, -0.3) qf.prim <- prim.box(x=qf, y=qf.label, threshold=thr, threshold.type=0) jpeg('rplot.jpg') plot(qf.prim) dev.off() } ''') from rpy2.robjects.conversion import localconverter from rpy2.robjects import pandas2ri with localconverter(robjects.default_converter + pandas2ri.converter): qf = robjects.conversion.py2rpy(df) print(qf) resp = robjects.FloatVector(response) h=r.hist(resp) hf = robjects.conversion.converter.rpy2py(h) pf = pd.DataFrame(hf) print(h) print(pf) print(pf.attrs) #print(prim_response) #thr = robjects.FloatVector([1.0,2.0]) #rprim = r['prim.box'] #prim_res = rprim(x=qf,y=prim_response,threshold=thr)
from scipy import stats os.chdir( "/srv/uom-data1-q.unimelb.edu.au/6300-afournier/home/student.unimelb.edu.au/andhikap/Clim_GWAS/Clim_GWAS_2" ) from rpy2.robjects.packages import importr import rpy2.robjects as ro import pandas as pd from rpy2.robjects import r from rpy2.robjects import pandas2ri from rpy2.robjects import default_converter from rpy2.robjects.conversion import localconverter #Get the snp data ro.r('load("AllPlantings_Corrected_SNPs_unique.RData")') #Load the file with localconverter(default_converter + pandas2ri.converter) as cv: pd_snps = r('snps') snps = pd_snps.iloc[:, 1:len(pd_snps.columns) + 1] SNP_data = np.array(snps) #get the phenotype data ro.r('load("Phenotypes_dtbfixed.RData")') with localconverter(default_converter + pandas2ri.converter) as cv: pd_phenotypes = r('phenotypes') Pheno_data = np.array( pd_phenotypes.iloc[:, 1]) #Days-to-bolting phenotypes only #include the kinship matrix
def conversion_rpy2py_local(conv_mod: ConversionModule, dataset: Callable[[], Sexp]) -> Any: # Needs default_converter to e.g. call `as` on a SummarizedExperiment: # Calling a R function returning a S4 object requires py2rpy[RS4], py2rpy[str], … with localconverter(default_converter + conv_mod.converter): return dataset()