def call_edgeR(self, df_counts: DataFrame) -> DataFrame: """ Call to edgeR via r2py to get TMM (trimmed mean of M-values) normalization for raw counts. Prepare the edgeR input in python and call edgeR calcNormFactors via r2py. The TMM normalized values are returned in a DataFrame which is converted back to pandas DataFrame via r2py. Parameters ---------- df_counts : DataFrame The dataframe containing the raw counts. Returns ------- DataFrame A dataframe with TMM values (trimmed mean of M-values). """ ro.r("library(edgeR)") ro.r("library(base)") df_input = df_counts columns = df_input.columns to_df = {"lib.size": df_input.sum(axis=0).values} if self.samples_to_group is not None: to_df["group"] = [ self.samples_to_group[sample_name] for sample_name in self.samples_to_group ] if self.batch is not None: to_df["batch"] = self.batch df_samples = pd.DataFrame(to_df) df_samples["lib.size"] = df_samples["lib.size"].astype(int) r_counts = mbf_r.convert_dataframe_to_r(df_input) r_samples = mbf_r.convert_dataframe_to_r(df_samples) y = ro.r("DGEList")( counts=r_counts, samples=r_samples, ) # apply TMM normalization y = ro.r("calcNormFactors")(y) # default is TMM logtmm = ro.r("""function(y){ cpm(y, log=TRUE, prior.count=5) }""")( y ) # apparently removeBatchEffects works better on log2-transformed values if self.batch is not None: batches = np.array(self.batch) batches = numpy2ri.py2rpy(batches) logtmm = ro.r(""" function(logtmm, batch) { tmm = removeBatchEffect(logtmm,batch=batch) } """)(logtmm=logtmm, batch=batches) cpm = ro.r("data.frame")(logtmm) df = mbf_r.convert_dataframe_from_r(cpm) df = df.reset_index(drop=True) df.columns = columns return df
def edgeR_comparison( self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4 ): """Call edgeR exactTest comparing two groups. Resulting dataframe is in df order. """ import mbf_r import math import rpy2.robjects as ro import rpy2.robjects.numpy2ri as numpy2ri ro.r("library(edgeR)") input_df = df[columns_a + columns_b] input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))] if library_sizes is not None: # pragma: no cover samples = pd.DataFrame({"lib.size": library_sizes}) else: samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)}) # this looks like it inverts the columns, # but it doesnt' samples.insert(0, "group", ["z"] * len(columns_b) + ["x"] * len(columns_a)) r_counts = mbf_r.convert_dataframe_to_r(input_df) r_samples = mbf_r.convert_dataframe_to_r(samples) y = ro.r("DGEList")( counts=r_counts, samples=r_samples, **{ "lib.size": ro.r("as.vector")( numpy2ri.py2rpy(np.array(samples["lib.size"])) ) }, ) # apply TMM normalization y = ro.r("calcNormFactors")(y) if len(columns_a) == 1 and len(columns_b) == 1: # pragma: no cover # not currently used. z = manual_dispersion_value e = ro.r("exactTest")(y, dispersion=math.pow(manual_dispersion_value, 2)) """ you are attempting to estimate dispersions without any replicates. Since this is not possible, there are several inferior workarounds to come up with something still semi-useful. 1. pick a reasonable dispersion value from "Experience": 0.4 for humans, 0.1 for genetically identical model organisms, 0.01 for technical replicates. We'll try this for now. 2. estimate dispersions on a number of genes that you KNOW to be not differentially expressed. 3. In case of multiple factor experiments, discard the least important factors and treat the samples as replicates. 4. just use logFC and forget about significance. """ else: z = ro.r("estimateDisp")(y, robust=True) e = ro.r("exactTest")(z) res = ro.r("topTags")(e, n=len(input_df), **{"sort.by": "none"}) result = mbf_r.convert_dataframe_from_r(res[0]) return result
def edgeR_comparison( self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4 ): """Call edgeR exactTest comparing two groups. Resulting dataframe is in df order. """ import mbf_r import rpy2.robjects as ro import rpy2.robjects.numpy2ri as numpy2ri if len(columns_a) != len(columns_b): raise ValueError("paired requires equal length groups") ro.r("library(edgeR)") input_df = df[columns_a + columns_b] input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))] if library_sizes is not None: # pragma: no cover samples = pd.DataFrame({"lib.size": library_sizes}) else: samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)}) # remember, edgeR does b-a not a-b... samples.insert(0, "group", ["z"] * len(columns_b) + ["y"] * len(columns_a)) samples.insert( 1, "pairs", [str(x) for x in list(range(len(columns_a))) + list(range(len(columns_a)))], ) r_counts = mbf_r.convert_dataframe_to_r(input_df) r_samples = mbf_r.convert_dataframe_to_r(samples) design = ro.r("model.matrix")(ro.r("~pairs+group"), data=r_samples) y = ro.r("DGEList")( counts=r_counts, samples=r_samples, **{ "lib.size": ro.r("as.vector")( numpy2ri.py2rpy(np.array(samples["lib.size"])) ) }, ) # apply TMM normalization y = ro.r("calcNormFactors")(y) z = ro.r("estimateDisp")(y, design, robust=True) fit = ro.r("glmFit")(z, design) lrt = ro.r("glmLRT")(fit) res = ro.r("topTags")(lrt, n=len(input_df), **{"sort.by": "none"}) result = mbf_r.convert_dataframe_from_r(res[0]) return result
def call_DESeq2(self, count_data, samples, conditions): """Call DESeq2. @count_data is a DataFrame with 'samples' as the column names. @samples is a list. @conditions as well. Condition is the one you're contrasting on. You can add additional_conditions (a DataFrame, index = samples) which DESeq2 will keep under consideration (changes the formula). """ import rpy2.robjects as robjects import rpy2.robjects.numpy2ri as numpy2ri import mbf_r count_data = count_data.values count_data = np.array(count_data) nr, nc = count_data.shape count_data = count_data.reshape(count_data.size) # turn into 1d vector count_data = robjects.r.matrix( numpy2ri.py2rpy(count_data), nrow=nr, ncol=nc, byrow=True ) col_data = pd.DataFrame({"sample": samples, "condition": conditions}).set_index( "sample" ) formula = "~ condition" col_data = col_data.reset_index(drop=True) col_data = mbf_r.convert_dataframe_to_r(pd.DataFrame(col_data.to_dict("list"))) deseq_experiment = robjects.r("DESeqDataSetFromMatrix")( countData=count_data, colData=col_data, design=robjects.Formula(formula) ) deseq_experiment = robjects.r("DESeq")(deseq_experiment) res = robjects.r("results")( deseq_experiment, contrast=robjects.r("c")("condition", "c", "base") ) df = mbf_r.convert_dataframe_from_r(robjects.r("as.data.frame")(res)) return df
def test_categorical_ordered(self): df = pd.DataFrame( {"a": pd.Categorical(['a', 'b', 'c', 'a'], ordered=True)}) rdf = convert_dataframe_to_r(df) pdf = convert_dataframe_from_r(rdf) df.index = [str(x) for x in df.index] assert_frame_equal(df, pdf)
def test_index_gets_dropped(self): df = pd.DataFrame({ "a": [0.1, 0.2, 0.3], 'idx': ['a', 'b', 'c'] }).set_index('idx') rdf = convert_dataframe_to_r(df) pdf = convert_dataframe_from_r(rdf) df.index.name = None assert_frame_equal(df, pdf)
def test_raise_on_funny(self): df = pd.DataFrame({"aaaa": [(1, 2), (3, 4)]}) with pytest.raises(ValueError) as e: convert_dataframe_to_r(df) assert 'aaaa' in str(e.value)
def test_simple(self): df = pd.DataFrame({"a": [0.1, 0.2, 0.3]}) rdf = convert_dataframe_to_r(df) pdf = convert_dataframe_from_r(rdf) df.index = [str(x) for x in df.index] assert_frame_equal(df, pdf)