def call_DESeq2(self, count_data, samples, conditions):
        """Call DESeq2.
        @count_data is a DataFrame with 'samples' as the column names.
        @samples is a list. @conditions as well. Condition is the one you're contrasting on.
        You can add additional_conditions (a DataFrame, index = samples) which DESeq2 will
        keep under consideration (changes the formula).
        """
        import rpy2.robjects as robjects
        import rpy2.robjects.numpy2ri as numpy2ri
        import mbf_r

        count_data = count_data.values
        count_data = np.array(count_data)
        nr, nc = count_data.shape
        count_data = count_data.reshape(count_data.size)  # turn into 1d vector
        count_data = robjects.r.matrix(
            numpy2ri.py2rpy(count_data), nrow=nr, ncol=nc, byrow=True
        )
        col_data = pd.DataFrame({"sample": samples, "condition": conditions}).set_index(
            "sample"
        )
        formula = "~ condition"
        col_data = col_data.reset_index(drop=True)
        col_data = mbf_r.convert_dataframe_to_r(pd.DataFrame(col_data.to_dict("list")))
        deseq_experiment = robjects.r("DESeqDataSetFromMatrix")(
            countData=count_data, colData=col_data, design=robjects.Formula(formula)
        )
        deseq_experiment = robjects.r("DESeq")(deseq_experiment)
        res = robjects.r("results")(
            deseq_experiment, contrast=robjects.r("c")("condition", "c", "base")
        )
        df = mbf_r.convert_dataframe_from_r(robjects.r("as.data.frame")(res))
        return df
 def test_categorical_ordered(self):
     df = pd.DataFrame(
         {"a": pd.Categorical(['a', 'b', 'c', 'a'], ordered=True)})
     rdf = convert_dataframe_to_r(df)
     pdf = convert_dataframe_from_r(rdf)
     df.index = [str(x) for x in df.index]
     assert_frame_equal(df, pdf)
Beispiel #3
0
    def call_edgeR(self, df_counts: DataFrame) -> DataFrame:
        """
        Call to edgeR via r2py to get TMM (trimmed mean of M-values)
        normalization for raw counts.

        Prepare the edgeR input in python and call edgeR calcNormFactors via
        r2py. The TMM normalized values are returned in a DataFrame which
        is converted back to pandas DataFrame via r2py.

        Parameters
        ----------
        df_counts : DataFrame
            The dataframe containing the raw counts.

        Returns
        -------
        DataFrame
            A dataframe with TMM values (trimmed mean of M-values).
        """
        ro.r("library(edgeR)")
        ro.r("library(base)")
        df_input = df_counts
        columns = df_input.columns
        to_df = {"lib.size": df_input.sum(axis=0).values}
        if self.samples_to_group is not None:
            to_df["group"] = [
                self.samples_to_group[sample_name]
                for sample_name in self.samples_to_group
            ]
        if self.batch is not None:
            to_df["batch"] = self.batch
        df_samples = pd.DataFrame(to_df)
        df_samples["lib.size"] = df_samples["lib.size"].astype(int)
        r_counts = mbf_r.convert_dataframe_to_r(df_input)
        r_samples = mbf_r.convert_dataframe_to_r(df_samples)
        y = ro.r("DGEList")(
            counts=r_counts,
            samples=r_samples,
        )
        # apply TMM normalization
        y = ro.r("calcNormFactors")(y)  # default is TMM
        logtmm = ro.r("""function(y){
                cpm(y, log=TRUE, prior.count=5)
                }""")(
            y
        )  # apparently removeBatchEffects works better on log2-transformed values
        if self.batch is not None:
            batches = np.array(self.batch)
            batches = numpy2ri.py2rpy(batches)
            logtmm = ro.r("""
                function(logtmm, batch) {
                    tmm = removeBatchEffect(logtmm,batch=batch)
                }
                """)(logtmm=logtmm, batch=batches)
        cpm = ro.r("data.frame")(logtmm)
        df = mbf_r.convert_dataframe_from_r(cpm)
        df = df.reset_index(drop=True)
        df.columns = columns
        return df
 def test_simple_from(self):
     rdf = ro.r("""data.frame(a=c(0.1, 0.2, 0.3))""")
     pdf = convert_dataframe_from_r(rdf)
     should = pd.DataFrame({
         'a': [0.1, 0.2, 0.3],
         'idx': ['1', '2', '3']
     }).set_index('idx')
     should.index.name = None
     assert_frame_equal(pdf, should)
 def test_index_gets_dropped(self):
     df = pd.DataFrame({
         "a": [0.1, 0.2, 0.3],
         'idx': ['a', 'b', 'c']
     }).set_index('idx')
     rdf = convert_dataframe_to_r(df)
     pdf = convert_dataframe_from_r(rdf)
     df.index.name = None
     assert_frame_equal(df, pdf)
    def edgeR_comparison(
        self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4
    ):
        """Call edgeR exactTest comparing two groups.
        Resulting dataframe is in df order.
        """
        import mbf_r
        import math
        import rpy2.robjects as ro
        import rpy2.robjects.numpy2ri as numpy2ri

        ro.r("library(edgeR)")
        input_df = df[columns_a + columns_b]
        input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))]
        if library_sizes is not None:  # pragma: no cover
            samples = pd.DataFrame({"lib.size": library_sizes})
        else:
            samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)})
        # this looks like it inverts the columns,
        # but it doesnt'
        samples.insert(0, "group", ["z"] * len(columns_b) + ["x"] * len(columns_a))
        r_counts = mbf_r.convert_dataframe_to_r(input_df)
        r_samples = mbf_r.convert_dataframe_to_r(samples)
        y = ro.r("DGEList")(
            counts=r_counts,
            samples=r_samples,
            **{
                "lib.size": ro.r("as.vector")(
                    numpy2ri.py2rpy(np.array(samples["lib.size"]))
                )
            },
        )
        # apply TMM normalization
        y = ro.r("calcNormFactors")(y)
        if len(columns_a) == 1 and len(columns_b) == 1:  # pragma: no cover
            # not currently used.
            z = manual_dispersion_value
            e = ro.r("exactTest")(y, dispersion=math.pow(manual_dispersion_value, 2))
            """
            you are attempting to estimate dispersions without any replicates.
            Since this is not possible, there are several inferior workarounds to come up with something
            still semi-useful.
            1. pick a reasonable dispersion value from "Experience": 0.4 for humans, 0.1 for genetically identical model organisms, 0.01 for technical replicates. We'll try this for now.
            2. estimate dispersions on a number of genes that you KNOW to be not differentially expressed.
            3. In case of multiple factor experiments, discard the least important factors and treat the samples as replicates.
            4. just use logFC and forget about significance.
            """
        else:
            z = ro.r("estimateDisp")(y, robust=True)
            e = ro.r("exactTest")(z)
        res = ro.r("topTags")(e, n=len(input_df), **{"sort.by": "none"})
        result = mbf_r.convert_dataframe_from_r(res[0])
        return result
    def edgeR_comparison(
        self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4
    ):
        """Call edgeR exactTest comparing two groups.
        Resulting dataframe is in df order.
        """
        import mbf_r
        import rpy2.robjects as ro
        import rpy2.robjects.numpy2ri as numpy2ri

        if len(columns_a) != len(columns_b):
            raise ValueError("paired requires equal length groups")

        ro.r("library(edgeR)")
        input_df = df[columns_a + columns_b]
        input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))]
        if library_sizes is not None:  # pragma: no cover
            samples = pd.DataFrame({"lib.size": library_sizes})
        else:
            samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)})
        # remember, edgeR does b-a not a-b...
        samples.insert(0, "group", ["z"] * len(columns_b) + ["y"] * len(columns_a))
        samples.insert(
            1,
            "pairs",
            [str(x) for x in list(range(len(columns_a))) + list(range(len(columns_a)))],
        )

        r_counts = mbf_r.convert_dataframe_to_r(input_df)
        r_samples = mbf_r.convert_dataframe_to_r(samples)
        design = ro.r("model.matrix")(ro.r("~pairs+group"), data=r_samples)
        y = ro.r("DGEList")(
            counts=r_counts,
            samples=r_samples,
            **{
                "lib.size": ro.r("as.vector")(
                    numpy2ri.py2rpy(np.array(samples["lib.size"]))
                )
            },
        )
        # apply TMM normalization
        y = ro.r("calcNormFactors")(y)
        z = ro.r("estimateDisp")(y, design, robust=True)
        fit = ro.r("glmFit")(z, design)
        lrt = ro.r("glmLRT")(fit)
        res = ro.r("topTags")(lrt, n=len(input_df), **{"sort.by": "none"})
        result = mbf_r.convert_dataframe_from_r(res[0])
        return result
    def _get_tuch_data(self):
        import mbf_sampledata
        import mbf_r
        import rpy2.robjects as ro

        path = mbf_sampledata.get_sample_path("mbf_comparisons/TuchEtAlS1.csv")
        # directly from the manual.
        # plus minus """To make
        # this file, we downloaded Table S1 from Tuch et al. [39], deleted some unnecessary columns
        # and edited the column headings slightly:"""
        ro.r("""load_data = function(path) {
                rawdata <- read.delim(path, check.names=FALSE, stringsAsFactors=FALSE)
                library(edgeR)
                y <- DGEList(counts=rawdata[,3:8], genes=rawdata[,1:2])
                library(org.Hs.eg.db)
                idfound <- y$genes$idRefSeq %in% mappedRkeys(org.Hs.egREFSEQ)
                y <- y[idfound,]
                egREFSEQ <- toTable(org.Hs.egREFSEQ)
                m <- match(y$genes$idRefSeq, egREFSEQ$accession)
                y$genes$EntrezGene <- egREFSEQ$gene_id[m]
                egSYMBOL <- toTable(org.Hs.egSYMBOL)
                m <- match(y$genes$EntrezGene, egSYMBOL$gene_id)
                y$genes$Symbol <- egSYMBOL$symbol[m]

                o <- order(rowSums(y$counts), decreasing=TRUE)
                y <- y[o,]
                d <- duplicated(y$genes$Symbol)
                y <- y[!d,]

                cbind(y$genes, y$counts)
            }
""")
        df = mbf_r.convert_dataframe_from_r(ro.r("load_data")(str(path)))
        df.columns = [
            "idRefSeq",
            "nameOfGene",
            "EntrezGene",
            "Symbol",
            "8.N",
            "8.T",
            "33.N",
            "33.T",
            "51.N",
            "51.T",
        ]
        assert len(df) == 10519
        return df
 def test_simple(self):
     df = pd.DataFrame({"a": [0.1, 0.2, 0.3]})
     rdf = convert_dataframe_to_r(df)
     pdf = convert_dataframe_from_r(rdf)
     df.index = [str(x) for x in df.index]
     assert_frame_equal(df, pdf)