Python py2ri_pandasdataframeの例、rpy2.robjects.pandas2ri.py2ri_pandasdataframe Pythonの例

コード例 #1

0

ファイルを表示

ファイル: irismustangmetrics.py プロジェクト: MazamaScience/ispaq

def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2):
    """"
    Invoke a named "correlation" R metric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream1: an r_stream object
    :param r_stream2: an r_stream object
    :param metric_function_name: the name of the set of metrics
    :return:
    """
    R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric')
    
    # NOTE:  Conversion of dataframes only works if you activate but we don't want conversion
    # NOTE:  to always be automatic so we deactivate() after we're done converting.
    pandas2ri.activate()
    r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1)
    r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2)
    pandas2ri.deactivate()
    
    # TODO:  Can we just activate/deactivate before/after R_function() without converting
    # TODO:  r_evalresp1/2 ahead of time?
    
    # Calculate the metric
    r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2)
    r_dataframe = _R_metricList2DF(r_metriclist)
    pandas2ri.activate()
    df = pandas2ri.ri2py_dataframe(r_dataframe)
    pandas2ri.deactivate()
    
    # Convert columns from R POSIXct to pyton UTCDateTime
    df.starttime = df.starttime.apply(UTCDateTime)
    df.endtime = df.endtime.apply(UTCDateTime)
    return df

コード例 #2

0

ファイルを表示

def mbpls_ade4(data, num_comp=20):
    from rpy2 import robjects
    from rpy2.robjects import pandas2ri
    import time

    x1, x2, y = data

    pandas2ri.activate(
    )  # to activate easy conversion from r to pandas dataframes

    # to generate a dataframe in R global environment
    robjects.globalenv['x1'] = pandas2ri.py2ri_pandasdataframe(x1)
    robjects.globalenv['x2'] = pandas2ri.py2ri_pandasdataframe(x2)
    robjects.globalenv['ref'] = pandas2ri.py2ri_pandasdataframe(y)

    start_time = time.time()
    # how to execute R code using the global environment variables defined above
    robjects.r('''
        library(ade4)
        library(adegraphics)

        dudiY.act <- dudi.pca(ref, center = TRUE, scale = TRUE, scannf =
                                  FALSE,nf=1)
        ktabX.act <- ktab.list.df(list(mainPerformance=x1,mainInput=x2))
        resmbpls.act <- mbpls(dudiY.act, ktabX.act, scale = TRUE,
                                option = "none", scannf = FALSE, nf=''' +
               str(num_comp) + ''')

    bip <- resmbpls.act$bip
    ''')

    return start_time

コード例 #3

0

ファイルを表示

ファイル: irismustangmetrics.py プロジェクト: ardasysmic/ispaq

def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2):
    """"
    Invoke a named "correlation" R metric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream1: an r_stream object
    :param r_stream2: an r_stream object
    :param evalresp1: pandas DataFrame of evalresp FAP for r_stream1
    :param evalresp2: pandas DataFrame of evalresp FAP for r_stream2
    :return:
    """
    R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric')

    # NOTE:  Conversion of dataframes only works if you activate but we don't want conversion
    # NOTE:  to always be automatic so we deactivate() after we're done converting.
    pandas2ri.activate()
    r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1)
    r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2)
    pandas2ri.deactivate()

    # TODO:  Can we just activate/deactivate before/after R_function() without converting
    # TODO:  r_evalresp1/2 ahead of time?

    # Calculate the metric
    r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2)
    r_dataframe = _R_metricList2DF(r_metriclist)
    pandas2ri.activate()
    df = pandas2ri.ri2py_dataframe(r_dataframe)
    pandas2ri.deactivate()

    # Convert columns from R POSIXct to pyton UTCDateTime
    df.starttime = df.starttime.apply(UTCDateTime)
    df.endtime = df.endtime.apply(UTCDateTime)
    return df

コード例 #4

0

ファイルを表示

ファイル: RnaseqqcReport.py プロジェクト: sudlab/CGATPipelines

    def getCorrelations(self, dataframe):
        """
        Perform hierarchical clustering on a
        dataframe of expression values

        Arguments
        ---------
        dataframe: pandas.Core.DataFrame
          a dataframe containing gene IDs, sample IDs
          and gene expression values

        Returns
        -------
        corr_frame: pandas.Core.DataFrame
          a dataframe of a pair-wise correlation matrix
          across samples.  Uses the Pearson correlation.
        """

        # set sample_id to index
        pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM")
        transpose = pivot.T
        # why do I have to resort to R????
        r_df = py2ri.py2ri_pandasdataframe(transpose)
        R.assign("p.df", r_df)
        R("""p.mat <- apply(p.df, 2, as.numeric)""")
        R("""cor.df <- cor(p.mat)""")
        r_cor = R["cor.df"]
        py_cor = py2ri.ri2py_dataframe(r_cor)
        corr_frame = py_cor

        return corr_frame

コード例 #5

0

ファイルを表示

ファイル: RnaseqqcReport.py プロジェクト: wbyu/CGATPipelines

    def getCorrelations(self, dataframe):
        '''
        Perform hierarchical clustering on a
        dataframe of expression values

        Arguments
        ---------
        dataframe: pandas.Core.DataFrame
          a dataframe containing gene IDs, sample IDs
          and gene expression values

        Returns
        -------
        corr_frame: pandas.Core.DataFrame
          a dataframe of a pair-wise correlation matrix
          across samples.  Uses the Pearson correlation.
        '''

        # set sample_id to index
        pivot = dataframe.pivot(index="sample_name",
                                columns="transcript_id",
                                values="TPM")
        transpose = pivot.T
        # why do I have to resort to R????
        r_df = py2ri.py2ri_pandasdataframe(transpose)
        R.assign("p.df", r_df)
        R('''p.mat <- apply(p.df, 2, as.numeric)''')
        R('''cor.df <- cor(p.mat)''')
        r_cor = R["cor.df"]
        py_cor = py2ri.ri2py_dataframe(r_cor)
        corr_frame = py_cor

        return corr_frame

コード例 #6

0

ファイルを表示

ファイル: rpy2.py プロジェクト: rajithbt/pyim

def pandas_to_dataframe(pd_frame, check_names=False):
    r_frame = pandas2ri.py2ri_pandasdataframe(pd_frame)
    r_frame.colnames = pd_frame.columns

    if not check_names:
        r_frame.rownames = pd_frame.index

    return r_frame

コード例 #7

0

ファイルを表示

ファイル: rpy2.py プロジェクト: jrderuiter/ngs-tk

def pandas_to_dataframe(pd_frame, check_names=False):
    r_frame = pandas2ri.py2ri_pandasdataframe(pd_frame)
    r_frame.colnames = pd_frame.columns

    if not check_names:
        r_frame.rownames = pd_frame.index

    return r_frame

コード例 #8

0

ファイルを表示

ファイル: rutils.py プロジェクト: gitter-badger/traja

def to_trajr(trj):
    """Convert trajectory to R `trajr` object. Default fps is 30.

    Args:
        trajectory (:class:`~traja.main.TrajaDataFrame`): trajectory


    Returns:
        traj (:class:`rpy2.robjects.vectors.DataFrame`): column names are ['x', 'y', 'time', 'displacementTime',
                                                            'polar', 'displacement']

    .. doctest::

        >>> import traja
        >>> df = traja.TrajaDataFrame({'x':range(5),'y':range(5)})
        >>> trjr_df = traja.rutils.to_trajr(df) # doctest: +SKIP
        >>> [x for x in trjr_df.names] # doctest: +SKIP
        ...
        ['x', 'y', 'id', 'time', 'displacementTime', 'polar', 'displacement']


    """
    from traja.trajectory import _get_time_col

    trajr = import_trajr()
    if "id" not in trj.__dict__.keys():
        trj["id"] = 0
    time_col = _get_time_col(trj)
    if time_col == "index":
        trj["time"] = trj.index
        time_col = "time"
    fps = trj.fps
    spatial_units = trj.spatial_units or "m"
    time_units = trj.time_units or "s"
    trj_rdf = rpandas.py2ri_pandasdataframe(trj)

    trajr_trj = trajr.TrajFromCoords(
        trj_rdf,
        xCol="x",
        yCol="y",
        timeCol=time_col or rpy2.rinterface.NULL,
        fps=fps or 30,
        spatialUnits=spatial_units,
        timeUnits=time_units,
    )
    return trajr_trj

コード例 #9

0

ファイルを表示

ファイル: Estimate.py プロジェクト: airanmehr/bio

 def LD_usingR(H0):
     import rpy2.robjects as robjects
     from rpy2.robjects import pandas2ri
     robjects.r('options(warn=-1)')
     robjects.r('library(genetics)')
     genotype = H0.applymap(lambda x: ('A', 'G')[x == 0])
     genotype = pd.concat(
             [pd.concat([genotype.iloc[i], genotype.iloc[i + 1]], axis=1).apply(lambda x: '/'.join(x), axis=1) for i
              in np.arange(0, H0.shape[0], 2)], axis=1).T
     c = robjects.r['LD'](robjects.r['makeGenotypes'](pandas2ri.py2ri_pandasdataframe(genotype.astype('str'))))
     c = pd.Series(map(lambda x: pd.DataFrame(pandas2ri.ri2py(x)), c[1:]), index=list(c.names[1:])).apply(
         lambda x: x.fillna(0))
     for x in c:
         x += x.T
         x.index = H0.columns;
         x.columns = H0.columns;
     c.apply(lambda x: np.fill_diagonal(x.values, None))
     return c

コード例 #10

0

ファイルを表示

ファイル: testPleiotropy.py プロジェクト: MikeDMorgan/gwas_pipeline

def pythonWrapper4Pet(dataframe, snps, covars,
                      trait1, trait2, model1,
                      model2, resamples=999):
    '''
    This is just Python wrapper around the R code
    for the PET calculations

    '''
    py2ri.activate()

    E.info("Checking regression models")
    if model1 == "logistic":
        R('''trait1.mod <- binomial''')
        R('''trait1.link <- "logit" ''')
    elif model1 == "linear":
        R('''trait1.mod <- gaussian''')
        R('''trait1.link <- "identity" ''')

    if model2 == "logistic":
        R('''trait2.mod <- binomial''')
        R('''trait2.link <- "logit" ''')
    elif model2 == "linear":
        R('''trait2.mod <- gaussian''')
        R('''trait2.link <- "identity" ''')
    E.info("Running {} regression for trait 1: {}".format(model1,
                                                          trait1))
    E.info("Running {} regression for trait 2: {}".format(model2,
                                                          trait2))

    R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/PET_functions.R")''')
    E.info("Pushing data objects into the R environment")
    # push everything into the R environment
    r_df = py2ri.py2ri_pandasdataframe(dataframe)
    R.assign("data.df", r_df)

    r_snps = ro.StrVector([sp for sp in snps])
    R.assign("snp.list", r_snps)

    E.info("Parsing covariates")
    covars = covars.split(",")
    r_covar = ro.StrVector([cv for cv in covars])
    R.assign("covar.list", r_covar)
    E.info("{} covariates found to adjust "
           "in regression  models".format(len(covars)))

    # clean up, replacing "missing values" with NAs for R
    R('''data.df[data.df == -9] <- NA''')
    R('''pet_results <- list()''')

    # loop over all SNP, calculate PCC and p-value
    # this takes a long time <- need to think of speed ups
    # possible Python-pure implementation, i.e. with LIMIX?
    E.info("Iteratively calculating PCC for all SNPs")
    R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", '''
      '''trait1.link=trait1.link, trait2.link=trait2.link, '''
      '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,'''
      '''resamples=%(resamples)i, snp.list=snp.list)''' % locals())

    R('''out.res <- data.frame(do.call(rbind, results))''')
    R('''colnames(out.res) <- c("PCC", "pvalue")''')
    py_out = py2ri.ri2py_dataframe(R["out.res"])

    return py_out

コード例 #11

0

ファイルを表示

def pythonWrapper4Pet(dataframe, snps, covars,
                      trait1, trait2, model1,
                      scriptsdir,
                      model2, resamples=999):
    '''
    This is just Python wrapper around the R code
    for the PET calculations

    '''
    py2ri.activate()

    E.info("Checking regression models")
    if model1 == "logistic":
        R('''trait1.mod <- binomial''')
        R('''trait1.link <- "logit" ''')
    elif model1 == "linear":
        R('''trait1.mod <- gaussian''')
        R('''trait1.link <- "identity" ''')

    if model2 == "logistic":
        R('''trait2.mod <- binomial''')
        R('''trait2.link <- "logit" ''')
    elif model2 == "linear":
        R('''trait2.mod <- gaussian''')
        R('''trait2.link <- "identity" ''')
    E.info("Running {} regression for trait 1: {}".format(model1,
                                                          trait1))
    E.info("Running {} regression for trait 2: {}".format(model2,
                                                          trait2))

    R('''source("%(scriptsdir)s/PET_functions.R")''' % locals())
    E.info("Pushing data objects into the R environment")
    # push everything into the R environment
    r_df = py2ri.py2ri_pandasdataframe(dataframe)
    R.assign("data.df", r_df)

    r_snps = ro.StrVector([sp for sp in snps])
    R.assign("snp.list", r_snps)

    E.info("Parsing covariates")
    covars = covars.split(",")
    r_covar = ro.StrVector([cv for cv in covars])
    R.assign("covar.list", r_covar)
    E.info("{} covariates found to adjust "
           "in regression  models".format(len(covars)))

    # clean up, replacing "missing values" with NAs for R
    R('''data.df[data.df == -9] <- NA''')
    R('''pet_results <- list()''')

    # loop over all SNP, calculate PCC and p-value
    # this takes a long time <- need to think of speed ups
    # possible Python-pure implementation, i.e. with LIMIX?
    E.info("Iteratively calculating PCC for all SNPs")
    R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", '''
      '''trait1.link=trait1.link, trait2.link=trait2.link, '''
      '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,'''
      '''resamples=%(resamples)i, snp.list=snp.list)''' % locals())

    R('''out.res <- data.frame(do.call(rbind, results))''')
    R('''colnames(out.res) <- c("PCC", "pvalue")''')
    py_out = py2ri.ri2py_dataframe(R["out.res"])

    return py_out

コード例 #12

0

ファイルを表示

def R_var_importance(nsamples=40000, data_store=None):
    base = importr('base')
    ###################################################
    # load dataframe
    store = pd.HDFStore(data_store)
    print(store)

    #pandas2ri.activate()
    Xtrain = store['Xtrain']
    ytrain = store['ytrain']

    #Xtest, Xtrain, ytrain, Xval, yval, test_idx, val_idx = prepareAllFeatures()

    #sample
    if nsamples != -1:
        if isinstance(nsamples, str) and 'shuffle' in nsamples:
            print("Shuffle train data...")
            rows = np.random.choice(len(Xtrain.index),
                                    size=len(Xtrain.index),
                                    replace=False)
        else:
            rows = np.random.choice(len(Xtrain.index),
                                    size=nsamples,
                                    replace=False)

        print("unique rows: %6.2f" %
              (float(np.unique(rows).shape[0]) / float(rows.shape[0])))
        Xtrain = Xtrain.iloc[rows, :]
        ytrain = ytrain.iloc[rows]

    store.close()
    pandas2ri.activate()
    print(Xtrain.info())
    print(Xtrain.describe(include='all'))
    Xtrain_R = pandas2ri.py2ri_pandasdataframe(Xtrain)
    ytrain_R = pandas2ri.py2ri_pandasseries(ytrain)
    #print Xtrain_R

    ###################################################
    # R-code
    # http://stackoverflow.com/questions/27801409/get-field-values-from-rpy2-random-forest-object
    r = robjects.r
    r['options'](warn=-1)

    r.library('randomForest')
    rf = r.randomForest(Xtrain_R,
                        ytrain_R,
                        ntree=250,
                        importance=True,
                        do_trace=1)
    df_imp_R = rf.rx("importance")
    df_imp_R = base.as_data_frame(df_imp_R)
    df_imp = pandas2ri.ri2py(df_imp_R)
    df_imp = df_imp.sort(columns=['importance.IncNodePurity'], ascending=False)
    print(df_imp)

    with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
        print(list(df_imp.index))

    #print r.dimnames(rf[8])
    r.varImpPlot(rf, sort=True, n_var=30)

コード例 #13

0

ファイルを表示

ファイル: assoc2coloc.py プロジェクト: MikeDMorgan/gwas_pipeline

def testColoc(trait1, trait2, trait1_type, trait2_type,
              maf_table, gene_list=None,
              trait1_prev=None, trait2_prev=None,
              chromosome=None, start=None, end=None):
    '''
    Perform colocalization testing between two traits.

    Arguments
    ------
    trait1: pandas.core.dataframe
      A data frame containing the summary statistics for
      trait 1

    trait2: pandas.core.dataframe
      A data frame containing the summary statistics for
      trait 2

    trait1_type: string
      Either `cc` or `quant`, denoting the type of trait 1

    trait2_type: string
      Either `cc` or `quant`, denoting the type of trait 2

    maf_table: pandas.core.dataframe
      Data frame containing SNP IDs and MAF

    gene_list: list
      A list of genes to restirct analysis to.  Either trait 1
      or trait 2 must be a quantitative trait

    trait1_prev: float
      Prevalence of trait1 if binary

    trait2_prev: float
      Prevalence of trait2 if binary

    chromosome: int
      Chromosome to restrict the colocalisation analysis to

    start: int
      start co-ordinate to restrict analysis to.  Must also
    provide `chromosome`. 1-based index, closed [start, end]

    end: int
      end co-ordinate to restrict analysis to.  Must also
      provide `chromosome` and `start`. 1-based index, closed
      [start, end]

    Returns
    -------
    coloc_results: pandas.core.dataframe
      A data frame containing each region (e.g. genes) and
      the posterior probability in favour of each hypothesis:
      H0 - no association with trait1 or trait2, and no colocalisation
      H1 - association with trait 1, but no colocalisation
      H2 - association with trait2, but no colocalisation
      H3 - association with trait1 and 2, but no colocalisation
      H4 - association with trait1 and 2, and colocalised
    '''

    # push all elements into the R environment
    R('''sink(file="sink.text")''')
    R('''suppressPackageStartupMessages(library(coloc))''')
    R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/coloQtl.R")''')
    
    E.info("Pushing results tables into R environment")
    py2ri.activate()
    r_trait1 = py2ri.py2ri_pandasdataframe(trait1)
    R.assign("r.trait1", r_trait1)

    r_trait2 = py2ri.py2ri_pandasdataframe(trait2)
    R.assign("r.trait2", r_trait2)

    r_maf = py2ri.py2ri_pandasdataframe(maf_table)
    R.assign("r.mafs", r_maf)

    if trait1_prev:
        R.assign("trait1.prev", trait1_prev)
    else:
        R('''trait1.prev <- NULL''')

    if trait2_prev:
        R.assign("trait2.prev", trait2_prev)
    else:
        R('''trait2.prev <- NULL''')

    E.info("Checking for gene list")
    if gene_list:
        E.info("Gene list contains {} genes".format(len(set(gene_list))))
        r_genes = ro.StrVector([rx for rx in set(gene_list)])
        R.assign("gene.list", r_genes)

        E.info("Iterating over gene list")
        R('''res.df <- geneListSnpColocQtl(gene_list=gene.list,'''
          '''results_table=r.trait1, MAF_table=r.mafs, '''
          '''eqtl_table=r.trait2, trait_type="%(trait1_type)s", '''
          '''prev=trait1.prev)''' % locals())

        R('''genes <- rownames(res.df)''')
        genes = [gx for gx in R["genes"]]

    else:
        R('''res.df <- TwoTraitSnpColocQtl(trait1_table=r.trait1,'''
          '''trait2_table=r.trait2, MAF_table=r.mafs, '''
          '''trait1_type="%(trait1_type)s", trait2_type="%(trait2_type)s",'''
          '''prev1=trait1.prev, prev2=trait2.prev)''')
        
        R('''genes <- dim(res.df)[1]''')
        genes = R["genes"]

    coloc_results = py2ri.ri2py_dataframe(R["res.df"])
    coloc_results.index = genes

    coloc_results.columns = ["nSNPs", "H0.PP", "H1.PP", "H2.PP", "H3.PP", "H4.PP"]

    R('''sink(file=NULL)''')

    return coloc_results

コード例 #14

0

ファイルを表示

def pandas_df_to_exprset(
        phenoData, featureData, dfs: pd.DataFrame
    or [pd.DataFrame]) -> pd.DataFrame or list:
    # TODO: !!!IMPORTANT!!! methods.new("ExpressionSet", exprs=base.as_matrix(df)) to make an exprset from df
    # object.r_repr() to print representation
    # phenoData = annotatedDF, featureData = AnnotatedDF, assayData=expresion matrix (type matrix or AssayData)
    """A function to convert a pandas DataFrame to a Biobase ExpressionSet

        Takes a list of pandas DataFrames or a single pandas DataFrame and converts it to
        (source: https://bioconductor.org/packages/release/bioc/html/Biobase.html)

    References:
        https://rpy2.readthedocs.io/en/version_2.7.x/generated_rst/s4class.html
        https://rpy2.readthedocs.io/en/version_2.7.x/_static/notebooks/s4class.html
        https://rpy2.readthedocs.io/en/version_2.8.x/robjects_oop.html

    Args:
        dfs: pd.DataFrame(s) to convert to Biobase.ExpressionSet (single dataframe or a list)

    Returns:
        a Biobase.ExpressionSet object or a list of Biobase.ExpressionSet objects

    """
    # attributes can be accessed using .slots

    rpd.activate()
    data = importr('data.table')
    base = importr('base')
    ballgown = importr('ballgown')
    # dfs.dropna(inplace=True)

    if isinstance(dfs, list):
        exprset = list()
        for i in dfs:
            data = [dfs[col].values.tolist() for col in dfs]
            cols = [col for col in dfs]
            exprset.append(
                methods.new(
                    'ExpressionSet',
                    exprs=biobase.assayData(rpd.py2ri_pandasdataframe(data)),
                    phenoData=biobase.AnnotatedDataFrame(phenoData),
                    featureData=biobase.AnnotatedDataFrame(featureData)))
    else:
        # data = [dfs[col].values.tolist() for col in dfs]
        rphenoData = biobase.AnnotatedDataFrame(
            rpd.py2ri_pandasdataframe(phenoData))
        rfeatureData = biobase.AnnotatedDataFrame(
            rpd.py2ri_pandasdataframe(featureData))
        rdata = data.na_omit_data_table(rpd.py2ri_pandasdataframe(dfs))
        # used to be data.as_matrix
        exprs = base.as_matrix((rdata))
        # rdata = base.unname(rdata)
        # print(rdata)
        exprset = biobase.ExpressionSet()
        exprset.slots['assayData<-'] = exprs
        exprset.slots['phenoData'] = rphenoData
        exprset.slots['featureData'] = rfeatureData

        # print(ballgown.expr(exprset))
    # rpd.deactivate()
    return exprset

コード例 #15

0

ファイルを表示

ファイル: PipelineProject035.py プロジェクト: MikeDMorgan/proj035

def deseqAnalysis(counts_table,
                  design,
                  reference,
                  outfile):
    '''
    Perform differential expression analysis using DESeq2
    '''

    design_df = pd.read_table(design, sep="\t",
                              header=0, index_col=0)
    counts_df = pd.read_table(counts_table, sep="\t",
                              header=0, index_col=0, compression="gzip")

    E.info("setting up counts table")
    py2ri.activate()
    r_design = py2ri.py2ri_pandasdataframe(design_df)
    r_counts = py2ri.py2ri_pandasdataframe(counts_df)

    R.assign("design", r_design)
    R.assign("counts", r_counts)

    R('''sink(file="/dev/null")''')
    E.info("loading required R packages")
    R('''suppressPackageStartupMessages(library(DESeq2))''')
    R('''suppressPackageStartupMessages(library(gplots))''')
    R('''suppressPackageStartupMessages(library(RColorBrewer))''')
    R('''suppressPackageStartupMessages(library(ggplot2))''')

    R('''notZero <- counts[rowMeans(counts) > 1,]''')
    R('''dds <- DESeqDataSetFromMatrix(countData=notZero,'''
      '''colData=design, design=~group)''')
    E.info("performing differential expression testing")
    R('''de <- DESeq(dds, fitType="parametric")''')
    R('''res <- results(de)''')

    E.info("generating MA plot")
    # generate MAplots
    R('''png("images.dir/%s-MAplot.png", height=480, width=480)''' % reference)
    R('''plotMA(res, alpha=0.05)''')
    R('''dev.off()''')

    E.info("performing variance stabilising transformation")
    R('''vst <- data.frame(getVarianceStabilizedData(de))''')

    E.info("clustering samples and plotting heatmap")
    R('''cors <- cor(vst)''')
    R('''hmcol <- colorRampPalette(brewer.pal(9, "PuOr"))(100)''')
    R('''png("images.dir/%s-sample_clustering-heatmap.png", height=480, '''
      '''width=480)''' % reference)
    R('''heatmap.2(as.matrix(cors), col=hmcol, trace="none", '''
      '''breaks=seq(0, 1, 0.01), margins=c(10,10), cexRow=0.8,'''
      '''cexCol=0.8)''')
    R('''dev.off()''')

    E.info("performing principal components analysis")
    R('''pca <- prcomp(data.frame(t(vst)), scale=T, centre=T)''')
    R('''pcs <- data.frame(pca$x)''')
    R('''pcs$condition <- as.factor(design$group)''')
    R('''p_pca <- ggplot(pcs, aes(x=PC1, y=PC2, colour=condition)) + '''
      '''geom_point(size=6)''')
    R('''png("images.dir/%s-PCA_pc1-pc2.png", height=480, '''
      '''width=480)''' % reference)
    R('''print(p_pca)''')
    R('''dev.off()''')

    E.info("writing table of results")
    R('''res.df <- data.frame(res)''')
    ('''sink(file=NULL)''')
    out_df = com.load_data("res.df")
    out_df.to_csv(outfile, sep="\t", index_label="gene_id")