Exemple #1
0
    def _plot_with_rpy2(self, regions, filename):
        from rpy2 import robjects
        import rpy2.robjects.lib.ggplot2 as ggplot2
        from rpy2.robjects.lib import grid
        from rpy2.robjects.packages import importr
        grdevices = importr('grDevices')
        base = importr('base')
        grdevices.pdf(file=filename + '.pdf')

        t = [x for x in range(-self.num_bins, self.num_bins + 1)]
        for region in regions[:self.num_regs]:
            if not np.any(region.weighted):
                logger.warning(
                    "Warning: No data for region located on bin " + str(region.bin) + ". Not plotting this one.")
                continue
            middle = (len(region.weighted[0]) - 1) / 2
            if middle < self.num_bins:
                logger.error("Warning: There are less bins calculated for regions than you want to plot.")
                sys.exit(1)
            d = {'map': robjects.StrVector(
                [str(m) for sublist in [[x] * len(t) for x in range(len(region.weighted))] for m in sublist]),
                't': robjects.FloatVector(t * len(region.weighted)),
                'e': robjects.FloatVector([i for sublist in region.weighted for i in
                                           sublist[middle - self.num_bins:middle + self.num_bins + 1]]),
                'p': robjects.FloatVector([-np.log10(x) for sublist in region.pvalues for x in
                                           sublist[middle - self.num_bins:middle + self.num_bins + 1]]),
                'c': robjects.FloatVector([-np.log10(x) for sublist in region.corrected_pvalues for x in
                                           sublist[middle - self.num_bins:middle + self.num_bins + 1]])}
            dataf = robjects.DataFrame(d)
            gp = ggplot2.ggplot(dataf)  # first yellow second red
            p1 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='e', group='map', colour='map'),
                                        alpha=0.8) + ggplot2.scale_y_continuous(trans='log2') + ggplot2.ggtitle(
                "\n".join(wrap("Bin " + str(region.bin) + " : " + str(region.positions)))) + ggplot2.labs(
                y="log Intensity") + ggplot2.theme_classic() + ggplot2.theme(
                **{'axis.title.x': ggplot2.element_blank(), 'axis.text.y': ggplot2.element_text(angle=45),
                   'axis.text.x': ggplot2.element_blank(),
                   'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1")
            p2 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='p', group='map', colour='map'),
                                        alpha=0.8) + ggplot2.labs(
                y="-log10(p-value)") + ggplot2.theme_classic() + ggplot2.theme(
                **{'axis.title.x': ggplot2.element_blank(), 'axis.text.x': ggplot2.element_blank(),
                   'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1")
            p3 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='c', group='map', colour='map'),
                                        alpha=0.8) + ggplot2.labs(y="-log10(q-value)",
                                                                  x='bins (' + str(self.bin_res) + ' bp each)') + \
                 ggplot2.geom_hline(mapping=ggplot2.aes_string(yintercept=str(-np.log10(self.threshold))),
                                    colour='black', alpha=0.8, linetype='dashed') + ggplot2.theme_classic() + \
                 ggplot2.theme(**{'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1")
            g1 = ggplot2.ggplot2.ggplotGrob(p1)
            g2 = ggplot2.ggplot2.ggplotGrob(p2)
            g3 = ggplot2.ggplot2.ggplotGrob(p3)
            robjects.globalenv["g"] = base.rbind(g1, g2, g3, size='first')
            robjects.r("grid::grid.draw(g)")
            grid.newpage()
            logger.debug('Plotted region ' + str(region.bin))

        grdevices.dev_off()
Exemple #2
0
def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name):
    columns_to_data = {'subgroup': [], tname: [], 'count': []}
    max_count = 0
    for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items():
        for ss, n_count in sses_to_n_count.items():
            columns_to_data['subgroup'].append(subgroup)
            columns_to_data[tname].append(ss)
            columns_to_data['count'].append(n_count)
            if n_count > max_count:
                max_count = n_count
    r_columns_to_data = {
        'subgroup':
        ro.FactorVector(columns_to_data['subgroup'],
                        levels=ro.StrVector(
                            _sort_subgroup(set(columns_to_data['subgroup'])))),
        tname:
        ro.StrVector(columns_to_data[tname]),
        'count':
        ro.IntVector(columns_to_data['count'])
    }
    df = ro.DataFrame(r_columns_to_data)

    max_count = int(max_count / 1000 * 1000 + 1000)
    histogram_file_path = os.path.join(OUTPUT_PATH, file_name)
    logging.debug(
        str.format("The Data Frame for file {}: \n{}", histogram_file_path,
                   df))

    grdevices.png(file=histogram_file_path, width=1200, height=800)
    gp = ggplot2.ggplot(df)
    pp = gp + \
         ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \
         ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \
         ggplot2.theme_bw() + \
         ggplot2.theme_classic() + \
         ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \
         ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \
         ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \
         ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \
         ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]),
                                    limits=ro.IntVector([0, max_count])) + \
         ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1,
                           position=ggplot2.position_dodge(width=0.8),
                           vjust=-0.2)

    pp.plot()
    logging.info(str.format("Output step3 file {}", histogram_file_path))
    grdevices.dev_off()
Exemple #3
0
def generate_step3_5_lrr_acc20_line_chart(subgroups_to_lrrs_acc20mean,
                                          prefix=''):
    pandas2ri.activate()
    subgroups_to_lrr_count = {}
    columns_to_data = {'subgroup': [], 'pos': [], 'acc20': []}
    for subgroup, (acc20means,
                   acc20_count) in subgroups_to_lrrs_acc20mean.items():
        subgroups_to_lrr_count[subgroup] = acc20_count
        for index, acc20mean in enumerate(acc20means):
            columns_to_data['subgroup'].append(subgroup)
            columns_to_data['pos'].append(index + 1)
            columns_to_data['acc20'].append(acc20mean)

    # Write the count of LRRs for each subgroup to file
    with open(os.path.join(OUTPUT_PATH, prefix + "step3_5_lrr_count.txt"),
              'w') as f:
        for subgroup, lrr_count in subgroups_to_lrr_count.items():
            f.write(str.format("{}: {}\n", subgroup, lrr_count))

    # Generate the line chart file
    r_columns_to_data = {
        'subgroup': ro.StrVector(columns_to_data['subgroup']),
        'pos': ro.IntVector(columns_to_data['pos']),
        'acc20': ro.FloatVector(columns_to_data['acc20'])
    }
    df = ro.DataFrame(r_columns_to_data)

    line_chart_file_path = os.path.join(OUTPUT_PATH,
                                        prefix + "step3_5_lrr_acc20_line.png")
    logging.debug(
        str.format("The Data Frame for file {}: \n{}", line_chart_file_path,
                   df))
    grdevices.png(file=line_chart_file_path, width=1024, height=512)
    gp = ggplot2.ggplot(df)
    pp = gp + \
         ggplot2.theme_bw() + \
         ggplot2.theme_classic() + \
         ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \
         ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \
         ggplot2.aes_string(x='pos', y='acc20', group='subgroup', colour='subgroup') + \
         ggplot2.geom_point(size=4, shape=20) + \
         ggplot2.geom_line(size=3) + \
         ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \
         ggplot2.theme(**{'legend.text': ggplot2.element_text(size=20)}) + \
         ggplot2.scale_x_continuous(breaks=ro.IntVector(range(1, 25)), labels=ro.StrVector(list('LxxLxLxxNxLsGxIPxxLxxLxx')))
    pp.plot()
    logging.info(str.format("Output step3 file {}", line_chart_file_path))
    grdevices.dev_off()
Exemple #4
0
def generate_step3_9_n_count_histogram(place_type_pos_type_to_count,
                                       file_name):
    columns_to_data = {'place': [], 'pos': [], 'count': []}
    max_count = 0
    for place_pos_type, n_count in place_type_pos_type_to_count.items():
        place_type, pos_type = place_pos_type.split('_')
        columns_to_data['place'].append(place_type)
        columns_to_data['pos'].append(pos_type)
        columns_to_data['count'].append(n_count)
        if n_count > max_count:
            max_count = n_count
    r_columns_to_data = {
        'place': ro.StrVector(columns_to_data['place']),
        'pos': ro.StrVector(columns_to_data['pos']),
        'count': ro.IntVector(columns_to_data['count'])
    }
    df = ro.DataFrame(r_columns_to_data)

    if max_count > 1000:
        max_count = int(max_count / 1000 * 1000 + 1000)
    else:
        max_count = int(max_count / 100 * 100 + 100)
    histogram_file_path = os.path.join(OUTPUT_PATH, file_name)
    logging.debug(
        str.format("The Data Frame for file {}: \n{}", histogram_file_path,
                   df))
    grdevices.png(file=histogram_file_path, width=1024, height=512)
    gp = ggplot2.ggplot(df)
    pp = gp + \
         ggplot2.aes_string(x='pos', y='count', fill='place') + \
         ggplot2.geom_bar(position="dodge", stat="identity") + \
         ggplot2.theme_bw() + \
         ggplot2.theme_classic() + \
         ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \
         ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \
         ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]),
                                    limits=ro.IntVector([0, max_count])) + \
         ggplot2.geom_text(ggplot2.aes_string(label='count'),
                           position=ggplot2.position_dodge(width=0.8), size=10, angle=35, hjust=-0.2,
                           vjust=-0.5)
    pp.plot()
    logging.info(str.format("Output step3 file {}", histogram_file_path))
    grdevices.dev_off()
Exemple #5
0
def _generate_step3_5_ss_acc20_line_chart(ts_to_acc20s, tname,
                                          line_chart_file_path):
    logging.debug(
        str.format("Begin to generate {}, data {}", line_chart_file_path,
                   ts_to_acc20s))
    ts_to_acc20mean = calc_acc20mean_by_types(ts_to_acc20s)
    columns_to_data = {tname: [], 'site': [], 'acc20': []}
    for ss, acc20means in ts_to_acc20mean.items():
        for index, acc20mean in enumerate(acc20means):
            columns_to_data[tname].append(ss)
            columns_to_data['site'].append(index - 5)
            columns_to_data['acc20'].append(acc20mean)

    # Generate the line chart file
    r_columns_to_data = {
        tname: ro.StrVector(columns_to_data[tname]),
        'site': ro.IntVector(columns_to_data['site']),
        'acc20': ro.FloatVector(columns_to_data['acc20'])
    }
    df = ro.DataFrame(r_columns_to_data)

    logging.debug(
        str.format("The Data Frame for file {}: \n{}", line_chart_file_path,
                   df))
    grdevices.png(file=line_chart_file_path, width=1024, height=512)
    gp = ggplot2.ggplot(df)
    pp = gp + \
         ggplot2.theme_bw() + \
         ggplot2.theme_classic() + \
         ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \
         ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \
         ggplot2.aes_string(x='site', y='acc20', group=tname, colour=tname) + \
         ggplot2.geom_point(size=4, shape=20) + \
         ggplot2.geom_line(size=3) + \
         ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \
         ggplot2.theme(**{'legend.text': ggplot2.element_text(size=20)}) + \
         ggplot2.scale_x_continuous(breaks=ro.IntVector(list(range(-5, 6))),
                                    labels=ro.StrVector(['-5', '-4', '-3', '-2', '-1', 'N', '1', '2', '3', '4', '5']))
    pp.plot()
    logging.info(str.format("Output step3 file {}", line_chart_file_path))
    grdevices.dev_off()
Exemple #6
0
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']

    print("running DESEQ2 %s" % prefix, file=sys.stderr)

    # make the quant DF
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df = pandas2ri.py2ri(quantDF)

    # import formula
    formulaDF     = pd.read_csv(formula,header=0, sep="\t",index_col=0)
    sampleTable = pandas2ri.py2ri(formulaDF)


    if "batch" in list(formulaDF):
        design = Formula("~ batch + condition")
    else:
        design = Formula("~ condition")
   

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    deseq     = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman     = importr('qqman')



    ### RUN DESEQ2 ###
    R.assign('df', df)
    R.assign('sampleTable', sampleTable)
    R.assign('design',design)
    R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)')
    R('dds <- DESeq(dds)')
    R('name <- grep("condition", resultsNames(dds), value=TRUE)')

    ###
    ###
    # Get Results and shrinkage values
    res    = R('results(dds, name=name)')
    resLFC = R('lfcShrink(dds, coef=name)')
    vsd    = R('vst(dds,blind=FALSE)')
    resdf  = robjects.r['as.data.frame'](res) 
    reslfc = robjects.r['as.data.frame'](resLFC)
    dds    = R('dds')

    
    ### Plotting section ###
    # plot MA and PC stats for the user
    plotMA    = robjects.r['plotMA']
    plotDisp  = robjects.r['plotDispEsts']
    plotPCA   = robjects.r['plotPCA']
    plotQQ    = robjects.r['qq']
    
    # get pca data
    if "batch" in list(formulaDF):
        pcaData    = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    else:
        print(vsd)
        pcaData    = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    # arrange 


    data_folder = os.path.join(os.getcwd(), outdir)
    qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf"  % (prefix,group1,group2))
    
    grdevices.pdf(file=qcOut)

    x = "PC1: %s" % int(percentVar[0]*100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1]*100) + "%% variance"

    if "batch" in list(formulaDF):
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()

    else:
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
    pp.plot()
    plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results")
    plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage")    
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic() + \
            ggplot2.ggtitle("pvalue distribution")
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()


    data_folder = os.path.join(os.getcwd(), outdir)
    lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv"  % (prefix,group1,group2))
    resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv"  % (prefix,group1,group2))
   
    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
Exemple #7
0
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']




    # make the quant DF
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df = pandas2ri.py2ri(quantDF)
    #print(df.head())
    # import formula
    formulaDF     = pd.read_csv(formula,header=0, sep="\t",index_col=0)
    sampleTable = pandas2ri.py2ri(formulaDF)

    if "batch" in list(formulaDF):
        design = Formula("~ batch + condition")
    else:
        design = Formula("~ condition")
    #print(sampleTable)

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    deseq     = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman     = importr('qqman')



    dds = deseq.DESeqDataSetFromMatrix(countData = df,
                                        colData = sampleTable,
                                        design = design)

    dds  = deseq.DESeq(dds)
    cont = robjects.r["grep"]("condition",robjects.r['resultsNames'](dds),value=True)
    #print(cont)
    # get results; orient the results for groupA vs B
    res = deseq.results(dds, name=cont)
    # results with shrinkage
    resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm")
    resdf  = robjects.r['as.data.frame'](res)
    
    R.assign('res', res)
    
    reslfc  = robjects.r['as.data.frame'](resLFC)

    # plot MA and PC stats for the user
    plotMA    = robjects.r['plotMA']
    plotDisp  = robjects.r['plotDispEsts']
    plotPCA   = robjects.r['plotPCA']
    plotQQ    = robjects.r['qq']
    
    vsd       = robjects.r['vst'](dds, blind=robjects.r['F'])
    # get pca data
    if "batch" in list(formulaDF):
        pcaData    = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    else:
        print(vsd)
        pcaData    = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    # arrange 
    grdevices.pdf(file="./%s/%s_QCplots_%s_v_%s.pdf" % (outdir,prefix,group1,group2))


    x = "PC1: %s" % int(percentVar[0]*100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1]*100) + "%% variance"

    if "batch" in list(formulaDF):
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
        pp.plot()
    else:
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
        pp.plot()
    plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results")
    #plotMA(res, main="MA-plot results")
    plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrrhinkage")
    #plotMA(resLFC, main="MA-plot LFCSrrhinkage")
    plotQQ(resdf.rx2('pvalue'), main="pvalue QQ")
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic() 
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()


    lfcOut =  "./%s/%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (outdir,prefix,group1,group2)
    resOut =  "./%s/%s_%s_v_%s_deseq2_results.tsv" % (outdir,prefix,group1,group2)

    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
Exemple #8
0
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir = "diffExpOut"
    group1 = myCommandLine.args['group1']
    group2 = myCommandLine.args['group2']
    batch = myCommandLine.args['batch']
    matrix = myCommandLine.args['matrix']
    prefix = "flair_diffexp"

    # make the quant DF
    quantDF = pd.read_table(matrix, header=0, sep='\t')
    quantDF = quantDF.set_index('ids')
    df = pandas2ri.py2ri(quantDF)

    # now make the formula
    with open(matrix) as l:
        header = next(l).rstrip().split()[1:]

    formula = [[x, x.split("_")[1], x.split("_")[-1]] for x in header]
    formulaDF = pd.DataFrame(formula)
    formulaDF.columns = ['sampleName', 'condition', 'batch']
    formulaDF = formulaDF.set_index('sampleName')
    sampleTable = pandas2ri.py2ri(formulaDF)

    design = Formula("~ batch + condition")
    print(sampleTable)

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods = importr('methods')
    deseq = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman = importr('qqman')

    dds = deseq.DESeqDataSetFromMatrix(countData=df,
                                       colData=sampleTable,
                                       design=design)

    dds = deseq.DESeq(dds)
    cont = robjects.r["grep"]("condition",
                              robjects.r['resultsNames'](dds),
                              value="TRUE")

    # get results; orient the results for groupA vs B
    res = deseq.results(dds, name=cont)
    # results with shrinkage
    resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm")
    resdf = robjects.r['as.data.frame'](res)

    R.assign('res', res)
    R('write.table(res, file="testres.tsv", quote=FALSE, col.names=NA)')
    reslfc = robjects.r['as.data.frame'](resLFC)

    # plot MA and PC stats for the user
    plotMA = robjects.r['plotMA']
    plotDisp = robjects.r['plotDispEsts']
    plotPCA = robjects.r['plotPCA']
    plotQQ = robjects.r['qq']

    vsd = robjects.r['vst'](dds, blind=robjects.r['F'])
    # get pca data
    pcaData = plotPCA(vsd,
                      intgroup=robjects.StrVector(("condition", "batch")),
                      returnData=robjects.r['T'])
    percentVar = robjects.r['attr'](pcaData, "percentVar")

    # arrange
    grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_cutoff_plots.pdf" %
                  (outdir, prefix, group1, group2, str(batch)))

    x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance"

    pp = ggplot2.ggplot(pcaData) + \
            ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
            ggplot2.geom_point(size=3) + \
            robjects.r['xlab'](x) + \
            robjects.r['ylab'](y) + \
            ggplot2.theme_classic() + \
            ggplot2.coord_fixed()
    pp.plot()

    plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results")
    #plotMA(res, main="MA-plot results")
    plotMA(resLFC,
           ylim=robjects.IntVector((-3, 3)),
           main="MA-plot LFCSrrhinkage")
    #plotMA(resLFC, main="MA-plot LFCSrrhinkage")
    plotQQ(resdf.rx2('pvalue'), main="pvalue QQ")
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic()
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()

    lfcOut = "./%s/%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, str(batch))
    resOut = "./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch))

    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
    sys.exit(1)
    reslsf = pandas2ri.ri2py(reslfc)
    res = pandas2ri.ri2py(resdf)

    reslsf.to_csv("./%s/%s_%s_deseq2_results_LFC.tsv" %
                  (outdir, prefix, str(batch)),
                  sep='\t')
    res.to_csv("./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch)),
               sep='\t')
Exemple #9
0
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    workingdir = myCommandLine.args['workingdir']
    outdir = myCommandLine.args['outdir']
    group1 = myCommandLine.args['group1']
    group2 = myCommandLine.args['group2']
    batch = myCommandLine.args['batch']
    files = myCommandLine.args['files']
    prefix = myCommandLine.args['out_prefix']
    sFilter = myCommandLine.args['filter']

    makeDir(outdir)

    files = checkSamples(files)

    df = filesToDF(files, sFilter)

    # DO DESEQ2
    from rpy2 import robjects
    from rpy2.robjects import r, pandas2ri, Formula
    from rpy2.robjects.lib import grid
    pandas2ri.activate()

    # Compile data for data frame
    data = list()
    for f in files:
        if group1 in f:
            if batch in f:
                data.append((f, group1, '1'))
            else:
                data.append((f, group1, '2'))
        else:
            if batch in f:
                data.append((f, group2, '1'))
            else:
                data.append((f, group2, '2'))

    # Make the Data Frame
    pydf = pd.DataFrame(data)
    pydf.columns = ['sampleName', 'condition', 'batch']
    pydf = pydf.set_index('sampleName')
    # Convert pandas to R data frame.
    sampleTable = pandas2ri.py2ri(pydf)

    # DESEQ2 part.
    # Forumla
    design = Formula("~ batch + condition")

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods = importr('methods')
    deseq = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman = importr('qqman')

    # dds = deseq.DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
    #                                         directory = workingdir,
    #                                         design= design)

    dds = deseq.DESeqDataSetFromMatrix(countData=df,
                                       colData=sampleTable,
                                       design=design)
    dds = deseq.DESeq(dds)

    # get results; orient the results for groupA vs B
    res = deseq.results(dds,
                        contrast=robjects.StrVector(
                            ("condition", group2, group1)))
    # results with shrinkage
    resLFC = deseq.lfcShrink(dds,
                             coef="condition_%s_vs_%s" % (group2, group1),
                             type="apeglm")
    resdf = robjects.r['as.data.frame'](res)
    reslfc = robjects.r['as.data.frame'](resLFC)

    # plot MA and PC stats for the user
    plotMA = robjects.r['plotMA']
    plotDisp = robjects.r['plotDispEsts']
    plotPCA = robjects.r['plotPCA']
    plotQQ = robjects.r['qq']

    vsd = robjects.r['vst'](dds, blind=robjects.r['F'])
    # get pca data
    pcaData = plotPCA(vsd,
                      intgroup=robjects.StrVector(("condition", "batch")),
                      returnData=robjects.r['T'])
    percentVar = robjects.r['attr'](pcaData, "percentVar")

    # arrange
    grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_%s_cutoff_plots.pdf" %
                  (outdir, prefix, group1, group2, str(batch), sFilter))

    x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance"

    pp = ggplot2.ggplot(pcaData) + \
            ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
            ggplot2.geom_point(size=3) + \
            robjects.r['xlab'](x) + \
            robjects.r['ylab'](y) + \
            ggplot2.theme_classic() + \
            ggplot2.coord_fixed()
    pp.plot()

    plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results")
    #plotMA(res, main="MA-plot results")
    plotMA(resLFC,
           ylim=robjects.IntVector((-3, 3)),
           main="MA-plot LFCSrrhinkage")
    #plotMA(resLFC, main="MA-plot LFCSrrhinkage")
    plotQQ(resdf.rx2('pvalue'), main="pvalue QQ")
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic()
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()

    reslsf = pandas2ri.ri2py(reslfc)
    res = pandas2ri.ri2py(resdf)

    reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results_LFC.tsv" %
                  (outdir, prefix, group1, group2, str(batch)),
                  sep='\t')
    reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results.tsv" %
                  (outdir, prefix, group1, group2, str(batch)),
                  sep='\t')