コード例 #1
0
def _plt_distr(dat,
               col,
               title='',
               splitBy_pfill=True,
               pfill='label',
               independentpdf=False,
               fname='xdistr.pdf'):
    df = dat[dat[pfill] != 'NA']  ## remove invalid pairs
    n = len(df)
    df = {
        col: robjects.FloatVector(list(df[col])),
        pfill: robjects.StrVector(list(df[pfill]))
    }
    df = robjects.DataFrame(df)

    pp = ggplot2.ggplot(df) + \
        ggplot2.ggtitle('%s [Total = %s]' % (title, n))

    ## Plot1: counts
    if splitBy_pfill:
        p1 = pp + ggplot2.aes_string(x=col, fill=pfill)
    else:
        p1 = pp + ggplot2.aes_string(x=col)

    ## Plot2: density
    if splitBy_pfill:
        p2 = pp + ggplot2.aes_string(x=col, fill=pfill, y='..density..')
    else:
        p2 = pp + ggplot2.aes_string(x=col, y='..density..')
    p2 = p2 + ggplot2.geom_density(alpha=.5, origin=-500)

    if col == 'distance':
        p1 = p1 + \
            ggplot2.geom_histogram(binwidth=1000, alpha=.5, position='identity', origin=-500) + \
            ggplot2.xlim(-1000, 51000)

        p2 = p2 + \
            ggplot2.geom_histogram(binwidth=1000, alpha=.33, position='identity', origin=-500) + \
            ggplot2.xlim(-1000, 51000)
    else:
        p1 = p1 + \
            ggplot2.geom_histogram(alpha=.5, position='identity')

        p2 = p2 + \
            ggplot2.geom_histogram(alpha=.33, position='identity')

        if col == 'correlation':
            p1 = p1 + ggplot2.xlim(-1.1, 1.1)
            p2 = p2 + ggplot2.xlim(-1.1, 1.1)

    if independentpdf:
        grdevices = importr('grDevices')
        grdevices.pdf(file=fname)
        p1.plot()
        p2.plot()
        grdevices.dev_off()
    else:
        p1.plot()
        p2.plot()
    return
コード例 #2
0
def plot_hist(sizes, args):
    """
    Use rpy2 to plot a histogram of the read sizes
    """
    r = robjects.r
    r.library("ggplot2")
    grdevices = importr('grDevices')

    sizes = robjects.IntVector([s for s in sizes \
                if s < args.max_length and s > args.min_length])

    sizes_min = min(sizes)
    sizes_max = max(sizes)

    binwidth = (sizes_max - sizes_min) / args.num_bins

    d = {'sizes': sizes}
    df = robjects.DataFrame(d)

    # plot
    gp = ggplot2.ggplot(df)

    if not args.theme_bw:
        pp = gp + ggplot2.aes_string(x='sizes') \
                + ggplot2.geom_histogram(binwidth=binwidth)
    else:
        pp = gp + ggplot2.aes_string(x='sizes') \
            + ggplot2.geom_histogram(binwidth=binwidth) \
            + ggplot2.theme_bw()

    if args.saveas is not None:
        plot_file = args.saveas
        if plot_file.endswith(".pdf"):
            grdevices.pdf(plot_file, width=8.5, height=8.5)
        elif plot_file.endswith(".png"):
            grdevices.png(plot_file,
                          width=8.5,
                          height=8.5,
                          units="in",
                          res=300)
        else:
            logger.error("Unrecognized extension for %s!" % (plot_file))
            sys.exit()

        pp.plot()
        grdevices.dev_off()
    else:
        pp.plot()
        # keep the plot open until user hits enter
        print('Type enter to exit.')
        raw_input()
コード例 #3
0
ファイル: pwqmn_data.py プロジェクト: jenniferlhood/pwqmn
	def histogram(self, dataframe, filename, parm, group, units):
		with suppress_stdout():
			grdevices.png(file=filename, width=512, height=512)
			data = ggplot2.ggplot(dataframe)
			aes = ggplot2.aes_string(x=parm,fill = group)
			geom = ggplot2.geom_histogram(colour="black")
			labs = ggplot2.labs(x=parm + " " + units)
			gg = data + aes + geom + labs
			gg.plot()
			grdevices.dev_off()
コード例 #4
0
def plot_hist(sizes, args):
    """
	Use rpy2 to plot a histogram of the read sizes
	"""
    r = robjects.r
    r.library("ggplot2")
    grdevices = importr("grDevices")

    sizes = robjects.IntVector([s for s in sizes if s < args.max_length and s > args.min_length])

    sizes_min = min(sizes)
    sizes_max = max(sizes)

    binwidth = (sizes_max - sizes_min) / args.num_bins

    d = {"sizes": sizes}
    df = robjects.DataFrame(d)

    # plot
    gp = ggplot2.ggplot(df)

    if not args.theme_bw:
        pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth)
    else:
        pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth) + ggplot2.theme_bw()

    if args.saveas is not None:
        plot_file = args.saveas
        if plot_file.endswith(".pdf"):
            grdevices.pdf(plot_file, width=8.5, height=8.5)
        elif plot_file.endswith(".png"):
            grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300)
        else:
            logger.error("Unrecognized extension for %s!" % (plot_file))
            sys.exit()

        pp.plot()
        grdevices.dev_off()
    else:
        pp.plot()
        # keep the plot open until user hits enter
        print("Type enter to exit.")
        raw_input()
コード例 #5
0
 def histogram(self, dataframe, filename, parm, group, units):
     with suppress_stdout():
         grdevices.png(file=filename, width=512, height=512)
         data = ggplot2.ggplot(dataframe)
         aes = ggplot2.aes_string(x=parm, fill=group)
         geom = ggplot2.geom_histogram(colour="black")
         labs = ggplot2.labs(x=parm + " " + units)
         gg = data + aes + geom + labs
         gg.plot()
         grdevices.dev_off()
コード例 #6
0
def rpy2_plotter(anno, clusters, name):
    """Plot genes distribution in clusters using ggplot2 from R."""
    pandas2ri.activate()
    grdevices = importr('grDevices')
    rprint = robjects.globalenv.get("print")

    anno = anno.sort_values(by="n_ft", ascending=False)
    anno = anno.head(n=10)
    category = anno["category"].tolist()
    clusters = clusters[clusters["category"].isin(category)]
    clusters = pandas2ri.py2ri(clusters)

    pp = ggplot2.ggplot(clusters) + ggplot2.aes_string(x="n_features") + ggplot2.geom_histogram(binwidth=1) + ggplot2.facet_wrap(robjects.Formula("~category"), ncol=5) + ggplot2.labs(x="Number of Features", y="Number of Clusters", title="Clusters distribution")

    grdevices.pdf(file=name, width=11.692, height=8.267)
    rprint(pp)
    grdevices.dev_off()
コード例 #7
0
def plot_histogram(fastq_file, plot_filename_png):
    """Plots histogram of length distribution of sequence in fastq_file and
        saves to plot_filename_png"""
    r = robjects.r
    r.library("ggplot2")
    grdevices = importr('grDevices')
    
    sizes = []
    
    with open(fastq_file, 'rb') as f:
        # skip first line
        for _ in itertools.islice(f, 0, 1):
            pass
        # Get every 4th line with raw sequence letters
        fourthlines = itertools.islice(f, 0, None, 4)
        for line in fourthlines:
            sizes.append(len(line.strip()))
            
    sizes = robjects.IntVector([s for s in sizes])

    sizes_min = min(sizes)
    sizes_max = max(sizes)
    
    binwidth = (sizes_max - sizes_min) / 20
    
    d = {'sizes' : sizes}
    df = robjects.DataFrame(d)
    
    # plot
    gp = ggplot2.ggplot(df)
    
    pp = gp + ggplot2.aes_string(x='sizes') \
            + ggplot2.geom_histogram(binwidth=binwidth) \
            + ggplot2.theme_grey() \
            + ggplot2.labs(title =plot_filename_png, \
                x = "Size (in nucleotides)", y = "Count") 
            
    grdevices.png(plot_filename_png, width = 8.5, height = 8.5, 
                units = "in", res = 300)
    pp.plot()
    grdevices.dev_off()
コード例 #8
0
def main():
    usage = 'usage: %prog [options] <gtf file> <fpkm tracking>'
    parser = OptionParser(usage)
    #parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        gtf_file = args[0]
        fpkm_tracking_file = args[1]

    # get genes
    genes = set()
    for line in open(gtf_file):
        a = line.split('\t')
        genes.add(gff.gtf_kv(a[8])['gene_id'])

    # get expression
    cuff = cufflinks.fpkm_tracking(fpkm_tracking_file)
    log_fpkms = []
    for gene_id in genes:
        max_fpkm = max(cuff.gene_expr(gene_id))
        if max_fpkm > 0:
            log_fpkms.append(math.log(max_fpkm,2))

    # construct R data objects
    fpkms_r = ro.FloatVector(log_fpkms)
    df = ro.DataFrame({'fpkm':fpkms_r})
    
    # construct plot
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='fpkm') + \
        ggplot2.geom_histogram(binwidth=0.2)
    
    # save to file
    gtf_pre = os.path.splitext(gtf_file)[0]
    grdevices.pdf(file='%s_fpkmhist.pdf' % gtf_pre)
    gp.plot()
    grdevices.dev_off()
コード例 #9
0
ファイル: fpkm_hist.py プロジェクト: davek44/utility
def main():
    usage = "usage: %prog [options] <gtf file> <fpkm tracking>"
    parser = OptionParser(usage)
    # parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        gtf_file = args[0]
        fpkm_tracking_file = args[1]

    # get genes
    genes = set()
    for line in open(gtf_file):
        a = line.split("\t")
        genes.add(gff.gtf_kv(a[8])["gene_id"])

    # get expression
    cuff = cufflinks.fpkm_tracking(fpkm_tracking_file)
    log_fpkms = []
    for gene_id in genes:
        max_fpkm = max(cuff.gene_expr(gene_id))
        if max_fpkm > 0:
            log_fpkms.append(math.log(max_fpkm, 2))

    # construct R data objects
    fpkms_r = ro.FloatVector(log_fpkms)
    df = ro.DataFrame({"fpkm": fpkms_r})

    # construct plot
    gp = ggplot2.ggplot(df) + ggplot2.aes_string(x="fpkm") + ggplot2.geom_histogram(binwidth=0.2)

    # save to file
    gtf_pre = os.path.splitext(gtf_file)[0]
    grdevices.pdf(file="%s_fpkmhist.pdf" % gtf_pre)
    gp.plot()
    grdevices.dev_off()
コード例 #10
0
ファイル: graphics.py プロジェクト: realmichaelzyy/cs249-2
gp = ggplot2.ggplot(mtcars)

pp = gp + \
     ggplot2.aes_string(x='factor(cyl)', y='mpg') + \
     ggplot2.geom_boxplot()

pp.plot()
#-- ggplot2geomboxplot-end
grdevices.dev_off()

#-- ggplot2geomhistogram-begin
gp = ggplot2.ggplot(mtcars)

pp = gp + \
     ggplot2.aes_string(x='wt') + \
     ggplot2.geom_histogram()

#pp.plot()
#-- ggplot2geomhistogram-end

grdevices.png('../../_static/graphics_ggplot2geomhistogram.png',
              width=900,
              height=412,
              antialias="subpixel",
              type="cairo")
grid.newpage()
grid.viewport(layout=grid.layout(1, 3)).push()

params = (('black', 'black'), ('black', 'white'), ('white', 'black'))

for col_i in range(3):
コード例 #11
0
              antialias=ANTIALIAS,
              type="cairo")
#-- ggplot2geomboxplot-begin
gp = ggplot2.ggplot(mtcars)

pp = (gp + ggplot2.aes_string(x='factor(cyl)', y='mpg') +
      ggplot2.geom_boxplot())

pp.plot()
#-- ggplot2geomboxplot-end
grdevices.dev_off()

#-- ggplot2geomhistogram-begin
gp = ggplot2.ggplot(mtcars)

pp = (gp + ggplot2.aes_string(x='wt') + ggplot2.geom_histogram(bins=30))

#pp.plot()
#-- ggplot2geomhistogram-end

grdevices.png('../../_static/graphics_ggplot2geomhistogram.png',
              width=900,
              height=412,
              antialias=ANTIALIAS,
              type="cairo")
grid.newpage()
grid.viewport(layout=grid.layout(1, 3)).push()

params = (('black', 'black'), ('black', 'white'), ('white', 'black'))

for col_i in range(3):
コード例 #12
0
ファイル: runDE.py プロジェクト: wqhf/flair
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']

    print("running DESEQ2 %s" % prefix, file=sys.stderr)

    # make the quant DF
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df = pandas2ri.py2ri(quantDF)

    # import formula
    formulaDF     = pd.read_csv(formula,header=0, sep="\t",index_col=0)
    sampleTable = pandas2ri.py2ri(formulaDF)


    if "batch" in list(formulaDF):
        design = Formula("~ batch + condition")
    else:
        design = Formula("~ condition")
   

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    deseq     = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman     = importr('qqman')



    ### RUN DESEQ2 ###
    R.assign('df', df)
    R.assign('sampleTable', sampleTable)
    R.assign('design',design)
    R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)')
    R('dds <- DESeq(dds)')
    R('name <- grep("condition", resultsNames(dds), value=TRUE)')

    ###
    ###
    # Get Results and shrinkage values
    res    = R('results(dds, name=name)')
    resLFC = R('lfcShrink(dds, coef=name)')
    vsd    = R('vst(dds,blind=FALSE)')
    resdf  = robjects.r['as.data.frame'](res) 
    reslfc = robjects.r['as.data.frame'](resLFC)
    dds    = R('dds')

    
    ### Plotting section ###
    # plot MA and PC stats for the user
    plotMA    = robjects.r['plotMA']
    plotDisp  = robjects.r['plotDispEsts']
    plotPCA   = robjects.r['plotPCA']
    plotQQ    = robjects.r['qq']
    
    # get pca data
    if "batch" in list(formulaDF):
        pcaData    = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    else:
        print(vsd)
        pcaData    = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    # arrange 


    data_folder = os.path.join(os.getcwd(), outdir)
    qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf"  % (prefix,group1,group2))
    
    grdevices.pdf(file=qcOut)

    x = "PC1: %s" % int(percentVar[0]*100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1]*100) + "%% variance"

    if "batch" in list(formulaDF):
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()

    else:
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
    pp.plot()
    plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results")
    plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage")    
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic() + \
            ggplot2.ggtitle("pvalue distribution")
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()


    data_folder = os.path.join(os.getcwd(), outdir)
    lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv"  % (prefix,group1,group2))
    resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv"  % (prefix,group1,group2))
   
    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
コード例 #13
0
ファイル: graphics.py プロジェクト: taiyun/rpy2-doc-zh_CN
pp = gp + \
     ggplot2.aes_string(x='factor(cyl)', y='mpg') + \
     ggplot2.geom_boxplot()

pp.plot()
#-- ggplot2geomboxplot-end
grdevices.dev_off()


#-- ggplot2geomhistogram-begin
gp = ggplot2.ggplot(mtcars)

pp = gp + \
     ggplot2.aes_string(x='wt') + \
     ggplot2.geom_histogram()

#pp.plot()
#-- ggplot2geomhistogram-end

grdevices.png('../../_static/graphics_ggplot2geomhistogram.png',
              width = 900, height = 412, antialias="subpixel", type="cairo")
grid.newpage()
grid.viewport(layout=grid.layout(1, 3)).push()

params = (('black', 'black'),
          ('black', 'white'),
          ('white', 'black'))
          
for col_i in range(3):
   vp = grid.viewport(**{'layout.pos.col':col_i+1, 'layout.pos.row': 1})
コード例 #14
0
ファイル: runDE.py プロジェクト: laffayb/flair
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']




    # make the quant DF
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df = pandas2ri.py2ri(quantDF)
    #print(df.head())
    # import formula
    formulaDF     = pd.read_csv(formula,header=0, sep="\t",index_col=0)
    sampleTable = pandas2ri.py2ri(formulaDF)

    if "batch" in list(formulaDF):
        design = Formula("~ batch + condition")
    else:
        design = Formula("~ condition")
    #print(sampleTable)

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    deseq     = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman     = importr('qqman')



    dds = deseq.DESeqDataSetFromMatrix(countData = df,
                                        colData = sampleTable,
                                        design = design)

    dds  = deseq.DESeq(dds)
    cont = robjects.r["grep"]("condition",robjects.r['resultsNames'](dds),value=True)
    #print(cont)
    # get results; orient the results for groupA vs B
    res = deseq.results(dds, name=cont)
    # results with shrinkage
    resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm")
    resdf  = robjects.r['as.data.frame'](res)
    
    R.assign('res', res)
    
    reslfc  = robjects.r['as.data.frame'](resLFC)

    # plot MA and PC stats for the user
    plotMA    = robjects.r['plotMA']
    plotDisp  = robjects.r['plotDispEsts']
    plotPCA   = robjects.r['plotPCA']
    plotQQ    = robjects.r['qq']
    
    vsd       = robjects.r['vst'](dds, blind=robjects.r['F'])
    # get pca data
    if "batch" in list(formulaDF):
        pcaData    = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    else:
        print(vsd)
        pcaData    = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    # arrange 
    grdevices.pdf(file="./%s/%s_QCplots_%s_v_%s.pdf" % (outdir,prefix,group1,group2))


    x = "PC1: %s" % int(percentVar[0]*100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1]*100) + "%% variance"

    if "batch" in list(formulaDF):
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
        pp.plot()
    else:
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
        pp.plot()
    plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results")
    #plotMA(res, main="MA-plot results")
    plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrrhinkage")
    #plotMA(resLFC, main="MA-plot LFCSrrhinkage")
    plotQQ(resdf.rx2('pvalue'), main="pvalue QQ")
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic() 
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()


    lfcOut =  "./%s/%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (outdir,prefix,group1,group2)
    resOut =  "./%s/%s_%s_v_%s_deseq2_results.tsv" % (outdir,prefix,group1,group2)

    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
コード例 #15
0
##text_log+="average: "+str(rmean(test23)[0])+end
##text_log+="sum: "+str(rsum(test23)[0])+end
#
#roughbin= round(ma[0]/100)
#bins=round(roughbin/100)*100


#ma2=rmax(ed)

#dataf_subset = dataf.rx(dataf.rx2("contig").ro >= 18, true)

scales = importr('scales')

gp = ggplot2.ggplot(dataf)
	#geom_histogram(aes(y = ..density..))
	#   ggplot2.geom_density()+\

	    # pp = gp + ggplot2.aes_string(x='%s(contrrr)') +  ggplot2.geom_histogram()+ggplot2.scale_y_sqrt()
bins=10
teest3=robjects.r('theme(axis.text.x=element_text(angle=90))')

pp = gp + \
ggplot2.aes_string(x='Length') +  \
ggplot2.geom_histogram()+\
ggplot2.ggtitle("Found IS fragment lengths")+ \
ggplot2.scale_x_continuous(name="fragment lengths, bin="+str(bins),breaks=scales.pretty_breaks(20)) +\
ggplot2.scale_y_continuous(labels=scales.comma,name="Count",breaks=scales.pretty_breaks(10))+ \
teest3
pp.plot()
robjects.r.ggsave("/Users/security/science/dna_subj_hist.pdf")
コード例 #16
0
ファイル: runDE.py プロジェクト: smaegol/flair
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    workingdir = myCommandLine.args['workingdir']
    outdir = myCommandLine.args['outdir']
    group1 = myCommandLine.args['group1']
    group2 = myCommandLine.args['group2']
    batch = myCommandLine.args['batch']
    files = myCommandLine.args['files']
    prefix = myCommandLine.args['out_prefix']
    sFilter = myCommandLine.args['filter']

    makeDir(outdir)

    files = checkSamples(files)

    df = filesToDF(files, sFilter)

    # DO DESEQ2
    from rpy2 import robjects
    from rpy2.robjects import r, pandas2ri, Formula
    from rpy2.robjects.lib import grid
    pandas2ri.activate()

    # Compile data for data frame
    data = list()
    for f in files:
        if group1 in f:
            if batch in f:
                data.append((f, group1, '1'))
            else:
                data.append((f, group1, '2'))
        else:
            if batch in f:
                data.append((f, group2, '1'))
            else:
                data.append((f, group2, '2'))

    # Make the Data Frame
    pydf = pd.DataFrame(data)
    pydf.columns = ['sampleName', 'condition', 'batch']
    pydf = pydf.set_index('sampleName')
    # Convert pandas to R data frame.
    sampleTable = pandas2ri.py2ri(pydf)

    # DESEQ2 part.
    # Forumla
    design = Formula("~ batch + condition")

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods = importr('methods')
    deseq = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman = importr('qqman')

    # dds = deseq.DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
    #                                         directory = workingdir,
    #                                         design= design)

    dds = deseq.DESeqDataSetFromMatrix(countData=df,
                                       colData=sampleTable,
                                       design=design)
    dds = deseq.DESeq(dds)

    # get results; orient the results for groupA vs B
    res = deseq.results(dds,
                        contrast=robjects.StrVector(
                            ("condition", group2, group1)))
    # results with shrinkage
    resLFC = deseq.lfcShrink(dds,
                             coef="condition_%s_vs_%s" % (group2, group1),
                             type="apeglm")
    resdf = robjects.r['as.data.frame'](res)
    reslfc = robjects.r['as.data.frame'](resLFC)

    # plot MA and PC stats for the user
    plotMA = robjects.r['plotMA']
    plotDisp = robjects.r['plotDispEsts']
    plotPCA = robjects.r['plotPCA']
    plotQQ = robjects.r['qq']

    vsd = robjects.r['vst'](dds, blind=robjects.r['F'])
    # get pca data
    pcaData = plotPCA(vsd,
                      intgroup=robjects.StrVector(("condition", "batch")),
                      returnData=robjects.r['T'])
    percentVar = robjects.r['attr'](pcaData, "percentVar")

    # arrange
    grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_%s_cutoff_plots.pdf" %
                  (outdir, prefix, group1, group2, str(batch), sFilter))

    x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance"

    pp = ggplot2.ggplot(pcaData) + \
            ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
            ggplot2.geom_point(size=3) + \
            robjects.r['xlab'](x) + \
            robjects.r['ylab'](y) + \
            ggplot2.theme_classic() + \
            ggplot2.coord_fixed()
    pp.plot()

    plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results")
    #plotMA(res, main="MA-plot results")
    plotMA(resLFC,
           ylim=robjects.IntVector((-3, 3)),
           main="MA-plot LFCSrrhinkage")
    #plotMA(resLFC, main="MA-plot LFCSrrhinkage")
    plotQQ(resdf.rx2('pvalue'), main="pvalue QQ")
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic()
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()

    reslsf = pandas2ri.ri2py(reslfc)
    res = pandas2ri.ri2py(resdf)

    reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results_LFC.tsv" %
                  (outdir, prefix, group1, group2, str(batch)),
                  sep='\t')
    reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results.tsv" %
                  (outdir, prefix, group1, group2, str(batch)),
                  sep='\t')
コード例 #17
0
def _plt_percountr(dat, independentpdf=False, fname='xpercount.pdf'):
    def _filt_dat(dat, item, getlabel=True):
        df = pd.DataFrame(dat[item].value_counts())
        df.columns = ['count']
        if getlabel:
            df['label'] = [
                list(dat[dat[item] == i]['label'])[0] for i in df.index
            ]
        n = len(df)
        mx = max(df['count'])
        return df, n, mx

    dat = dat[dat['label'] != 'NA']

    ## NUMBER OF MIRNA PER TSS
    df, n, mx = _filt_dat(dat, 'tss', False)
    df = {'count': robjects.IntVector(df['count'])}
    df = robjects.DataFrame(df)

    pt = ggplot2.ggplot(df) + \
        ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \
        ggplot2.xlim(-.5, mx+1) + \
        ggplot2.aes_string(x='count') + \
        ggplot2.ggtitle('TSS [Total = %s]' % n) + \
        ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx)

    pt_den = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='count', y='..density..') + \
        ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \
        ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \
        ggplot2.ggtitle('TSS [Total = %s]' % n) + \
        ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx)

    ## NUMBER OF TSS PER MIRNA
    df, n, mx = _filt_dat(dat, 'mirna')
    df = {
        'count': robjects.IntVector(df['count']),
        'label': robjects.StrVector(df['label'])
    }
    df = robjects.DataFrame(df)

    _pm = ggplot2.ggplot(df) + \
        ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \
        ggplot2.xlim(-.5, mx+1) + \
        ggplot2.ggtitle('miRNA [Total = %s]' % n)

    _pm_den = ggplot2.ggplot(df) + \
        ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \
        ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \
        ggplot2.ggtitle('miRNA [Total = %s]' % n)

    ## not split by label
    pm = _pm + ggplot2.aes_string(x='count')
    pm_den = _pm_den + ggplot2.aes_string(x='count', y='..density..')

    ## split by label
    pms = _pm + ggplot2.aes_string(x='count', fill='label')
    pm_dens = _pm_den + ggplot2.aes_string(
        x='count', fill='label', y='..density..')

    ## add xlabelling (need to be added after aes_string)
    _xlab = ggplot2.labs(x='Number of TSS per miRNA (max = %s)' % mx)
    pm += _xlab
    pm_den += _xlab
    pms += _xlab
    pm_dens += _xlab

    if independentpdf:
        grdevices = importr('grDevices')
        grdevices.pdf(fname)
        pt.plot()
        pt_den.plot()
        pm.plot()
        pm_den.plot()
        pms.plot()
        pm_dens.plot()
        grdevices.dev_off()
    else:
        pt.plot()
        pt_den.plot()
        pm.plot()
        pm_den.plot()
        pms.plot()
        pm_dens.plot()
    return
コード例 #18
0
ファイル: runDE.py プロジェクト: csoulette/cmtools
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir = "diffExpOut"
    group1 = myCommandLine.args['group1']
    group2 = myCommandLine.args['group2']
    batch = myCommandLine.args['batch']
    matrix = myCommandLine.args['matrix']
    prefix = "flair_diffexp"

    # make the quant DF
    quantDF = pd.read_table(matrix, header=0, sep='\t')
    quantDF = quantDF.set_index('ids')
    df = pandas2ri.py2ri(quantDF)

    # now make the formula
    with open(matrix) as l:
        header = next(l).rstrip().split()[1:]

    formula = [[x, x.split("_")[1], x.split("_")[-1]] for x in header]
    formulaDF = pd.DataFrame(formula)
    formulaDF.columns = ['sampleName', 'condition', 'batch']
    formulaDF = formulaDF.set_index('sampleName')
    sampleTable = pandas2ri.py2ri(formulaDF)

    design = Formula("~ batch + condition")
    print(sampleTable)

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods = importr('methods')
    deseq = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman = importr('qqman')

    dds = deseq.DESeqDataSetFromMatrix(countData=df,
                                       colData=sampleTable,
                                       design=design)

    dds = deseq.DESeq(dds)
    cont = robjects.r["grep"]("condition",
                              robjects.r['resultsNames'](dds),
                              value="TRUE")

    # get results; orient the results for groupA vs B
    res = deseq.results(dds, name=cont)
    # results with shrinkage
    resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm")
    resdf = robjects.r['as.data.frame'](res)

    R.assign('res', res)
    R('write.table(res, file="testres.tsv", quote=FALSE, col.names=NA)')
    reslfc = robjects.r['as.data.frame'](resLFC)

    # plot MA and PC stats for the user
    plotMA = robjects.r['plotMA']
    plotDisp = robjects.r['plotDispEsts']
    plotPCA = robjects.r['plotPCA']
    plotQQ = robjects.r['qq']

    vsd = robjects.r['vst'](dds, blind=robjects.r['F'])
    # get pca data
    pcaData = plotPCA(vsd,
                      intgroup=robjects.StrVector(("condition", "batch")),
                      returnData=robjects.r['T'])
    percentVar = robjects.r['attr'](pcaData, "percentVar")

    # arrange
    grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_cutoff_plots.pdf" %
                  (outdir, prefix, group1, group2, str(batch)))

    x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance"

    pp = ggplot2.ggplot(pcaData) + \
            ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
            ggplot2.geom_point(size=3) + \
            robjects.r['xlab'](x) + \
            robjects.r['ylab'](y) + \
            ggplot2.theme_classic() + \
            ggplot2.coord_fixed()
    pp.plot()

    plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results")
    #plotMA(res, main="MA-plot results")
    plotMA(resLFC,
           ylim=robjects.IntVector((-3, 3)),
           main="MA-plot LFCSrrhinkage")
    #plotMA(resLFC, main="MA-plot LFCSrrhinkage")
    plotQQ(resdf.rx2('pvalue'), main="pvalue QQ")
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic()
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()

    lfcOut = "./%s/%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, str(batch))
    resOut = "./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch))

    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
    sys.exit(1)
    reslsf = pandas2ri.ri2py(reslfc)
    res = pandas2ri.ri2py(resdf)

    reslsf.to_csv("./%s/%s_%s_deseq2_results_LFC.tsv" %
                  (outdir, prefix, str(batch)),
                  sep='\t')
    res.to_csv("./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch)),
               sep='\t')