def _plt_distr(dat, col, title='', splitBy_pfill=True, pfill='label', independentpdf=False, fname='xdistr.pdf'): df = dat[dat[pfill] != 'NA'] ## remove invalid pairs n = len(df) df = { col: robjects.FloatVector(list(df[col])), pfill: robjects.StrVector(list(df[pfill])) } df = robjects.DataFrame(df) pp = ggplot2.ggplot(df) + \ ggplot2.ggtitle('%s [Total = %s]' % (title, n)) ## Plot1: counts if splitBy_pfill: p1 = pp + ggplot2.aes_string(x=col, fill=pfill) else: p1 = pp + ggplot2.aes_string(x=col) ## Plot2: density if splitBy_pfill: p2 = pp + ggplot2.aes_string(x=col, fill=pfill, y='..density..') else: p2 = pp + ggplot2.aes_string(x=col, y='..density..') p2 = p2 + ggplot2.geom_density(alpha=.5, origin=-500) if col == 'distance': p1 = p1 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.5, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) p2 = p2 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.33, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) else: p1 = p1 + \ ggplot2.geom_histogram(alpha=.5, position='identity') p2 = p2 + \ ggplot2.geom_histogram(alpha=.33, position='identity') if col == 'correlation': p1 = p1 + ggplot2.xlim(-1.1, 1.1) p2 = p2 + ggplot2.xlim(-1.1, 1.1) if independentpdf: grdevices = importr('grDevices') grdevices.pdf(file=fname) p1.plot() p2.plot() grdevices.dev_off() else: p1.plot() p2.plot() return
def plot_hist(sizes, args): """ Use rpy2 to plot a histogram of the read sizes """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') sizes = robjects.IntVector([s for s in sizes \ if s < args.max_length and s > args.min_length]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / args.num_bins d = {'sizes': sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) else: pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def histogram(self, dataframe, filename, parm, group, units): with suppress_stdout(): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm,fill = group) geom = ggplot2.geom_histogram(colour="black") labs = ggplot2.labs(x=parm + " " + units) gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def plot_hist(sizes, args): """ Use rpy2 to plot a histogram of the read sizes """ r = robjects.r r.library("ggplot2") grdevices = importr("grDevices") sizes = robjects.IntVector([s for s in sizes if s < args.max_length and s > args.min_length]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / args.num_bins d = {"sizes": sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth) else: pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth) + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print("Type enter to exit.") raw_input()
def histogram(self, dataframe, filename, parm, group, units): with suppress_stdout(): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm, fill=group) geom = ggplot2.geom_histogram(colour="black") labs = ggplot2.labs(x=parm + " " + units) gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def rpy2_plotter(anno, clusters, name): """Plot genes distribution in clusters using ggplot2 from R.""" pandas2ri.activate() grdevices = importr('grDevices') rprint = robjects.globalenv.get("print") anno = anno.sort_values(by="n_ft", ascending=False) anno = anno.head(n=10) category = anno["category"].tolist() clusters = clusters[clusters["category"].isin(category)] clusters = pandas2ri.py2ri(clusters) pp = ggplot2.ggplot(clusters) + ggplot2.aes_string(x="n_features") + ggplot2.geom_histogram(binwidth=1) + ggplot2.facet_wrap(robjects.Formula("~category"), ncol=5) + ggplot2.labs(x="Number of Features", y="Number of Clusters", title="Clusters distribution") grdevices.pdf(file=name, width=11.692, height=8.267) rprint(pp) grdevices.dev_off()
def plot_histogram(fastq_file, plot_filename_png): """Plots histogram of length distribution of sequence in fastq_file and saves to plot_filename_png""" r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') sizes = [] with open(fastq_file, 'rb') as f: # skip first line for _ in itertools.islice(f, 0, 1): pass # Get every 4th line with raw sequence letters fourthlines = itertools.islice(f, 0, None, 4) for line in fourthlines: sizes.append(len(line.strip())) sizes = robjects.IntVector([s for s in sizes]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / 20 d = {'sizes' : sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) \ + ggplot2.theme_grey() \ + ggplot2.labs(title =plot_filename_png, \ x = "Size (in nucleotides)", y = "Count") grdevices.png(plot_filename_png, width = 8.5, height = 8.5, units = "in", res = 300) pp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] <gtf file> <fpkm tracking>' parser = OptionParser(usage) #parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: gtf_file = args[0] fpkm_tracking_file = args[1] # get genes genes = set() for line in open(gtf_file): a = line.split('\t') genes.add(gff.gtf_kv(a[8])['gene_id']) # get expression cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) log_fpkms = [] for gene_id in genes: max_fpkm = max(cuff.gene_expr(gene_id)) if max_fpkm > 0: log_fpkms.append(math.log(max_fpkm,2)) # construct R data objects fpkms_r = ro.FloatVector(log_fpkms) df = ro.DataFrame({'fpkm':fpkms_r}) # construct plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='fpkm') + \ ggplot2.geom_histogram(binwidth=0.2) # save to file gtf_pre = os.path.splitext(gtf_file)[0] grdevices.pdf(file='%s_fpkmhist.pdf' % gtf_pre) gp.plot() grdevices.dev_off()
def main(): usage = "usage: %prog [options] <gtf file> <fpkm tracking>" parser = OptionParser(usage) # parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: gtf_file = args[0] fpkm_tracking_file = args[1] # get genes genes = set() for line in open(gtf_file): a = line.split("\t") genes.add(gff.gtf_kv(a[8])["gene_id"]) # get expression cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) log_fpkms = [] for gene_id in genes: max_fpkm = max(cuff.gene_expr(gene_id)) if max_fpkm > 0: log_fpkms.append(math.log(max_fpkm, 2)) # construct R data objects fpkms_r = ro.FloatVector(log_fpkms) df = ro.DataFrame({"fpkm": fpkms_r}) # construct plot gp = ggplot2.ggplot(df) + ggplot2.aes_string(x="fpkm") + ggplot2.geom_histogram(binwidth=0.2) # save to file gtf_pre = os.path.splitext(gtf_file)[0] grdevices.pdf(file="%s_fpkmhist.pdf" % gtf_pre) gp.plot() grdevices.dev_off()
gp = ggplot2.ggplot(mtcars) pp = gp + \ ggplot2.aes_string(x='factor(cyl)', y='mpg') + \ ggplot2.geom_boxplot() pp.plot() #-- ggplot2geomboxplot-end grdevices.dev_off() #-- ggplot2geomhistogram-begin gp = ggplot2.ggplot(mtcars) pp = gp + \ ggplot2.aes_string(x='wt') + \ ggplot2.geom_histogram() #pp.plot() #-- ggplot2geomhistogram-end grdevices.png('../../_static/graphics_ggplot2geomhistogram.png', width=900, height=412, antialias="subpixel", type="cairo") grid.newpage() grid.viewport(layout=grid.layout(1, 3)).push() params = (('black', 'black'), ('black', 'white'), ('white', 'black')) for col_i in range(3):
antialias=ANTIALIAS, type="cairo") #-- ggplot2geomboxplot-begin gp = ggplot2.ggplot(mtcars) pp = (gp + ggplot2.aes_string(x='factor(cyl)', y='mpg') + ggplot2.geom_boxplot()) pp.plot() #-- ggplot2geomboxplot-end grdevices.dev_off() #-- ggplot2geomhistogram-begin gp = ggplot2.ggplot(mtcars) pp = (gp + ggplot2.aes_string(x='wt') + ggplot2.geom_histogram(bins=30)) #pp.plot() #-- ggplot2geomhistogram-end grdevices.png('../../_static/graphics_ggplot2geomhistogram.png', width=900, height=412, antialias=ANTIALIAS, type="cairo") grid.newpage() grid.viewport(layout=grid.layout(1, 3)).push() params = (('black', 'black'), ('black', 'white'), ('white', 'black')) for col_i in range(3):
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] print("running DESEQ2 %s" % prefix, file=sys.stderr) # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) # import formula formulaDF = pd.read_csv(formula,header=0, sep="\t",index_col=0) sampleTable = pandas2ri.py2ri(formulaDF) if "batch" in list(formulaDF): design = Formula("~ batch + condition") else: design = Formula("~ condition") # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') ### RUN DESEQ2 ### R.assign('df', df) R.assign('sampleTable', sampleTable) R.assign('design',design) R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)') R('dds <- DESeq(dds)') R('name <- grep("condition", resultsNames(dds), value=TRUE)') ### ### # Get Results and shrinkage values res = R('results(dds, name=name)') resLFC = R('lfcShrink(dds, coef=name)') vsd = R('vst(dds,blind=FALSE)') resdf = robjects.r['as.data.frame'](res) reslfc = robjects.r['as.data.frame'](resLFC) dds = R('dds') ### Plotting section ### # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] # get pca data if "batch" in list(formulaDF): pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") else: print(vsd) pcaData = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange data_folder = os.path.join(os.getcwd(), outdir) qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf" % (prefix,group1,group2)) grdevices.pdf(file=qcOut) x = "PC1: %s" % int(percentVar[0]*100) + "%% variance" y = "PC2: %s" % int(percentVar[1]*100) + "%% variance" if "batch" in list(formulaDF): pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() else: pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() + \ ggplot2.ggtitle("pvalue distribution") hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() data_folder = os.path.join(os.getcwd(), outdir) lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (prefix,group1,group2)) resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv" % (prefix,group1,group2)) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
pp = gp + \ ggplot2.aes_string(x='factor(cyl)', y='mpg') + \ ggplot2.geom_boxplot() pp.plot() #-- ggplot2geomboxplot-end grdevices.dev_off() #-- ggplot2geomhistogram-begin gp = ggplot2.ggplot(mtcars) pp = gp + \ ggplot2.aes_string(x='wt') + \ ggplot2.geom_histogram() #pp.plot() #-- ggplot2geomhistogram-end grdevices.png('../../_static/graphics_ggplot2geomhistogram.png', width = 900, height = 412, antialias="subpixel", type="cairo") grid.newpage() grid.viewport(layout=grid.layout(1, 3)).push() params = (('black', 'black'), ('black', 'white'), ('white', 'black')) for col_i in range(3): vp = grid.viewport(**{'layout.pos.col':col_i+1, 'layout.pos.row': 1})
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) #print(df.head()) # import formula formulaDF = pd.read_csv(formula,header=0, sep="\t",index_col=0) sampleTable = pandas2ri.py2ri(formulaDF) if "batch" in list(formulaDF): design = Formula("~ batch + condition") else: design = Formula("~ condition") #print(sampleTable) # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') dds = deseq.DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design) dds = deseq.DESeq(dds) cont = robjects.r["grep"]("condition",robjects.r['resultsNames'](dds),value=True) #print(cont) # get results; orient the results for groupA vs B res = deseq.results(dds, name=cont) # results with shrinkage resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm") resdf = robjects.r['as.data.frame'](res) R.assign('res', res) reslfc = robjects.r['as.data.frame'](resLFC) # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] vsd = robjects.r['vst'](dds, blind=robjects.r['F']) # get pca data if "batch" in list(formulaDF): pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") else: print(vsd) pcaData = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange grdevices.pdf(file="./%s/%s_QCplots_%s_v_%s.pdf" % (outdir,prefix,group1,group2)) x = "PC1: %s" % int(percentVar[0]*100) + "%% variance" y = "PC2: %s" % int(percentVar[1]*100) + "%% variance" if "batch" in list(formulaDF): pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() else: pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results") #plotMA(res, main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrrhinkage") #plotMA(resLFC, main="MA-plot LFCSrrhinkage") plotQQ(resdf.rx2('pvalue'), main="pvalue QQ") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() lfcOut = "./%s/%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (outdir,prefix,group1,group2) resOut = "./%s/%s_%s_v_%s_deseq2_results.tsv" % (outdir,prefix,group1,group2) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
##text_log+="average: "+str(rmean(test23)[0])+end ##text_log+="sum: "+str(rsum(test23)[0])+end # #roughbin= round(ma[0]/100) #bins=round(roughbin/100)*100 #ma2=rmax(ed) #dataf_subset = dataf.rx(dataf.rx2("contig").ro >= 18, true) scales = importr('scales') gp = ggplot2.ggplot(dataf) #geom_histogram(aes(y = ..density..)) # ggplot2.geom_density()+\ # pp = gp + ggplot2.aes_string(x='%s(contrrr)') + ggplot2.geom_histogram()+ggplot2.scale_y_sqrt() bins=10 teest3=robjects.r('theme(axis.text.x=element_text(angle=90))') pp = gp + \ ggplot2.aes_string(x='Length') + \ ggplot2.geom_histogram()+\ ggplot2.ggtitle("Found IS fragment lengths")+ \ ggplot2.scale_x_continuous(name="fragment lengths, bin="+str(bins),breaks=scales.pretty_breaks(20)) +\ ggplot2.scale_y_continuous(labels=scales.comma,name="Count",breaks=scales.pretty_breaks(10))+ \ teest3 pp.plot() robjects.r.ggsave("/Users/security/science/dna_subj_hist.pdf")
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() workingdir = myCommandLine.args['workingdir'] outdir = myCommandLine.args['outdir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] files = myCommandLine.args['files'] prefix = myCommandLine.args['out_prefix'] sFilter = myCommandLine.args['filter'] makeDir(outdir) files = checkSamples(files) df = filesToDF(files, sFilter) # DO DESEQ2 from rpy2 import robjects from rpy2.robjects import r, pandas2ri, Formula from rpy2.robjects.lib import grid pandas2ri.activate() # Compile data for data frame data = list() for f in files: if group1 in f: if batch in f: data.append((f, group1, '1')) else: data.append((f, group1, '2')) else: if batch in f: data.append((f, group2, '1')) else: data.append((f, group2, '2')) # Make the Data Frame pydf = pd.DataFrame(data) pydf.columns = ['sampleName', 'condition', 'batch'] pydf = pydf.set_index('sampleName') # Convert pandas to R data frame. sampleTable = pandas2ri.py2ri(pydf) # DESEQ2 part. # Forumla design = Formula("~ batch + condition") # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') # dds = deseq.DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, # directory = workingdir, # design= design) dds = deseq.DESeqDataSetFromMatrix(countData=df, colData=sampleTable, design=design) dds = deseq.DESeq(dds) # get results; orient the results for groupA vs B res = deseq.results(dds, contrast=robjects.StrVector( ("condition", group2, group1))) # results with shrinkage resLFC = deseq.lfcShrink(dds, coef="condition_%s_vs_%s" % (group2, group1), type="apeglm") resdf = robjects.r['as.data.frame'](res) reslfc = robjects.r['as.data.frame'](resLFC) # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] vsd = robjects.r['vst'](dds, blind=robjects.r['F']) # get pca data pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_%s_cutoff_plots.pdf" % (outdir, prefix, group1, group2, str(batch), sFilter)) x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance" y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance" pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results") #plotMA(res, main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3, 3)), main="MA-plot LFCSrrhinkage") #plotMA(resLFC, main="MA-plot LFCSrrhinkage") plotQQ(resdf.rx2('pvalue'), main="pvalue QQ") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() reslsf = pandas2ri.ri2py(reslfc) res = pandas2ri.ri2py(resdf) reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, group1, group2, str(batch)), sep='\t') reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results.tsv" % (outdir, prefix, group1, group2, str(batch)), sep='\t')
def _plt_percountr(dat, independentpdf=False, fname='xpercount.pdf'): def _filt_dat(dat, item, getlabel=True): df = pd.DataFrame(dat[item].value_counts()) df.columns = ['count'] if getlabel: df['label'] = [ list(dat[dat[item] == i]['label'])[0] for i in df.index ] n = len(df) mx = max(df['count']) return df, n, mx dat = dat[dat['label'] != 'NA'] ## NUMBER OF MIRNA PER TSS df, n, mx = _filt_dat(dat, 'tss', False) df = {'count': robjects.IntVector(df['count'])} df = robjects.DataFrame(df) pt = ggplot2.ggplot(df) + \ ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \ ggplot2.xlim(-.5, mx+1) + \ ggplot2.aes_string(x='count') + \ ggplot2.ggtitle('TSS [Total = %s]' % n) + \ ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx) pt_den = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='count', y='..density..') + \ ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \ ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \ ggplot2.ggtitle('TSS [Total = %s]' % n) + \ ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx) ## NUMBER OF TSS PER MIRNA df, n, mx = _filt_dat(dat, 'mirna') df = { 'count': robjects.IntVector(df['count']), 'label': robjects.StrVector(df['label']) } df = robjects.DataFrame(df) _pm = ggplot2.ggplot(df) + \ ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \ ggplot2.xlim(-.5, mx+1) + \ ggplot2.ggtitle('miRNA [Total = %s]' % n) _pm_den = ggplot2.ggplot(df) + \ ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \ ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \ ggplot2.ggtitle('miRNA [Total = %s]' % n) ## not split by label pm = _pm + ggplot2.aes_string(x='count') pm_den = _pm_den + ggplot2.aes_string(x='count', y='..density..') ## split by label pms = _pm + ggplot2.aes_string(x='count', fill='label') pm_dens = _pm_den + ggplot2.aes_string( x='count', fill='label', y='..density..') ## add xlabelling (need to be added after aes_string) _xlab = ggplot2.labs(x='Number of TSS per miRNA (max = %s)' % mx) pm += _xlab pm_den += _xlab pms += _xlab pm_dens += _xlab if independentpdf: grdevices = importr('grDevices') grdevices.pdf(fname) pt.plot() pt_den.plot() pm.plot() pm_den.plot() pms.plot() pm_dens.plot() grdevices.dev_off() else: pt.plot() pt_den.plot() pm.plot() pm_den.plot() pms.plot() pm_dens.plot() return
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = "diffExpOut" group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = "flair_diffexp" # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t') quantDF = quantDF.set_index('ids') df = pandas2ri.py2ri(quantDF) # now make the formula with open(matrix) as l: header = next(l).rstrip().split()[1:] formula = [[x, x.split("_")[1], x.split("_")[-1]] for x in header] formulaDF = pd.DataFrame(formula) formulaDF.columns = ['sampleName', 'condition', 'batch'] formulaDF = formulaDF.set_index('sampleName') sampleTable = pandas2ri.py2ri(formulaDF) design = Formula("~ batch + condition") print(sampleTable) # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') dds = deseq.DESeqDataSetFromMatrix(countData=df, colData=sampleTable, design=design) dds = deseq.DESeq(dds) cont = robjects.r["grep"]("condition", robjects.r['resultsNames'](dds), value="TRUE") # get results; orient the results for groupA vs B res = deseq.results(dds, name=cont) # results with shrinkage resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm") resdf = robjects.r['as.data.frame'](res) R.assign('res', res) R('write.table(res, file="testres.tsv", quote=FALSE, col.names=NA)') reslfc = robjects.r['as.data.frame'](resLFC) # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] vsd = robjects.r['vst'](dds, blind=robjects.r['F']) # get pca data pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_cutoff_plots.pdf" % (outdir, prefix, group1, group2, str(batch))) x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance" y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance" pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results") #plotMA(res, main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3, 3)), main="MA-plot LFCSrrhinkage") #plotMA(resLFC, main="MA-plot LFCSrrhinkage") plotQQ(resdf.rx2('pvalue'), main="pvalue QQ") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() lfcOut = "./%s/%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, str(batch)) resOut = "./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch)) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t") sys.exit(1) reslsf = pandas2ri.ri2py(reslfc) res = pandas2ri.ri2py(resdf) reslsf.to_csv("./%s/%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, str(batch)), sep='\t') res.to_csv("./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch)), sep='\t')