def plot_total_bp(parser, args, tot_bp_per_pore): """ Plot the pore performance """ import math r = robjects.r r.library("ggplot2") grdevices = importr("grDevices") flowcell_layout = minion_flowcell_layout() pore_values = [] for pore in flowcell_layout: if pore in tot_bp_per_pore: pore_values.append(math.log10(tot_bp_per_pore[pore])) else: pore_values.append(0) # make a data frame of the lists d = { "rownum": robjects.IntVector(range(1, 17) * 32), "colnum": robjects.IntVector(sorted(range(1, 33) * 16)), "log10_tot_bp": robjects.IntVector(pore_values), "labels": robjects.IntVector(flowcell_layout), } df = robjects.DataFrame(d) gp = gg.ggplot(df) pp = ( gp + gg.aes_string(y="factor(rownum, rev(rownum))", x="factor(colnum)") + gg.geom_point(gg.aes_string(color="log10_tot_bp"), size=7) + gg.geom_text(gg.aes_string(label="labels"), colour="white", size=2) + gg.scale_colour_gradient2(low="black", mid="black", high="red") + gg.coord_fixed(ratio=1.4) + gg.labs(x=gg.NULL, y=gg.NULL) ) if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=11, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=11, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print ("Type enter to exit.") raw_input()
def plot_total_bp(parser, args, tot_bp_per_pore): """ Plot the pore performance """ import math r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') flowcell_layout = minion_flowcell_layout() pore_values = [] for pore in flowcell_layout: if pore in tot_bp_per_pore: pore_values.append(math.log10(tot_bp_per_pore[pore])) else: pore_values.append(0) # make a data frame of the lists d = {'rownum': robjects.IntVector(range(1,17)*32), 'colnum': robjects.IntVector(sorted(range(1,33)*16)), 'log10_tot_bp': robjects.IntVector(pore_values), 'labels': robjects.IntVector(flowcell_layout) } df = robjects.DataFrame(d) gp = gg.ggplot(df) pp = gp + gg.aes_string(y = 'factor(rownum, rev(rownum))', \ x = 'factor(colnum)') \ + gg.geom_point(gg.aes_string(color='log10_tot_bp'), size = 7) \ + gg.geom_text(gg.aes_string(label ='labels'), colour="white", size = 2) \ + gg.scale_colour_gradient2(low = "black", mid= "black", high="red") \ + gg.coord_fixed(ratio=1.4) \ + gg.labs(x=gg.NULL, y=gg.NULL) if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width = 11, height = 8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width = 11, height = 8.5, units = "in", res = 300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def plot_similarity_matrix(self, item_type, image_file, title): '''Plot similarities of crawls (overlap of unique items) as heat map matrix''' data = defaultdict(dict) n = 1 for crawl1 in self.similarity[item_type]: for crawl2 in self.similarity[item_type][crawl1]: similarity = self.similarity[item_type][crawl1][crawl2] data['crawl1'][n] = MonthlyCrawl.short_name(crawl1) data['crawl2'][n] = MonthlyCrawl.short_name(crawl2) data['similarity'][n] = similarity data['sim_rounded'][n] = similarity # to be rounded n += 1 data = pandas.DataFrame(data) print(data) # select median of similarity values as midpoint of similarity scale midpoint = data['similarity'].median() decimals = 3 textsize = 2 minshown = .0005 if (data['similarity'].max()-data['similarity'].min()) > .2: decimals = 2 textsize = 2.8 minshown = .005 data['sim_rounded'] = data['sim_rounded'].apply( lambda x: ('{0:.'+str(decimals)+'f}').format(x).lstrip('0') if x >= minshown else '0') print('Median of similarities for', item_type, '=', midpoint) matrix_size = len(self.similarity[item_type]) if matrix_size > self.MAX_MATRIX_SIZE: n = 0 for crawl1 in sorted(self.similarity[item_type], reverse=True): short_name = MonthlyCrawl.short_name(crawl1) if n > self.MAX_MATRIX_SIZE: data = data[data['crawl1'] != short_name] data = data[data['crawl2'] != short_name] n += 1 p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl2', y='crawl1', fill='similarity', label='sim_rounded') \ + ggplot2.geom_tile(color="white") \ + ggplot2.scale_fill_gradient2(low="red", high="blue", mid="white", midpoint=midpoint, space="Lab") \ + GGPLOT2_THEME \ + ggplot2.coord_fixed() \ + ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle=45, vjust=1, hjust=1)}) \ + ggplot2.labs(title=title, x='', y='') \ + ggplot2.geom_text(color='black', size=textsize) img_path = os.path.join(PLOTDIR, image_file) p.save(img_path) return p
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] print("running DESEQ2 %s" % prefix, file=sys.stderr) # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) # import formula formulaDF = pd.read_csv(formula,header=0, sep="\t",index_col=0) sampleTable = pandas2ri.py2ri(formulaDF) if "batch" in list(formulaDF): design = Formula("~ batch + condition") else: design = Formula("~ condition") # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') ### RUN DESEQ2 ### R.assign('df', df) R.assign('sampleTable', sampleTable) R.assign('design',design) R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)') R('dds <- DESeq(dds)') R('name <- grep("condition", resultsNames(dds), value=TRUE)') ### ### # Get Results and shrinkage values res = R('results(dds, name=name)') resLFC = R('lfcShrink(dds, coef=name)') vsd = R('vst(dds,blind=FALSE)') resdf = robjects.r['as.data.frame'](res) reslfc = robjects.r['as.data.frame'](resLFC) dds = R('dds') ### Plotting section ### # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] # get pca data if "batch" in list(formulaDF): pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") else: print(vsd) pcaData = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange data_folder = os.path.join(os.getcwd(), outdir) qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf" % (prefix,group1,group2)) grdevices.pdf(file=qcOut) x = "PC1: %s" % int(percentVar[0]*100) + "%% variance" y = "PC2: %s" % int(percentVar[1]*100) + "%% variance" if "batch" in list(formulaDF): pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() else: pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() + \ ggplot2.ggtitle("pvalue distribution") hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() data_folder = os.path.join(os.getcwd(), outdir) lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (prefix,group1,group2)) resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv" % (prefix,group1,group2)) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) #print(df.head()) # import formula formulaDF = pd.read_csv(formula,header=0, sep="\t",index_col=0) sampleTable = pandas2ri.py2ri(formulaDF) if "batch" in list(formulaDF): design = Formula("~ batch + condition") else: design = Formula("~ condition") #print(sampleTable) # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') dds = deseq.DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design) dds = deseq.DESeq(dds) cont = robjects.r["grep"]("condition",robjects.r['resultsNames'](dds),value=True) #print(cont) # get results; orient the results for groupA vs B res = deseq.results(dds, name=cont) # results with shrinkage resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm") resdf = robjects.r['as.data.frame'](res) R.assign('res', res) reslfc = robjects.r['as.data.frame'](resLFC) # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] vsd = robjects.r['vst'](dds, blind=robjects.r['F']) # get pca data if "batch" in list(formulaDF): pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") else: print(vsd) pcaData = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange grdevices.pdf(file="./%s/%s_QCplots_%s_v_%s.pdf" % (outdir,prefix,group1,group2)) x = "PC1: %s" % int(percentVar[0]*100) + "%% variance" y = "PC2: %s" % int(percentVar[1]*100) + "%% variance" if "batch" in list(formulaDF): pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() else: pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results") #plotMA(res, main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrrhinkage") #plotMA(resLFC, main="MA-plot LFCSrrhinkage") plotQQ(resdf.rx2('pvalue'), main="pvalue QQ") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() lfcOut = "./%s/%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (outdir,prefix,group1,group2) resOut = "./%s/%s_%s_v_%s_deseq2_results.tsv" % (outdir,prefix,group1,group2) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = "diffExpOut" group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = "flair_diffexp" # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t') quantDF = quantDF.set_index('ids') df = pandas2ri.py2ri(quantDF) # now make the formula with open(matrix) as l: header = next(l).rstrip().split()[1:] formula = [[x, x.split("_")[1], x.split("_")[-1]] for x in header] formulaDF = pd.DataFrame(formula) formulaDF.columns = ['sampleName', 'condition', 'batch'] formulaDF = formulaDF.set_index('sampleName') sampleTable = pandas2ri.py2ri(formulaDF) design = Formula("~ batch + condition") print(sampleTable) # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') dds = deseq.DESeqDataSetFromMatrix(countData=df, colData=sampleTable, design=design) dds = deseq.DESeq(dds) cont = robjects.r["grep"]("condition", robjects.r['resultsNames'](dds), value="TRUE") # get results; orient the results for groupA vs B res = deseq.results(dds, name=cont) # results with shrinkage resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm") resdf = robjects.r['as.data.frame'](res) R.assign('res', res) R('write.table(res, file="testres.tsv", quote=FALSE, col.names=NA)') reslfc = robjects.r['as.data.frame'](resLFC) # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] vsd = robjects.r['vst'](dds, blind=robjects.r['F']) # get pca data pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_cutoff_plots.pdf" % (outdir, prefix, group1, group2, str(batch))) x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance" y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance" pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results") #plotMA(res, main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3, 3)), main="MA-plot LFCSrrhinkage") #plotMA(resLFC, main="MA-plot LFCSrrhinkage") plotQQ(resdf.rx2('pvalue'), main="pvalue QQ") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() lfcOut = "./%s/%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, str(batch)) resOut = "./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch)) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t") sys.exit(1) reslsf = pandas2ri.ri2py(reslfc) res = pandas2ri.ri2py(resdf) reslsf.to_csv("./%s/%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, str(batch)), sep='\t') res.to_csv("./%s/%s_%s_deseq2_results.tsv" % (outdir, prefix, str(batch)), sep='\t')
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() workingdir = myCommandLine.args['workingdir'] outdir = myCommandLine.args['outdir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] files = myCommandLine.args['files'] prefix = myCommandLine.args['out_prefix'] sFilter = myCommandLine.args['filter'] makeDir(outdir) files = checkSamples(files) df = filesToDF(files, sFilter) # DO DESEQ2 from rpy2 import robjects from rpy2.robjects import r, pandas2ri, Formula from rpy2.robjects.lib import grid pandas2ri.activate() # Compile data for data frame data = list() for f in files: if group1 in f: if batch in f: data.append((f, group1, '1')) else: data.append((f, group1, '2')) else: if batch in f: data.append((f, group2, '1')) else: data.append((f, group2, '2')) # Make the Data Frame pydf = pd.DataFrame(data) pydf.columns = ['sampleName', 'condition', 'batch'] pydf = pydf.set_index('sampleName') # Convert pandas to R data frame. sampleTable = pandas2ri.py2ri(pydf) # DESEQ2 part. # Forumla design = Formula("~ batch + condition") # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') # dds = deseq.DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, # directory = workingdir, # design= design) dds = deseq.DESeqDataSetFromMatrix(countData=df, colData=sampleTable, design=design) dds = deseq.DESeq(dds) # get results; orient the results for groupA vs B res = deseq.results(dds, contrast=robjects.StrVector( ("condition", group2, group1))) # results with shrinkage resLFC = deseq.lfcShrink(dds, coef="condition_%s_vs_%s" % (group2, group1), type="apeglm") resdf = robjects.r['as.data.frame'](res) reslfc = robjects.r['as.data.frame'](resLFC) # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] vsd = robjects.r['vst'](dds, blind=robjects.r['F']) # get pca data pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange grdevices.pdf(file="./%s/%s_%s_vs_%s_%s_%s_cutoff_plots.pdf" % (outdir, prefix, group1, group2, str(batch), sFilter)) x = "PC1: %s" % int(percentVar[0] * 100) + "%% variance" y = "PC2: %s" % int(percentVar[1] * 100) + "%% variance" pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3, 3)), main="MA-plot results") #plotMA(res, main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3, 3)), main="MA-plot LFCSrrhinkage") #plotMA(resLFC, main="MA-plot LFCSrrhinkage") plotQQ(resdf.rx2('pvalue'), main="pvalue QQ") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() reslsf = pandas2ri.ri2py(reslfc) res = pandas2ri.ri2py(resdf) reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results_LFC.tsv" % (outdir, prefix, group1, group2, str(batch)), sep='\t') reslsf.to_csv("./%s/%s_%s_vs_%s_%s_deseq2_results.tsv" % (outdir, prefix, group1, group2, str(batch)), sep='\t')