def launch_coveragebed(bamfilenames, bedfilename, legend, outdir, executiongranted): global TMP coveragefiles = [] Pcoveragebeds = [] pid = str(os.getpid()) for i,bamfilename in enumerate(bamfilenames): coveragefile = TMP+'/'+os.path.basename(bamfilename).replace('.bam','.'+pid+'.coverage') coveragebedgraph = outdir+'/data/'+legend[i].replace('.bam', '.bed') print 'Coveragefile = '+coveragefile bam = bam_file.bam_file(bamfilename, 'rb') print 'Launching coverageBed...' Pcoveragebed = multiprocessing.Process(target=bam.myCoverageBed, args=(bedfilename, None, coveragefile, executiongranted,TMP,coveragebedgraph,)) Pcoveragebed.start() # [positions,coverage,chromosomes,processedbed] = bam.myCoverageBed(bedfilename, bam.nreads()) print ' Done.' coveragefiles.append(coveragefile) Pcoveragebeds.append(Pcoveragebed) # return [positions,coverage,chromosomes,processedbed] return [Pcoveragebeds,coveragefiles]
def intersectbam(self, bam): """************************************************************************************************************************************************************ Task: IntersectBam Inputs: bam: Bam_file type ************************************************************************************************************************************************************""" pid = str(os.getpid()) newbam = TMP+'/'+pid+'.intersect.bam' self.run(BEDTOOLS+"intersectBed -abam "+bam.filename+" -b "+self.filename+" > "+newbam) pysam.index(newbam) return bam_file.bam_file(newbam,"rb")
def launch_onoff_reads(bamfilenames, bedfilename, legend, outdir, executiongranted): global TMP onoff_status = multiprocessing.Value('b', False) duplicates_status = multiprocessing.Value('b', False) enrichment = multiprocessing.Array('f', len(bamfilenames)) percontarget = multiprocessing.Array('f', len(bamfilenames)) onduplicates = multiprocessing.Array('f', len(bamfilenames)) offduplicates = multiprocessing.Array('f', len(bamfilenames)) bam = bam_file.bam_file(bamfilenames[0], 'rb') print 'Launching on/off target enrichment calculation...' Ponoff_reads = multiprocessing.Process(target=bam.reads_on_target, args=(bedfilename,outdir,[bam_file.bam_file(bamfilenames[i]) for i in range(1,len(bamfilenames))], legend,executiongranted,onoff_status,duplicates_status,onduplicates, offduplicates,enrichment,percontarget,TMP,config.warnontarget,)) Ponoff_reads.start() bam.close() return Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget
def exon_coverage_std(groups, fileoutprefix, bedfilename, legend=None, normalize=True): """************************************************************************************************************************************************************ Task: generates the distribution of coverage standard deviation across exons. Inputs: groups: list of sublists. Each sublist contains bam filenames of samples related somehow, e.g. samples sequenced in the same run. fileoutprefix: String containing the fileout prefix. bedfilename: string containing the name of the bed with the regions to analyze. legend: list of strings containing descriptions describing each of the groups that will be processed. These descriptions will form the legend of the bar plot. normalize: {True, False} to indicate whether bam files should be normalized Output: two .png figures are generated. One containing the distributions of coverage standard deviation across exons and a box plot of such distributions. ************************************************************************************************************************************************************""" minsize = 1000000000000000 minbamfilename = None bamgroups = [] # Process each group and draw the corresponding histogram in the graph for colouridx,filelist in enumerate(groups): bamlist = [] # Samples std for each exon in current file for filename in filelist: # Check indexing of the bam file, needed for pysam use if(not os.path.isfile(filename+'.bai') and not os.path.isfile(filename.replace('.bam','.bai'))): print 'WARNING: index not found for '+filename+'. Indexing...' pysam.index(filename) print ' Done.' bam = bam_file.bam_file(filename, 'rb') bamlist.append(bam) # Find the bam with the minimum number of reads if(bam.nreads() < minsize): minsize = bam.nreads() minbamfilename = bam.filename bamgroups.append(bamlist) print 'The smaller bam is '+minbamfilename+' and contains '+str(minsize)+' reads.' fig = pyplot.figure(figsize=(13,6)) ax = fig.add_subplot(111) boxplot = pyplot.figure() axb = boxplot.add_subplot(111) rects = [] colours = ['#ff0000', '#00ff00', '#0000ff', '#cc0011', '#007722', '#110066'] global_stdsampling = [] # Process each group and draw the corresponding histogram in the graph for colouridx,filelist in enumerate(bamgroups): # Samples std for each exon in current file for bam in filelist: print ' '+bam.filename # Check whether normalization should be applied if(normalize): normalizedbam = bam.normalize(minsize) else: normalizedbam = bam std_sampling = normalizedbam.region_coverage_std(bedfilename) # print '# exons < 0.028 - '+legend[colouridx]+': '+str(len((numpy.array(std_sampling)<0.028).nonzero()[0])) bins = numpy.arange(0, 1, 0.007) rects.append(ax.hist(std_sampling, bins, alpha=0.5, facecolor=colours[colouridx])[2]) std_sampling = numpy.array(std_sampling) global_stdsampling.append(list(numpy.log10(std_sampling[(std_sampling>0)]))) # add some fig.suptitle('Distribution of coverage standard deviations (normalized) across exons', fontsize=14, fontweight='bold') ax.set_ylabel('Frequency') ax.set_xlabel('Normalized standard deviation') ax.set_xlim(0, 1) boxplot.suptitle('Distribution of coverage standard deviations (normalized) across exons', fontsize=14, fontweight='bold') axb.boxplot(global_stdsampling) # Check whether graph legend should be included or not if(legend<>None): axb.set_xticklabels(legend) # Shink current axis by 20% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Add graphic legend ax.legend( tuple([rect[0] for rect in rects]), tuple(legend), loc="upper left", bbox_to_anchor=(1,1) ) fig.savefig(fileoutprefix+'/std_distribution.png') matplotlib.pyplot.close(fig) boxplot.savefig(fileoutprefix+'/std_boxplot.png') matplotlib.pyplot.close(boxplot) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def coverage_saturation_local(bamlist, targets, depthlist, coverage, legend, fileout, executiongranted=None, status=None, slopes=None, tmpdir=None, warnthreshold=1e-5): """************************************************************************************************************************************************************ Task: calculates and draws coverage saturation plots for a list of samples. Just the same as the one below but in multithreading mode. Inputs: bamlist: list of strings with the names of the bams to process. targets: list of strings with the names of the beds containing the targets for each run. depthlist: list of integers containing the run depths to test (millions of reads). legend: list of descriptions describing each of the files that will be processed. These descriptions will form the legend of the bar plot. fileout: String containing the name of the file where the plot will be saved. Outputs: ************************************************************************************************************************************************************""" # Check whether a temporary directory is provided as an argument if(tmpdir<>None): TMP = tmpdir pid = str(os.getpid()) simulated_depth_processes = [] # executiongranted = multiprocessing.Semaphore(2) # Launches one thread for each sample and depth for calculating % of covered positions result_files = [] for i,bam in enumerate(bamlist): # Check whether there is an index for current bam if(not os.path.isfile(bam+'.bai') and not os.path.isfile(bam.replace('.bam','.bai'))): print 'WARNING: index not found for '+bam+'. Indexing...' pysam.index(bam) # Threads are launched for each bam and depth point. If provided depth values are greater than the number of reads in the bam file, the maximum depth # value to be used will be the number of reads in the bam and no more threads will be launched. nreads_bam=bam_file.bam_file(bam).nreads() sorteddepths = depthlist sorteddepths.sort() if(nreads_bam>=(sorteddepths[1]*1000000)): endreached=False j=0 while(j<len(depthlist) and not endreached): depth = depthlist[j] # If a legend is provided, use it to differentiate job ids if(legend<>None): jobid = 'coverage_'+pid+'_'+str(depth)+'_'+legend[i].lower() else: jobid = 'coverage_'+pid+'_'+str(depth)+'_'+os.path.basename(bamlist[i]) print "Submitting depth "+str(depth)+", file "+bam # Activate the flag to indicate that following depth values are greater than the number of reads in the bam if((depth*1000000)>=nreads_bam): endreached = True # queue.wait() newprocess = multiprocessing.Process(target=simulated_depth.simulated_depth, args=(bam,targets[i],depth,coverage,TMP+'/'+jobid,executiongranted, TMP,)) simulated_depth_processes.append(newprocess) newprocess.start() # queue.push(newprocess) result_files.append(TMP+'/'+jobid) j += 1 else: print 'WARNING: the number of reads in '+str(bam)+' is '+str(nreads_bam) print ' The set of depths provided for coverage saturation calculus is 10e6*'+str(depthlist) print ' At least two depths equal or lower than the number of mapped reads are required.' if(len(simulated_depth_processes)>0): # Wait for all the processess to finish for process in simulated_depth_processes: process.join() process.terminate() print 'Submitting draw saturation curve...' slope_status,tmpslopes = draw_saturation_curve.draw_saturation_curve(result_files,'% covered positions',fileout,legend,warnthreshold=warnthreshold) if(slopes<>None): for i,slope in enumerate(tmpslopes): slopes[i] = slope # Calculate status flag as an OR among the flags for each bam file if(status<>None): status.value = (sum(slope_status)==len(bamlist)) # Remove temporary files for afile in result_files: os.remove(afile) else: status.value = False
def ngscat(bamfilenames, originalbedfilename, outdir, reference=None, saturation=False, nthreads=2, extend=None, depthlist='auto', coveragethresholds=[1,5,10,20,30], onefeature=None, tmpdir=None): global TMP if(tmpdir<>None): if(os.path.isdir(tmpdir) or os.path.islink(tmpdir)): TMP = tmpdir else: print 'ERROR: temporary directory '+tmpdir+' does not exist.' print ' Exiting' sys.exit(1) if(not (os.path.isdir(outdir) or os.path.islink(outdir))): print 'WARNING: '+outdir+' does not exist. Creating directory.' os.mkdir(outdir) if(not (os.path.isdir(outdir+'/data') or os.path.islink(outdir+'/data'))): print 'Creating '+outdir+'/data' os.mkdir(outdir+'/data') if(not (os.path.isdir(outdir+'/img') or os.path.islink(outdir+'/img'))): print 'Creating '+outdir+'/img' os.mkdir(outdir+'/img') sortedbams = [] for bamfilename in bamfilenames: filelink = TMP+'/'+os.path.basename(bamfilename) try: os.symlink(bamfilename, filelink) except OSError: print 'WARNING: when trying to create a symbolic link at the temporary directory pointing to '+bamfilename+', a file named '+filelink+' was already found.' print ' Probably the temporary and origin directories are the same. The only problem this could cause is that the new index overwrites an existing one.' print ' Continue (y/n)?' goahead = raw_input() if(goahead=='n' or goahead=='N'): print 'Exiting...' sys.exit(1) elif(goahead<>'y' and goahead<>'Y'): print 'Unknown choice '+goahead print 'Exiting...' sys.exit(1) if(os.path.dirname(bamfilename)<>os.path.dirname(TMP+'/')): os.remove(filelink) os.symlink(bamfilename, filelink) print 'Indexing...' pysam.index(filelink) print ' Done.' if(not bam_file.bam_file(filelink).issorted()): print 'WARNING: '+bamfilename+' is not sorted' print 'Sorting...' pid = str(time.time()) newsortedbam = TMP+'/'+pid+'.sorted' sortedbams.append(newsortedbam+'.bam') pysam.sort(filelink, newsortedbam) print 'Indexing...' pysam.index(sortedbams[-1]) print ' Done.' else: sortedbams.append(filelink) if(saturation and depthlist=='auto'): maxdepth = max([bam_file.bam_file(bamfilename).nreads() for bamfilename in sortedbams]) depthlist = numpy.arange(maxdepth/5.0, maxdepth+(maxdepth/5.0)-1, maxdepth/5.0) depthlist = depthlist/1000000.0 legend = [os.path.basename(bamfilename) for bamfilename in bamfilenames] executiongranted = multiprocessing.Semaphore(nthreads) if(extend<>None): bedfilename = TMP+'/'+originalbedfilename.replace('.bed','.'+pid+'.extended.bed') bed_file.bed_file(originalbedfilename).extendnoref(extend,bedfilename) else: bedfilename = originalbedfilename if(onefeature==None or onefeature<>'saturation' or onefeature<>'specificity'): Pcoveragebeds,coveragefiles = launch_coveragebed(sortedbams, bedfilename, legend, outdir, executiongranted) if((saturation and onefeature==None) or onefeature=='saturation'): Psaturation,coverage_saturation_status,saturationslopes = launch_coverage_saturation(sortedbams, bedfilename, depthlist, legend, outdir+'/data/', executiongranted) else: coverage_saturation_status = None saturationslopes = None if(onefeature==None or onefeature=='specificity'): Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget = launch_onoff_reads(sortedbams, bedfilename, legend, outdir+'/data/', executiongranted) for i in range(len(Pcoveragebeds)): Pcoveragebeds[i].join() Pcoveragebeds[i].terminate() if(onefeature==None or onefeature=='specificity'): Poffclusters = launch_offclusters(glob.glob(outdir+'/data/*.bed'), bedfilename, executiongranted) if(onefeature==None or onefeature=='coveragefreq'): Pcoveragedistribution,coveragedistribution_status,meancoverage = launch_coverage_distribution(coveragefiles, outdir+'/data/', legend, executiongranted) if(onefeature==None or onefeature=='percbases'): Pcoveredpositions,coveredpositions_status,coveredbases = launch_covered_positions(coveragefiles, coveragethresholds, outdir+'/data/', legend, executiongranted) if(onefeature==None or onefeature=='coveragedistr'): Pcoveragethroughtarget,throughtarget_status,lowcovbases = launch_coverage_through_target(coveragefiles, outdir+'/data/', legend, executiongranted) if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')): Pcoveragecorr,coveragecorr_status,corr = launch_coveragecorr(coveragefiles, outdir+'/data/coveragecorr.png', legend, executiongranted) else: coveragecorr_status = None corr = None if(onefeature==None or onefeature=='coveragestd'): Pcoveragestd,coveragestd_status,coveragestd = launch_coverage_std(coveragefiles, outdir+'/data/', legend, executiongranted) if((reference<>None and onefeature==None) or onefeature=='gcbias'): Pgcbias = [] for i,coveragefile in enumerate(coveragefiles): onePgcbias,gcbias_status = launch_gcbias(coveragefile, bedfilename, reference, outdir+'/data/gcbias'+str(i)+'.png', legend[i], executiongranted) Pgcbias.append(onePgcbias) for onePgcbias in Pgcbias: onePgcbias.join() onePgcbias.terminate() else: gcbias_status = None # LAUNCH BASIC STATS if((saturation and onefeature==None) or onefeature=='saturation'): Psaturation.join() Psaturation.terminate() if(onefeature==None or onefeature=='coveragefreq'): Pcoveragedistribution.join() Pcoveragedistribution.terminate() if(onefeature==None or onefeature=='percbases'): Pcoveredpositions.join() Pcoveredpositions.terminate() if(onefeature==None or onefeature=='coveragedistr'): Pcoveragethroughtarget.join() Pcoveragethroughtarget.terminate() if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')): Pcoveragecorr.join() Pcoveragecorr.terminate() if(onefeature==None or onefeature=='coveragestd'): Pcoveragestd.join() Pcoveragestd.terminate() if(onefeature==None or onefeature=='specificity'): Ponoff_reads.join() Ponoff_reads.terminate() Poffclusters.join() Poffclusters.terminate() # if(onefeature==None or onefeature<>'saturation'): # for coveragefile in coveragefiles: # os.remove(coveragefile) if(onefeature==None): generate_report(bamfilenames,sortedbams,originalbedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes, onoff_status, duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage, coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget, reference,nthreads,depthlist, coveragethresholds)
def generate_report(bamfilenames,sortedbams,bedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes,onoff_status, duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage, coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget, reference,nthreads, depthlist, coveragethresholds): global TMP shutil.copy(IMGSRC+'/xls_icon.png', outdir+'/img') shutil.copy(IMGSRC+'/txt_icon.png', outdir+'/img') shutil.copy(IMGSRC+'/ok.jpg', outdir+'/img') shutil.copy(IMGSRC+'/warning.jpg', outdir+'/img') shutil.copy(IMGSRC+'/coverage_histogram_example.png', outdir+'/img') shutil.copy(DATASRC+'/styles.css', outdir) # ********************************************************* INput parameters ****************************************************************** if(coverage_saturation_status<>None): saturationcurve = 'Yes' else: saturationcurve = 'No' fd = file(DATASRC+'/captureQC.html') reportcontent = string.join(fd.readlines(),sep='').replace('bamfilename', string.join(bamfilenames, sep=', ')).replace('bedfilename',bedfilename).replace('reportdate', time.ctime()).replace('reference',str(reference)).replace('saturationcurve',saturationcurve).replace('nthreads',str(nthreads)).replace('tmpdir',TMP) fd.close() # ********************************************************* Result summary ****************************************************************** jsonstr = '' for i,bam in enumerate(bamfilenames): jsonstr += '{"bamfile":"'+bam+'"' jsonstr += ',"nreads":'+str(bam_file.bam_file(sortedbams[i]).nreads()) jsonstr += ',"coveredbases":'+str(coveredbases[i]) if(coverage_saturation_status<>None): jsonstr += ',"saturationslope":'+str(saturationslopes[i]) jsonstr += ',"percontarget":'+str(percontarget[i]) jsonstr += ',"onduplicates":'+str(onduplicates[i]) jsonstr += ',"offduplicates":'+str(offduplicates[i]) jsonstr += ',"meancoverage":'+str(meancoverage[i]) jsonstr += ',"lowcovbases":'+str(lowcovbases[i]) if(not math.isnan(coveragestd[i])): jsonstr += ',"coveragestd":'+str(coveragestd[i])+'}' else: jsonstr +='}' fd = file(outdir+'/data/summary.json', 'w') fd.write(jsonstr) fd.close() summaryrows = '' for i,bam in enumerate(bamfilenames): summaryrows += '<tr>\n' summaryrows += '<td class="table-cell"> '+bam+'</td>' summaryrows += '<td class="table-cell"> '+str(bam_file.bam_file(sortedbams[i]).nreads())+' </td>' summaryrows += '<td class="table-cell">%.1f'%(coveredbases[i])+'% </td>' if(coverage_saturation_status<>None): summaryrows += '<td class="table-cell">%.1e</td>\n'%saturationslopes[i] summaryrows += '<td class="table-cell">%.1f'%(percontarget[i])+'% </td>\n' summaryrows += ('<td class="table-cell">ON-%.1f%%'%onduplicates[i])+'; OFF: %.1f'%(offduplicates[i])+'% </td>' summaryrows += '<td class="table-cell">%.1fx'%meancoverage[i]+'</td>\n' summaryrows += '<td class="table-cell">%d consecutive bases<br>with coverage <= <WARNCOVERAGETHRESHOLD></td>\n'%(lowcovbases[i]) if(coveragecorr_status<>None): summaryrows += '<td class="table-cell">%.2f</td>\n'%corr.value summaryrows += '<td class="table-cell">%.2f</td>\n'%coveragestd[i] summaryrows += '</tr>\n' summarystatus = '<td class="table-header">Overall status</td>\n' summarystatus += '<td class="table-header"></td>\n' summarystatus += '<td class="table-header"><a href="#targetbases"><img src="img/<TARGETBASESSTATUS>.jpg" height=23px /></a></td>\n' if(coverage_saturation_status<>None): summarystatus += '<td class="table-header"><a href="#coveragesaturation"><img src="img/<COVERAGESATURATIONSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#onoff"><img src="img/<ONOFFSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#dup"><img src="img/<DUPSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#distribution"><img src="img/<DISTRIBUTIONSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#coveragethroughtarget"><img src="img/<COVERAGETHROUGHTARGETSTATUS>.jpg" height=23px /></a></td>\n' if(coveragecorr_status<>None): summarystatus += '<td class="table-header"><a href="#coveragecorr"><img src="img/<COVERAGECORRSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#coveragestd"><img src="img/<COVERAGESTDSTATUS>.jpg" height=23px /></a></td>\n' reportcontent = reportcontent.replace('<SUMMARYROWS>',summaryrows) reportcontent = reportcontent.replace('<SUMMARYSTATUS>',summarystatus) if(coverage_saturation_status<>None): reportcontent = reportcontent.replace('<SUMMARYSATURATION>','<td class="table-header"><a href="#coveragesaturation">Coverage saturation<br>(slope at the end of the curve)</a></td>') else: reportcontent = reportcontent.replace('<SUMMARYSATURATION>','') if(coveragecorr_status<>None): reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','<td class="table-header"><a href="#coveragecorr">Coverage correlation<br>per ROI</a></td>') else: reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','') reportcontent = reportcontent.replace('<SUMMARYCOVERAGETHRS>',str(coveragethresholds[0])) reportcontent = reportcontent.replace('<SUMMARYTARGETSIZE>',str(bed_file.bed_file(bedfilename).size())) # ********************************************************* Detailed results ****************************************************************** chromosomeimages = '' ontarget_coverage_files = glob.glob(outdir+'/data/*_Ontarget_Coverage.png') ontarget_coverage_files.sort() for afile in ontarget_coverage_files: chromosomeimages += '<a href="data/'+os.path.basename(afile)+'"><img style="width: 33%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>' reportcontent = reportcontent.replace('<CHROMOSOMEIMAGES>',chromosomeimages) if(coveredpositions_status.value): reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','ok') else: reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','warning') reportcontent = reportcontent.replace('<WARNBASESCOVERED>',str(config.warnbasescovered)) percentagestr = '\n<ul>' enrichmentstr = '\n<ul>' for i,bamfilename in enumerate(bamfilenames): percentagestr += '<li>'+bamfilename+': %.1f'%(percontarget[i])+'%</li>\n' enrichmentstr += '<li>'+bamfilename+': %.1f'%(enrichment[i])+'</li>\n' percentagestr += '</ul>' enrichmentstr += '</ul>' reportcontent = reportcontent.replace('<PERCENTAGEONTARGET>', percentagestr) reportcontent = reportcontent.replace('<ENRICHMENT>', enrichmentstr) reportcontent = reportcontent.replace('<WARNONTARGET>', str(config.warnontarget)) if(onoff_status.value): reportcontent = reportcontent.replace('<ONOFFSTATUS>','ok') else: reportcontent = reportcontent.replace('<ONOFFSTATUS>','warning') duplicates_files = glob.glob(outdir+'/data/duplicates*.png') duplicates_files.sort() dupimages = '' for afile in duplicates_files: dupimages += '<img style="width: 50%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>' reportcontent = reportcontent.replace('<DUPIMAGES>',dupimages) if(duplicates_status.value): reportcontent = reportcontent.replace('<DUPSTATUS>','ok') else: reportcontent = reportcontent.replace('<DUPSTATUS>','warning') reportcontent = reportcontent.replace('<WARNMEANCOVERAGE>',str(config.warnmeancoverage)) if(coveragedistribution_status.value): reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','ok') else: reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','warning') if(coveragecorr_status<>None): fd = file(DATASRC+'/coveragecorr_content.html') coveragecorr_content = string.join(fd.readlines(), sep='') fd.close() reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>',coveragecorr_content) reportcontent = reportcontent.replace('<WARNCOVERAGECORRELATION>',str(config.warncoveragecorrelation)) if(coveragecorr_status.value): reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','warning') else: reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>','\n') reportcontent = reportcontent.replace('<WARNCOVERAGEREGION>',str(config.warncoverageregion)) reportcontent = reportcontent.replace('<WARNCOVERAGETHRESHOLD>',str(config.warncoveragethreshold)) if(throughtarget_status.value): reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','warning') reportcontent = reportcontent.replace('<WARNSTD>',str(config.warnstd)) if(coveragestd_status.value): reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','warning') if(coverage_saturation_status<>None): fd = file(DATASRC+'/saturation_content.html') saturation_content = string.join(fd.readlines(), sep='') fd.close() reportcontent = reportcontent.replace('<SATURATIONCONTENT>',saturation_content).replace('<DEPTHLIST>',string.join(map(str,depthlist[:-1]),sep='x10<sup>6</sup>, ')+'x10<sup>6</sup> and '+str(depthlist[-1])+'x10<sup>6</sup>').replace('depthlist',str(depthlist)[1:-1]) reportcontent = reportcontent.replace('<WARNSATURATION>',str(config.warnsaturation)) if(coverage_saturation_status.value): reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','warning') else: reportcontent = reportcontent.replace('<SATURATIONCONTENT>','\n').replace('depthlist','None') reportcontent = reportcontent.replace('coveragethrs', string.join(map(str, coveragethresholds), sep=', ')) if(gcbias_status<>None): fd = file(DATASRC+'/gcbias_content.html') gcbias_content = string.join(fd.readlines(), sep='') fd.close() reportcontent = reportcontent.replace('<GCBIASCONTENT>',gcbias_content) gcbiasimages = '' for afile in glob.glob(outdir+'/data/gcbias*.png'): gcbiasimages += '<img style="width:40%" src="data/'+os.path.basename(afile)+'" />' reportcontent = reportcontent.replace('<GCBIASIMAGES>', gcbiasimages) else: reportcontent = reportcontent.replace('<GCBIASCONTENT>','\n') fd = file(outdir+'/captureQC.html', 'w') fd.write(reportcontent) fd.close() print 'Results written at '+outdir
def coverage_distribution(bams,beds,dirout,labels,normalize): """************************************************************************************************************************************************************ Task: calculates coverage distribution for a set of bam and bed (capture) files. Inputs: bams: list of strings with the paths to the bam files. beds: list of strings with the paths to the bed files. dirout: string containing the full path to the directory where results will be stored. labels: list of strings with the labels to name each sample (bam) in the graph. normalize: {True,False} to indicate whether normalization should be applied. Output: <dirout>/Coverage_histo.png with the coverage histogram, <dirout>/Coverage_boxp.png with the boxplots and <dirout>/Coverage_stats.xls with quartiles, mean, maximum and minimum values. ************************************************************************************************************************************************************""" # Chek output directory exists. In case not, create it if(not os.path.isdir(dirout)): print 'WARNING: directory '+dirout+' does not exist. Creating...' os.mkdir(dirout) distributions = [] bamlist = [] # Indexes each bam file and creates the corresponding bam_file objects for i,bamfilename in enumerate(bams): # Check indexing of the bam file, needed for pysam use if(not os.path.isfile(bamfilename+'.bai') and not os.path.isfile(bamfilename.replace('.bam','.bai'))): print 'WARNING: index not found for '+bamfilename+'. Indexing...' pysam.index(bamfilename) print ' Done.' bam = bam_file.bam_file(bamfilename, 'rb') bamlist.append(bam) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is '+bamlist[sizes.argmin()].filename+' and contains '+str(minsize)+' reads.' # Process each file and store counting results print 'Counting covered bases...' for i,bam in enumerate(bamlist): print ' '+bam.filename # Check whether normalization should be applied if(normalize): normalizedbam = bam.normalize(minsize) else: normalizedbam = bam distributions.append(normalizedbam.get_coverage_distribution(beds[i])) draw_histogram(distributions, labels, dirout) draw_boxplot(distributions, labels, dirout) # Initialize the workbook and sheet wb = xlwt.Workbook() ws = wb.add_sheet('Coverage distribution') # Create header font header_style = xlwt.easyxf('font: bold on') ws.write(0,0,'Sample',header_style);ws.write(0,1,'Q1',header_style);ws.write(0,2,'Q2',header_style);ws.write(0,3,'Q3',header_style); ws.write(0,4,'Max. coverage',header_style); ws.write(0,5,'Min. coverage',header_style);ws.write(0,6,'Mean coverage',header_style); # Calculate distribution stats for each of the bams for i,dist in enumerate(distributions): #Sacamos estadisticas ndist=numpy.array(dist) p25=numpy.percentile(ndist, 25) p50=numpy.percentile(ndist, 50) p75=numpy.percentile(ndist, 75) maximum=numpy.max(ndist) minimum=numpy.min(ndist) mean=numpy.average(ndist) ws.write(i+1,0,labels[i]);ws.write(i+1,1,p25);ws.write(i+1,2,p50);ws.write(i+1,3,p75);ws.write(i+1,4,maximum);ws.write(i+1,5,minimum);ws.write(i+1,6,mean); wb.save(dirout+'/Coverage_stats.xls')
def simulated_depth(bam, target, depth, coveragethreshold, fileout, executiongranted=None, tmpdir=None): """************************************************************************************************************************************************************ Task: randomly selects a number of reads from a given bam and calculates target coverage. Inputs: pipelinehome: String containing the home dir where pipeline output is stored. E.g.: /data/pipeline_outputs/solid/parana/11847_2012-09-14_bfast_190408/ target: String containing the full path to the bed file. depth: Integer containing the run depth in number of reads (millions). fileout: String containing the name of the file where results will be stored. Output: generates a text file (fileout) with a tab separated line: <dept>\t<ncovered positions>\t<%covered positions> ************************************************************************************************************************************************************""" global TMP if (tmpdir <> None): TMP = tmpdir if (executiongranted <> None): executiongranted.acquire() pid = str(os.getpid()) bam = bam_file.bam_file(bam, 'rb') [positions, coverage, chromosomes, processedbed] = bam.myCoverageBed(target, depth * 1000000, tmpdir=TMP) # totalregions = sum([len(processedbed.chrs[chr]) for chr in processedbed.chrs]) # A progress bar is initialized print 'Loading coverage...' # widgets = ['Progress: ', progressbar.Percentage(), ' ', # progressbar.Bar(marker=progressbar.RotatingMarker()), ' ', progressbar.ETA()] # pbar = progressbar.ProgressBar(widgets=widgets, maxval=totalregions).start() nregions = 0 npositions = 0 ncovered_positions = 0 for chr in processedbed.chrs: positionsidx = chromosomes[chr][0] for i, region in enumerate(processedbed.chrs[chr]): npositions += (region[1] - region[0] + 1) while ((positionsidx + 1) <= chromosomes[chr][1] and positions[positionsidx + 1] <= region[1]): if (coverage[positionsidx] >= coveragethreshold): ncovered_positions += (positions[positionsidx + 1] - positions[positionsidx]) positionsidx += 1 if (coverage[positionsidx] >= coveragethreshold): ncovered_positions += (region[1] - positions[positionsidx] + 1) positionsidx += 1 nregions += 1 # pbar.update(nregions) # pbar.finish() print 'Writing results at ' + fileout + ' ...' fd = file(fileout, 'w') fd.write(os.path.basename(bam.filename) + '\n') fd.write( str(min(bam.nreads(), depth * 1000000)) + '\t' + str(ncovered_positions) + '\t' + str(ncovered_positions * 100.0 / npositions)) fd.close() print ' Done.' if (executiongranted <> None): executiongranted.release()
def gcbias(filelist, fileoutlist, bedfilelist): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content Input: filelist: list of strings, each containing the full path of the bam file to analyze. fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved. bedfilelist: Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" pid = str(os.getpid()) numpy.random.seed(1) ntotal_positions = [] bamlist = [] # Process each file and store counting results for filename in filelist: # Check whether index already exists for the bam file, needed for pysam use if(not os.path.isfile(filename+'.bai')): print 'Creating index for '+filename pysam.index(filename) print ' Done.' bamlist.append(bam_file.bam_file(filename)) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is '+filelist[sizes.argmin()]+' and contains '+str(minsize)+' reads.' # Process each file and store counting results for i,bamfile in enumerate(bamlist): print 'Processing '+bamfile.filename print 'Results will be written at '+fileoutlist[i] # Check whether normalization should be run if(normalize): normalizedbam = bamfile.normalize(minsize) else: normalizedbam = bamfile coveragefile = TMP+'/'+pid+'.coverage' print 'Calculating coverage per position...' run(BEDTOOLSPATH+'coverageBed -d -abam '+normalizedbam.filename+' -b '+bedfilelist[i]+' > '+coveragefile) coverage = region_coverage(coveragefile) print 'Calculating nt content...' bedfd = pybedtools.BedTool(bedfilelist[i]) pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(REF) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100 print ' Done.' fig = pyplot.figure(figsize=(13,6)) ax = fig.add_subplot(111) region_ids = coverage.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] xmin = gccontentarray.min() xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6,6)) ax = fig.add_subplot(111) sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)]) cbar.ax.set_yticklabels(['Low','High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') fig.savefig(fileoutlist[i]) matplotlib.pyplot.close(fig) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def gcbias(filelist, fileoutlist, bedfilelist): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content Input: filelist: list of strings, each containing the full path of the bam file to analyze. fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved. bedfilelist: Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" pid = str(os.getpid()) numpy.random.seed(1) ntotal_positions = [] bamlist = [] # Process each file and store counting results for filename in filelist: # Check whether index already exists for the bam file, needed for pysam use if (not os.path.isfile(filename + '.bai')): print 'Creating index for ' + filename pysam.index(filename) print ' Done.' bamlist.append(bam_file.bam_file(filename)) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is ' + filelist[ sizes.argmin()] + ' and contains ' + str(minsize) + ' reads.' # Process each file and store counting results for i, bamfile in enumerate(bamlist): print 'Processing ' + bamfile.filename print 'Results will be written at ' + fileoutlist[i] # Check whether normalization should be run if (normalize): normalizedbam = bamfile.normalize(minsize) else: normalizedbam = bamfile coveragefile = TMP + '/' + pid + '.coverage' print 'Calculating coverage per position...' run(BEDTOOLSPATH + 'coverageBed -d -abam ' + normalizedbam.filename + ' -b ' + bedfilelist[i] + ' > ' + coveragefile) coverage = region_coverage(coveragefile) print 'Calculating nt content...' bedfd = pybedtools.BedTool(bedfilelist[i]) pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(REF) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi( entry.fields[1]), string.atoi( entry.fields[2]))] = string.atof(entry.fields[-8]) * 100 print ' Done.' fig = pyplot.figure(figsize=(13, 6)) ax = fig.add_subplot(111) region_ids = coverage.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] xmin = gccontentarray.min() xmax = gccontentarray.max( ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6, 6)) ax = fig.add_subplot(111) sc = ax.imshow( rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, 100, ymin, ymax], aspect="auto" ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)]) cbar.ax.set_yticklabels(['Low', 'High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') fig.savefig(fileoutlist[i]) matplotlib.pyplot.close(fig) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def simulated_depth(bam, target, depth, coveragethreshold, fileout, executiongranted=None, tmpdir=None): """************************************************************************************************************************************************************ Task: randomly selects a number of reads from a given bam and calculates target coverage. Inputs: pipelinehome: String containing the home dir where pipeline output is stored. E.g.: /data/pipeline_outputs/solid/parana/11847_2012-09-14_bfast_190408/ target: String containing the full path to the bed file. depth: Integer containing the run depth in number of reads (millions). fileout: String containing the name of the file where results will be stored. Output: generates a text file (fileout) with a tab separated line: <dept>\t<ncovered positions>\t<%covered positions> ************************************************************************************************************************************************************""" global TMP if(tmpdir<>None): TMP = tmpdir if(executiongranted<>None): executiongranted.acquire() pid = str(os.getpid()) bam = bam_file.bam_file(bam, 'rb') [positions,coverage,chromosomes,processedbed] = bam.myCoverageBed(target,depth*1000000, tmpdir=TMP) # totalregions = sum([len(processedbed.chrs[chr]) for chr in processedbed.chrs]) # A progress bar is initialized print 'Loading coverage...' # widgets = ['Progress: ', progressbar.Percentage(), ' ', # progressbar.Bar(marker=progressbar.RotatingMarker()), ' ', progressbar.ETA()] # pbar = progressbar.ProgressBar(widgets=widgets, maxval=totalregions).start() nregions = 0 npositions = 0 ncovered_positions = 0 for chr in processedbed.chrs: positionsidx = chromosomes[chr][0] for i,region in enumerate(processedbed.chrs[chr]): npositions += (region[1]-region[0]+1) while((positionsidx+1)<=chromosomes[chr][1] and positions[positionsidx+1]<=region[1]): if(coverage[positionsidx] >= coveragethreshold): ncovered_positions += (positions[positionsidx+1]-positions[positionsidx]) positionsidx += 1 if(coverage[positionsidx] >= coveragethreshold): ncovered_positions += (region[1]-positions[positionsidx]+1) positionsidx += 1 nregions += 1 # pbar.update(nregions) # pbar.finish() print 'Writing results at '+fileout+' ...' fd = file(fileout, 'w') fd.write(os.path.basename(bam.filename)+'\n') fd.write(str(min(bam.nreads(),depth*1000000))+'\t'+str(ncovered_positions)+'\t'+str(ncovered_positions*100.0/npositions)) fd.close() print ' Done.' if(executiongranted<>None): executiongranted.release()
def coverage_saturation_local(bamlist, targets, depthlist, coverage, legend, fileout, executiongranted=None, status=None, slopes=None, tmpdir=None, warnthreshold=1e-5): """************************************************************************************************************************************************************ Task: calculates and draws coverage saturation plots for a list of samples. Just the same as the one below but in multithreading mode. Inputs: bamlist: list of strings with the names of the bams to process. targets: list of strings with the names of the beds containing the targets for each run. depthlist: list of integers containing the run depths to test (millions of reads). legend: list of descriptions describing each of the files that will be processed. These descriptions will form the legend of the bar plot. fileout: String containing the name of the file where the plot will be saved. Outputs: ************************************************************************************************************************************************************""" # Check whether a temporary directory is provided as an argument if (tmpdir <> None): TMP = tmpdir pid = str(os.getpid()) simulated_depth_processes = [] # executiongranted = multiprocessing.Semaphore(2) # Launches one thread for each sample and depth for calculating % of covered positions result_files = [] for i, bam in enumerate(bamlist): # Check whether there is an index for current bam if (not os.path.isfile(bam + '.bai') and not os.path.isfile(bam.replace('.bam', '.bai'))): print 'WARNING: index not found for ' + bam + '. Indexing...' pysam.index(bam) # Threads are launched for each bam and depth point. If provided depth values are greater than the number of reads in the bam file, the maximum depth # value to be used will be the number of reads in the bam and no more threads will be launched. nreads_bam = bam_file.bam_file(bam).nreads() sorteddepths = depthlist sorteddepths.sort() if (nreads_bam >= (sorteddepths[1] * 1000000)): endreached = False j = 0 while (j < len(depthlist) and not endreached): depth = depthlist[j] # If a legend is provided, use it to differentiate job ids if (legend <> None): jobid = 'coverage_' + pid + '_' + str( depth) + '_' + legend[i].lower() else: jobid = 'coverage_' + pid + '_' + str( depth) + '_' + os.path.basename(bamlist[i]) print "Submitting depth " + str(depth) + ", file " + bam # Activate the flag to indicate that following depth values are greater than the number of reads in the bam if ((depth * 1000000) >= nreads_bam): endreached = True # queue.wait() newprocess = multiprocessing.Process( target=simulated_depth.simulated_depth, args=( bam, targets[i], depth, coverage, TMP + '/' + jobid, executiongranted, TMP, )) simulated_depth_processes.append(newprocess) newprocess.start() # queue.push(newprocess) result_files.append(TMP + '/' + jobid) j += 1 else: print 'WARNING: the number of reads in ' + str(bam) + ' is ' + str( nreads_bam) print ' The set of depths provided for coverage saturation calculus is 10e6*' + str( depthlist) print ' At least two depths equal or lower than the number of mapped reads are required.' if (len(simulated_depth_processes) > 0): # Wait for all the processess to finish for process in simulated_depth_processes: process.join() process.terminate() print 'Submitting draw saturation curve...' slope_status, tmpslopes = draw_saturation_curve.draw_saturation_curve( result_files, '% covered positions', fileout, legend, warnthreshold=warnthreshold) if (slopes <> None): for i, slope in enumerate(tmpslopes): slopes[i] = slope # Calculate status flag as an OR among the flags for each bam file if (status <> None): status.value = (sum(slope_status) == len(bamlist)) # Remove temporary files for afile in result_files: os.remove(afile) else: status.value = False
def target_coverage(filelist, targetfiles, coveragelist, graph_legend, outprefix, xticklabels=None, normalize=False): """************************************************************************************************************************************************************ Task: draws statistics about the percentage of covered exons and transcripts at different coverage levels. A transcript is considered to be covered when at least the 90% of its positions present a coverage greater than the threshold. Inputs: filelist: list of strings indicating those files to be processed. For a file format example see /home/javi/MGP/capture_methods/data/coverage/GU_20120719_FC1_6L1S_AL_01_3376_BC1_AA_F3.filtered.singleHits.realigned.recalibrated.bam.coverage coveragelist: list of values with coverage thresholds to use. graph_legend: list of descriptions describing each of the files that will be processed. These descriptions will form the legend of the bar plot. These labels will also be used to identify sample replicates. Replicates will be merged in one bar in the bar plot. outprefix: string containing the full path to the directory where data will be saved. xticklabels: list of strings with labels for the ticks in the x axis. normalize: boolean to indicate whether bam files should be normalized or not. Output: a summary .xls file and a bar plot depicting coverage vs. %covered-positions. Figures will be saved as <dirout>/coverage_summary.xls, <dirout>/covered_positions.png ************************************************************************************************************************************************************""" numpy.random.seed(1) covered_positions = [] ntotal_positions = [] bamlist = [] # Process each file and store counting results for filename in filelist: # Check whether index already exists for the bam file, needed for pysam use if not os.path.isfile(filename + ".bai"): print "Creating index for " + filename pysam.index(filename) print " Done." bamlist.append(bam_file.bam_file(filename)) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print "The smaller bam is " + filelist[sizes.argmin()] + " and contains " + str(minsize) + " reads." # Process each file and store counting results print "Counting covered bases..." for i, bam in enumerate(bamlist): print " " + bam.filename # Check whether normalization should be run if normalize: normalizedbam = bam.normalize(minsize) else: normalizedbam = bam ntotal_positions_tmp, covered_positions_per_depth = normalizedbam.target_coverage(coveragelist, targetfiles[i]) covered_positions.append(covered_positions_per_depth) ntotal_positions.append(ntotal_positions_tmp) # Initialize the workbook and sheet wb = xlwt.Workbook() ws = wb.add_sheet("Bases") # Create header font header_style = xlwt.easyxf("font: bold on") for i, cov in enumerate(coveragelist): ws.write(0, i * 2 + 1, "Coverage >=" + str(cov) + "x", style=header_style) ws.write(0, i * 2 + 1 + 1, "%", style=header_style) # Write count of covered positions in each file for each coverage threshold for i, value_list in enumerate(covered_positions): # Use graph legend elements for row identifiers if graph_legend <> None: ws.write(i + 1, 0, graph_legend[i], style=header_style) else: ws.write(i + 1, 0, os.path.basename(filelist[i]), style=header_style) # Write counts for current file for j, value in enumerate(value_list): ws.write(i + 1, j * 2 + 1, value) ws.write(i + 1, j * 2 + 1 + 1, value * 100.0 / ntotal_positions[i]) # Calculate percentage of covered positions. Pass through the results of each file. for i in range(len(covered_positions)): # Divide each count by the total number of positions for j, value in enumerate(covered_positions[i]): covered_positions[i][j] = value * 100.0 / ntotal_positions[i] # Check whether the output directory is already created if not os.path.isdir(os.path.dirname(outprefix)): print "WARNING: directory " + os.path.dirname(outprefix) + " not found. Creating new directory." os.mkdir(os.path.dirname(outprefix)) # If x labels are not provided, generate ad hoc labels if xticklabels == None: xticklabels = [">=" + str(cov) + "x" for cov in coveragelist] # Save .xls file and generate the two bar plots. wb.save(outprefix + "coverage_summary.xls") draw_graph_wreplicates( outprefix + "covered_positions.png", covered_positions, xticklabels, "Coverage threshold", "% covered positions", graph_legend, )
def exon_coverage_std(groups, fileoutprefix, bedfilename, legend=None, normalize=True): """************************************************************************************************************************************************************ Task: generates the distribution of coverage standard deviation across exons. Inputs: groups: list of sublists. Each sublist contains bam filenames of samples related somehow, e.g. samples sequenced in the same run. fileoutprefix: String containing the fileout prefix. bedfilename: string containing the name of the bed with the regions to analyze. legend: list of strings containing descriptions describing each of the groups that will be processed. These descriptions will form the legend of the bar plot. normalize: {True, False} to indicate whether bam files should be normalized Output: two .png figures are generated. One containing the distributions of coverage standard deviation across exons and a box plot of such distributions. ************************************************************************************************************************************************************""" minsize = 1000000000000000 minbamfilename = None bamgroups = [] # Process each group and draw the corresponding histogram in the graph for colouridx, filelist in enumerate(groups): bamlist = [] # Samples std for each exon in current file for filename in filelist: # Check indexing of the bam file, needed for pysam use if not os.path.isfile(filename + ".bai") and not os.path.isfile(filename.replace(".bam", ".bai")): print "WARNING: index not found for " + filename + ". Indexing..." pysam.index(filename) print " Done." bam = bam_file.bam_file(filename, "rb") bamlist.append(bam) # Find the bam with the minimum number of reads if bam.nreads() < minsize: minsize = bam.nreads() minbamfilename = bam.filename bamgroups.append(bamlist) print "The smaller bam is " + minbamfilename + " and contains " + str(minsize) + " reads." fig = pyplot.figure(figsize=(13, 6)) ax = fig.add_subplot(111) boxplot = pyplot.figure() axb = boxplot.add_subplot(111) rects = [] colours = ["#ff0000", "#00ff00", "#0000ff", "#cc0011", "#007722", "#110066"] global_stdsampling = [] # Process each group and draw the corresponding histogram in the graph for colouridx, filelist in enumerate(bamgroups): # Samples std for each exon in current file for bam in filelist: print " " + bam.filename # Check whether normalization should be applied if normalize: normalizedbam = bam.normalize(minsize) else: normalizedbam = bam std_sampling = normalizedbam.region_coverage_std(bedfilename) # print '# exons < 0.028 - '+legend[colouridx]+': '+str(len((numpy.array(std_sampling)<0.028).nonzero()[0])) bins = numpy.arange(0, 1, 0.007) rects.append(ax.hist(std_sampling, bins, alpha=0.5, facecolor=colours[colouridx])[2]) std_sampling = numpy.array(std_sampling) global_stdsampling.append(list(numpy.log10(std_sampling[(std_sampling > 0)]))) # add some fig.suptitle( "Distribution of coverage standard deviations (normalized) across exons", fontsize=14, fontweight="bold" ) ax.set_ylabel("Frequency") ax.set_xlabel("Normalized standard deviation") ax.set_xlim(0, 1) boxplot.suptitle( "Distribution of coverage standard deviations (normalized) across exons", fontsize=14, fontweight="bold" ) axb.boxplot(global_stdsampling) # Check whether graph legend should be included or not if legend <> None: axb.set_xticklabels(legend) # Shink current axis by 20% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Add graphic legend ax.legend(tuple([rect[0] for rect in rects]), tuple(legend), loc="upper left", bbox_to_anchor=(1, 1)) fig.savefig(fileoutprefix + "/std_distribution.png") matplotlib.pyplot.close(fig) boxplot.savefig(fileoutprefix + "/std_boxplot.png") matplotlib.pyplot.close(boxplot) print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
def coverage_distribution(bams, beds, dirout, labels, normalize): """************************************************************************************************************************************************************ Task: calculates coverage distribution for a set of bam and bed (capture) files. Inputs: bams: list of strings with the paths to the bam files. beds: list of strings with the paths to the bed files. dirout: string containing the full path to the directory where results will be stored. labels: list of strings with the labels to name each sample (bam) in the graph. normalize: {True,False} to indicate whether normalization should be applied. Output: <dirout>/Coverage_histo.png with the coverage histogram, <dirout>/Coverage_boxp.png with the boxplots and <dirout>/Coverage_stats.xls with quartiles, mean, maximum and minimum values. ************************************************************************************************************************************************************""" # Chek output directory exists. In case not, create it if (not os.path.isdir(dirout)): print 'WARNING: directory ' + dirout + ' does not exist. Creating...' os.mkdir(dirout) distributions = [] bamlist = [] # Indexes each bam file and creates the corresponding bam_file objects for i, bamfilename in enumerate(bams): # Check indexing of the bam file, needed for pysam use if (not os.path.isfile(bamfilename + '.bai') and not os.path.isfile(bamfilename.replace('.bam', '.bai'))): print 'WARNING: index not found for ' + bamfilename + '. Indexing...' pysam.index(bamfilename) print ' Done.' bam = bam_file.bam_file(bamfilename, 'rb') bamlist.append(bam) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is ' + bamlist[ sizes.argmin()].filename + ' and contains ' + str(minsize) + ' reads.' # Process each file and store counting results print 'Counting covered bases...' for i, bam in enumerate(bamlist): print ' ' + bam.filename # Check whether normalization should be applied if (normalize): normalizedbam = bam.normalize(minsize) else: normalizedbam = bam distributions.append(normalizedbam.get_coverage_distribution(beds[i])) draw_histogram(distributions, labels, dirout) draw_boxplot(distributions, labels, dirout) # Initialize the workbook and sheet wb = xlwt.Workbook() ws = wb.add_sheet('Coverage distribution') # Create header font header_style = xlwt.easyxf('font: bold on') ws.write(0, 0, 'Sample', header_style) ws.write(0, 1, 'Q1', header_style) ws.write(0, 2, 'Q2', header_style) ws.write(0, 3, 'Q3', header_style) ws.write(0, 4, 'Max. coverage', header_style) ws.write(0, 5, 'Min. coverage', header_style) ws.write(0, 6, 'Mean coverage', header_style) # Calculate distribution stats for each of the bams for i, dist in enumerate(distributions): #Sacamos estadisticas ndist = numpy.array(dist) p25 = numpy.percentile(ndist, 25) p50 = numpy.percentile(ndist, 50) p75 = numpy.percentile(ndist, 75) maximum = numpy.max(ndist) minimum = numpy.min(ndist) mean = numpy.average(ndist) ws.write(i + 1, 0, labels[i]) ws.write(i + 1, 1, p25) ws.write(i + 1, 2, p50) ws.write(i + 1, 3, p75) ws.write(i + 1, 4, maximum) ws.write(i + 1, 5, minimum) ws.write(i + 1, 6, mean) wb.save(dirout + '/Coverage_stats.xls')