def normaliseBAMs(infiles, outfile): '''Sample reads from larger of two BAM files so CAP and control samples have aprox same aligned read number.''' infile, controlfile = infiles controlfilenorm = controlfile.replace(".dedup.mapped", ".norm") to_cluster = True # Count reads in chip file countfile1 = infile.replace(".bam", ".count") statement = ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( infile, countfile1) P.run() fh = open(countfile1, "r") chip_reads = int(fh.read()) fh.close() # Count reads in input file countfile2 = controlfile.replace(".bam", ".count") statement = ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( controlfile, countfile2) P.run() fh = open(countfile2, "r") input_reads = int(fh.read()) fh.close() # If chip > input then sample chip reads if chip_reads > input_reads: PIntervals.buildSimpleNormalizedBAM((infile, countfile1), outfile, input_reads) statement = '''cp %(controlfile)s %(controlfilenorm)s; cp %(controlfile)s.bai %(controlfilenorm)s.bai; ''' P.run() else: PIntervals.buildSimpleNormalizedBAM((controlfile, countfile2), controlfilenorm, chip_reads) statement = '''cp %(infile)s %(outfile)s; cp %(infile)s.bai %(outfile)s.bai; ''' P.run()
def normaliseBAMs( infiles, outfile ): '''Sample reads from larger of two BAM files so CAP and control samples have aprox same aligned read number.''' infile, controlfile = infiles controlfilenorm = controlfile.replace(".dedup.mapped",".norm") to_cluster = True # Count reads in chip file countfile1 = infile.replace(".bam",".count") statement= ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( infile,countfile1 ) P.run() fh = open(countfile1,"r") chip_reads = int(fh.read()) fh.close() # Count reads in input file countfile2 = controlfile.replace(".bam",".count") statement= ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( controlfile,countfile2 ) P.run() fh = open(countfile2,"r") input_reads = int(fh.read()) fh.close() # If chip > input then sample chip reads if chip_reads > input_reads: PIntervals.buildSimpleNormalizedBAM( (infile,countfile1), outfile, input_reads) statement = '''cp %(controlfile)s %(controlfilenorm)s; cp %(controlfile)s.bai %(controlfilenorm)s.bai; ''' P.run() else: PIntervals.buildSimpleNormalizedBAM( (controlfile,countfile2), controlfilenorm, chip_reads) statement = '''cp %(infile)s %(outfile)s; cp %(infile)s.bai %(outfile)s.bai; ''' P.run()
def getMergedBigWigPeakShift( infiles, outfile ): '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file''' expt = P.snip( os.path.basename( outfile ), ".merge.bw").replace("-agg","") in_list = " --bamfile=".join(infiles) offsets = [] for t in infiles: track = P.snip(os.path.basename(t), ".norm.bam") fn = "macs/with_input/%s.macs" % track if os.path.exists( fn ): offsets.append( str(PIntervals.getPeakShiftFromMacs( fn )) ) shifts = " --shift=".join(offsets) statement = '''python %(scriptsdir)s/bam2wiggle.py --output-format=bigwig %(in_list)s %(shifts)s > %(outfile)s''' P.run()
def getMergedBigWigPeakShift(infiles, outfile): '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file''' expt = P.snip(os.path.basename(outfile), ".merge.bw").replace("-agg", "") in_list = " --bamfile=".join(infiles) offsets = [] for t in infiles: track = P.snip(os.path.basename(t), ".norm.bam") fn = "macs/with_input/%s.macs" % track if os.path.exists(fn): offsets.append(str(PIntervals.getPeakShiftFromMacs(fn))) shifts = " --shift=".join(offsets) statement = '''python %(scriptsdir)s/bam2wiggle.py --output-format=bigwig %(in_list)s %(shifts)s > %(outfile)s''' P.run()
def summarizeMACSsolo(infiles, outfile): '''run MACS for peak detection.''' PIntervals.summarizeMACSsolo(infiles, outfile)
def loadMACSsolo(infiles, outfile): infile, bamfile = infiles PIntervals.loadMACS(infile, outfile, bamfile)
def loadMergedIntervals(infile, outfile): '''load combined intervals. Also, re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. In particular, the meaning of the columns in the table changes to: nProbes: number of reads in interval PeakCenter: position with maximum number of reads in interval AvgVal: average coverage within interval If *replicates* is true, only replicates will be considered for the counting. Otherwise the counts aggregate both replicates and conditions. ''' # Write header to output file tmpfile = tempfile.NamedTemporaryFile(delete=False) headers = ("contig", "start", "end", "interval_id", "nPeaks", "PeakCenter", "Length", "AvgVal", "PeakVal", "nProbes", "Fold") tmpfile.write("\t".join(headers) + "\n") contig, start, end, interval_id, npeaks, peakcenter, length, avgval, peakval, nprobes = "", 0, 0, 0, 0, 0, 0, 0, 0, 0 # Get SAM file and Macs offset samfiles, offsets = [], [] track = P.snip(os.path.basename(infile), ".merged.cleaned.bed") base_track = track.replace(".solo", "") fn = "bam/%s.norm.bam" % track assert os.path.exists( fn), "could not find bamfile %s for track %s" % (fn, track) samfiles.append(pysam.Samfile(fn, "rb")) if track.find("solo") > -1: fn = "macs/no_input/%s.macs" % track else: fn = "macs/with_input/%s.macs" % track if os.path.exists(fn): offsets.append(PIntervals.getPeakShiftFromMacs(fn)) # Loop over input Bed file and calculate stats for merged intervals c = E.Counter() for line in open(infile, "r"): c.input += 1 contig, start, end, int_id, fc = line[:-1].split()[:5] start, end = int(start), int(end) interval_id = c.input npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks( contig, start, end, samfiles, offsets) # nreads can be 0 if the intervals overlap only slightly # and due to the binning, no reads are actually in the overlap region. # However, most of these intervals should be small and have already be deleted via # the merge_min_interval_length cutoff. # do not output intervals without reads. if nprobes == 0: c.skipped_reads += 1 c.output += 1 tmpfile.write("\t".join( map(str, (contig, start, end, int_id, npeaks, peakcenter, length, avgval, peakval, nprobes, fc))) + "\n") tmpfile.close() tmpfilename = tmpfile.name tablename = "%s_macs_merged_intervals" % track statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=interval_id --index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name) L.info("%s\n" % str(c))
def mergeIntervals(infile, outfile): '''Merge intervals less than n bases apart in each dataset and update foldchange scores''' d = PARAMS["intervals_merge_dist"] method = PARAMS["intervals_foldchange_merge_method"] PIntervals.mergeIntervalsWithScores(infile, outfile, d, method)
def exportIntervalsAsBed(infile, outfile): '''Export MACS intervals from database as BED file and filter on fold change''' fc = PARAMS["intervals_min_fc"] PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)
def loadMergedIntervals( infile, outfile ): '''load combined intervals. Also, re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. In particular, the meaning of the columns in the table changes to: nProbes: number of reads in interval PeakCenter: position with maximum number of reads in interval AvgVal: average coverage within interval If *replicates* is true, only replicates will be considered for the counting. Otherwise the counts aggregate both replicates and conditions. ''' # Write header to output file tmpfile = tempfile.NamedTemporaryFile(delete=False) headers = ( "contig","start","end","interval_id","nPeaks","PeakCenter","Length","AvgVal","PeakVal","nProbes", "Fold" ) tmpfile.write( "\t".join(headers) + "\n" ) contig,start,end,interval_id,npeaks,peakcenter,length,avgval,peakval,nprobes = "",0,0,0,0,0,0,0,0,0 # Get SAM file and Macs offset samfiles, offsets = [], [] track = P.snip( os.path.basename(infile), ".merged.cleaned.bed") base_track = track.replace(".solo","") fn = "bam/%s.norm.bam" % track assert os.path.exists( fn ), "could not find bamfile %s for track %s" % ( fn, track) samfiles.append( pysam.Samfile( fn, "rb" ) ) if track.find("solo") > -1: fn = "macs/no_input/%s.macs" % track else: fn = "macs/with_input/%s.macs" % track if os.path.exists( fn ): offsets.append( PIntervals.getPeakShiftFromMacs( fn ) ) # Loop over input Bed file and calculate stats for merged intervals c = E.Counter() for line in open(infile, "r"): c.input += 1 contig, start, end, int_id, fc = line[:-1].split()[:5] start, end = int(start), int(end) interval_id = c.input npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks( contig, start, end, samfiles, offsets ) # nreads can be 0 if the intervals overlap only slightly # and due to the binning, no reads are actually in the overlap region. # However, most of these intervals should be small and have already be deleted via # the merge_min_interval_length cutoff. # do not output intervals without reads. if nprobes == 0: c.skipped_reads += 1 c.output += 1 tmpfile.write( "\t".join( map( str, (contig,start,end,int_id,npeaks,peakcenter,length,avgval,peakval,nprobes,fc) )) + "\n" ) tmpfile.close() tmpfilename = tmpfile.name tablename = "%s_macs_merged_intervals" % track statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=interval_id --index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink( tmpfile.name ) L.info( "%s\n" % str(c) )
def mergeIntervals(infile, outfile): '''Merge intervals less than n bases apart in each dataset and update foldchange scores''' d = PARAMS["intervals_merge_dist"] method = PARAMS["intervals_foldchange_merge_method"] PIntervals.mergeIntervalsWithScores( infile, outfile, d, method )
def exportIntervalsAsBed( infile, outfile ): '''Export MACS intervals from database as BED file and filter on fold change''' fc = PARAMS["intervals_min_fc"] PIntervals.exportMacsIntervalsAsBed( infile, outfile, fc )
def summarizeMACS( infiles, outfile ): '''Parse MACS summary statistics from log file''' PIntervals.summarizeMACS( infiles, outfile )
def loadMACS( infiles, outfile ): '''Load MACS intervals into database and filter on fold change and p-value''' infile, bamfile = infiles PIntervals.loadMACS( infile, outfile, bamfile )
def exportIntervalsAsBedsolo(infile, outfile): '''Export list of intervals passing fold change threshold to file ''' fc = PARAMS["intervals_min_fc"] PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)
def loadMACS(infiles, outfile): '''Load MACS intervals into database and filter on fold change and p-value''' infile, bamfile = infiles PIntervals.loadMACS(infile, outfile, bamfile)
def summarizeMACS(infiles, outfile): '''Parse MACS summary statistics from log file''' PIntervals.summarizeMACS(infiles, outfile)