Example #1
0
def normaliseBAMs(infiles, outfile):
    '''Sample reads from larger of two BAM files so CAP and control samples have aprox same aligned read number.'''
    infile, controlfile = infiles
    controlfilenorm = controlfile.replace(".dedup.mapped", ".norm")
    to_cluster = True

    # Count reads in chip file
    countfile1 = infile.replace(".bam", ".count")
    statement = ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % (
        infile, countfile1)
    P.run()
    fh = open(countfile1, "r")
    chip_reads = int(fh.read())
    fh.close()

    # Count reads in input file
    countfile2 = controlfile.replace(".bam", ".count")
    statement = ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % (
        controlfile, countfile2)
    P.run()
    fh = open(countfile2, "r")
    input_reads = int(fh.read())
    fh.close()

    # If chip > input then sample chip reads
    if chip_reads > input_reads:
        PIntervals.buildSimpleNormalizedBAM((infile, countfile1), outfile,
                                            input_reads)
        statement = '''cp %(controlfile)s %(controlfilenorm)s; cp %(controlfile)s.bai %(controlfilenorm)s.bai; '''
        P.run()
    else:
        PIntervals.buildSimpleNormalizedBAM((controlfile, countfile2),
                                            controlfilenorm, chip_reads)
        statement = '''cp %(infile)s %(outfile)s; cp %(infile)s.bai %(outfile)s.bai; '''
        P.run()
def normaliseBAMs( infiles, outfile ):
    '''Sample reads from larger of two BAM files so CAP and control samples have aprox same aligned read number.'''
    infile, controlfile = infiles
    controlfilenorm = controlfile.replace(".dedup.mapped",".norm")
    to_cluster = True

    # Count reads in chip file
    countfile1 = infile.replace(".bam",".count")
    statement= ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( infile,countfile1 )
    P.run()
    fh = open(countfile1,"r")
    chip_reads = int(fh.read())
    fh.close()

    # Count reads in input file
    countfile2 = controlfile.replace(".bam",".count")
    statement= ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( controlfile,countfile2 )
    P.run()
    fh = open(countfile2,"r")
    input_reads = int(fh.read())
    fh.close()

    # If chip > input then sample chip reads
    if chip_reads > input_reads:
        PIntervals.buildSimpleNormalizedBAM( (infile,countfile1), outfile, input_reads)
        statement = '''cp %(controlfile)s %(controlfilenorm)s; cp %(controlfile)s.bai %(controlfilenorm)s.bai; '''
        P.run()
    else:
        PIntervals.buildSimpleNormalizedBAM( (controlfile,countfile2), controlfilenorm, chip_reads)
        statement = '''cp %(infile)s %(outfile)s; cp %(infile)s.bai %(outfile)s.bai; '''
        P.run()
def getMergedBigWigPeakShift( infiles, outfile ):
    '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file'''
    expt = P.snip( os.path.basename( outfile ), ".merge.bw").replace("-agg","")
    in_list = " --bamfile=".join(infiles)
    
    offsets = []
    for t in infiles:
        track = P.snip(os.path.basename(t), ".norm.bam")
        fn = "macs/with_input/%s.macs" % track
        if os.path.exists( fn ):
            offsets.append( str(PIntervals.getPeakShiftFromMacs( fn )) )

    shifts = " --shift=".join(offsets)
    statement = '''python %(scriptsdir)s/bam2wiggle.py 
                      --output-format=bigwig
                      %(in_list)s
                      %(shifts)s > %(outfile)s'''
    P.run()
Example #4
0
def getMergedBigWigPeakShift(infiles, outfile):
    '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file'''
    expt = P.snip(os.path.basename(outfile), ".merge.bw").replace("-agg", "")
    in_list = " --bamfile=".join(infiles)

    offsets = []
    for t in infiles:
        track = P.snip(os.path.basename(t), ".norm.bam")
        fn = "macs/with_input/%s.macs" % track
        if os.path.exists(fn):
            offsets.append(str(PIntervals.getPeakShiftFromMacs(fn)))

    shifts = " --shift=".join(offsets)
    statement = '''python %(scriptsdir)s/bam2wiggle.py 
                      --output-format=bigwig
                      %(in_list)s
                      %(shifts)s > %(outfile)s'''
    P.run()
Example #5
0
def summarizeMACSsolo(infiles, outfile):
    '''run MACS for peak detection.'''
    PIntervals.summarizeMACSsolo(infiles, outfile)
Example #6
0
def loadMACSsolo(infiles, outfile):
    infile, bamfile = infiles
    PIntervals.loadMACS(infile, outfile, bamfile)
Example #7
0
def loadMergedIntervals(infile, outfile):
    '''load combined intervals.

    Also, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    If *replicates* is true, only replicates will be considered
    for the counting. Otherwise the counts aggregate both replicates
    and conditions.
    '''

    # Write header to output file
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    headers = ("contig", "start", "end", "interval_id", "nPeaks", "PeakCenter",
               "Length", "AvgVal", "PeakVal", "nProbes", "Fold")
    tmpfile.write("\t".join(headers) + "\n")
    contig, start, end, interval_id, npeaks, peakcenter, length, avgval, peakval, nprobes = "", 0, 0, 0, 0, 0, 0, 0, 0, 0

    # Get SAM file and Macs offset
    samfiles, offsets = [], []
    track = P.snip(os.path.basename(infile), ".merged.cleaned.bed")
    base_track = track.replace(".solo", "")

    fn = "bam/%s.norm.bam" % track
    assert os.path.exists(
        fn), "could not find bamfile %s for track %s" % (fn, track)
    samfiles.append(pysam.Samfile(fn, "rb"))
    if track.find("solo") > -1:
        fn = "macs/no_input/%s.macs" % track
    else:
        fn = "macs/with_input/%s.macs" % track
    if os.path.exists(fn):
        offsets.append(PIntervals.getPeakShiftFromMacs(fn))

    # Loop over input Bed file and calculate stats for merged intervals
    c = E.Counter()
    for line in open(infile, "r"):
        c.input += 1
        contig, start, end, int_id, fc = line[:-1].split()[:5]
        start, end = int(start), int(end)
        interval_id = c.input

        npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks(
            contig, start, end, samfiles, offsets)

        # nreads can be 0 if the intervals overlap only slightly
        # and due to the binning, no reads are actually in the overlap region.
        # However, most of these intervals should be small and have already be deleted via
        # the merge_min_interval_length cutoff.
        # do not output intervals without reads.
        if nprobes == 0:
            c.skipped_reads += 1

        c.output += 1
        tmpfile.write("\t".join(
            map(str, (contig, start, end, int_id, npeaks, peakcenter, length,
                      avgval, peakval, nprobes, fc))) + "\n")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_macs_merged_intervals" % track

    statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s
                       --index=interval_id
                       --index=contig,start 
                       --table=%(tablename)s
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfile.name)
    L.info("%s\n" % str(c))
Example #8
0
def mergeIntervals(infile, outfile):
    '''Merge intervals less than n bases apart in each dataset and update foldchange scores'''
    d = PARAMS["intervals_merge_dist"]
    method = PARAMS["intervals_foldchange_merge_method"]
    PIntervals.mergeIntervalsWithScores(infile, outfile, d, method)
Example #9
0
def exportIntervalsAsBed(infile, outfile):
    '''Export MACS intervals from database as BED file and filter on fold change'''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)
def summarizeMACSsolo(infiles, outfile):
    '''run MACS for peak detection.'''
    PIntervals.summarizeMACSsolo(infiles, outfile)
def loadMergedIntervals( infile, outfile ):
    '''load combined intervals.

    Also, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    If *replicates* is true, only replicates will be considered
    for the counting. Otherwise the counts aggregate both replicates
    and conditions.
    '''

    # Write header to output file
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    headers = ( "contig","start","end","interval_id","nPeaks","PeakCenter","Length","AvgVal","PeakVal","nProbes", "Fold" )
    tmpfile.write( "\t".join(headers) + "\n" )
    contig,start,end,interval_id,npeaks,peakcenter,length,avgval,peakval,nprobes = "",0,0,0,0,0,0,0,0,0

    # Get SAM file and Macs offset
    samfiles, offsets = [], []
    track = P.snip( os.path.basename(infile), ".merged.cleaned.bed")
    base_track = track.replace(".solo","")

    fn = "bam/%s.norm.bam" % track
    assert os.path.exists( fn ), "could not find bamfile %s for track %s" % ( fn, track)
    samfiles.append( pysam.Samfile( fn,  "rb" ) )
    if track.find("solo") > -1:
        fn = "macs/no_input/%s.macs" % track
    else:
        fn = "macs/with_input/%s.macs" % track
    if os.path.exists( fn ):
        offsets.append( PIntervals.getPeakShiftFromMacs( fn ) )

    # Loop over input Bed file and calculate stats for merged intervals
    c = E.Counter()
    for line in open(infile, "r"):
        c.input += 1
        contig, start, end, int_id, fc = line[:-1].split()[:5]
        start, end = int(start), int(end)
        interval_id = c.input

        npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks( contig, start, end, samfiles, offsets )

        # nreads can be 0 if the intervals overlap only slightly
        # and due to the binning, no reads are actually in the overlap region.
        # However, most of these intervals should be small and have already be deleted via 
        # the merge_min_interval_length cutoff.
        # do not output intervals without reads.
        if nprobes == 0:
            c.skipped_reads += 1
            
        c.output += 1
        tmpfile.write( "\t".join( map( str, (contig,start,end,int_id,npeaks,peakcenter,length,avgval,peakval,nprobes,fc) )) + "\n" )
 
    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_macs_merged_intervals" % track
    
    statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s
                       --index=interval_id
                       --index=contig,start 
                       --table=%(tablename)s
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink( tmpfile.name )
    L.info( "%s\n" % str(c) )
def mergeIntervals(infile, outfile):
    '''Merge intervals less than n bases apart in each dataset and update foldchange scores'''
    d = PARAMS["intervals_merge_dist"]
    method = PARAMS["intervals_foldchange_merge_method"]
    PIntervals.mergeIntervalsWithScores( infile, outfile, d, method )
def exportIntervalsAsBed( infile, outfile ):
    '''Export MACS intervals from database as BED file and filter on fold change'''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed( infile, outfile, fc )
def summarizeMACS( infiles, outfile ):
    '''Parse MACS summary statistics from log file'''
    PIntervals.summarizeMACS( infiles, outfile )
def loadMACS( infiles, outfile ):
    '''Load MACS intervals into database and filter on fold change and p-value'''
    infile, bamfile = infiles
    PIntervals.loadMACS( infile, outfile, bamfile )
Example #16
0
def exportIntervalsAsBedsolo(infile, outfile):
    '''Export list of intervals passing fold change threshold to file '''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)
def loadMACSsolo(infiles, outfile):
    infile, bamfile = infiles
    PIntervals.loadMACS(infile, outfile, bamfile)
Example #18
0
def loadMACS(infiles, outfile):
    '''Load MACS intervals into database and filter on fold change and p-value'''
    infile, bamfile = infiles
    PIntervals.loadMACS(infile, outfile, bamfile)
def exportIntervalsAsBedsolo(infile, outfile):
    '''Export list of intervals passing fold change threshold to file '''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)
Example #20
0
def summarizeMACS(infiles, outfile):
    '''Parse MACS summary statistics from log file'''
    PIntervals.summarizeMACS(infiles, outfile)