예제 #1
0
def getMergedBigWigPeakShift(infiles, outfile):
    '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file'''
    expt = P.snip(os.path.basename(outfile), ".merge.bw").replace("-agg", "")
    in_list = " --bamfile=".join(infiles)

    offsets = []
    for t in infiles:
        track = P.snip(os.path.basename(t), ".norm.bam")
        fn = "macs/with_input/%s.macs" % track
        if os.path.exists(fn):
            offsets.append(str(PIntervals.getPeakShiftFromMacs(fn)))

    shifts = " --shift=".join(offsets)
    statement = '''python %(scriptsdir)s/bam2wiggle.py 
                      --output-format=bigwig
                      %(in_list)s
                      %(shifts)s > %(outfile)s'''
    P.run()
def getMergedBigWigPeakShift( infiles, outfile ):
    '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file'''
    expt = P.snip( os.path.basename( outfile ), ".merge.bw").replace("-agg","")
    in_list = " --bamfile=".join(infiles)
    
    offsets = []
    for t in infiles:
        track = P.snip(os.path.basename(t), ".norm.bam")
        fn = "macs/with_input/%s.macs" % track
        if os.path.exists( fn ):
            offsets.append( str(PIntervals.getPeakShiftFromMacs( fn )) )

    shifts = " --shift=".join(offsets)
    statement = '''python %(scriptsdir)s/bam2wiggle.py 
                      --output-format=bigwig
                      %(in_list)s
                      %(shifts)s > %(outfile)s'''
    P.run()
예제 #3
0
def loadMergedIntervals(infile, outfile):
    '''load combined intervals.

    Also, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    If *replicates* is true, only replicates will be considered
    for the counting. Otherwise the counts aggregate both replicates
    and conditions.
    '''

    # Write header to output file
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    headers = ("contig", "start", "end", "interval_id", "nPeaks", "PeakCenter",
               "Length", "AvgVal", "PeakVal", "nProbes", "Fold")
    tmpfile.write("\t".join(headers) + "\n")
    contig, start, end, interval_id, npeaks, peakcenter, length, avgval, peakval, nprobes = "", 0, 0, 0, 0, 0, 0, 0, 0, 0

    # Get SAM file and Macs offset
    samfiles, offsets = [], []
    track = P.snip(os.path.basename(infile), ".merged.cleaned.bed")
    base_track = track.replace(".solo", "")

    fn = "bam/%s.norm.bam" % track
    assert os.path.exists(
        fn), "could not find bamfile %s for track %s" % (fn, track)
    samfiles.append(pysam.Samfile(fn, "rb"))
    if track.find("solo") > -1:
        fn = "macs/no_input/%s.macs" % track
    else:
        fn = "macs/with_input/%s.macs" % track
    if os.path.exists(fn):
        offsets.append(PIntervals.getPeakShiftFromMacs(fn))

    # Loop over input Bed file and calculate stats for merged intervals
    c = E.Counter()
    for line in open(infile, "r"):
        c.input += 1
        contig, start, end, int_id, fc = line[:-1].split()[:5]
        start, end = int(start), int(end)
        interval_id = c.input

        npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks(
            contig, start, end, samfiles, offsets)

        # nreads can be 0 if the intervals overlap only slightly
        # and due to the binning, no reads are actually in the overlap region.
        # However, most of these intervals should be small and have already be deleted via
        # the merge_min_interval_length cutoff.
        # do not output intervals without reads.
        if nprobes == 0:
            c.skipped_reads += 1

        c.output += 1
        tmpfile.write("\t".join(
            map(str, (contig, start, end, int_id, npeaks, peakcenter, length,
                      avgval, peakval, nprobes, fc))) + "\n")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_macs_merged_intervals" % track

    statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s
                       --index=interval_id
                       --index=contig,start 
                       --table=%(tablename)s
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfile.name)
    L.info("%s\n" % str(c))
def loadMergedIntervals( infile, outfile ):
    '''load combined intervals.

    Also, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    If *replicates* is true, only replicates will be considered
    for the counting. Otherwise the counts aggregate both replicates
    and conditions.
    '''

    # Write header to output file
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    headers = ( "contig","start","end","interval_id","nPeaks","PeakCenter","Length","AvgVal","PeakVal","nProbes", "Fold" )
    tmpfile.write( "\t".join(headers) + "\n" )
    contig,start,end,interval_id,npeaks,peakcenter,length,avgval,peakval,nprobes = "",0,0,0,0,0,0,0,0,0

    # Get SAM file and Macs offset
    samfiles, offsets = [], []
    track = P.snip( os.path.basename(infile), ".merged.cleaned.bed")
    base_track = track.replace(".solo","")

    fn = "bam/%s.norm.bam" % track
    assert os.path.exists( fn ), "could not find bamfile %s for track %s" % ( fn, track)
    samfiles.append( pysam.Samfile( fn,  "rb" ) )
    if track.find("solo") > -1:
        fn = "macs/no_input/%s.macs" % track
    else:
        fn = "macs/with_input/%s.macs" % track
    if os.path.exists( fn ):
        offsets.append( PIntervals.getPeakShiftFromMacs( fn ) )

    # Loop over input Bed file and calculate stats for merged intervals
    c = E.Counter()
    for line in open(infile, "r"):
        c.input += 1
        contig, start, end, int_id, fc = line[:-1].split()[:5]
        start, end = int(start), int(end)
        interval_id = c.input

        npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks( contig, start, end, samfiles, offsets )

        # nreads can be 0 if the intervals overlap only slightly
        # and due to the binning, no reads are actually in the overlap region.
        # However, most of these intervals should be small and have already be deleted via 
        # the merge_min_interval_length cutoff.
        # do not output intervals without reads.
        if nprobes == 0:
            c.skipped_reads += 1
            
        c.output += 1
        tmpfile.write( "\t".join( map( str, (contig,start,end,int_id,npeaks,peakcenter,length,avgval,peakval,nprobes,fc) )) + "\n" )
 
    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_macs_merged_intervals" % track
    
    statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s
                       --index=interval_id
                       --index=contig,start 
                       --table=%(tablename)s
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink( tmpfile.name )
    L.info( "%s\n" % str(c) )