Python PipelineChipseqの例、PipelineChipseq Pythonの例

コード例 #1

0

ファイルを表示

def normaliseBAMs(infiles, outfile):
    '''Sample reads from larger of two BAM files so CAP and control samples have aprox same aligned read number.'''
    infile, controlfile = infiles
    controlfilenorm = controlfile.replace(".dedup.mapped", ".norm")
    to_cluster = True

    # Count reads in chip file
    countfile1 = infile.replace(".bam", ".count")
    statement = ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % (
        infile, countfile1)
    P.run()
    fh = open(countfile1, "r")
    chip_reads = int(fh.read())
    fh.close()

    # Count reads in input file
    countfile2 = controlfile.replace(".bam", ".count")
    statement = ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % (
        controlfile, countfile2)
    P.run()
    fh = open(countfile2, "r")
    input_reads = int(fh.read())
    fh.close()

    # If chip > input then sample chip reads
    if chip_reads > input_reads:
        PIntervals.buildSimpleNormalizedBAM((infile, countfile1), outfile,
                                            input_reads)
        statement = '''cp %(controlfile)s %(controlfilenorm)s; cp %(controlfile)s.bai %(controlfilenorm)s.bai; '''
        P.run()
    else:
        PIntervals.buildSimpleNormalizedBAM((controlfile, countfile2),
                                            controlfilenorm, chip_reads)
        statement = '''cp %(infile)s %(outfile)s; cp %(infile)s.bai %(outfile)s.bai; '''
        P.run()

コード例 #2

0

ファイルを表示

ファイル: pipeline_proj012_chipseq.py プロジェクト: BioinformaticsArchive/cgat

def normaliseBAMs( infiles, outfile ):
    '''Sample reads from larger of two BAM files so CAP and control samples have aprox same aligned read number.'''
    infile, controlfile = infiles
    controlfilenorm = controlfile.replace(".dedup.mapped",".norm")
    to_cluster = True

    # Count reads in chip file
    countfile1 = infile.replace(".bam",".count")
    statement= ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( infile,countfile1 )
    P.run()
    fh = open(countfile1,"r")
    chip_reads = int(fh.read())
    fh.close()

    # Count reads in input file
    countfile2 = controlfile.replace(".bam",".count")
    statement= ''' samtools idxstats %s | awk '{s+=$3} END {print s}' > %s ''' % ( controlfile,countfile2 )
    P.run()
    fh = open(countfile2,"r")
    input_reads = int(fh.read())
    fh.close()

    # If chip > input then sample chip reads
    if chip_reads > input_reads:
        PIntervals.buildSimpleNormalizedBAM( (infile,countfile1), outfile, input_reads)
        statement = '''cp %(controlfile)s %(controlfilenorm)s; cp %(controlfile)s.bai %(controlfilenorm)s.bai; '''
        P.run()
    else:
        PIntervals.buildSimpleNormalizedBAM( (controlfile,countfile2), controlfilenorm, chip_reads)
        statement = '''cp %(infile)s %(outfile)s; cp %(infile)s.bai %(outfile)s.bai; '''
        P.run()

コード例 #3

0

ファイルを表示

ファイル: pipeline_proj012_chipseq.py プロジェクト: BioinformaticsArchive/cgat

def getMergedBigWigPeakShift( infiles, outfile ):
    '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file'''
    expt = P.snip( os.path.basename( outfile ), ".merge.bw").replace("-agg","")
    in_list = " --bamfile=".join(infiles)
    
    offsets = []
    for t in infiles:
        track = P.snip(os.path.basename(t), ".norm.bam")
        fn = "macs/with_input/%s.macs" % track
        if os.path.exists( fn ):
            offsets.append( str(PIntervals.getPeakShiftFromMacs( fn )) )

    shifts = " --shift=".join(offsets)
    statement = '''python %(scriptsdir)s/bam2wiggle.py 
                      --output-format=bigwig
                      %(in_list)s
                      %(shifts)s > %(outfile)s'''
    P.run()

コード例 #4

0

ファイルを表示

def getMergedBigWigPeakShift(infiles, outfile):
    '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file'''
    expt = P.snip(os.path.basename(outfile), ".merge.bw").replace("-agg", "")
    in_list = " --bamfile=".join(infiles)

    offsets = []
    for t in infiles:
        track = P.snip(os.path.basename(t), ".norm.bam")
        fn = "macs/with_input/%s.macs" % track
        if os.path.exists(fn):
            offsets.append(str(PIntervals.getPeakShiftFromMacs(fn)))

    shifts = " --shift=".join(offsets)
    statement = '''python %(scriptsdir)s/bam2wiggle.py 
                      --output-format=bigwig
                      %(in_list)s
                      %(shifts)s > %(outfile)s'''
    P.run()

コード例 #5

0

ファイルを表示

def summarizeMACSsolo(infiles, outfile):
    '''run MACS for peak detection.'''
    PIntervals.summarizeMACSsolo(infiles, outfile)

コード例 #6

0

ファイルを表示

def loadMACSsolo(infiles, outfile):
    infile, bamfile = infiles
    PIntervals.loadMACS(infile, outfile, bamfile)

コード例 #7

0

ファイルを表示

def loadMergedIntervals(infile, outfile):
    '''load combined intervals.

    Also, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    If *replicates* is true, only replicates will be considered
    for the counting. Otherwise the counts aggregate both replicates
    and conditions.
    '''

    # Write header to output file
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    headers = ("contig", "start", "end", "interval_id", "nPeaks", "PeakCenter",
               "Length", "AvgVal", "PeakVal", "nProbes", "Fold")
    tmpfile.write("\t".join(headers) + "\n")
    contig, start, end, interval_id, npeaks, peakcenter, length, avgval, peakval, nprobes = "", 0, 0, 0, 0, 0, 0, 0, 0, 0

    # Get SAM file and Macs offset
    samfiles, offsets = [], []
    track = P.snip(os.path.basename(infile), ".merged.cleaned.bed")
    base_track = track.replace(".solo", "")

    fn = "bam/%s.norm.bam" % track
    assert os.path.exists(
        fn), "could not find bamfile %s for track %s" % (fn, track)
    samfiles.append(pysam.Samfile(fn, "rb"))
    if track.find("solo") > -1:
        fn = "macs/no_input/%s.macs" % track
    else:
        fn = "macs/with_input/%s.macs" % track
    if os.path.exists(fn):
        offsets.append(PIntervals.getPeakShiftFromMacs(fn))

    # Loop over input Bed file and calculate stats for merged intervals
    c = E.Counter()
    for line in open(infile, "r"):
        c.input += 1
        contig, start, end, int_id, fc = line[:-1].split()[:5]
        start, end = int(start), int(end)
        interval_id = c.input

        npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks(
            contig, start, end, samfiles, offsets)

        # nreads can be 0 if the intervals overlap only slightly
        # and due to the binning, no reads are actually in the overlap region.
        # However, most of these intervals should be small and have already be deleted via
        # the merge_min_interval_length cutoff.
        # do not output intervals without reads.
        if nprobes == 0:
            c.skipped_reads += 1

        c.output += 1
        tmpfile.write("\t".join(
            map(str, (contig, start, end, int_id, npeaks, peakcenter, length,
                      avgval, peakval, nprobes, fc))) + "\n")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_macs_merged_intervals" % track

    statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s
                       --index=interval_id
                       --index=contig,start 
                       --table=%(tablename)s
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfile.name)
    L.info("%s\n" % str(c))

コード例 #8

0

ファイルを表示

def mergeIntervals(infile, outfile):
    '''Merge intervals less than n bases apart in each dataset and update foldchange scores'''
    d = PARAMS["intervals_merge_dist"]
    method = PARAMS["intervals_foldchange_merge_method"]
    PIntervals.mergeIntervalsWithScores(infile, outfile, d, method)

コード例 #9

0

ファイルを表示

def exportIntervalsAsBed(infile, outfile):
    '''Export MACS intervals from database as BED file and filter on fold change'''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)

コード例 #10

0

ファイルを表示

ファイル: pipeline_fastqToBigWig.py プロジェクト: Charlie-George/cgat

def summarizeMACSsolo(infiles, outfile):
    '''run MACS for peak detection.'''
    PIntervals.summarizeMACSsolo(infiles, outfile)

コード例 #11

0

ファイルを表示

ファイル: pipeline_proj012_chipseq.py プロジェクト: BioinformaticsArchive/cgat

def loadMergedIntervals( infile, outfile ):
    '''load combined intervals.

    Also, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    If *replicates* is true, only replicates will be considered
    for the counting. Otherwise the counts aggregate both replicates
    and conditions.
    '''

    # Write header to output file
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    headers = ( "contig","start","end","interval_id","nPeaks","PeakCenter","Length","AvgVal","PeakVal","nProbes", "Fold" )
    tmpfile.write( "\t".join(headers) + "\n" )
    contig,start,end,interval_id,npeaks,peakcenter,length,avgval,peakval,nprobes = "",0,0,0,0,0,0,0,0,0

    # Get SAM file and Macs offset
    samfiles, offsets = [], []
    track = P.snip( os.path.basename(infile), ".merged.cleaned.bed")
    base_track = track.replace(".solo","")

    fn = "bam/%s.norm.bam" % track
    assert os.path.exists( fn ), "could not find bamfile %s for track %s" % ( fn, track)
    samfiles.append( pysam.Samfile( fn,  "rb" ) )
    if track.find("solo") > -1:
        fn = "macs/no_input/%s.macs" % track
    else:
        fn = "macs/with_input/%s.macs" % track
    if os.path.exists( fn ):
        offsets.append( PIntervals.getPeakShiftFromMacs( fn ) )

    # Loop over input Bed file and calculate stats for merged intervals
    c = E.Counter()
    for line in open(infile, "r"):
        c.input += 1
        contig, start, end, int_id, fc = line[:-1].split()[:5]
        start, end = int(start), int(end)
        interval_id = c.input

        npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks( contig, start, end, samfiles, offsets )

        # nreads can be 0 if the intervals overlap only slightly
        # and due to the binning, no reads are actually in the overlap region.
        # However, most of these intervals should be small and have already be deleted via 
        # the merge_min_interval_length cutoff.
        # do not output intervals without reads.
        if nprobes == 0:
            c.skipped_reads += 1
            
        c.output += 1
        tmpfile.write( "\t".join( map( str, (contig,start,end,int_id,npeaks,peakcenter,length,avgval,peakval,nprobes,fc) )) + "\n" )
 
    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_macs_merged_intervals" % track
    
    statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s
                       --index=interval_id
                       --index=contig,start 
                       --table=%(tablename)s
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink( tmpfile.name )
    L.info( "%s\n" % str(c) )

コード例 #12

0

ファイルを表示

ファイル: pipeline_proj012_chipseq.py プロジェクト: BioinformaticsArchive/cgat

def mergeIntervals(infile, outfile):
    '''Merge intervals less than n bases apart in each dataset and update foldchange scores'''
    d = PARAMS["intervals_merge_dist"]
    method = PARAMS["intervals_foldchange_merge_method"]
    PIntervals.mergeIntervalsWithScores( infile, outfile, d, method )

コード例 #13

0

ファイルを表示

ファイル: pipeline_proj012_chipseq.py プロジェクト: BioinformaticsArchive/cgat

def exportIntervalsAsBed( infile, outfile ):
    '''Export MACS intervals from database as BED file and filter on fold change'''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed( infile, outfile, fc )

コード例 #14

0

ファイルを表示

ファイル: pipeline_proj012_chipseq.py プロジェクト: BioinformaticsArchive/cgat

def summarizeMACS( infiles, outfile ):
    '''Parse MACS summary statistics from log file'''
    PIntervals.summarizeMACS( infiles, outfile )

コード例 #15

0

ファイルを表示

ファイル: pipeline_proj012_chipseq.py プロジェクト: BioinformaticsArchive/cgat

def loadMACS( infiles, outfile ):
    '''Load MACS intervals into database and filter on fold change and p-value'''
    infile, bamfile = infiles
    PIntervals.loadMACS( infile, outfile, bamfile )

コード例 #16

0

ファイルを表示

def exportIntervalsAsBedsolo(infile, outfile):
    '''Export list of intervals passing fold change threshold to file '''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)

コード例 #17

0

ファイルを表示

ファイル: pipeline_fastqToBigWig.py プロジェクト: Charlie-George/cgat

def loadMACSsolo(infiles, outfile):
    infile, bamfile = infiles
    PIntervals.loadMACS(infile, outfile, bamfile)

コード例 #18

0

ファイルを表示

def loadMACS(infiles, outfile):
    '''Load MACS intervals into database and filter on fold change and p-value'''
    infile, bamfile = infiles
    PIntervals.loadMACS(infile, outfile, bamfile)

コード例 #19

0

ファイルを表示

ファイル: pipeline_fastqToBigWig.py プロジェクト: Charlie-George/cgat

def exportIntervalsAsBedsolo(infile, outfile):
    '''Export list of intervals passing fold change threshold to file '''
    fc = PARAMS["intervals_min_fc"]
    PIntervals.exportMacsIntervalsAsBed(infile, outfile, fc)

コード例 #20

0

ファイルを表示

def summarizeMACS(infiles, outfile):
    '''Parse MACS summary statistics from log file'''
    PIntervals.summarizeMACS(infiles, outfile)