Esempio n. 1
0
def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
Esempio n. 2
0
def subtractBedFiles(infile, subtractfile, outfile):
    '''subtract intervals in *subtractfile* from *infile*
    and store in *outfile*.
    '''

    if IOTools.isEmpty(subtractfile):
        shutil.copyfile(infile, outfile)
        return
    elif IOTools.isEmpty(infile):
        P.touch(outfile)
        return

    statement = '''
        intersectBed -v -a %(infile)s -b %(subtractfile)s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %(outfile)s ; tabix -p bed %(outfile)s
        '''

    P.run()
Esempio n. 3
0
def loadZinba(infile, outfile, bamfile,
              tablename=None,
              controlfile=None):
    '''load Zinba results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.zinba`.

    If no peaks were predicted, an empty table is created.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.

    This method uses the refined peak locations.

    Zinba peaks can be overlapping. This method does not merge
    overlapping intervals.

    Zinba calls peaks in regions where there are many reads inside
    the control. Thus this method applies a filtering step 
    removing all intervals in which there is a peak of
    more than readlength / 2 height in the control.

    .. note:

       Zinba calls peaks that are overlapping.

    '''

    track = P.snip(os.path.basename(infile), ".zinba")
    folder = os.path.dirname(infile)

    infilename = infile + ".peaks"

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")

    counter = E.Counter()

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
    elif IOTools.isEmpty(infilename):
        E.warn("no data in %s" % infilename)
    else:
        # filter peaks
        shift = getPeakShiftFromZinba(infile)
        assert shift is not None, \
            "could not determine peak shift from Zinba file %s" % infile

        E.info("%s: found peak shift of %i" % (track, shift))

        samfiles = [pysam.Samfile(bamfile, "rb")]
        offsets = [shift / 2]

        if controlfile:
            controlfiles = [pysam.Samfile(controlfile, "rb")]
            readlength = BamTools.estimateTagSize(controlfile)
            control_max_peakval = readlength // 2
            E.info("removing intervals in which control has peak higher than %i reads" %
                   control_max_peakval)
        else:
            controlfiles = None

        id = 0

        # get thresholds
        max_qvalue = float(PARAMS["zinba_fdr_threshold"])

        with IOTools.openFile(infilename, "r") as ins:
            for peak in WrapperZinba.iteratePeaks(ins):

                # filter by qvalue
                if peak.fdr > max_qvalue:
                    counter.removed_qvalue += 1
                    continue

                assert peak.refined_start < peak.refined_end

                # filter by control
                if controlfiles:
                    npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                     peak.refined_start,
                                                                                     peak.refined_end,
                                                                                     controlfiles,
                                                                                     offsets)

                    if peakval > control_max_peakval:
                        counter.removed_control += 1
                        continue

                # output peak
                npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                 peak.refined_start,
                                                                                 peak.refined_end,
                                                                                 samfiles,
                                                                                 offsets)

                outtemp.write("\t".join(map(str, (
                    id, peak.contig, peak.refined_start, peak.refined_end,
                    npeaks, peakcenter, length, avgval, peakval, nreads,
                    1.0 - peak.posterior, 1.0, peak.fdr,
                    peak.refined_start + peak.summit - 1,
                    peak.height))) + "\n")
                id += 1
                counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_intervals" % track

    statement = '''
    cgat csv2db %(csv2db_options)s 
              --allow-empty-file
              --add-index=interval_id 
              --add-index=contig,start
              --table=%(tablename)s 
    < %(tmpfilename)s 
    > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.is_empty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            IOTools.touch_file(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run(statement)

    else:

        tmpfile = P.get_temp_filename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.is_empty(infiles[0]):
            IOTools.touch_file(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run(statement)

        for fn in infiles[1:]:
            if IOTools.is_empty(infiles[0]):
                IOTools.touch_file(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run(statement)

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run(statement)

        os.unlink(tmpfile)