Beispiel #1
0
    def add(self,read,A,B,dist,ov,strandmatch,signal,n_hits):
        self.signal = signal
        self.strandmatch = strandmatch      
        self.edits.append(dist)
        self.overlaps.append(ov)
        self.n_hits.append(n_hits)

        # by convention have A precede B in the genome.
        if A.pos > B.pos:
            A,B = B,A
            
        # Alignment Score - Secondbest hit score            
        aopt = dict(A.tags)
        bopt = dict(B.tags)
        qA = aopt.get('AS') - aopt.get('XS',minmapscore)
        qB = bopt.get('AS') - bopt.get('XS',minmapscore)

        if qA and qB:
            # both anchors from the *same read* align uniquely
            self.uniq_bridges += 1

        self.mapquals_A.append(qA)
        self.mapquals_B.append(qB)

        # recover the original readname 
        # ('__' is forbidden in input read names!)
        if '__' in A.qname:
            qname = A.qname.split('__')[0][:-2]
        else: # reads have been swapped at some point
            qname = B.qname.split('__')[0][:-2]

        self.readnames.append(qname)
        
        # record the spliced read sequence as it was before mapping
        if A.is_reverse:
            self.strand_minus += 1
            self.reads.append(rev_comp(read))
        else:
            self.strand_plus += 1
            self.reads.append(read)

        # identify the tissue/sample it came from
        sample_name = options.name
        for (prefix,tiss) in samples:
            if qname.startswith(prefix):
                sample_name = tiss
                break

        self.tissues[sample_name] += 1
        
        self.uniq.add((read,sample_name))
        self.uniq.add((rev_comp(read),sample_name))
Beispiel #2
0
    def add(self, read, A, B, dist, ov, strandmatch, signal, n_hits):
        self.signal = signal
        self.strandmatch = strandmatch
        self.edits.append(dist)
        self.overlaps.append(ov)
        self.n_hits.append(n_hits)

        # by convention have A precede B in the genome.
        if A.pos > B.pos:
            A, B = B, A

        # Alignment Score - Secondbest hit score
        aopt = dict(A.tags)
        bopt = dict(B.tags)
        qA = aopt.get('AS') - aopt.get('XS', minmapscore)
        qB = bopt.get('AS') - bopt.get('XS', minmapscore)

        if qA and qB:
            # both anchors from the *same read* align uniquely
            self.uniq_bridges += 1

        self.mapquals_A.append(qA)
        self.mapquals_B.append(qB)

        # recover the original readname
        # ('__' is forbidden in input read names!)
        if '__' in A.qname:
            qname = A.qname.split('__')[0][:-2]
        else:  # reads have been swapped at some point
            qname = B.qname.split('__')[0][:-2]

        self.readnames.append(qname)

        # record the spliced read sequence as it was before mapping
        if A.is_reverse:
            self.strand_minus += 1
            self.reads.append(rev_comp(read))
        else:
            self.strand_plus += 1
            self.reads.append(read)

        # identify the tissue/sample it came from
        sample_name = options.name
        for (prefix, tiss) in samples:
            if qname.startswith(prefix):
                sample_name = tiss
                break

        self.tissues[sample_name] += 1

        self.uniq.add((read, sample_name))
        self.uniq.add((rev_comp(read), sample_name))
Beispiel #3
0
    def get_data(self, chrom, start, end, sense):
        if not self.chrom_stats:
            self.index()

        ofs, ldata, skip, skip_char, size = self.chrom_stats[chrom]
        pad_start = 0
        pad_end = 0
        if start < 0:
            pad_start = -start
            start = 0

        if end > size:
            pad_end = end - size
            end = end

        l_start = start / ldata
        l_end = end / ldata
        ofs_start = l_start * skip + start + ofs
        ofs_end = l_end * skip + end + ofs

        s = self.mmap[ofs_start:ofs_end].replace(skip_char, "")
        if pad_start or pad_end:
            s = "N" * pad_start + s + "N" * pad_end

        if sense == "-":
            s = rev_comp(s)
        return s
Beispiel #4
0
    def get_data(self,chrom,start,end,sense):
        if not self.chrom_stats:
            self.index()

        ofs,ldata,skip,skip_char,size = self.chrom_stats[chrom]
        pad_start = 0
        pad_end = 0
        if start < 0:
            pad_start = -start
            start = 0

        if end > size:
            pad_end = end - size
            end = end

        l_start = start / ldata
        l_end = end / ldata
        ofs_start = l_start * skip + start + ofs
        ofs_end = l_end * skip + end + ofs
        
        s = self.mmap[ofs_start:ofs_end].replace(skip_char,"")
        if pad_start or pad_end:
            s = "N"*pad_start + s + "N"*pad_end

        if sense == "-":
            s = rev_comp(s)
        return s
Beispiel #5
0
def find_breakpoints(A,
                     B,
                     read,
                     chrom,
                     margin=options.margin,
                     maxdist=options.maxdist):
    def mismatches(a, b):
        a, b = fromstring(a, dtype=byte), fromstring(b, dtype=byte)

        return (a != b).sum()

    def rnd():
        return numpy.random.random()

    if not options.randomize:
        rnd = lambda: 0

    def strandmatch(ann, sense):
        if ann == sense:
            return "MATCH"
        if ann == "*" or len(ann) > 1:
            return "NA"
        return "MISMATCH"

    L = len(read)
    hits = []
    #print "readlen",L
    #print " "*2+read
    eff_a = options.asize - margin
    internal = read[eff_a:-eff_a].upper()

    flank = L - 2 * eff_a + 2

    A_flank = genome.get(chrom, A.aend - margin, A.aend - margin + flank,
                         '+').upper()
    B_flank = genome.get(chrom, B.pos - flank + margin, B.pos + margin,
                         '+').upper()

    l = L - 2 * eff_a
    for x in range(l + 1):
        spliced = A_flank[:x] + B_flank[x + 2:]
        dist = mismatches(spliced, internal)

        #bla = A_flank[:x].lower() + B_flank[x+2:]
        #print " "*(eff_a+2)+bla,dist

        ov = 0
        if x < margin:
            ov = margin - x
        if l - x < margin:
            ov = margin - (l - x)

        if dist <= maxdist:
            gt = A_flank[x:x + 2]
            ag = B_flank[x:x + 2]
            gtag = gt + ag
            rc_gtag = rev_comp(gtag)

            start, end = B.pos + margin - l + x, A.aend - margin + x + 1
            start, end = min(start, end), max(start, end)

            strand = "*"

            # get strand cues from read if strand-specific sequencing was used.
            if options.stranded:
                if A.is_reverse:
                    strand = '-'
                else:
                    strand = '+'

            if options.noncanonical:
                hits.append((dist, ov, strandmatch(strand, '+'), rnd(), chrom,
                             start, end, gtag, '+'))
                hits.append((dist, ov, strandmatch(strand, '-'), rnd(), chrom,
                             start, end, rc_gtag, '-'))

            else:
                if gtag == 'GTAG':
                    hits.append((dist, ov, strandmatch(strand, '+'), rnd(),
                                 chrom, start, end, 'GTAG', '+'))
                elif gtag == 'CTAC':
                    hits.append((dist, ov, strandmatch(strand, '-'), rnd(),
                                 chrom, start, end, 'GTAG', '-'))

    if len(hits) < 2:
        # unambiguous, return right away
        return hits

    # Hits are sorted, with low edit distance beating low anchor overlap
    hits = sorted(hits)
    # return only hits that are tied with the best candidate by edit distance, anchor overlap
    # and strand preference (if requested). Ties are either broken by random draw or by selecting
    # the first hit later, but they are kept here such that they are counted properly.
    best = hits[0]
    if options.strandpref:
        # exploit strand information to break ties only if requested. Otherwise it's only used as control.
        ties = [
            h for h in hits
            if (h[0] == best[0]) and (h[1] == best[1]) and (h[2] == best[2])
        ]
    else:
        ties = [h for h in hits if (h[0] == best[0]) and (h[1] == best[1])]

    return ties
Beispiel #6
0
        ### anchor pairs that make it up to here are interesting
        if bam_out:
            bam_out.write(A)
            bam_out.write(B)

        #debug("A='%s' B='%s' dist=%d A.is_reverse=%s" % (A,B,dist,A.is_reverse))
        if (A.is_reverse and dist > 0) or (not A.is_reverse and dist < 0):
            # the anchors align in reversed orientation -> circRNA?

            read = A.qname.split('__')[1]
            chrom = sam.getrname(A.tid)

            if A.is_reverse:
                #print "ISREVERSE"
                A, B = B, A
                read = rev_comp(read)

            bp = find_breakpoints(A, B, read, chrom)
            if not bp:
                N['circ_no_bp'] += 1
            else:
                N['circ_reads'] += 1

            n_hits = len(bp)
            if bp and not options.allhits:
                bp = [
                    bp[0],
                ]

            for h in bp:
                # for some weird reason for circ we need a correction here
Beispiel #7
0
def find_breakpoints(A,B,read,chrom,margin=options.margin,maxdist=options.maxdist):

    def mismatches(a,b):
        a,b = fromstring(a,dtype=byte), fromstring(b,dtype=byte)
        
        return (a != b).sum()

    def rnd():
        return numpy.random.random()
    if not options.randomize:
        rnd = lambda : 0

    def strandmatch(ann,sense):
        if ann == sense:
            return "MATCH"
        if ann == "*" or len(ann) > 1:
            return "NA"
        return "MISMATCH"

    L = len(read)
    hits = []
    #print "readlen",L
    #print " "*2+read
    eff_a = options.asize-margin
    internal = read[eff_a:-eff_a].upper()
        
    flank = L - 2*eff_a + 2

    A_flank = genome.get(chrom,A.aend-margin,A.aend-margin + flank,'+').upper()
    B_flank = genome.get(chrom,B.pos - flank+margin,B.pos+margin,'+').upper()

    l = L - 2*eff_a
    for x in range(l+1):
        spliced = A_flank[:x] + B_flank[x+2:]
        dist = mismatches(spliced,internal)        
        
        #bla = A_flank[:x].lower() + B_flank[x+2:]
        #print " "*(eff_a+2)+bla,dist

        ov = 0
        if x < margin:
            ov = margin-x
        if l-x < margin:
            ov = margin-(l-x)
        
        if dist <= maxdist:
            gt = A_flank[x:x+2]
            ag = B_flank[x:x+2]
            gtag = gt+ag
            rc_gtag = rev_comp(gtag)
            
            start,end = B.pos+margin-l+x,A.aend-margin+x+1
            start,end = min(start,end),max(start,end)
            
            strand = "*"
            
            # get strand cues from read if strand-specific sequencing was used.
            if options.stranded:
                if A.is_reverse:
                    strand = '-'
                else:
                    strand = '+'

            if options.noncanonical:
                hits.append((dist,ov,strandmatch(strand,'+'),rnd(),chrom,start,end,gtag,'+'))
                hits.append((dist,ov,strandmatch(strand,'-'),rnd(),chrom,start,end,rc_gtag,'-'))

            else:
                if gtag == 'GTAG':
                    hits.append((dist,ov,strandmatch(strand,'+'),rnd(),chrom,start,end,'GTAG','+'))
                elif gtag == 'CTAC':
                    hits.append((dist,ov,strandmatch(strand,'-'),rnd(),chrom,start,end,'GTAG','-'))

    if len(hits) < 2:
        # unambiguous, return right away
        return hits

    # Hits are sorted, with low edit distance beating low anchor overlap
    hits = sorted(hits)
    # return only hits that are tied with the best candidate by edit distance, anchor overlap 
    # and strand preference (if requested). Ties are either broken by random draw or by selecting 
    # the first hit later, but they are kept here such that they are counted properly.
    best = hits[0]
    if options.strandpref:
        # exploit strand information to break ties only if requested. Otherwise it's only used as control.
        ties = [h for h in hits if (h[0] == best[0]) and (h[1] == best[1]) and (h[2] == best[2])]
    else:
        ties = [h for h in hits if (h[0] == best[0]) and (h[1] == best[1])]
        
    return ties
Beispiel #8
0
        ### anchor pairs that make it up to here are interesting
        if bam_out:
            bam_out.write(A)
            bam_out.write(B)

        #debug("A='%s' B='%s' dist=%d A.is_reverse=%s" % (A,B,dist,A.is_reverse))
        if (A.is_reverse and dist > 0) or (not A.is_reverse and dist < 0):
            # the anchors align in reversed orientation -> circRNA?
            
            read = A.qname.split('__')[1]
            chrom = sam.getrname(A.tid)
            
            if A.is_reverse:
                #print "ISREVERSE"
                A,B = B,A
                read = rev_comp(read)
                            
            bp = find_breakpoints(A,B,read,chrom)
            if not bp:
                N['circ_no_bp'] += 1
            else:
                N['circ_reads'] += 1

            n_hits = len(bp)
            if bp and not options.allhits:
                bp = [bp[0],]

            for h in bp:
                # for some weird reason for circ we need a correction here
                dist,ov,strandmatch,rnd,chrom,start,end,signal,sense = h
                h = (chrom,start+1,end-1,sense)