def add(self,read,A,B,dist,ov,strandmatch,signal,n_hits): self.signal = signal self.strandmatch = strandmatch self.edits.append(dist) self.overlaps.append(ov) self.n_hits.append(n_hits) # by convention have A precede B in the genome. if A.pos > B.pos: A,B = B,A # Alignment Score - Secondbest hit score aopt = dict(A.tags) bopt = dict(B.tags) qA = aopt.get('AS') - aopt.get('XS',minmapscore) qB = bopt.get('AS') - bopt.get('XS',minmapscore) if qA and qB: # both anchors from the *same read* align uniquely self.uniq_bridges += 1 self.mapquals_A.append(qA) self.mapquals_B.append(qB) # recover the original readname # ('__' is forbidden in input read names!) if '__' in A.qname: qname = A.qname.split('__')[0][:-2] else: # reads have been swapped at some point qname = B.qname.split('__')[0][:-2] self.readnames.append(qname) # record the spliced read sequence as it was before mapping if A.is_reverse: self.strand_minus += 1 self.reads.append(rev_comp(read)) else: self.strand_plus += 1 self.reads.append(read) # identify the tissue/sample it came from sample_name = options.name for (prefix,tiss) in samples: if qname.startswith(prefix): sample_name = tiss break self.tissues[sample_name] += 1 self.uniq.add((read,sample_name)) self.uniq.add((rev_comp(read),sample_name))
def add(self, read, A, B, dist, ov, strandmatch, signal, n_hits): self.signal = signal self.strandmatch = strandmatch self.edits.append(dist) self.overlaps.append(ov) self.n_hits.append(n_hits) # by convention have A precede B in the genome. if A.pos > B.pos: A, B = B, A # Alignment Score - Secondbest hit score aopt = dict(A.tags) bopt = dict(B.tags) qA = aopt.get('AS') - aopt.get('XS', minmapscore) qB = bopt.get('AS') - bopt.get('XS', minmapscore) if qA and qB: # both anchors from the *same read* align uniquely self.uniq_bridges += 1 self.mapquals_A.append(qA) self.mapquals_B.append(qB) # recover the original readname # ('__' is forbidden in input read names!) if '__' in A.qname: qname = A.qname.split('__')[0][:-2] else: # reads have been swapped at some point qname = B.qname.split('__')[0][:-2] self.readnames.append(qname) # record the spliced read sequence as it was before mapping if A.is_reverse: self.strand_minus += 1 self.reads.append(rev_comp(read)) else: self.strand_plus += 1 self.reads.append(read) # identify the tissue/sample it came from sample_name = options.name for (prefix, tiss) in samples: if qname.startswith(prefix): sample_name = tiss break self.tissues[sample_name] += 1 self.uniq.add((read, sample_name)) self.uniq.add((rev_comp(read), sample_name))
def get_data(self, chrom, start, end, sense): if not self.chrom_stats: self.index() ofs, ldata, skip, skip_char, size = self.chrom_stats[chrom] pad_start = 0 pad_end = 0 if start < 0: pad_start = -start start = 0 if end > size: pad_end = end - size end = end l_start = start / ldata l_end = end / ldata ofs_start = l_start * skip + start + ofs ofs_end = l_end * skip + end + ofs s = self.mmap[ofs_start:ofs_end].replace(skip_char, "") if pad_start or pad_end: s = "N" * pad_start + s + "N" * pad_end if sense == "-": s = rev_comp(s) return s
def get_data(self,chrom,start,end,sense): if not self.chrom_stats: self.index() ofs,ldata,skip,skip_char,size = self.chrom_stats[chrom] pad_start = 0 pad_end = 0 if start < 0: pad_start = -start start = 0 if end > size: pad_end = end - size end = end l_start = start / ldata l_end = end / ldata ofs_start = l_start * skip + start + ofs ofs_end = l_end * skip + end + ofs s = self.mmap[ofs_start:ofs_end].replace(skip_char,"") if pad_start or pad_end: s = "N"*pad_start + s + "N"*pad_end if sense == "-": s = rev_comp(s) return s
def find_breakpoints(A, B, read, chrom, margin=options.margin, maxdist=options.maxdist): def mismatches(a, b): a, b = fromstring(a, dtype=byte), fromstring(b, dtype=byte) return (a != b).sum() def rnd(): return numpy.random.random() if not options.randomize: rnd = lambda: 0 def strandmatch(ann, sense): if ann == sense: return "MATCH" if ann == "*" or len(ann) > 1: return "NA" return "MISMATCH" L = len(read) hits = [] #print "readlen",L #print " "*2+read eff_a = options.asize - margin internal = read[eff_a:-eff_a].upper() flank = L - 2 * eff_a + 2 A_flank = genome.get(chrom, A.aend - margin, A.aend - margin + flank, '+').upper() B_flank = genome.get(chrom, B.pos - flank + margin, B.pos + margin, '+').upper() l = L - 2 * eff_a for x in range(l + 1): spliced = A_flank[:x] + B_flank[x + 2:] dist = mismatches(spliced, internal) #bla = A_flank[:x].lower() + B_flank[x+2:] #print " "*(eff_a+2)+bla,dist ov = 0 if x < margin: ov = margin - x if l - x < margin: ov = margin - (l - x) if dist <= maxdist: gt = A_flank[x:x + 2] ag = B_flank[x:x + 2] gtag = gt + ag rc_gtag = rev_comp(gtag) start, end = B.pos + margin - l + x, A.aend - margin + x + 1 start, end = min(start, end), max(start, end) strand = "*" # get strand cues from read if strand-specific sequencing was used. if options.stranded: if A.is_reverse: strand = '-' else: strand = '+' if options.noncanonical: hits.append((dist, ov, strandmatch(strand, '+'), rnd(), chrom, start, end, gtag, '+')) hits.append((dist, ov, strandmatch(strand, '-'), rnd(), chrom, start, end, rc_gtag, '-')) else: if gtag == 'GTAG': hits.append((dist, ov, strandmatch(strand, '+'), rnd(), chrom, start, end, 'GTAG', '+')) elif gtag == 'CTAC': hits.append((dist, ov, strandmatch(strand, '-'), rnd(), chrom, start, end, 'GTAG', '-')) if len(hits) < 2: # unambiguous, return right away return hits # Hits are sorted, with low edit distance beating low anchor overlap hits = sorted(hits) # return only hits that are tied with the best candidate by edit distance, anchor overlap # and strand preference (if requested). Ties are either broken by random draw or by selecting # the first hit later, but they are kept here such that they are counted properly. best = hits[0] if options.strandpref: # exploit strand information to break ties only if requested. Otherwise it's only used as control. ties = [ h for h in hits if (h[0] == best[0]) and (h[1] == best[1]) and (h[2] == best[2]) ] else: ties = [h for h in hits if (h[0] == best[0]) and (h[1] == best[1])] return ties
### anchor pairs that make it up to here are interesting if bam_out: bam_out.write(A) bam_out.write(B) #debug("A='%s' B='%s' dist=%d A.is_reverse=%s" % (A,B,dist,A.is_reverse)) if (A.is_reverse and dist > 0) or (not A.is_reverse and dist < 0): # the anchors align in reversed orientation -> circRNA? read = A.qname.split('__')[1] chrom = sam.getrname(A.tid) if A.is_reverse: #print "ISREVERSE" A, B = B, A read = rev_comp(read) bp = find_breakpoints(A, B, read, chrom) if not bp: N['circ_no_bp'] += 1 else: N['circ_reads'] += 1 n_hits = len(bp) if bp and not options.allhits: bp = [ bp[0], ] for h in bp: # for some weird reason for circ we need a correction here
def find_breakpoints(A,B,read,chrom,margin=options.margin,maxdist=options.maxdist): def mismatches(a,b): a,b = fromstring(a,dtype=byte), fromstring(b,dtype=byte) return (a != b).sum() def rnd(): return numpy.random.random() if not options.randomize: rnd = lambda : 0 def strandmatch(ann,sense): if ann == sense: return "MATCH" if ann == "*" or len(ann) > 1: return "NA" return "MISMATCH" L = len(read) hits = [] #print "readlen",L #print " "*2+read eff_a = options.asize-margin internal = read[eff_a:-eff_a].upper() flank = L - 2*eff_a + 2 A_flank = genome.get(chrom,A.aend-margin,A.aend-margin + flank,'+').upper() B_flank = genome.get(chrom,B.pos - flank+margin,B.pos+margin,'+').upper() l = L - 2*eff_a for x in range(l+1): spliced = A_flank[:x] + B_flank[x+2:] dist = mismatches(spliced,internal) #bla = A_flank[:x].lower() + B_flank[x+2:] #print " "*(eff_a+2)+bla,dist ov = 0 if x < margin: ov = margin-x if l-x < margin: ov = margin-(l-x) if dist <= maxdist: gt = A_flank[x:x+2] ag = B_flank[x:x+2] gtag = gt+ag rc_gtag = rev_comp(gtag) start,end = B.pos+margin-l+x,A.aend-margin+x+1 start,end = min(start,end),max(start,end) strand = "*" # get strand cues from read if strand-specific sequencing was used. if options.stranded: if A.is_reverse: strand = '-' else: strand = '+' if options.noncanonical: hits.append((dist,ov,strandmatch(strand,'+'),rnd(),chrom,start,end,gtag,'+')) hits.append((dist,ov,strandmatch(strand,'-'),rnd(),chrom,start,end,rc_gtag,'-')) else: if gtag == 'GTAG': hits.append((dist,ov,strandmatch(strand,'+'),rnd(),chrom,start,end,'GTAG','+')) elif gtag == 'CTAC': hits.append((dist,ov,strandmatch(strand,'-'),rnd(),chrom,start,end,'GTAG','-')) if len(hits) < 2: # unambiguous, return right away return hits # Hits are sorted, with low edit distance beating low anchor overlap hits = sorted(hits) # return only hits that are tied with the best candidate by edit distance, anchor overlap # and strand preference (if requested). Ties are either broken by random draw or by selecting # the first hit later, but they are kept here such that they are counted properly. best = hits[0] if options.strandpref: # exploit strand information to break ties only if requested. Otherwise it's only used as control. ties = [h for h in hits if (h[0] == best[0]) and (h[1] == best[1]) and (h[2] == best[2])] else: ties = [h for h in hits if (h[0] == best[0]) and (h[1] == best[1])] return ties
### anchor pairs that make it up to here are interesting if bam_out: bam_out.write(A) bam_out.write(B) #debug("A='%s' B='%s' dist=%d A.is_reverse=%s" % (A,B,dist,A.is_reverse)) if (A.is_reverse and dist > 0) or (not A.is_reverse and dist < 0): # the anchors align in reversed orientation -> circRNA? read = A.qname.split('__')[1] chrom = sam.getrname(A.tid) if A.is_reverse: #print "ISREVERSE" A,B = B,A read = rev_comp(read) bp = find_breakpoints(A,B,read,chrom) if not bp: N['circ_no_bp'] += 1 else: N['circ_reads'] += 1 n_hits = len(bp) if bp and not options.allhits: bp = [bp[0],] for h in bp: # for some weird reason for circ we need a correction here dist,ov,strandmatch,rnd,chrom,start,end,signal,sense = h h = (chrom,start+1,end-1,sense)