def test1(self): regions = [] add = regions.append add(makeReadRegion(1, '2I', -1, -2))## an inserted I_1 before any M_1 add(makeReadRegion(0, '4M', 0, 3)) add(makeReadRegion(1, '2I', -1, -1)) ## an inserted I_1 after an M_1 add(makeReadRegion(1, '3I', -1, -1)) ## an inserted I_1 after an inserted I_1 ## a gap add(makeReadRegion(1, '1M', 5, 5)) ## a match I_1 after is an inserted I_1 add(makeReadRegion(1, '4I', 6, 5)) ## an inserted I_1 after a match I_1 ## a gap here add(makeReadRegion(0, '2M', 7, 8)) add(makeReadRegion(0, '2I', 9, 8)) ## an M_1 that is an insertion add(makeReadRegion(1, '3I', 9, 8)) add(makeReadRegion(0, '4I', 9, 8)) ## an M_1 that is an insertion add(makeReadRegion(1, '5I', 9, 8)) ## a gap here add(makeReadRegion(0, '5M', 10, 14)) cb = cigarbuilder.CigarBuilder() cigar = cb.build(regions) print(cigar) self.assertEqual(cigar, [(1, 2), (0, 4), (1, 2), (1, 3), (2, 1), (0, 1), (1, 4), (2, 1), (0, 2), (1, 2), (1, 3), (1, 4), (1, 5), (2, 1), (0, 5)]) self.assertEqual(cigarutils.toString(cigar), '2I,4M,2I,3I,1D,1M,4I,1D,2M,2I,3I,4I,5I,1D,5M')
def logRegion(reg): log(" op: %d.\n" % reg[0]) log(" region start: %d, end: %d, mpos: %d.\n" % (reg[2], reg[3], reg[4])) log(" region cigar: '%s'.\n" % cu.toString(reg[1])) if len(reg) > 5: log(" annotation: s: %d, i: %d, d: %d.\n" % (reg[5], reg[6], reg[7]))
def getReadOffset(rseq, pos): ''' Given the position in the alignment coordinate, return its offset in read ''' readPos = rseq.pos #readLen = rseq.qlen #qlen does not include soft-clipping, use rlen instead. readLen = rseq.rlen cigar = rseq.cigar if (pos < readPos): raise ValueError("position underflows") offset = 0 curPos = readPos for op, length in cigar: if op == 0 or op == 7 or op == 8: # Match if (pos < curPos + length): # (a) pos in this match ret = offset + pos - curPos if ret < readLen: return ret else: msg = "cigar '%s' and length '%s' conflict in read '%s'" cigarStr = cu.toString(cigar) raise ValueError(msg % (cigarStr, readLen, rseq.qname)) else: offset += length # (b) pos not in this match curPos += length elif op == 1 or op == 4: # Insertion and Soft Clipping offset += length elif op == 2 or op == 3: # Deletion or Splicing junction curPos += length else: raise NotImplementedError("unknown op '%s' in read '%s'" % (op, rseq.qname)) # In the case of offset == readLen, there may be an overflow if offset > readLen: msg = "cigar '%s' and length '%s' conflict in read '%s'" cigarStr = cu.toString(cigar) raise ValueError(msg % (cigarStr, readLen, rseq.qname)) if curPos > pos: raise ValueError('position in deletion or splicing junction') raise ValueError('position overflows')
def checkRegion(region): '''sanity check for a region''' assert len(region) >= 4 cigar = region[1] nRefBases = 0 for op, length in cigar: if op == 0 or op == 2 or op == 3: nRefBases += length if region[2]+ nRefBases - 1 != region[3]: raise ValueError("Error: cigar '%s' conflicts with read region %d-%d." % (cigarutils.toString(cigar), region[2], region[3]))
def checkRegion(region): '''sanity check for a region''' assert len(region) >= 4 cigar = region[1] nRefBases = 0 for op, length in cigar: if op == 0 or op == 2 or op == 3: nRefBases += length if region[2] + nRefBases - 1 != region[3]: raise ValueError( "Error: cigar '%s' conflicts with read region %d-%d." % (cigarutils.toString(cigar), region[2], region[3]))
def test1(self): regions = [] add = regions.append ## an inserted I_1 before any M_1 add(makeReadRegion(1, '2I', -1, -2)) add(makeReadRegion(0, '4M', 0, 3)) ## an inserted I_1 after an M_1 add(makeReadRegion(1, '2I', -1, -1)) ## an inserted I_1 after an inserted I_1 add(makeReadRegion(1, '3I', -1, -1)) ## a gap ## add(makeReadRegion(1, '1M', 5, 5)) ## an inserted I_1 after a match I_1 add(makeReadRegion(1, '4I', 6, 5)) ## a gap here add(makeReadRegion(0, '2M', 7, 8)) ## an M_1 that is an insertion add(makeReadRegion(0, '2I', 9, 8)) add(makeReadRegion(1, '3I', 9, 8)) ## an M_1 that is an insertion add(makeReadRegion(0, '4I', 9, 8)) add(makeReadRegion(1, '5I', 9, 8)) ## a gap here add(makeReadRegion(0, '5M', 10, 14)) cb = cigarbuilder.CigarBuilder() cigar = cb.build(regions) print(cigar) self.assertEqual(cigar, [(1, 2), (0, 4), (1, 2), (1, 3), (2, 1), (1, 5), (2, 1), (0, 2), (1, 2), (1, 3), (1, 4), (1, 5), (2, 1), (0, 5)]) self.assertEqual(cigarutils.toString(cigar), '2I,4M,2I,3I,1D,1M,4I,1D,2M,2I,3I,4I,5I,1D,5M')
@author: Shunping Huang ''' import unittest import StringIO import tempfile import os import pysam from lapels import annotator as annot from lapels import cigarutils from modtools import mod polish = lambda x: cigarutils.toString(cigarutils.simplify(x)) class Read: '''Class for simulating reads from a bam file''' def __init__(self, start, end, cigar=None, qlen=None): self.qname = 'unknown' self.pos = start self.aend = end #one base after the actual end self.tags = dict() if cigar is None: self.cigar = [(0, self.aend - self.pos)] else: self.cigar = cigar if qlen is None:
Created on Oct 3, 2012 @author: Shunping Huang ''' import unittest import StringIO import tempfile import os import pysam from lapels import annotator as annot from lapels import cigarutils from modtools import mod polish = lambda x: cigarutils.toString(cigarutils.simplify(x)) class Read: '''Class for simulating reads from a bam file''' def __init__(self, start, end, cigar=None, qlen=None): self.qname = 'unknown' self.pos = start self.aend = end #one base after the actual end self.tags = dict() if cigar is None: self.cigar = [(0, self.aend - self.pos)] else: self.cigar = cigar if qlen is None: self.qlen = 0
def testMakeCigar(self): self.assertEqual(make(toString(self.cigar1)), self.cigar1) self.assertEqual(make(toString(self.cigar2)), self.cigar2)
def testToString(self): self.assertEqual(toString(self.cigar1),'15M,5I,10D,5M') self.assertEqual(toString(self.cigar2),'10I,15M,5I,10D,5M,10I')
def execute(self): '''The driver method for the module''' self.logger.info("[%s]: %d read(s) found in BAM", self.chrom, self.nReads) if self.nReads == 0: return 0 self.data = self.mod.data self.modKeys = [tup[2] for tup in self.data] self.posmap = self.mod.get_posmap(self.chrom) count = 0 count2 = 0 if TESTING: results = [] if not TESTING: self.logger.info("[%s]: %3d%%", self.chrom, count * 100 / self.nReads) for rseq in self.inBam: # Annotate each reads if rseq.is_unmapped: # write unmapped reads without changing count += 1 if VERBOSITY > 1: log("unmapped read name: %s\n" % rseq.qname) if self.outBam is not None: self.outBam.write(rseq) continue if VERBOSITY > 1: log("read name: %s\n" % rseq.qname) log("t. alignment pos: %d\n" % rseq.pos) log("t. alignment cigar: '%s'\n\n" % cu.toString(rseq.cigar)) rseq.cigar = cu.simplify(rseq.cigar) # Simplify the cigar first. regions = [] tregs = getTargetRegions(rseq) for idx, treg in enumerate(tregs): if treg[0] == 0 or treg[0] == 7 or treg[0] == 8: # Match if VERBOSITY > 1: log("process match\n") log("t. region cigar(%d): %s\n" % (idx, cu.toString([rseq.cigar[idx]]))) rreg = self.parseTargetRegion(treg, rseq) if VERBOSITY > 1: logRegion(rreg) elif treg[0] == 2 or treg[0] == 3: # Deletion/Splice junction rreg = (treg[0], ) else: rreg = (1, [rseq.cigar[idx]], 0, -1, -1) # Insertion regions.append(rreg) if VERBOSITY > 1: log("\nafter parsing match regions:\n") log('\n'.join(map(str, regions))) log("\n\n") nRegions = len(regions) assert nRegions == len(rseq.cigar) for idx in range(nRegions): op = regions[idx][0] if op == 2 or op == 3: # Handle deletions and splicing if VERBOSITY > 1: log("process deletion/splicing junction\n") log("t. region cigar(%d): %s\n" % (idx, cu.toString([rseq.cigar[idx]]))) if ((idx > 0) and idx < (nRegions - 1) and (regions[idx - 1][0] == 0) and (regions[idx + 1][0] == 0)): delta = regions[idx + 1][2] - regions[idx - 1][3] - 1 assert delta >= 0 if delta == 0: rreg = (op, [], regions[idx - 1][3] + 1, regions[idx + 1][2] - 1, -1) else: rreg = (op, [(op, delta)], regions[idx - 1][3] + 1, regions[idx + 1][2] - 1, -1) if VERBOSITY > 1: logRegion(rreg) else: rreg = self.parseTargetRegion(tregs[idx], rseq) if VERBOSITY > 1: logRegion(rreg) rreg = regionutils.modifyRegion(rreg) if VERBOSITY > 1: log("after modification\n") logRegion(rreg) regions[idx] = rreg if VERBOSITY > 1: log("\nafter parsing deletions/splicing junction regions:\n") log('\n'.join(map(str, regions))) log('\n\n') cb = cigarbuilder.CigarBuilder() # cigar=cb.build(regions) for reg in regions: cb.append(reg) cigar = cb.cigar # Fix the MIDM pattern if re.match('.*\d*I,\d*D', cu.toString(cigar)) is not None: if VERBOSITY > 1: log("fix MIDM pattern: %s\n" % cu.toString(cigar)) # print("%d,%d,%d" %(nSNPs, nInsertions, nDeletions)) # print(regions) for i in range(nRegions): if tregs[i][0] == 1: length = rseq.cigar[i][1] assert length > 0 offset = getReadOffset(rseq, tregs[i][3]) + 1 ins = (rseq.seq[offset:offset + length]) if VERBOSITY > 1: log('seq in insertion: %s\n' % ins) if i > 0: assert regions[i - 1][3] >= 0 loKey = regions[i - 1][3] elif i < nRegions - 1 and regions[i + 1][2] >= length: loKey = regions[i + 1][2] - length else: loKey = -1 if i < nRegions - 1: assert regions[i + 1][2] >= 0 hiKey = regions[i + 1][2] elif i > 0 and regions[i - 1][3] >= 0: hiKey = regions[i - 1][3] + length else: hiKey = -1 # loKey = regions[i-1][3] # hiKey = regions[i+1][2] lo = bisect.bisect_left(self.modKeys, loKey) hi = bisect.bisect_right(self.modKeys, hiKey) if VERBOSITY > 1: log('variants from %d-%d\n' % (loKey, hiKey)) for j in range(lo, hi): log('%s\n' % str(self.data[j])) isMatched = False matchStart = -1 pivot = 0 isFailed = False for j in range(lo, hi): if self.data[j][0] != 'd': # To activate MIDM pattern, the first instruction # after insertion in read should be D. isFailed = True break #continue if matchStart == -1: matchStart = int(self.data[j][2]) if ins[pivot] == self.data[j][3]: pivot += 1 else: break if j == hi - 1 or pivot >= length: isMatched = True break if isFailed: continue if isMatched: if VERBOSITY > 1: log('insertion matches gap from left\n') log("before:\n") logRegion(regions[i]) if pivot < length: regions[i] = (0, [(0, pivot), (1, length - pivot)], matchStart, matchStart + pivot - 1, matchStart) else: regions[i] = (0, [(0, pivot)], matchStart, matchStart + pivot - 1, matchStart) if VERBOSITY > 1: log("after:\n") logRegion(regions[i]) log("\n") else: pivot = length - 1 for j in range(hi - 1, lo - 1, -1): if self.data[j][0] != 'd': continue if ins[pivot] == self.data[j][3]: matchStart = int(self.data[j][2]) pivot -= 1 else: break if j == lo or pivot <= 0: isMatched = True break if isMatched: if VERBOSITY > 1: log('insertion matches gap from right\n') log("before:\n") logRegion(regions[i]) if pivot >= 0: regions[i] = (0, [(1, pivot + 1), (0, length - 1 - pivot)], matchStart, matchStart + length - pivot - 2, matchStart) else: regions[i] = (0, [(0, length - 1 - pivot)], matchStart, matchStart + length - pivot - 2, matchStart) if VERBOSITY > 1: log("after:\n") logRegion(regions[i]) log("\n") else: if VERBOSITY > 1: log('insertion not matches\n') cb = cigarbuilder.CigarBuilder() # cigar=cb.build(regions) for reg in regions: cb.append(reg) cigar = cb.cigar # print(cu.toString(cigar)) if VERBOSITY > 1: log("\nafter fixing special pattern:\n") log('\n'.join(map(str, regions))) log('\n\n') nSNPs = 0 nInsertions = 0 nDeletions = 0 for reg in regions: if len(reg) > 5: nSNPs += reg[5] nInsertions += reg[6] nDeletions += reg[7] ## Set tags tags = dict(rseq.tags) self.setTag(tags, 's0', nSNPs) self.setTag(tags, 'i0', nInsertions) self.setTag(tags, 'd0', nDeletions) self.setTag(tags, 'OC', cu.toString(rseq.cigar).replace(',', '')) self.setTag(tags, 'OM', tags['NM']) del tags['NM'] # Delete the old 'NM' tag. if nSNPs != 0 or nInsertions != 0 or nDeletions != 0: count2 += 1 rseq.tags = [(key, tags[key]) for key in sorted(tags.keys())] ## Set pos to be the first M pos = -1 for reg in regions: if reg[4] >= 0: pos = reg[4] break rseq.pos = pos ## Set cigar rseq.cigar = cu.simplify(cigar) if self.outBam is not None: self.outBam.write(rseq) if TESTING: results.append((rseq.cigar, rseq.pos, nSNPs, nInsertions, nDeletions)) if VERBOSITY > 1: log("output read pos: %d\n" % rseq.pos) log("output read cigar: '%s'\n" % cu.toString(rseq.cigar)) log("output annotation: s: %d, i: %d, d: %d.\n" "========================\n\n" % (nSNPs, nInsertions, nDeletions)) count += 1 if not TESTING and count % 100000 == 0: self.logger.info("[%s]: %3d%%", self.chrom, count * 100 / self.nReads) if not TESTING: self.logger.info("[%s]: %3d%%", self.chrom, count * 100 / self.nReads) self.logger.info("[%s]: %d read(s) written to file", self.chrom, count) self.logger.info("[%s]: %d read(s) have variants", self.chrom, count2) else: return results return count