Ejemplo n.º 1
0
 def test1(self):
     regions = []
     add = regions.append
     add(makeReadRegion(1, '2I', -1, -2))## an inserted I_1 before any M_1    
     add(makeReadRegion(0, '4M', 0, 3))
     add(makeReadRegion(1, '2I', -1, -1))  ## an inserted I_1 after an M_1
     add(makeReadRegion(1, '3I', -1, -1))  ## an inserted I_1 after an inserted I_1
                                         ## a gap
     add(makeReadRegion(1, '1M', 5, 5))  ## a match I_1 after is an inserted I_1
     add(makeReadRegion(1, '4I', 6, 5))  ## an inserted I_1 after a match I_1   
                                         ## a gap here 
     add(makeReadRegion(0, '2M', 7, 8))    
     add(makeReadRegion(0, '2I', 9, 8))  ## an M_1 that is an insertion  
     add(makeReadRegion(1, '3I', 9, 8))    
     add(makeReadRegion(0, '4I', 9, 8))  ## an M_1 that is an insertion
     add(makeReadRegion(1, '5I', 9, 8))
                                         ## a gap here
     add(makeReadRegion(0, '5M', 10, 14))    
     cb = cigarbuilder.CigarBuilder()
     cigar = cb.build(regions)
     print(cigar)
     self.assertEqual(cigar, [(1, 2), (0, 4), (1, 2), (1, 3), (2, 1), 
                              (0, 1), (1, 4), (2, 1), (0, 2), (1, 2), 
                              (1, 3), (1, 4), (1, 5), (2, 1), (0, 5)])
     self.assertEqual(cigarutils.toString(cigar), '2I,4M,2I,3I,1D,1M,4I,1D,2M,2I,3I,4I,5I,1D,5M')        
Ejemplo n.º 2
0
def logRegion(reg):
    log("  op: %d.\n" % reg[0])
    log("  region start: %d, end: %d, mpos: %d.\n"
        % (reg[2], reg[3], reg[4]))
    log("  region cigar: '%s'.\n" % cu.toString(reg[1]))
    if len(reg) > 5:
        log("  annotation: s: %d, i: %d, d: %d.\n" % (reg[5], reg[6], reg[7]))
Ejemplo n.º 3
0
def getReadOffset(rseq, pos):
    '''
    Given the position in the alignment coordinate, return its offset in read
    '''
    readPos = rseq.pos
    #readLen = rseq.qlen
    #qlen does not include soft-clipping, use rlen instead.
    readLen = rseq.rlen
    cigar = rseq.cigar

    if (pos < readPos):
        raise ValueError("position underflows")

    offset = 0
    curPos = readPos
    for op, length in cigar:
        if op == 0 or op == 7 or op == 8:     # Match
            if (pos < curPos + length):       # (a) pos in this match
                ret = offset + pos - curPos
                if ret < readLen:
                    return ret
                else:
                    msg = "cigar '%s' and length '%s' conflict in read '%s'"
                    cigarStr = cu.toString(cigar)
                    raise ValueError(msg % (cigarStr, readLen, rseq.qname))
            else:
                offset += length              # (b) pos not in this match
                curPos += length
        elif op == 1 or op == 4:              # Insertion and Soft Clipping
            offset += length
        elif op == 2 or op == 3:              # Deletion or Splicing junction
            curPos += length
        else:
            raise NotImplementedError("unknown op '%s' in read '%s'" %
                                      (op, rseq.qname))

        # In the case of offset == readLen, there may be an overflow
        if offset > readLen:
            msg = "cigar '%s' and length '%s' conflict in read '%s'"
            cigarStr = cu.toString(cigar)
            raise ValueError(msg % (cigarStr, readLen, rseq.qname))

        if curPos > pos:
            raise ValueError('position in deletion or splicing junction')

    raise ValueError('position overflows')
Ejemplo n.º 4
0
def checkRegion(region):
    '''sanity check for a region'''
    assert len(region) >= 4 
    cigar = region[1]
    nRefBases = 0
    for op, length in cigar:
        if op == 0 or op == 2 or op == 3:
            nRefBases += length

    if region[2]+ nRefBases - 1 != region[3]:
        raise ValueError("Error: cigar '%s' conflicts with read region %d-%d."
                         % (cigarutils.toString(cigar), region[2], region[3]))
Ejemplo n.º 5
0
def checkRegion(region):
    '''sanity check for a region'''
    assert len(region) >= 4
    cigar = region[1]
    nRefBases = 0
    for op, length in cigar:
        if op == 0 or op == 2 or op == 3:
            nRefBases += length

    if region[2] + nRefBases - 1 != region[3]:
        raise ValueError(
            "Error: cigar '%s' conflicts with read region %d-%d." %
            (cigarutils.toString(cigar), region[2], region[3]))
Ejemplo n.º 6
0
    def test1(self):
        regions = []
        add = regions.append
        ## an inserted I_1 before any M_1
        add(makeReadRegion(1, '2I', -1, -2))

        add(makeReadRegion(0, '4M', 0, 3))

        ## an inserted I_1 after an M_1
        add(makeReadRegion(1, '2I', -1, -1))

        ## an inserted I_1 after an inserted I_1
        add(makeReadRegion(1, '3I', -1, -1))

        ## a gap

        ##
        add(makeReadRegion(1, '1M', 5, 5))

        ## an inserted I_1 after a match I_1
        add(makeReadRegion(1, '4I', 6, 5))

        ## a gap here

        add(makeReadRegion(0, '2M', 7, 8))
        ## an M_1 that is an insertion
        add(makeReadRegion(0, '2I', 9, 8))

        add(makeReadRegion(1, '3I', 9, 8))

        ## an M_1 that is an insertion
        add(makeReadRegion(0, '4I', 9, 8))
        add(makeReadRegion(1, '5I', 9, 8))

        ## a gap here

        add(makeReadRegion(0, '5M', 10, 14))

        cb = cigarbuilder.CigarBuilder()
        cigar = cb.build(regions)
        print(cigar)
        self.assertEqual(cigar, [(1, 2), (0, 4), (1, 2), (1, 3), (2, 1),
                                 (1, 5), (2, 1), (0, 2), (1, 2), (1, 3),
                                 (1, 4), (1, 5), (2, 1), (0, 5)])
        self.assertEqual(cigarutils.toString(cigar),
                         '2I,4M,2I,3I,1D,1M,4I,1D,2M,2I,3I,4I,5I,1D,5M')
Ejemplo n.º 7
0
@author: Shunping Huang
'''

import unittest
import StringIO
import tempfile
import os
import pysam

from lapels import annotator as annot
from lapels import cigarutils
from modtools import mod



polish = lambda x: cigarutils.toString(cigarutils.simplify(x))



class Read:
    '''Class for simulating reads from a bam file'''
    def __init__(self, start, end, cigar=None, qlen=None):
        self.qname = 'unknown'
        self.pos = start
        self.aend = end     #one base after the actual end
        self.tags = dict()        
        if cigar is None:
            self.cigar = [(0, self.aend - self.pos)]
        else:
            self.cigar = cigar
        if qlen is None:
Ejemplo n.º 8
0
Created on Oct 3, 2012

@author: Shunping Huang
'''

import unittest
import StringIO
import tempfile
import os
import pysam

from lapels import annotator as annot
from lapels import cigarutils
from modtools import mod

polish = lambda x: cigarutils.toString(cigarutils.simplify(x))


class Read:
    '''Class for simulating reads from a bam file'''
    def __init__(self, start, end, cigar=None, qlen=None):
        self.qname = 'unknown'
        self.pos = start
        self.aend = end  #one base after the actual end
        self.tags = dict()
        if cigar is None:
            self.cigar = [(0, self.aend - self.pos)]
        else:
            self.cigar = cigar
        if qlen is None:
            self.qlen = 0
Ejemplo n.º 9
0
 def testMakeCigar(self):
     self.assertEqual(make(toString(self.cigar1)), self.cigar1)
     self.assertEqual(make(toString(self.cigar2)), self.cigar2)
Ejemplo n.º 10
0
 def testToString(self):
     self.assertEqual(toString(self.cigar1),'15M,5I,10D,5M')
     self.assertEqual(toString(self.cigar2),'10I,15M,5I,10D,5M,10I')
Ejemplo n.º 11
0
    def execute(self):
        '''The driver method for the module'''
        self.logger.info("[%s]: %d read(s) found in BAM", self.chrom,
                         self.nReads)
        if self.nReads == 0:
            return 0

        self.data = self.mod.data
        self.modKeys = [tup[2] for tup in self.data]
        self.posmap = self.mod.get_posmap(self.chrom)
        count = 0
        count2 = 0
        if TESTING:
            results = []
        if not TESTING:
            self.logger.info("[%s]: %3d%%", self.chrom,
                             count * 100 / self.nReads)

        for rseq in self.inBam:      # Annotate each reads
            if rseq.is_unmapped:     # write unmapped reads without changing
                count += 1
                if VERBOSITY > 1:
                    log("unmapped read name: %s\n" % rseq.qname)
                if self.outBam is not None:
                    self.outBam.write(rseq)
                continue

            if VERBOSITY > 1:
                log("read name: %s\n" % rseq.qname)
                log("t. alignment pos: %d\n" % rseq.pos)
                log("t. alignment cigar: '%s'\n\n" % cu.toString(rseq.cigar))
            rseq.cigar = cu.simplify(rseq.cigar)   # Simplify the cigar first.
            regions = []
            tregs = getTargetRegions(rseq)
            for idx, treg in enumerate(tregs):
                if treg[0] == 0 or treg[0] == 7 or treg[0] == 8:  # Match
                    if VERBOSITY > 1:
                        log("process match\n")
                        log("t. region cigar(%d): %s\n"
                            % (idx, cu.toString([rseq.cigar[idx]])))
                    rreg = self.parseTargetRegion(treg, rseq)
                    if VERBOSITY > 1:
                        logRegion(rreg)
                elif treg[0] == 2 or treg[0] == 3:  # Deletion/Splice junction
                    rreg = (treg[0], )
                else:
                    rreg = (1, [rseq.cigar[idx]], 0, -1, -1)  # Insertion
                regions.append(rreg)

            if VERBOSITY > 1:
                log("\nafter parsing match regions:\n")
                log('\n'.join(map(str, regions)))
                log("\n\n")

            nRegions = len(regions)
            assert nRegions == len(rseq.cigar)

            for idx in range(nRegions):
                op = regions[idx][0]
                if op == 2 or op == 3:   # Handle deletions and splicing
                    if VERBOSITY > 1:
                        log("process deletion/splicing junction\n")
                        log("t. region cigar(%d): %s\n"
                            % (idx, cu.toString([rseq.cigar[idx]])))
                    if ((idx > 0) and idx < (nRegions - 1) and
                        (regions[idx - 1][0] == 0) and
                        (regions[idx + 1][0] == 0)):
                        delta = regions[idx + 1][2] - regions[idx - 1][3] - 1
                        assert delta >= 0
                        if delta == 0:
                            rreg = (op, [],
                                    regions[idx - 1][3] + 1,
                                    regions[idx + 1][2] - 1,
                                    -1)
                        else:
                            rreg = (op, [(op, delta)],
                                    regions[idx - 1][3] + 1,
                                    regions[idx + 1][2] - 1,
                                    -1)
                        if VERBOSITY > 1:
                            logRegion(rreg)

                    else:
                        rreg = self.parseTargetRegion(tregs[idx], rseq)
                        if VERBOSITY > 1:
                            logRegion(rreg)
                        rreg = regionutils.modifyRegion(rreg)
                        if VERBOSITY > 1:
                            log("after modification\n")
                            logRegion(rreg)
                    regions[idx] = rreg

            if VERBOSITY > 1:
                log("\nafter parsing deletions/splicing junction regions:\n")
                log('\n'.join(map(str, regions)))
                log('\n\n')

            cb = cigarbuilder.CigarBuilder()
#            cigar=cb.build(regions)
            for reg in regions:
                cb.append(reg)
            cigar = cb.cigar

            # Fix the MIDM pattern
            if re.match('.*\d*I,\d*D', cu.toString(cigar)) is not None:
                if VERBOSITY > 1:
                    log("fix MIDM pattern: %s\n" % cu.toString(cigar))
#                print("%d,%d,%d" %(nSNPs, nInsertions, nDeletions))
#                print(regions)
                for i in range(nRegions):
                    if tregs[i][0] == 1:
                        length = rseq.cigar[i][1]
                        assert length > 0

                        offset = getReadOffset(rseq, tregs[i][3]) + 1
                        ins = (rseq.seq[offset:offset + length])
                        if VERBOSITY > 1:
                            log('seq in insertion: %s\n' % ins)

                        if i > 0:
                            assert regions[i - 1][3] >= 0
                            loKey = regions[i - 1][3]
                        elif i < nRegions - 1 and regions[i + 1][2] >= length:
                            loKey = regions[i + 1][2] - length
                        else:
                            loKey = -1

                        if i < nRegions - 1:
                            assert regions[i + 1][2] >= 0
                            hiKey = regions[i + 1][2]
                        elif i > 0 and regions[i - 1][3] >= 0:
                            hiKey = regions[i - 1][3] + length
                        else:
                            hiKey = -1

#                        loKey = regions[i-1][3]
#                        hiKey = regions[i+1][2]

                        lo = bisect.bisect_left(self.modKeys, loKey)
                        hi = bisect.bisect_right(self.modKeys, hiKey)
                        if VERBOSITY > 1:
                            log('variants from %d-%d\n' % (loKey, hiKey))
                            for j in range(lo, hi):
                                log('%s\n' % str(self.data[j]))

                        isMatched = False
                        matchStart = -1
                        pivot = 0
                        isFailed = False
                        for j in range(lo, hi):
                            if self.data[j][0] != 'd':
                                # To activate MIDM pattern, the first instruction
                                # after insertion in read should be D.
                                isFailed = True
                                break
                                #continue
                            if matchStart == -1:
                                matchStart = int(self.data[j][2])
                            if ins[pivot] == self.data[j][3]:
                                pivot += 1
                            else:
                                break
                            if j == hi - 1 or pivot >= length:
                                isMatched = True
                                break

                        if isFailed:
                            continue

                        if isMatched:
                            if VERBOSITY > 1:
                                log('insertion matches gap from left\n')
                                log("before:\n")
                                logRegion(regions[i])
                            if pivot < length:
                                regions[i] = (0, [(0, pivot),
                                                  (1, length - pivot)],
                                              matchStart,
                                              matchStart + pivot - 1,
                                              matchStart)
                            else:
                                regions[i] = (0, [(0, pivot)], matchStart,
                                              matchStart + pivot - 1,
                                              matchStart)
                            if VERBOSITY > 1:
                                log("after:\n")
                                logRegion(regions[i])
                                log("\n")
                        else:
                            pivot = length - 1
                            for j in range(hi - 1, lo - 1, -1):
                                if self.data[j][0] != 'd':
                                    continue
                                if ins[pivot] == self.data[j][3]:
                                    matchStart = int(self.data[j][2])
                                    pivot -= 1
                                else:
                                    break
                                if j == lo or pivot <= 0:
                                    isMatched = True
                                    break

                            if isMatched:
                                if VERBOSITY > 1:
                                    log('insertion matches gap from right\n')
                                    log("before:\n")
                                    logRegion(regions[i])
                                if pivot >= 0:
                                    regions[i] = (0, [(1, pivot + 1),
                                                      (0, length - 1 - pivot)],
                                                  matchStart,
                                                  matchStart + length - pivot - 2,
                                                  matchStart)
                                else:
                                    regions[i] = (0, [(0, length - 1 - pivot)],
                                                  matchStart,
                                                  matchStart + length - pivot - 2,
                                                  matchStart)
                                if VERBOSITY > 1:
                                    log("after:\n")
                                    logRegion(regions[i])
                                    log("\n")
                            else:
                                if VERBOSITY > 1:
                                    log('insertion not matches\n')

                cb = cigarbuilder.CigarBuilder()
#                cigar=cb.build(regions)
                for reg in regions:
                    cb.append(reg)
                cigar = cb.cigar
#                print(cu.toString(cigar))

            if VERBOSITY > 1:
                log("\nafter fixing special pattern:\n")
                log('\n'.join(map(str, regions)))
                log('\n\n')

            nSNPs = 0
            nInsertions = 0
            nDeletions = 0
            for reg in regions:
                if len(reg) > 5:
                    nSNPs += reg[5]
                    nInsertions += reg[6]
                    nDeletions += reg[7]

            ## Set tags
            tags = dict(rseq.tags)

            self.setTag(tags, 's0', nSNPs)
            self.setTag(tags, 'i0', nInsertions)
            self.setTag(tags, 'd0', nDeletions)
            self.setTag(tags, 'OC',
                        cu.toString(rseq.cigar).replace(',', ''))
            self.setTag(tags, 'OM', tags['NM'])
            del tags['NM']  # Delete the old 'NM' tag.

            if nSNPs != 0 or nInsertions != 0 or nDeletions != 0:
                count2 += 1

            rseq.tags = [(key, tags[key]) for key in sorted(tags.keys())]

            ## Set pos to be the first M
            pos = -1
            for reg in regions:
                if reg[4] >= 0:
                    pos = reg[4]
                    break
            rseq.pos = pos

            ## Set cigar
            rseq.cigar = cu.simplify(cigar)

            if self.outBam is not None:
                self.outBam.write(rseq)

            if TESTING:
                results.append((rseq.cigar, rseq.pos, nSNPs, nInsertions,
                                nDeletions))

            if VERBOSITY > 1:
                log("output read pos: %d\n" % rseq.pos)
                log("output read cigar: '%s'\n"
                    % cu.toString(rseq.cigar))
                log("output annotation: s: %d, i: %d, d: %d.\n"
                    "========================\n\n" %
                    (nSNPs, nInsertions, nDeletions))
            count += 1

            if not TESTING and count % 100000 == 0:
                self.logger.info("[%s]: %3d%%", self.chrom,
                                 count * 100 / self.nReads)

        if not TESTING:
            self.logger.info("[%s]: %3d%%", self.chrom,
                             count * 100 / self.nReads)
            self.logger.info("[%s]: %d read(s) written to file", self.chrom,
                             count)
            self.logger.info("[%s]: %d read(s) have variants", self.chrom,
                             count2)
        else:
            return results

        return count