def test5(self):
     regions = []
     add = regions.append
     add(makeReadRegion(0, '25M', 10, 34))            
     add(makeReadRegion(3, '1N', 35, 35))
     add(makeReadRegion(1, '1M,1I', 35, 35))
     add(makeReadRegion(0, '74M', 36, 109))
     cb = cigarbuilder.CigarBuilder()
     cigar = cb.build(regions)
     self.assertEqual(cigarutils.simplify(cigar),[(0,26),(1,1),(0,74)])   
Exemple #2
0
def modifyRegion(region):
    ''' Modify a region according to the region op (rop). 
    A region is a tuple of (rop, cigar, start, end, pos, #SNPs, #Ins, #Del)
    '''    
    rop = region[0]
    cigar = region[1]    
    if rop == 0:                     ## M_1
        ret = region    
    elif rop == 2:                   ## D_1
        for i, (op, length) in enumerate(cigar):
            if op == 0:         ## M_0
                cigar[i] = (2, length)
            elif op == 1:       ## I_0
                cigar[i] = (-1, length)
            elif op == 2:       ## D_0
                cigar[i] = (2, length)
        
        cigar = cigarutils.simplify(cigar)
        ret = (rop, cigar, region[2], region[3], -1, 0, 0, 0)                
        
    elif rop == 3:                   ## N_1
        for i, (op, length) in enumerate(cigar):
            if op == 0:         ## M_0
                cigar[i] = (3, length)
            elif op == 1:       ## I_0
                cigar[i] = (-1, length)
            elif op == 2:       ## D_0
                cigar[i] = (3, length)  #D_0 in N_1 should be N
                #cigar[i] = (2, length)                        
        cigar = cigarutils.simplify(cigar)            
        ret = (rop, cigar, region[2], region[3], -1, 0, 0, 0)
    
    else:    
        raise ValueError("Error: unknown op type %d" % rop)
                    
    return ret
Exemple #3
0
def modifyRegion(region):
    ''' Modify a region according to the region op (rop). 
    A region is a tuple of (rop, cigar, start, end, pos, #SNPs, #Ins, #Del)
    '''
    rop = region[0]
    cigar = region[1]
    if rop == 0:  ## M_1
        ret = region
    elif rop == 2:  ## D_1
        for i, (op, length) in enumerate(cigar):
            if op == 0:  ## M_0
                cigar[i] = (2, length)
            elif op == 1:  ## I_0
                cigar[i] = (-1, length)
            elif op == 2:  ## D_0
                cigar[i] = (2, length)

        cigar = cigarutils.simplify(cigar)
        ret = (rop, cigar, region[2], region[3], -1, 0, 0, 0)

    elif rop == 3:  ## N_1
        for i, (op, length) in enumerate(cigar):
            if op == 0:  ## M_0
                cigar[i] = (3, length)
            elif op == 1:  ## I_0
                cigar[i] = (-1, length)
            elif op == 2:  ## D_0
                cigar[i] = (3, length)  #D_0 in N_1 should be N
                #cigar[i] = (2, length)
        cigar = cigarutils.simplify(cigar)
        ret = (rop, cigar, region[2], region[3], -1, 0, 0, 0)

    else:
        raise ValueError("Error: unknown op type %d" % rop)

    return ret
@author: Shunping Huang
'''

import unittest
import StringIO
import tempfile
import os
import pysam

from lapels import annotator as annot
from lapels import cigarutils
from modtools import mod



polish = lambda x: cigarutils.toString(cigarutils.simplify(x))



class Read:
    '''Class for simulating reads from a bam file'''
    def __init__(self, start, end, cigar=None, qlen=None):
        self.qname = 'unknown'
        self.pos = start
        self.aend = end     #one base after the actual end
        self.tags = dict()        
        if cigar is None:
            self.cigar = [(0, self.aend - self.pos)]
        else:
            self.cigar = cigar
        if qlen is None:
Exemple #5
0
Created on Oct 3, 2012

@author: Shunping Huang
'''

import unittest
import StringIO
import tempfile
import os
import pysam

from lapels import annotator as annot
from lapels import cigarutils
from modtools import mod

polish = lambda x: cigarutils.toString(cigarutils.simplify(x))


class Read:
    '''Class for simulating reads from a bam file'''
    def __init__(self, start, end, cigar=None, qlen=None):
        self.qname = 'unknown'
        self.pos = start
        self.aend = end  #one base after the actual end
        self.tags = dict()
        if cigar is None:
            self.cigar = [(0, self.aend - self.pos)]
        else:
            self.cigar = cigar
        if qlen is None:
            self.qlen = 0
Exemple #6
0
    def execute(self):
        '''The driver method for the module'''
        self.logger.info("[%s]: %d read(s) found in BAM", self.chrom,
                         self.nReads)
        if self.nReads == 0:
            return 0

        self.data = self.mod.data
        self.modKeys = [tup[2] for tup in self.data]
        self.posmap = self.mod.get_posmap(self.chrom)
        count = 0
        count2 = 0
        if TESTING:
            results = []
        if not TESTING:
            self.logger.info("[%s]: %3d%%", self.chrom,
                             count * 100 / self.nReads)

        for rseq in self.inBam:      # Annotate each reads
            if rseq.is_unmapped:     # write unmapped reads without changing
                count += 1
                if VERBOSITY > 1:
                    log("unmapped read name: %s\n" % rseq.qname)
                if self.outBam is not None:
                    self.outBam.write(rseq)
                continue

            if VERBOSITY > 1:
                log("read name: %s\n" % rseq.qname)
                log("t. alignment pos: %d\n" % rseq.pos)
                log("t. alignment cigar: '%s'\n\n" % cu.toString(rseq.cigar))
            rseq.cigar = cu.simplify(rseq.cigar)   # Simplify the cigar first.
            regions = []
            tregs = getTargetRegions(rseq)
            for idx, treg in enumerate(tregs):
                if treg[0] == 0 or treg[0] == 7 or treg[0] == 8:  # Match
                    if VERBOSITY > 1:
                        log("process match\n")
                        log("t. region cigar(%d): %s\n"
                            % (idx, cu.toString([rseq.cigar[idx]])))
                    rreg = self.parseTargetRegion(treg, rseq)
                    if VERBOSITY > 1:
                        logRegion(rreg)
                elif treg[0] == 2 or treg[0] == 3:  # Deletion/Splice junction
                    rreg = (treg[0], )
                else:
                    rreg = (1, [rseq.cigar[idx]], 0, -1, -1)  # Insertion
                regions.append(rreg)

            if VERBOSITY > 1:
                log("\nafter parsing match regions:\n")
                log('\n'.join(map(str, regions)))
                log("\n\n")

            nRegions = len(regions)
            assert nRegions == len(rseq.cigar)

            for idx in range(nRegions):
                op = regions[idx][0]
                if op == 2 or op == 3:   # Handle deletions and splicing
                    if VERBOSITY > 1:
                        log("process deletion/splicing junction\n")
                        log("t. region cigar(%d): %s\n"
                            % (idx, cu.toString([rseq.cigar[idx]])))
                    if ((idx > 0) and idx < (nRegions - 1) and
                        (regions[idx - 1][0] == 0) and
                        (regions[idx + 1][0] == 0)):
                        delta = regions[idx + 1][2] - regions[idx - 1][3] - 1
                        assert delta >= 0
                        if delta == 0:
                            rreg = (op, [],
                                    regions[idx - 1][3] + 1,
                                    regions[idx + 1][2] - 1,
                                    -1)
                        else:
                            rreg = (op, [(op, delta)],
                                    regions[idx - 1][3] + 1,
                                    regions[idx + 1][2] - 1,
                                    -1)
                        if VERBOSITY > 1:
                            logRegion(rreg)

                    else:
                        rreg = self.parseTargetRegion(tregs[idx], rseq)
                        if VERBOSITY > 1:
                            logRegion(rreg)
                        rreg = regionutils.modifyRegion(rreg)
                        if VERBOSITY > 1:
                            log("after modification\n")
                            logRegion(rreg)
                    regions[idx] = rreg

            if VERBOSITY > 1:
                log("\nafter parsing deletions/splicing junction regions:\n")
                log('\n'.join(map(str, regions)))
                log('\n\n')

            cb = cigarbuilder.CigarBuilder()
#            cigar=cb.build(regions)
            for reg in regions:
                cb.append(reg)
            cigar = cb.cigar

            # Fix the MIDM pattern
            if re.match('.*\d*I,\d*D', cu.toString(cigar)) is not None:
                if VERBOSITY > 1:
                    log("fix MIDM pattern: %s\n" % cu.toString(cigar))
#                print("%d,%d,%d" %(nSNPs, nInsertions, nDeletions))
#                print(regions)
                for i in range(nRegions):
                    if tregs[i][0] == 1:
                        length = rseq.cigar[i][1]
                        assert length > 0

                        offset = getReadOffset(rseq, tregs[i][3]) + 1
                        ins = (rseq.seq[offset:offset + length])
                        if VERBOSITY > 1:
                            log('seq in insertion: %s\n' % ins)

                        if i > 0:
                            assert regions[i - 1][3] >= 0
                            loKey = regions[i - 1][3]
                        elif i < nRegions - 1 and regions[i + 1][2] >= length:
                            loKey = regions[i + 1][2] - length
                        else:
                            loKey = -1

                        if i < nRegions - 1:
                            assert regions[i + 1][2] >= 0
                            hiKey = regions[i + 1][2]
                        elif i > 0 and regions[i - 1][3] >= 0:
                            hiKey = regions[i - 1][3] + length
                        else:
                            hiKey = -1

#                        loKey = regions[i-1][3]
#                        hiKey = regions[i+1][2]

                        lo = bisect.bisect_left(self.modKeys, loKey)
                        hi = bisect.bisect_right(self.modKeys, hiKey)
                        if VERBOSITY > 1:
                            log('variants from %d-%d\n' % (loKey, hiKey))
                            for j in range(lo, hi):
                                log('%s\n' % str(self.data[j]))

                        isMatched = False
                        matchStart = -1
                        pivot = 0
                        isFailed = False
                        for j in range(lo, hi):
                            if self.data[j][0] != 'd':
                                # To activate MIDM pattern, the first instruction
                                # after insertion in read should be D.
                                isFailed = True
                                break
                                #continue
                            if matchStart == -1:
                                matchStart = int(self.data[j][2])
                            if ins[pivot] == self.data[j][3]:
                                pivot += 1
                            else:
                                break
                            if j == hi - 1 or pivot >= length:
                                isMatched = True
                                break

                        if isFailed:
                            continue

                        if isMatched:
                            if VERBOSITY > 1:
                                log('insertion matches gap from left\n')
                                log("before:\n")
                                logRegion(regions[i])
                            if pivot < length:
                                regions[i] = (0, [(0, pivot),
                                                  (1, length - pivot)],
                                              matchStart,
                                              matchStart + pivot - 1,
                                              matchStart)
                            else:
                                regions[i] = (0, [(0, pivot)], matchStart,
                                              matchStart + pivot - 1,
                                              matchStart)
                            if VERBOSITY > 1:
                                log("after:\n")
                                logRegion(regions[i])
                                log("\n")
                        else:
                            pivot = length - 1
                            for j in range(hi - 1, lo - 1, -1):
                                if self.data[j][0] != 'd':
                                    continue
                                if ins[pivot] == self.data[j][3]:
                                    matchStart = int(self.data[j][2])
                                    pivot -= 1
                                else:
                                    break
                                if j == lo or pivot <= 0:
                                    isMatched = True
                                    break

                            if isMatched:
                                if VERBOSITY > 1:
                                    log('insertion matches gap from right\n')
                                    log("before:\n")
                                    logRegion(regions[i])
                                if pivot >= 0:
                                    regions[i] = (0, [(1, pivot + 1),
                                                      (0, length - 1 - pivot)],
                                                  matchStart,
                                                  matchStart + length - pivot - 2,
                                                  matchStart)
                                else:
                                    regions[i] = (0, [(0, length - 1 - pivot)],
                                                  matchStart,
                                                  matchStart + length - pivot - 2,
                                                  matchStart)
                                if VERBOSITY > 1:
                                    log("after:\n")
                                    logRegion(regions[i])
                                    log("\n")
                            else:
                                if VERBOSITY > 1:
                                    log('insertion not matches\n')

                cb = cigarbuilder.CigarBuilder()
#                cigar=cb.build(regions)
                for reg in regions:
                    cb.append(reg)
                cigar = cb.cigar
#                print(cu.toString(cigar))

            if VERBOSITY > 1:
                log("\nafter fixing special pattern:\n")
                log('\n'.join(map(str, regions)))
                log('\n\n')

            nSNPs = 0
            nInsertions = 0
            nDeletions = 0
            for reg in regions:
                if len(reg) > 5:
                    nSNPs += reg[5]
                    nInsertions += reg[6]
                    nDeletions += reg[7]

            ## Set tags
            tags = dict(rseq.tags)

            self.setTag(tags, 's0', nSNPs)
            self.setTag(tags, 'i0', nInsertions)
            self.setTag(tags, 'd0', nDeletions)
            self.setTag(tags, 'OC',
                        cu.toString(rseq.cigar).replace(',', ''))
            self.setTag(tags, 'OM', tags['NM'])
            del tags['NM']  # Delete the old 'NM' tag.

            if nSNPs != 0 or nInsertions != 0 or nDeletions != 0:
                count2 += 1

            rseq.tags = [(key, tags[key]) for key in sorted(tags.keys())]

            ## Set pos to be the first M
            pos = -1
            for reg in regions:
                if reg[4] >= 0:
                    pos = reg[4]
                    break
            rseq.pos = pos

            ## Set cigar
            rseq.cigar = cu.simplify(cigar)

            if self.outBam is not None:
                self.outBam.write(rseq)

            if TESTING:
                results.append((rseq.cigar, rseq.pos, nSNPs, nInsertions,
                                nDeletions))

            if VERBOSITY > 1:
                log("output read pos: %d\n" % rseq.pos)
                log("output read cigar: '%s'\n"
                    % cu.toString(rseq.cigar))
                log("output annotation: s: %d, i: %d, d: %d.\n"
                    "========================\n\n" %
                    (nSNPs, nInsertions, nDeletions))
            count += 1

            if not TESTING and count % 100000 == 0:
                self.logger.info("[%s]: %3d%%", self.chrom,
                                 count * 100 / self.nReads)

        if not TESTING:
            self.logger.info("[%s]: %3d%%", self.chrom,
                             count * 100 / self.nReads)
            self.logger.info("[%s]: %d read(s) written to file", self.chrom,
                             count)
            self.logger.info("[%s]: %d read(s) have variants", self.chrom,
                             count2)
        else:
            return results

        return count
Exemple #7
0
    def parseTargetRegion(self, region, rseq):
        '''
        Parse a read region in the target coordinate and get data in ref.

        input region: a tuple of (op, None, start, end)
        output region: a tuple of (op, newCigar, newStart, newEnd, newPos,
                                   nSNPs, nInsertions, nDeletions)
        '''

        data = self.data
        posmap = self.posmap
        modKeys = self.modKeys  # All variant positions in reference coordinate

        nSNPs = 0
        nInsertions = 0
        nDeletions = 0

        # The region's exact start and end in the target coordinate.
        op = region[0]
        tstart = region[2]
        tend = region[3]
        if tstart > tend:
            assert (op == 1 or op == 4) # It must be an insertion region(I_1)
            return (op,)

        # Find the region's fuzzy boundaries in the reference coordinate.
        # The position will be imprecise if it falls in an Insertion(I_0).
        rstart = posmap.bmap((self.chrom, tstart))
        rend = posmap.bmap((self.chrom, tend))

        # Detect the case of translocation between two chromosomes.
        if rstart[0] != self.chrom or rend[0] != self.chrom:
            raise NotImplementedError("cannot parse this region.")
        rstart = abs(rstart[1])
        rend = abs(rend[1])
        # Detect the case of translocation/duplication/inversion.
        if rstart > rend:
            raise NotImplementedError("cannot parse this region.")

        if VERBOSITY > 1:
            log("T: %d-%d; R: %d-%d\n" % (tstart, tend, rstart, rend))

        # Initialize the new attributes of the region
        nstart = rend + 1   # The ref position of the first M or D.
        nend = -1           # The ref position of the last M or D.
        npos = rend + 1     # The ref position of the first M (and M only).
        ncigar = []         # The new cigar in the reference coordinate

        # The next processing position in ref and target coordinate
        rpos = rstart
        tpos = posmap.fmap((self.chrom, rstart))[1]  # tpos <= tstart
        if tpos < 0:        # it falls in a deletion(D_0)
            tpos = -tpos + 1

        # The searching boundaries in mod data.
        # Variants in the region will be in data[lo:hi]
        lo = bisect.bisect_left(modKeys, rstart)
        hi = bisect.bisect_right(modKeys, rend)

        if lo < hi:  # There are some variants in the region
            # The boundaries of the processing block in mod data.
            # Rows in data[startIdx:endIdx] will have the same ref position.
            startIdx = lo
            endIdx = lo
            # The next processing variant position in reference coordinate.
            vpos = data[lo][2]
            for i in range(lo, hi + 1):  # The plus 1 trick.
                if tpos > tend:        # Already reach the region's end.
                    break

                # Find the rows that share the same ref position.
                if i < hi and data[i][2] == vpos:  # The plus 1 trick
                    endIdx += 1
                    continue

                #assert rpos <= vpos
                if (rpos > vpos):
                    raise ValueError("position not sorted in MOD at line %d." %
                                     (i + 1))

                if rpos < vpos:
                    # Fill M's to the head or the gap between sub-regions.
                    if tpos >= tstart and tpos <= tend:
                        ncigar.append((0, vpos - rpos))
                        nstart = min(nstart, rpos)
                        nend = max(nend, vpos - 1)
                        npos = min(npos, rpos)
                    tpos += vpos - rpos
                    rpos = vpos

                if VERBOSITY > 1:
                    for j in range(startIdx, endIdx):
                        log(','.join(map(str, data[j])))
                        log('\n')

                # Handle the sub-regions for each variant position.
                subRegs = [('m', 1)]
                for j in range(startIdx, endIdx):
                    tup = data[j]
                    if tup[0] == 's':
                        if subRegs[0][0] != 'd':  # 'd' overrides 's'
                            subRegs[0] = ('s', tup[3])
                    elif tup[0] == 'i':
                        subRegs.append(('i', tup[3]))
                    elif tup[0] == 'd':
                        subRegs[0] = ('d',)
                    else:
                        raise NotImplementedError("unknown op '%s'" % tup[0])

                for reg in subRegs:
                    segType = reg[0]
                    if segType == 'm':      # Match
                        if tpos >= tstart and tpos <= tend:
                            ncigar.append((0, 1))
                            nstart = min(nstart, rpos)
                            nend = max(nend, rpos)
                            npos = min(npos, rpos)
                        rpos += 1
                        tpos += 1
                    elif segType == 's':    # Substitution
                        if tpos >= tstart and tpos <= tend:
                            ncigar.append((0, 1))
                            nstart = min(nstart, rpos)
                            nend = max(nend, rpos)
                            npos = min(npos, rpos)

                            if op != 2 and op != 3:  # D_1 or N_1
                                rbase = rseq.seq[getReadOffset(rseq, tpos)]
                                if VERBOSITY > 1:
                                    log("SNP found at %d: %s\n"
                                        % (tpos, reg[1]))
                                    log("Read base: %s\n" % rbase)
                                if rbase == reg[1][-1]:
                                    nSNPs += 1
                        rpos += 1
                        tpos += 1
                    elif segType == 'i':    # Insertion
                        # TODO: Support multiple insertions in a line.
                        segLen = len(reg[1])
                        tmax = min(tpos + segLen, tend + 1)
                        if tpos > tstart:
                            ncigar.append((1, tmax - tpos))
                        else:
                            if tmax > tstart:
                                ncigar.append((1, tmax - tstart))
                        tpos = tmax
                    elif segType == 'd':    # Deletion
                        if tpos > tstart and tpos <= tend:
                            ncigar.append((2, 1))
                            # NO assignment of 'npos' here: npos is for M only.
                            nstart = min(nstart, rpos)
                            nend = max(nend, rpos)
                        rpos += 1
                    else:
                        raise NotImplementedError("unknown op '%s'" % segType)
                    if tpos > tend:
                        break

                startIdx = endIdx
                endIdx += 1
                if i < hi:  # The plus 1 trick
                    vpos = data[i][2]

        #assert rpos <= refLens[chrom]
        if rpos > rend + 1:
            raise ValueError("variant position out of boundary")

        # Fill M's to the region end if bases are not enough
        if rpos < rend + 1:
            ncigar.append((0, rend - rpos + 1))
            npos = min(npos, rpos)
            nstart = min(nstart, rpos)
            nend = max(nend, rend)

        # Count the number of bases for insertions and deletions in a region.
        ncigar = cu.simplify(ncigar)
        for cig in ncigar:
            if cig[0] == 1:     # I_0
                nInsertions += cig[1]
            elif cig[0] == 2:   # D_0
                nDeletions += cig[1]

        if nstart > rend:
            # If no previous assignment of nstart/nend, then this region
            # contains merely I_0 regions.
            # Assign them according to the anchor position (the last preceding
            # position in reference), so that the gap of regions can be filled
            # properly later.

            # start is one-base after the anchor position, so that when it
            # subtracts the previous end will yield 0.
            nstart = rpos

            # end is the position of the anchor point, so that when it is
            # subtracted by the next start will yield 0.
            nend = rpos - 1

        # Assign -1 if the region contains no M_0.
        # TODO: assign unmapped properties for this reads
        if npos > rend:
            npos = -1

        return (op, ncigar, nstart, nend, npos, nSNPs, nInsertions, nDeletions)