Beispiel #1
0
 def accuracy(
     self, xcoverage
 ):  ### Calculate accuracy (if required) at xcoverage and returns
     '''Calculate accuracy (if required) at xcoverage and returns.'''
     try:  ### ~ [1] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         while len(self.list['Accuracy']) <= xcoverage:
             if not int((1.0 - self.list['Accuracy'][-1]) *
                        self.getNum('GenomeSize')):
                 self.list['Accuracy'].append(1.0)
                 continue
             xcov = len(self.list['Accuracy'])
             majority = int(
                 xcov /
                 2.0) + 1  # Number of correct reads needed for majority
             try:
                 self.list['Accuracy'].append(
                     rje.logBinomial(majority,
                                     xcov,
                                     1.0 - self.getNum('ErrPerBase'),
                                     exact=False,
                                     callobj=self))
             except:
                 self.list['Accuracy'].append(
                     rje.logPoisson(majority,
                                    xcov *
                                    (1.0 - self.getNum('ErrPerBase')),
                                    exact=False,
                                    callobj=self))
             self.debug(self.list['Accuracy'])
         return self.list['Accuracy'][xcoverage]
     except:
         self.errorLog('%s.accuracy error' % self.prog())
Beispiel #2
0
 def accuracy(self,xcoverage):   ### Calculate accuracy (if required) at xcoverage and returns
     '''Calculate accuracy (if required) at xcoverage and returns.'''
     try:### ~ [1] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         while len(self.list['Accuracy']) <= xcoverage:
             if not int((1.0 - self.list['Accuracy'][-1]) * self.getNum('GenomeSize')):
                 self.list['Accuracy'].append(1.0)
                 continue
             xcov = len(self.list['Accuracy'])
             majority = int(xcov/2.0) + 1        # Number of correct reads needed for majority
             try: self.list['Accuracy'].append(rje.logBinomial(majority,xcov,1.0 - self.getNum('ErrPerBase'),exact=False,callobj=self))
             except: self.list['Accuracy'].append(rje.logPoisson(majority,xcov*(1.0 - self.getNum('ErrPerBase')),exact=False,callobj=self))
             self.debug(self.list['Accuracy'])
         return self.list['Accuracy'][xcoverage]
     except: self.errorLog('%s.accuracy error' % self.prog())
Beispiel #3
0
 def _digest(self): ### Main digestion of sequences and population of results database
     '''Main digestion of sequences and population of results database.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('ProDigIS')
         prot_combo = self.protCombo()
         ## ~ [1] ~ Peptide Probability Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         pdb = self.db('PepProb'); pdict = {}
         if pdb:
             if self.getBool('CysWeight'):
                 for plen in pdb.index('PepSize').keys(): pdict[plen] = {}
                 for entry in pdb.entries(): pdict[entry['PepSize']][entry['CysCount']] = entry
             else:
                 for entry in pdb.entries(): pdict[entry['PepSize']] = entry
         ### ~ [2] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.deBug(self.int)
         for prot in prot_combo:
             allpep = []; redundant = []; maxcys = 0
             sx = 0.0; stot = self.obj['SeqList'].seqNum() 
             for seq in self.obj['SeqList'].seqs():
                 self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0
                 sequence = seq.getSequence()
                 for protease in string.split(prot,'+'):
                     for cut in proteases[protease]:
                         sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
                 for frag in string.split(sequence,':'):
                     if frag in allpep: redundant.append(frag)
                     else: allpep.append(frag); maxcys = max(maxcys,frag.count('C'))
             self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot)))
             if self.getBool('CysCount'):
                 for c in range(maxcys+1): db.addField('Cys%d' % c)
         ### ~ [3] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             sx = 0.0; stot = self.obj['SeqList'].seqNum() 
             for seq in self.obj['SeqList'].seqs():
                 self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0
                 acc = seq.getStr('AccNum')
                 ## ~ [2a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~ ##
                 entry = {'AccNum':acc,'Protease':prot}
                 for i in range(1,self.getInt('MaxPepLen')+1):
                     entry[i] = 0
                     if self.getBool('PepMWt'): entry[i*100.0] = 0
                 sequence = seq.getSequence()
                 ## ~ [2b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~ ##
                 for protease in string.split(prot,'+'):
                     for cut in proteases[protease]:
                         sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
                 ## ~ [2c] ~ Cut into fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 frag = string.split(sequence,':')
                 while '' in frag: frag.remove('')
                 self.deBug(frag)
                 entry['PepCount'] = len(frag)
                 if not self.getBool('NTerm'): frag = frag[1:]
                 if self.getInt('MinPepLen') > 0: 
                     for pep in frag[0:]:
                         if len(pep) < self.getInt('MinPepLen'): frag.remove(pep)
                 entry['MinPepLen'] = len(frag)
                 if self.getBool('NRPep'):
                     for pep in frag[0:]:
                         if pep in redundant: frag.remove(pep)
                     entry['NRPep'] = len(frag)
                 if self.getBool('CysCount'):
                     for c in range(maxcys+1): entry['Cys%d' % c] = 0
                     for pep in frag: entry['Cys%d' % pep.count('C')] += 1
                 if pdict: entry['LenExp'] = 0.0; entry['MWtExp'] = 0.0; entry['Len7Exp'] = 0.0
                 ## ~ [2d] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 for pep in frag[0:]:
                     plen = min(len(pep),self.getInt('MaxPepLen'))
                     self.deBug('"%s" -> %d' % (pep,plen))
                     entry[plen] += 1
                     if pdict:
                         if self.getBool('CysWeight'):
                             try: pprob = pdict[plen][pep.count('C')]['Prob']
                             except: pprob = 0.0
                         else: pprob = pdict[plen]['Prob']
                     if pdict: entry['LenExp'] += pprob
                     if pdict and 7 <= plen: entry['Len7Exp'] += pprob
                     if self.getBool('PepMWt'):
                         pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen'))
                         entry[pwt] += 1
                         if pdict: entry['MWtExp'] += pprob
                 entry['Len3'] = rje.logPoisson(3,entry['LenExp'],callobj=self)
                 if self.getBool('PepMWt'): entry['MWt3'] = rje.logPoisson(3,entry['MWtExp'],callobj=self)
                 entry['Len5'] = rje.logPoisson(5,entry['LenExp'],callobj=self)
                 if self.getBool('PepMWt'): entry['MWt5'] = rje.logPoisson(5,entry['MWtExp'],callobj=self)
                 entry['Len37'] = rje.logPoisson(3,entry['Len7Exp'],callobj=self)
                 db.addEntry(entry)
             self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot)))
     except: self.errorLog('%s._digest error' % self)
Beispiel #4
0
    def coverage(
        self
    ):  ### Calculates estimated % coverage and accuracy of genome sequencing.
        '''Calculates estimated % coverage and accuracy of genome sequencing.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            # XCoverage, SMRT, %Cov, Accuracy
            if self.getBool('BySMRT'): ckey = 'SMRT'
            else: ckey = 'XCoverage'
            cfields = ['XCoverage', 'SMRT', '%Coverage', 'Accuracy']
            for xn in self.list['XnList']:
                cfields.append('%%X%d' % xn)
            cdb = self.db().addEmptyTable('coverage', cfields, [ckey])

            ### ~ [2] Calculate stats for one round ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.progLog('\r#XCOV', 'Calculating coverage stats...')
            cov_per_base_per_read = self.getNum('AvRead') / self.getNum(
                'GenomeSize')
            if self.getBool('BySMRT'):
                reads = self.getInt('SMRTReads')  # If going per SMRT cell
            else:
                reads = int(0.5 + self.getNum('GenomeSize') /
                            self.getNum('AvRead'))  # if going per X coverage
            # Calculate X coverage counts using binomial
            bases = int(self.getNum('GenomeSize'))
            xcov = [
            ]  # List where index is X coverage and number is proportion of reads
            while bases > 1:
                try:
                    xcov.append(
                        rje.logBinomial(len(xcov),
                                        reads,
                                        cov_per_base_per_read,
                                        exact=True,
                                        callobj=self))
                except:
                    xcov.append(
                        rje.logPoisson(len(xcov),
                                       reads * cov_per_base_per_read,
                                       exact=True,
                                       callobj=self))
                bases -= self.getNum('GenomeSize') * xcov[-1]
                if len(xcov) > reads:
                    raise ValueError('XCoverage cannot exceed read count!')
            cyccov = xcov[0:]
            self.debug(xcov)

            ### ~ [3] Cycle through rounds, multiplying by X coverage ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            cx = 0.0
            ctot = self.getInt('MaxCov')
            xcoverage = 0.0
            while xcoverage < self.getInt('MaxCov'):
                self.progLog(
                    '\r#XCOV', 'Calculating coverage stats: %.1f%% (%d|%d)' %
                    ((cx / ctot, cdb.entryNum() + 1, len(cyccov))))
                cx += 100.0
                # Update xcov: calculate %bases at different X coverage
                if cdb.entryNum(
                ):  # Equivalent of starting with [1.0] (!00% 0 @ 0X)
                    prevcov = cyccov[0:]
                    cyccov = [0.0] * (len(prevcov) * 2 - 1)
                    for xi in range(len(prevcov)):
                        for xj in range(len(xcov)):
                            x = xi + xj
                            cyccov[x] += (prevcov[xi] * xcov[xj])
                while (cyccov[-1]) < 1.0 / self.getNum('GenomeSize'):
                    cyccov.pop(-1)
                # Calculate accuracy: For each X coverage, calculate % bases with >50% correct
                accuracy = 0.0
                for x in range(len(cyccov[1:])):
                    accuracy += cyccov[x] * self.accuracy(x)
                accuracy = 100.0 * accuracy / sum(cyccov[1:])
                # SMRT cells versus coverage
                xcoverage += self.getNum('AvRead') * reads / self.getNum(
                    'GenomeSize')
                smrt = (self.getNum('GenomeSize') * xcoverage) / (
                    self.getNum('AvRead') * self.getNum('SMRTReads'))
                # Update cdb
                #centry = {'XCoverage':'%.3f' % xcoverage,'SMRT':'%.2f' % smrt,'%Coverage':100.0 * (1.0-cyccov[0]),'Accuracy':accuracy}
                centry = {
                    'XCoverage': rje.sf(xcoverage, 3),
                    'SMRT': rje.sf(smrt, 3),
                    '%Coverage': 100.0 * (1.0 - cyccov[0]),
                    'Accuracy': accuracy
                }
                for xn in self.list['XnList']:
                    if xn <= len(cyccov):
                        centry['%%X%d' % xn] = rje.sf(100.0 * sum(cyccov[xn:]),
                                                      4)
                    else:
                        centry['%%X%d' % xn] = 0.000
                cdb.addEntry(centry)
            self.progLog(
                '\r#XCOV', 'Calculated coverage stats upto %dX coverage.' %
                self.getInt('MaxCov'))

            ### ~ [4] Save results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for xkey in cdb.dataKeys():
                cdb.dict['Data'][float(xkey)] = cdb.dict['Data'].pop(xkey)
            cdb.saveToFile()

            return
        except:
            self.errorLog('%s.coverage error' % self.prog())
Beispiel #5
0
    def coverage(self): ### Calculates estimated % coverage and accuracy of genome sequencing.
        '''Calculates estimated % coverage and accuracy of genome sequencing.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            # XCoverage, SMRT, %Cov, Accuracy
            if self.getBool('BySMRT'): ckey = 'SMRT'
            else: ckey = 'XCoverage'
            cfields = ['XCoverage','SMRT','%Coverage','Accuracy']
            for xn in self.list['XnList']: cfields.append('%%X%d' % xn)
            cdb = self.db().addEmptyTable('coverage',cfields,[ckey])

            ### ~ [2] Calculate stats for one round ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.progLog('\r#XCOV','Calculating coverage stats...')
            cov_per_base_per_read = self.getNum('AvRead') / self.getNum('GenomeSize')
            if self.getBool('BySMRT'): reads = self.getInt('SMRTReads')    # If going per SMRT cell
            else: reads = int(0.5 + self.getNum('GenomeSize') / self.getNum('AvRead')) # if going per X coverage
            # Calculate X coverage counts using binomial
            bases = int(self.getNum('GenomeSize'))
            xcov = []   # List where index is X coverage and number is proportion of reads
            while bases > 1:
                try: xcov.append(rje.logBinomial(len(xcov),reads,cov_per_base_per_read,exact=True,callobj=self))
                except: xcov.append(rje.logPoisson(len(xcov),reads*cov_per_base_per_read,exact=True,callobj=self))
                bases -= self.getNum('GenomeSize') * xcov[-1]
                if len(xcov) > reads: raise ValueError('XCoverage cannot exceed read count!')
            cyccov = xcov[0:]
            self.debug(xcov)

            ### ~ [3] Cycle through rounds, multiplying by X coverage ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            cx = 0.0; ctot = self.getInt('MaxCov'); xcoverage = 0.0
            while xcoverage < self.getInt('MaxCov'):
                self.progLog('\r#XCOV','Calculating coverage stats: %.1f%% (%d|%d)' % ((cx/ctot,cdb.entryNum()+1,len(cyccov)))); cx += 100.0
                # Update xcov: calculate %bases at different X coverage
                if cdb.entryNum():  # Equivalent of starting with [1.0] (!00% 0 @ 0X)
                    prevcov = cyccov[0:]
                    cyccov = [0.0] * (len(prevcov)*2 - 1)
                    for xi in range(len(prevcov)):
                        for xj in range(len(xcov)):
                            x = xi + xj
                            cyccov[x] += (prevcov[xi] * xcov[xj])
                while(cyccov[-1]) < 1.0 / self.getNum('GenomeSize'): cyccov.pop(-1)
                # Calculate accuracy: For each X coverage, calculate % bases with >50% correct
                accuracy = 0.0
                for x in range(len(cyccov[1:])): accuracy += cyccov[x] * self.accuracy(x)
                accuracy = 100.0 * accuracy / sum(cyccov[1:])
                # SMRT cells versus coverage
                xcoverage += self.getNum('AvRead') * reads / self.getNum('GenomeSize')
                smrt = (self.getNum('GenomeSize') * xcoverage) / (self.getNum('AvRead') * self.getNum('SMRTReads'))
                # Update cdb
                #centry = {'XCoverage':'%.3f' % xcoverage,'SMRT':'%.2f' % smrt,'%Coverage':100.0 * (1.0-cyccov[0]),'Accuracy':accuracy}
                centry = {'XCoverage':rje.sf(xcoverage,3),'SMRT':rje.sf(smrt,3),'%Coverage':100.0 * (1.0-cyccov[0]),'Accuracy':accuracy}
                for xn in self.list['XnList']:
                    if xn <= len(cyccov): centry['%%X%d' % xn] = rje.sf(100.0*sum(cyccov[xn:]),4)
                    else: centry['%%X%d' % xn] = 0.000
                cdb.addEntry(centry)
            self.progLog('\r#XCOV','Calculated coverage stats upto %dX coverage.' % self.getInt('MaxCov'))

            ### ~ [4] Save results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for xkey in cdb.dataKeys():
                cdb.dict['Data'][float(xkey)] = cdb.dict['Data'].pop(xkey)
            cdb.saveToFile()

            return
        except: self.errorLog('%s.coverage error' % self.prog())