def accuracy( self, xcoverage ): ### Calculate accuracy (if required) at xcoverage and returns '''Calculate accuracy (if required) at xcoverage and returns.''' try: ### ~ [1] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while len(self.list['Accuracy']) <= xcoverage: if not int((1.0 - self.list['Accuracy'][-1]) * self.getNum('GenomeSize')): self.list['Accuracy'].append(1.0) continue xcov = len(self.list['Accuracy']) majority = int( xcov / 2.0) + 1 # Number of correct reads needed for majority try: self.list['Accuracy'].append( rje.logBinomial(majority, xcov, 1.0 - self.getNum('ErrPerBase'), exact=False, callobj=self)) except: self.list['Accuracy'].append( rje.logPoisson(majority, xcov * (1.0 - self.getNum('ErrPerBase')), exact=False, callobj=self)) self.debug(self.list['Accuracy']) return self.list['Accuracy'][xcoverage] except: self.errorLog('%s.accuracy error' % self.prog())
def accuracy(self,xcoverage): ### Calculate accuracy (if required) at xcoverage and returns '''Calculate accuracy (if required) at xcoverage and returns.''' try:### ~ [1] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while len(self.list['Accuracy']) <= xcoverage: if not int((1.0 - self.list['Accuracy'][-1]) * self.getNum('GenomeSize')): self.list['Accuracy'].append(1.0) continue xcov = len(self.list['Accuracy']) majority = int(xcov/2.0) + 1 # Number of correct reads needed for majority try: self.list['Accuracy'].append(rje.logBinomial(majority,xcov,1.0 - self.getNum('ErrPerBase'),exact=False,callobj=self)) except: self.list['Accuracy'].append(rje.logPoisson(majority,xcov*(1.0 - self.getNum('ErrPerBase')),exact=False,callobj=self)) self.debug(self.list['Accuracy']) return self.list['Accuracy'][xcoverage] except: self.errorLog('%s.accuracy error' % self.prog())
def _digest(self): ### Main digestion of sequences and population of results database '''Main digestion of sequences and population of results database.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db('ProDigIS') prot_combo = self.protCombo() ## ~ [1] ~ Peptide Probability Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pdb = self.db('PepProb'); pdict = {} if pdb: if self.getBool('CysWeight'): for plen in pdb.index('PepSize').keys(): pdict[plen] = {} for entry in pdb.entries(): pdict[entry['PepSize']][entry['CysCount']] = entry else: for entry in pdb.entries(): pdict[entry['PepSize']] = entry ### ~ [2] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.deBug(self.int) for prot in prot_combo: allpep = []; redundant = []; maxcys = 0 sx = 0.0; stot = self.obj['SeqList'].seqNum() for seq in self.obj['SeqList'].seqs(): self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0 sequence = seq.getSequence() for protease in string.split(prot,'+'): for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) for frag in string.split(sequence,':'): if frag in allpep: redundant.append(frag) else: allpep.append(frag); maxcys = max(maxcys,frag.count('C')) self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot))) if self.getBool('CysCount'): for c in range(maxcys+1): db.addField('Cys%d' % c) ### ~ [3] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sx = 0.0; stot = self.obj['SeqList'].seqNum() for seq in self.obj['SeqList'].seqs(): self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0 acc = seq.getStr('AccNum') ## ~ [2a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~ ## entry = {'AccNum':acc,'Protease':prot} for i in range(1,self.getInt('MaxPepLen')+1): entry[i] = 0 if self.getBool('PepMWt'): entry[i*100.0] = 0 sequence = seq.getSequence() ## ~ [2b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~ ## for protease in string.split(prot,'+'): for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) ## ~ [2c] ~ Cut into fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## frag = string.split(sequence,':') while '' in frag: frag.remove('') self.deBug(frag) entry['PepCount'] = len(frag) if not self.getBool('NTerm'): frag = frag[1:] if self.getInt('MinPepLen') > 0: for pep in frag[0:]: if len(pep) < self.getInt('MinPepLen'): frag.remove(pep) entry['MinPepLen'] = len(frag) if self.getBool('NRPep'): for pep in frag[0:]: if pep in redundant: frag.remove(pep) entry['NRPep'] = len(frag) if self.getBool('CysCount'): for c in range(maxcys+1): entry['Cys%d' % c] = 0 for pep in frag: entry['Cys%d' % pep.count('C')] += 1 if pdict: entry['LenExp'] = 0.0; entry['MWtExp'] = 0.0; entry['Len7Exp'] = 0.0 ## ~ [2d] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for pep in frag[0:]: plen = min(len(pep),self.getInt('MaxPepLen')) self.deBug('"%s" -> %d' % (pep,plen)) entry[plen] += 1 if pdict: if self.getBool('CysWeight'): try: pprob = pdict[plen][pep.count('C')]['Prob'] except: pprob = 0.0 else: pprob = pdict[plen]['Prob'] if pdict: entry['LenExp'] += pprob if pdict and 7 <= plen: entry['Len7Exp'] += pprob if self.getBool('PepMWt'): pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen')) entry[pwt] += 1 if pdict: entry['MWtExp'] += pprob entry['Len3'] = rje.logPoisson(3,entry['LenExp'],callobj=self) if self.getBool('PepMWt'): entry['MWt3'] = rje.logPoisson(3,entry['MWtExp'],callobj=self) entry['Len5'] = rje.logPoisson(5,entry['LenExp'],callobj=self) if self.getBool('PepMWt'): entry['MWt5'] = rje.logPoisson(5,entry['MWtExp'],callobj=self) entry['Len37'] = rje.logPoisson(3,entry['Len7Exp'],callobj=self) db.addEntry(entry) self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot))) except: self.errorLog('%s._digest error' % self)
def coverage( self ): ### Calculates estimated % coverage and accuracy of genome sequencing. '''Calculates estimated % coverage and accuracy of genome sequencing.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### # XCoverage, SMRT, %Cov, Accuracy if self.getBool('BySMRT'): ckey = 'SMRT' else: ckey = 'XCoverage' cfields = ['XCoverage', 'SMRT', '%Coverage', 'Accuracy'] for xn in self.list['XnList']: cfields.append('%%X%d' % xn) cdb = self.db().addEmptyTable('coverage', cfields, [ckey]) ### ~ [2] Calculate stats for one round ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.progLog('\r#XCOV', 'Calculating coverage stats...') cov_per_base_per_read = self.getNum('AvRead') / self.getNum( 'GenomeSize') if self.getBool('BySMRT'): reads = self.getInt('SMRTReads') # If going per SMRT cell else: reads = int(0.5 + self.getNum('GenomeSize') / self.getNum('AvRead')) # if going per X coverage # Calculate X coverage counts using binomial bases = int(self.getNum('GenomeSize')) xcov = [ ] # List where index is X coverage and number is proportion of reads while bases > 1: try: xcov.append( rje.logBinomial(len(xcov), reads, cov_per_base_per_read, exact=True, callobj=self)) except: xcov.append( rje.logPoisson(len(xcov), reads * cov_per_base_per_read, exact=True, callobj=self)) bases -= self.getNum('GenomeSize') * xcov[-1] if len(xcov) > reads: raise ValueError('XCoverage cannot exceed read count!') cyccov = xcov[0:] self.debug(xcov) ### ~ [3] Cycle through rounds, multiplying by X coverage ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### cx = 0.0 ctot = self.getInt('MaxCov') xcoverage = 0.0 while xcoverage < self.getInt('MaxCov'): self.progLog( '\r#XCOV', 'Calculating coverage stats: %.1f%% (%d|%d)' % ((cx / ctot, cdb.entryNum() + 1, len(cyccov)))) cx += 100.0 # Update xcov: calculate %bases at different X coverage if cdb.entryNum( ): # Equivalent of starting with [1.0] (!00% 0 @ 0X) prevcov = cyccov[0:] cyccov = [0.0] * (len(prevcov) * 2 - 1) for xi in range(len(prevcov)): for xj in range(len(xcov)): x = xi + xj cyccov[x] += (prevcov[xi] * xcov[xj]) while (cyccov[-1]) < 1.0 / self.getNum('GenomeSize'): cyccov.pop(-1) # Calculate accuracy: For each X coverage, calculate % bases with >50% correct accuracy = 0.0 for x in range(len(cyccov[1:])): accuracy += cyccov[x] * self.accuracy(x) accuracy = 100.0 * accuracy / sum(cyccov[1:]) # SMRT cells versus coverage xcoverage += self.getNum('AvRead') * reads / self.getNum( 'GenomeSize') smrt = (self.getNum('GenomeSize') * xcoverage) / ( self.getNum('AvRead') * self.getNum('SMRTReads')) # Update cdb #centry = {'XCoverage':'%.3f' % xcoverage,'SMRT':'%.2f' % smrt,'%Coverage':100.0 * (1.0-cyccov[0]),'Accuracy':accuracy} centry = { 'XCoverage': rje.sf(xcoverage, 3), 'SMRT': rje.sf(smrt, 3), '%Coverage': 100.0 * (1.0 - cyccov[0]), 'Accuracy': accuracy } for xn in self.list['XnList']: if xn <= len(cyccov): centry['%%X%d' % xn] = rje.sf(100.0 * sum(cyccov[xn:]), 4) else: centry['%%X%d' % xn] = 0.000 cdb.addEntry(centry) self.progLog( '\r#XCOV', 'Calculated coverage stats upto %dX coverage.' % self.getInt('MaxCov')) ### ~ [4] Save results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for xkey in cdb.dataKeys(): cdb.dict['Data'][float(xkey)] = cdb.dict['Data'].pop(xkey) cdb.saveToFile() return except: self.errorLog('%s.coverage error' % self.prog())
def coverage(self): ### Calculates estimated % coverage and accuracy of genome sequencing. '''Calculates estimated % coverage and accuracy of genome sequencing.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### # XCoverage, SMRT, %Cov, Accuracy if self.getBool('BySMRT'): ckey = 'SMRT' else: ckey = 'XCoverage' cfields = ['XCoverage','SMRT','%Coverage','Accuracy'] for xn in self.list['XnList']: cfields.append('%%X%d' % xn) cdb = self.db().addEmptyTable('coverage',cfields,[ckey]) ### ~ [2] Calculate stats for one round ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.progLog('\r#XCOV','Calculating coverage stats...') cov_per_base_per_read = self.getNum('AvRead') / self.getNum('GenomeSize') if self.getBool('BySMRT'): reads = self.getInt('SMRTReads') # If going per SMRT cell else: reads = int(0.5 + self.getNum('GenomeSize') / self.getNum('AvRead')) # if going per X coverage # Calculate X coverage counts using binomial bases = int(self.getNum('GenomeSize')) xcov = [] # List where index is X coverage and number is proportion of reads while bases > 1: try: xcov.append(rje.logBinomial(len(xcov),reads,cov_per_base_per_read,exact=True,callobj=self)) except: xcov.append(rje.logPoisson(len(xcov),reads*cov_per_base_per_read,exact=True,callobj=self)) bases -= self.getNum('GenomeSize') * xcov[-1] if len(xcov) > reads: raise ValueError('XCoverage cannot exceed read count!') cyccov = xcov[0:] self.debug(xcov) ### ~ [3] Cycle through rounds, multiplying by X coverage ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### cx = 0.0; ctot = self.getInt('MaxCov'); xcoverage = 0.0 while xcoverage < self.getInt('MaxCov'): self.progLog('\r#XCOV','Calculating coverage stats: %.1f%% (%d|%d)' % ((cx/ctot,cdb.entryNum()+1,len(cyccov)))); cx += 100.0 # Update xcov: calculate %bases at different X coverage if cdb.entryNum(): # Equivalent of starting with [1.0] (!00% 0 @ 0X) prevcov = cyccov[0:] cyccov = [0.0] * (len(prevcov)*2 - 1) for xi in range(len(prevcov)): for xj in range(len(xcov)): x = xi + xj cyccov[x] += (prevcov[xi] * xcov[xj]) while(cyccov[-1]) < 1.0 / self.getNum('GenomeSize'): cyccov.pop(-1) # Calculate accuracy: For each X coverage, calculate % bases with >50% correct accuracy = 0.0 for x in range(len(cyccov[1:])): accuracy += cyccov[x] * self.accuracy(x) accuracy = 100.0 * accuracy / sum(cyccov[1:]) # SMRT cells versus coverage xcoverage += self.getNum('AvRead') * reads / self.getNum('GenomeSize') smrt = (self.getNum('GenomeSize') * xcoverage) / (self.getNum('AvRead') * self.getNum('SMRTReads')) # Update cdb #centry = {'XCoverage':'%.3f' % xcoverage,'SMRT':'%.2f' % smrt,'%Coverage':100.0 * (1.0-cyccov[0]),'Accuracy':accuracy} centry = {'XCoverage':rje.sf(xcoverage,3),'SMRT':rje.sf(smrt,3),'%Coverage':100.0 * (1.0-cyccov[0]),'Accuracy':accuracy} for xn in self.list['XnList']: if xn <= len(cyccov): centry['%%X%d' % xn] = rje.sf(100.0*sum(cyccov[xn:]),4) else: centry['%%X%d' % xn] = 0.000 cdb.addEntry(centry) self.progLog('\r#XCOV','Calculated coverage stats upto %dX coverage.' % self.getInt('MaxCov')) ### ~ [4] Save results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for xkey in cdb.dataKeys(): cdb.dict['Data'][float(xkey)] = cdb.dict['Data'].pop(xkey) cdb.saveToFile() return except: self.errorLog('%s.coverage error' % self.prog())