def checkMatchedByPrimer(self,activePrimers): from seqdata import revcomp from misc import hamming_distance # go through all active primers in collection for seq, name in activePrimers: #print ', '.join([name for seq, name in activePrimers]) # go through all the kmers in the sequence of the handle for i in range(len(self.sequence)-self.kmerLength): primer3prime = seq[-self.kmerLength:] prekmer = self.sequence[i:i+self.kmerLength] # check if the primer three prime end matches the kmer or the revcomp kmer for kmer in [prekmer,revcomp(prekmer)]: if primer3prime == kmer: return name distance = hamming_distance( primer3prime, kmer ) if distance < self.minHD: return name if distance < self.minHD+1: distance = hamming_distance( primer3prime[-5:], kmer[-5:] ) if distance < 1: return name return False
def checkMatchedByPrimer(self, activePrimers): from seqdata import revcomp from misc import hamming_distance # go through all active primers in collection for seq, name in activePrimers: #print ', '.join([name for seq, name in activePrimers]) # go through all the kmers in the sequence of the handle for i in range(len(self.sequence) - self.kmerLength): primer3prime = seq[-self.kmerLength:] prekmer = self.sequence[i:i + self.kmerLength] # check if the primer three prime end matches the kmer or the revcomp kmer for kmer in [prekmer, revcomp(prekmer)]: if primer3prime == kmer: return name distance = hamming_distance(primer3prime, kmer) if distance < self.minHD: return name if distance < self.minHD + 1: distance = hamming_distance(primer3prime[-5:], kmer[-5:]) if distance < 1: return name return False
def check3primEnd(self, kmers, revcompkmers, handles, fiveprime=False): """ Look at matches between the handle sequence and the kmer collections """ from seqdata import revcomp from misc import hamming_distance # create dictionary of the kmers in the handle sequence revcompself = {} for i in range(len(self.sequence) - self.kmerLength + 1): if not fiveprime: kmer = revcomp(self.sequence[i:i + self.kmerLength]) else: kmer = self.sequence[i:i + self.kmerLength] try: revcompself[kmer].append('revcomp self') except KeyError: revcompself[kmer] = ['revcomp self'] # check if the active end matches some other kmers in collection toCLose = False # initial vaule if fiveprime: # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence ENDSEQ = revcomp(self.sequence[:self.kmerLength]) endName = 'first ' else: ENDSEQ = self.sequence[-self.kmerLength:] endName = 'last ' assert len( ENDSEQ ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n' self.output = '\ngenereated handle#' + str( self.id) + ' ' + 'check ' + endName + str( self.kmerLength ) + '=' + ENDSEQ + ':=> ' # give some info for the output # # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers # # Look for perfect matches of the end sequence to kmer dictionaries for dictionary, name in [(revcompself, 'self-rc '), (kmers, ''), (revcompkmers, 'rc ')]: if ENDSEQ in dictionary: self.output += endName + str( self.kmerLength ) + ' (' + ENDSEQ + ') perfect ' + name + 'match to ' + ' ' + str( dictionary[ENDSEQ]) self.resonFordeath = name + 'kmer match' return # Look for matches with missmatch for dictionary, name in [(revcompself, 'self-rc'), (kmers, ''), (revcompkmers, 'rc '), (revcompself, 'self-rc')]: for kmer, hits in dictionary.iteritems(): assert len( kmer ) == self.kmerLength, 'Error: kmer of wrong length: ' + kmer + ' in ' + ', '.join( hits) if kmer.count('N'): continue # check for distance of full kmer to kmer dictionaries distFull = hamming_distance(ENDSEQ, kmer) if distFull < self.minHD: toCLose = True self.output += str(distFull) + ' mm to ' + str( hits) + ' (' + kmer + ') too close,' self.resonFordeath = name + 'kmer match' break if distFull < self.minHD + 1: # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part distLastFive = hamming_distance(ENDSEQ[-5:], kmer[-5:]) if distLastFive < 3: toCLose = True self.output += str( distLastFive) + ' mm in last5 to ' + str( hits) + ' (' + kmer + ') too close,' self.resonFordeath = name + 'kmer match in last5 ' break #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]: # looks for uniq three mers skip sthis mostly there are non 4**3 is to few # toCLose = True; # self.output+= ' lastbase(s) identical, '+name+' '+kmer # self.resonFordeath = name+'lastbase(s) identical ' # break #else:output+= str(dist)+' mm to '+str(hits)+' ok ' if toCLose: return # # check if 3' bases in handle matches any other 3' in other handles # for handle2 in handles: dist = hamming_distance(ENDSEQ, handle2.sequence[-self.kmerLength:]) if dist < self.minHD: toCLose = True self.output += str(dist) + ' mm to ' + str( handle2.id) + '(' + handle2.sequence[ -self.kmerLength:] + ')' + ' too close,' self.resonFordeath = '3 prime ends match' break else: self.output += str(dist) + ' mm to ' + str( handle2.id) + ' (' + handle2.sequence[ -self.kmerLength:] + ') ' + 'ok |' + ' ' if toCLose: return
def check3primEnd(self, kmers,revcompkmers,handles,fiveprime=False): """ Look at matches between the handle sequence and the kmer collections """ from seqdata import revcomp from misc import hamming_distance # create dictionary of the kmers in the handle sequence revcompself = {} for i in range(len(self.sequence)-self.kmerLength+1): if not fiveprime: kmer = revcomp(self.sequence[i:i+self.kmerLength]) else: kmer = self.sequence[i:i+self.kmerLength] try: revcompself[kmer].append('revcomp self') except KeyError:revcompself[kmer] = ['revcomp self'] # check if the active end matches some other kmers in collection toCLose = False # initial vaule if fiveprime: # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence ENDSEQ = revcomp(self.sequence[:self.kmerLength]) endName = 'first ' else: ENDSEQ = self.sequence[-self.kmerLength:] endName = 'last ' assert len(ENDSEQ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n' self.output = '\ngenereated handle#'+str(self.id)+' '+'check '+endName+str(self.kmerLength)+'='+ENDSEQ+':=> ' # give some info for the output # # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers # # Look for perfect matches of the end sequence to kmer dictionaries for dictionary, name in [(revcompself,'self-rc '),(kmers,''),(revcompkmers,'rc ')]: if ENDSEQ in dictionary: self.output+= endName+str(self.kmerLength)+' ('+ENDSEQ+') perfect '+name+'match to '+' '+str(dictionary[ENDSEQ]); self.resonFordeath = name+'kmer match' return # Look for matches with missmatch for dictionary, name in [(revcompself,'self-rc'),(kmers,''),(revcompkmers,'rc '),(revcompself,'self-rc')]: for kmer,hits in dictionary.iteritems(): assert len(kmer) == self.kmerLength, 'Error: kmer of wrong length: '+kmer+' in '+', '.join(hits) if kmer.count('N'): continue # check for distance of full kmer to kmer dictionaries distFull = hamming_distance(ENDSEQ,kmer) if distFull < self.minHD: toCLose = True; self.output+= str(distFull)+' mm to '+str(hits)+' ('+kmer+') too close,' self.resonFordeath = name+'kmer match' break if distFull < self.minHD+1: # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part distLastFive = hamming_distance(ENDSEQ[-5:],kmer[-5:]) if distLastFive < 3: toCLose = True; self.output+= str(distLastFive)+' mm in last5 to '+str(hits)+' ('+kmer+') too close,' self.resonFordeath = name+'kmer match in last5 ' break #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]: # looks for uniq three mers skip sthis mostly there are non 4**3 is to few # toCLose = True; # self.output+= ' lastbase(s) identical, '+name+' '+kmer # self.resonFordeath = name+'lastbase(s) identical ' # break #else:output+= str(dist)+' mm to '+str(hits)+' ok ' if toCLose: return # # check if 3' bases in handle matches any other 3' in other handles # for handle2 in handles: dist = hamming_distance(ENDSEQ,handle2.sequence[-self.kmerLength:]) if dist < self.minHD: toCLose = True; self.output+= str(dist)+' mm to '+str(handle2.id)+'('+handle2.sequence[-self.kmerLength:]+')'+' too close,' self.resonFordeath = '3 prime ends match' break else: self.output+= str(dist)+' mm to '+str(handle2.id)+' ('+handle2.sequence[-self.kmerLength:]+') '+'ok |'+' ' if toCLose: return