Beispiel #1
0
 def checkMatchedByPrimer(self,activePrimers):
     
     from seqdata import revcomp
     from misc import hamming_distance
     
     # go through all active primers in collection
     for seq, name in activePrimers:
        #print ', '.join([name for seq, name in activePrimers])
        
         # go through all the kmers in the sequence of the handle
         for i in range(len(self.sequence)-self.kmerLength):
             primer3prime = seq[-self.kmerLength:]
             prekmer = self.sequence[i:i+self.kmerLength]
   
             # check if the primer three prime end matches the kmer or the revcomp kmer
             for kmer in [prekmer,revcomp(prekmer)]:
                if primer3prime == kmer: return name
                distance = hamming_distance( primer3prime, kmer )
                if distance < self.minHD: return name
                if distance < self.minHD+1:
                   distance = hamming_distance( primer3prime[-5:], kmer[-5:] )
                   if distance < 1: return name
     return False
Beispiel #2
0
    def checkMatchedByPrimer(self, activePrimers):

        from seqdata import revcomp
        from misc import hamming_distance

        # go through all active primers in collection
        for seq, name in activePrimers:
            #print ', '.join([name for seq, name in activePrimers])

            # go through all the kmers in the sequence of the handle
            for i in range(len(self.sequence) - self.kmerLength):
                primer3prime = seq[-self.kmerLength:]
                prekmer = self.sequence[i:i + self.kmerLength]

                # check if the primer three prime end matches the kmer or the revcomp kmer
                for kmer in [prekmer, revcomp(prekmer)]:
                    if primer3prime == kmer: return name
                    distance = hamming_distance(primer3prime, kmer)
                    if distance < self.minHD: return name
                    if distance < self.minHD + 1:
                        distance = hamming_distance(primer3prime[-5:],
                                                    kmer[-5:])
                        if distance < 1: return name
        return False
Beispiel #3
0
    def check3primEnd(self, kmers, revcompkmers, handles, fiveprime=False):
        """ Look at matches between the handle sequence and the kmer collections
        """

        from seqdata import revcomp
        from misc import hamming_distance

        # create dictionary of the kmers in the handle sequence
        revcompself = {}
        for i in range(len(self.sequence) - self.kmerLength + 1):
            if not fiveprime:
                kmer = revcomp(self.sequence[i:i + self.kmerLength])
            else:
                kmer = self.sequence[i:i + self.kmerLength]
            try:
                revcompself[kmer].append('revcomp self')
            except KeyError:
                revcompself[kmer] = ['revcomp self']

        # check if the active end matches some other kmers in collection
        toCLose = False  # initial vaule

        if fiveprime:  # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence
            ENDSEQ = revcomp(self.sequence[:self.kmerLength])
            endName = 'first '
        else:
            ENDSEQ = self.sequence[-self.kmerLength:]
            endName = 'last '
        assert len(
            ENDSEQ
        ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n'
        self.output = '\ngenereated handle#' + str(
            self.id) + ' ' + 'check ' + endName + str(
                self.kmerLength
            ) + '=' + ENDSEQ + ':=> '  # give some info for the output

        #
        # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers
        #

        # Look for perfect matches of the end sequence to kmer dictionaries
        for dictionary, name in [(revcompself, 'self-rc '), (kmers, ''),
                                 (revcompkmers, 'rc ')]:
            if ENDSEQ in dictionary:
                self.output += endName + str(
                    self.kmerLength
                ) + ' (' + ENDSEQ + ') perfect ' + name + 'match to ' + ' ' + str(
                    dictionary[ENDSEQ])
                self.resonFordeath = name + 'kmer match'
                return

        # Look for matches with missmatch
        for dictionary, name in [(revcompself, 'self-rc'), (kmers, ''),
                                 (revcompkmers, 'rc '),
                                 (revcompself, 'self-rc')]:
            for kmer, hits in dictionary.iteritems():
                assert len(
                    kmer
                ) == self.kmerLength, 'Error: kmer of wrong length: ' + kmer + ' in ' + ', '.join(
                    hits)
                if kmer.count('N'): continue

                # check for distance of full kmer to kmer dictionaries
                distFull = hamming_distance(ENDSEQ, kmer)
                if distFull < self.minHD:
                    toCLose = True
                    self.output += str(distFull) + ' mm to ' + str(
                        hits) + ' (' + kmer + ') too close,'
                    self.resonFordeath = name + 'kmer match'
                    break

                if distFull < self.minHD + 1:  # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part
                    distLastFive = hamming_distance(ENDSEQ[-5:], kmer[-5:])
                    if distLastFive < 3:
                        toCLose = True
                        self.output += str(
                            distLastFive) + ' mm in last5 to ' + str(
                                hits) + ' (' + kmer + ') too close,'
                        self.resonFordeath = name + 'kmer match  in last5 '
                        break

                #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]:  # looks for uniq three mers skip sthis mostly there are non 4**3 is to few
                #      toCLose = True;
                #      self.output+= ' lastbase(s) identical, '+name+' '+kmer
                #      self.resonFordeath = name+'lastbase(s) identical '
                #      break
                #else:output+= str(dist)+' mm to '+str(hits)+' ok '
            if toCLose: return

        #
        # check if 3' bases in handle matches any other 3' in other handles
        #
        for handle2 in handles:
            dist = hamming_distance(ENDSEQ,
                                    handle2.sequence[-self.kmerLength:])
            if dist < self.minHD:
                toCLose = True
                self.output += str(dist) + ' mm to ' + str(
                    handle2.id) + '(' + handle2.sequence[
                        -self.kmerLength:] + ')' + ' too close,'
                self.resonFordeath = '3 prime ends match'
                break
            else:
                self.output += str(dist) + ' mm to ' + str(
                    handle2.id) + ' (' + handle2.sequence[
                        -self.kmerLength:] + ') ' + 'ok |' + ' '
        if toCLose: return
Beispiel #4
0
    def check3primEnd(self, kmers,revcompkmers,handles,fiveprime=False):
        """ Look at matches between the handle sequence and the kmer collections
        """
        
        from seqdata import revcomp
        from misc import hamming_distance

        # create dictionary of the kmers in the handle sequence
        revcompself = {}
        for i in range(len(self.sequence)-self.kmerLength+1):
           if not fiveprime: kmer = revcomp(self.sequence[i:i+self.kmerLength])
           else:             kmer =         self.sequence[i:i+self.kmerLength]
           try:            revcompself[kmer].append('revcomp self')
           except KeyError:revcompself[kmer] = ['revcomp self']
        
        # check if the active end matches some other kmers in collection
        toCLose = False # initial vaule
        
        if fiveprime: # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence
           ENDSEQ = revcomp(self.sequence[:self.kmerLength])
           endName = 'first '
        else:
           ENDSEQ = self.sequence[-self.kmerLength:]
           endName = 'last '
        assert len(ENDSEQ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n'
        self.output = '\ngenereated handle#'+str(self.id)+' '+'check '+endName+str(self.kmerLength)+'='+ENDSEQ+':=> ' # give some info for the output
        
        #
        # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers
        #
        
        # Look for perfect matches of the end sequence to kmer dictionaries
        for dictionary, name in [(revcompself,'self-rc '),(kmers,''),(revcompkmers,'rc ')]:
           if ENDSEQ in dictionary:
              self.output+= endName+str(self.kmerLength)+' ('+ENDSEQ+') perfect '+name+'match to '+' '+str(dictionary[ENDSEQ]);
              self.resonFordeath = name+'kmer match'
              return

        # Look for matches with missmatch
        for dictionary, name in [(revcompself,'self-rc'),(kmers,''),(revcompkmers,'rc '),(revcompself,'self-rc')]:
            for kmer,hits in dictionary.iteritems():
                assert len(kmer) == self.kmerLength, 'Error: kmer of wrong length: '+kmer+' in '+', '.join(hits)
                if kmer.count('N'): continue
                
                # check for distance of full kmer to kmer dictionaries
                distFull = hamming_distance(ENDSEQ,kmer)
                if distFull < self.minHD:
                    toCLose = True;
                    self.output+= str(distFull)+' mm to '+str(hits)+' ('+kmer+') too close,'
                    self.resonFordeath = name+'kmer match'
                    break
                  
                if distFull < self.minHD+1: # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part
                    distLastFive = hamming_distance(ENDSEQ[-5:],kmer[-5:])
                    if distLastFive < 3:
                        toCLose = True;
                        self.output+= str(distLastFive)+' mm in last5 to '+str(hits)+' ('+kmer+') too close,'
                        self.resonFordeath = name+'kmer match  in last5 '
                        break
                
                #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]:  # looks for uniq three mers skip sthis mostly there are non 4**3 is to few
                #      toCLose = True;
                #      self.output+= ' lastbase(s) identical, '+name+' '+kmer
                #      self.resonFordeath = name+'lastbase(s) identical '
                #      break
                #else:output+= str(dist)+' mm to '+str(hits)+' ok '
            if toCLose: return
    
        #
        # check if 3' bases in handle matches any other 3' in other handles
        #
        for handle2 in handles:
           dist = hamming_distance(ENDSEQ,handle2.sequence[-self.kmerLength:])
           if dist < self.minHD:
              toCLose = True;
              self.output+= str(dist)+' mm to '+str(handle2.id)+'('+handle2.sequence[-self.kmerLength:]+')'+' too close,'
              self.resonFordeath = '3 prime ends match'
              break
           else: self.output+= str(dist)+' mm to '+str(handle2.id)+' ('+handle2.sequence[-self.kmerLength:]+') '+'ok |'+' '
        if toCLose: return