Beispiel #1
0
    def profile_most_probable_kmer(self, profile, dna=None):
        """takes a profile matrix. validate it, removes 0 error.
        :returns profile most probable kmer."""

        if dna is None:
            dna = self.DNA

        if True in [0 in pro for pro in profile]:
            print('profile contains 0. changing it...')
            for i in range(4):
                for j in range(len(profile[i])):
                    profile[i][j] += 0.2

        kmers = ngram(dna.upper(), len(profile[0]))
        pro = []
        for i in range(len(kmers)):
            b = list(kmers[i])
            x = 1
            for j in range(len(b)):
                latter = b[j]
                if latter == 'A':
                    x *= profile[0][j]
                if latter == 'C':
                    x *= profile[1][j]
                if latter == 'G':
                    x *= profile[2][j]
                if latter == 'T':
                    x *= profile[3][j]
            pro.append(x)
        return kmers[pro.index(max(pro))]
Beispiel #2
0
    def approximate_matched_pattern(self, pattern, max_mismatch, dna=None):
        """:returns a list of starting indexes of approximate matched patterns"""

        if dna is None:
            dna = self.DNA
        k_gram = ngram(dna, len(pattern))
        return [
            i for i, j in enumerate(k_gram)
            if hamming_distance(j, pattern) <= max_mismatch
        ]
Beispiel #3
0
 def most_frequent_k_mer(self, k):
     """:returns the list of most frequent kmers in a DNA sequence."""
     result_list = []
     k_gram = ngram(self.DNA, k)
     set_gram = list(set(k_gram))
     set_count = []
     for a in set_gram:
         set_count.append(k_gram.count(a))
     m = max(set_count)
     for a in range(len(set_count)):
         if set_count[a] == m:
             result_list.append(set_gram[a])
     return result_list
Beispiel #4
0
    def most_frequent_k_mer_with_mismatch(self, k, max_mismatch):
        """:returns the list of most frequent kmers of length k in
        a DNA sequence allowing maximum mismatch of max_mismatch"""

        k_mers = list(set(ngram(self.DNA, k)))
        frequency_counts = [
            len(self.approximate_matched_pattern(gram, max_mismatch))
            for gram in k_mers
        ]
        m = max(frequency_counts)
        return [
            k_mers[i] for i in range(len(frequency_counts))
            if frequency_counts[i] == m
        ]
Beispiel #5
0
    def most_frequent_k_mer_with_mismatch_and_complements(
            self, k, max_mismatch):
        """:returns the list of most frequent kmers of length k in a DNA sequence allowing
        maximum mismatch of max_mismatch also including occurrences of there reverse compliments"""

        k_mers = list(set(ngram(self.DNA, k)))
        frequency_counts = [
            (len(self.approximate_matched_pattern(gram, max_mismatch)) + len(
                self.approximate_matched_pattern(
                    self.reverse_complement(dna=gram), max_mismatch)))
            for gram in k_mers
        ]
        m = max(frequency_counts)
        return [
            k_mers[i] for i in range(len(frequency_counts))
            if frequency_counts[i] == m
        ]
Beispiel #6
0
 def lt_clump(self, k, l, t, dna=None):
     """
     :param k: length of kmers
     :param l: length L
     :param t:minimum number of kmers in L
     :param dna: optional DNA sequence
     :return: list of kmers for which L-t clump is present in the dna sequence.
     """
     if dna is None:
         dna = self.DNA
     clumps = []
     k_mers = list(set(ngram(dna, k)))
     for each in k_mers:
         for i in range(len(dna) - l + 1):
             if dna[i] == each[0]:
                 if self.count_k_mer(each, dna=dna[i:i + l]) >= t:
                     clumps.append(each)
                     break  # if present no need to check further for a kmer
     return clumps