def profile_most_probable_kmer(self, profile, dna=None): """takes a profile matrix. validate it, removes 0 error. :returns profile most probable kmer.""" if dna is None: dna = self.DNA if True in [0 in pro for pro in profile]: print('profile contains 0. changing it...') for i in range(4): for j in range(len(profile[i])): profile[i][j] += 0.2 kmers = ngram(dna.upper(), len(profile[0])) pro = [] for i in range(len(kmers)): b = list(kmers[i]) x = 1 for j in range(len(b)): latter = b[j] if latter == 'A': x *= profile[0][j] if latter == 'C': x *= profile[1][j] if latter == 'G': x *= profile[2][j] if latter == 'T': x *= profile[3][j] pro.append(x) return kmers[pro.index(max(pro))]
def approximate_matched_pattern(self, pattern, max_mismatch, dna=None): """:returns a list of starting indexes of approximate matched patterns""" if dna is None: dna = self.DNA k_gram = ngram(dna, len(pattern)) return [ i for i, j in enumerate(k_gram) if hamming_distance(j, pattern) <= max_mismatch ]
def most_frequent_k_mer(self, k): """:returns the list of most frequent kmers in a DNA sequence.""" result_list = [] k_gram = ngram(self.DNA, k) set_gram = list(set(k_gram)) set_count = [] for a in set_gram: set_count.append(k_gram.count(a)) m = max(set_count) for a in range(len(set_count)): if set_count[a] == m: result_list.append(set_gram[a]) return result_list
def most_frequent_k_mer_with_mismatch(self, k, max_mismatch): """:returns the list of most frequent kmers of length k in a DNA sequence allowing maximum mismatch of max_mismatch""" k_mers = list(set(ngram(self.DNA, k))) frequency_counts = [ len(self.approximate_matched_pattern(gram, max_mismatch)) for gram in k_mers ] m = max(frequency_counts) return [ k_mers[i] for i in range(len(frequency_counts)) if frequency_counts[i] == m ]
def most_frequent_k_mer_with_mismatch_and_complements( self, k, max_mismatch): """:returns the list of most frequent kmers of length k in a DNA sequence allowing maximum mismatch of max_mismatch also including occurrences of there reverse compliments""" k_mers = list(set(ngram(self.DNA, k))) frequency_counts = [ (len(self.approximate_matched_pattern(gram, max_mismatch)) + len( self.approximate_matched_pattern( self.reverse_complement(dna=gram), max_mismatch))) for gram in k_mers ] m = max(frequency_counts) return [ k_mers[i] for i in range(len(frequency_counts)) if frequency_counts[i] == m ]
def lt_clump(self, k, l, t, dna=None): """ :param k: length of kmers :param l: length L :param t:minimum number of kmers in L :param dna: optional DNA sequence :return: list of kmers for which L-t clump is present in the dna sequence. """ if dna is None: dna = self.DNA clumps = [] k_mers = list(set(ngram(dna, k))) for each in k_mers: for i in range(len(dna) - l + 1): if dna[i] == each[0]: if self.count_k_mer(each, dna=dna[i:i + l]) >= t: clumps.append(each) break # if present no need to check further for a kmer return clumps