def frequent_words_with_mismatches(dna, k, d): frequent_patterns = [] close_array = [] frequency_array = [] for i in range(4**k): close_array.append(0) frequency_array.append(0) for i in range(len(dna) - k + 1): neighbourhood = neighbours(dna[i:i + k], d) for pattern in neighbourhood: idx = PatternToNumber(pattern) close_array[idx] = 1 for i in range(4**k): if close_array[i] == 1: pattern = NumberToPattern(i, k) frequency_array[i] = len( approximate_pattern_matching(pattern, dna, d)) max_value = max(frequency_array) for i in range(4**k): if frequency_array[i] == max_value: pattern = NumberToPattern(i, k) frequent_patterns.append(pattern) return frequent_patterns
def clump_finding(dna, k, L, t): clump_frequent_pattern_list = [] clump_array = [] for i in range(4**k - 1): clump_array.append(0) frequency_array = computing_frequencies(dna[0:L], k) for i in range(4**k - 1): if (frequency_array[i] >= t): pattern = NumberToPattern(i, k) clump_frequent_pattern_list.append(pattern) for i in range(1, len(dna) - L + 1): first_pattern = dna[i - 1:i - 1 + k] index = PatternToNumber(first_pattern) frequency_array[index] = frequency_array[index] - 1 second_pattern = dna[i + L - k:i + L] index = PatternToNumber(second_pattern) frequency_array[index] = frequency_array[index] + 1 if (frequency_array[index] >= t): clump_array[index] = 1 for i in range(4**k - 1): if (clump_array[i] == 1): pattern = NumberToPattern(i, k) clump_frequent_pattern_list.append(pattern) return clump_frequent_pattern_list
def finding_frequent_patterns_sorting(dna, k): frequent_patterns = [] index = [] count = [] for i in range(len(dna) - k + 1): pattern = dna[i:i + k] idx = PatternToNumber(pattern) cnt = 1 index.append(idx) count.append(cnt) index.sort() for i in range(1, len(dna) - k + 1): if (index[i] == index[i - 1]): count[i] = count[i - 1] + 1 max_count = max(count) for i in range(0, len(dna) - k + 1): if (count[i] == max_count): pattern = NumberToPattern(index[i], k) frequent_patterns.append(pattern) return frequent_patterns
def profile_kmer(dna, k, profile): pattern_list=[] pattern_prob=[] for i in range(4**k): kmer = NumberToPattern(i,k) pattern_list.append(kmer) prob = pattern_probability(kmer, profile) pattern_prob.append(prob) pattern_prob, pattern_list = (list(t) for t in zip(*sorted(zip(pattern_prob, pattern_list)))) for i in range(len(pattern_prob)-1,0,-1): for j in range(len(dna)-k+1): if(dna[j:j+k]==pattern_list[i]): return dna[j:j+k]
def median_string(dna_list, k): distance = math.inf median = '' pattern_list = [] for i in range(4**k): kmer = NumberToPattern(i,k) pattern_list.append(kmer) for pattern in pattern_list: score=0 for seq in dna_list: score = score + min_hamming_distance(pattern, seq) if score < distance: distance = score median = pattern return median