def better_clump_finding(genome, k, t, L): frequent_patterns = [] clump = [0] * (4**k) text = genome[0:L] frequency_array = computing_frequencies(text,k) for i in range(0,4**k): if frequency_array[i] >= t: clump[i] = 1 for i in range(1,len(genome)-L+1): first_pattern = genome[i-1:i-1+k] index = pattern_to_number(first_pattern) frequency_array[index] = frequency_array[index] - 1 last_pattern = genome[i+L-k:i+L] index = pattern_to_number(last_pattern) frequency_array[index] = frequency_array[index] + 1 if frequency_array[index] >= t: clump[index] = 1 for i in range(0,4**k): if clump[i] == 1: pattern = number_to_pattern(i,k) if pattern not in frequent_patterns: frequent_patterns.append(pattern) return frequent_patterns
def count_frequency_array(sequence, klen): frequency_array = [0] * (4**klen) frequent_positions = [] start = 0 end = klen for i in range(len(sequence) - klen): frame = sequence[start:end] num = pattern_to_number(frame) frequency_array[num] = frequency_array[num] + 1 start = start + 1 end = end + 1 max_count = max(frequency_array) for i in range(len(frequency_array)): if frequency_array[i] == max_count: frequent_positions.append(i) # convert numbers to pattern f = lambda x: number_to_pattern(x) frequent_patterns = list(map(f, frequent_nums)) return frequency_array, frequent_positions, frequent_patterns
def prob_3010_2(): lines = open("data/dataset_3010_2.txt").read().splitlines() pattern = lines[0] fout = open("out.txt", "w") fout.write(str(pattern_to_number(pattern))) fout.close()
def frequent_words_with_mismatches_and_revcomps(text, k, d): frequent_patterns = [] close = [0] * (4**k) frequency_array = [0] * (4**k) for i in range(0,len(text)-k+1): neighborhood = neighbors(text[i:i+k], d) for pattern in neighborhood: index = pattern_to_number(pattern) close[index] = 1 for i in range(0,4**k): if close[i] == 1: pattern = number_to_pattern(i, k) frequency_array[i] = approximate_pattern_count(text, pattern, d) frequency_array[i] += approximate_pattern_count(text, reverse_complement(pattern), d) max_count = max(frequency_array) for i in range(0,4**k): if frequency_array[i] == max_count: pattern = number_to_pattern(i, k) if pattern not in frequent_patterns: frequent_patterns.append(pattern) return frequent_patterns