def scan_for_repeating_kmers_in_clusters( sequence: str, k: int, min_occurrence_in_cluster: int, cluster_window_size: int, options: Options = Options()) -> Set[KmerCluster]: def neighborhood(kmer: str) -> Set[str]: neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) if options.reverse_complement: kmer_rc = reverse_complement(kmer) neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer_rc, options.hamming_distance) return neighbourhood kmer_counter = {} def add_kmer(kmer: str, loc: int) -> None: if kmer not in kmer_counter: kmer_counter[kmer] = set() kmer_counter[kmer].add(window_idx + kmer_idx) def remove_kmer(kmer: str, loc: int) -> None: kmer_counter[kmer].remove(window_idx - 1) if len(kmer_counter[kmer]) == 0: del kmer_counter[kmer] clustered_kmers = set() old_first_kmer = None for window, window_idx in slide_window(sequence, cluster_window_size): first_kmer = window[0:k] last_kmer = window[-k:] # If first iteration, add all kmers if window_idx == 0: for kmer, kmer_idx in slide_window(window, k): for alt_kmer in neighborhood(kmer): add_kmer(alt_kmer, window_idx + kmer_idx) else: # Add kmer that was walked in to for new_last_kmer in neighborhood(last_kmer): add_kmer(new_last_kmer, window_idx + cluster_window_size - k) # Remove kmer that was walked out of if old_first_kmer is not None: for alt_kmer in neighborhood(old_first_kmer): remove_kmer(alt_kmer, window_idx - 1) old_first_kmer = first_kmer # Find clusters within window -- tuple is k-mer, start_idx, occurrence_count [ clustered_kmers.add(KmerCluster(k, min(v), len(v))) for k, v in kmer_counter.items() if len(v) >= min_occurrence_in_cluster ] return clustered_kmers
def top_repeating_kmers(data: str, k: int, options: Options = Options()) -> Set[str]: counts = count_kmers(data, k, options) _, top_count = counts.most_common(1)[0] top_kmers = set() for kmer, count in counts.items(): if count == top_count: top_kmers.add((kmer, count)) return top_kmers
def main(): print("<div style=\"border:1px solid black;\">", end="\n\n") print("`{bm-disable-all}`", end="\n\n") try: filepath = input() with lzma.open(filepath, mode='rt', encoding='utf-8') as f: lines = f.read().splitlines() lines = [line for line in lines if not line.startswith('>')] seq = ''.join(lines) print( f'Calculating skew for: {textwrap.shorten(seq, width=15, placeholder="...")}\n' ) skew = gc_skew(seq) print( f'Result: {textwrap.shorten(str(skew), width=15, placeholder="...")}\n' ) plot_filename = 'skew_' + hashlib.md5( seq.encode()).hexdigest() + '.png' plt.plot(skew) plt.ylabel(f'{filepath} skew') plt.savefig(f'/output/{plot_filename}') ori_vicinity = skew.index(min(skew)) print(f'![GC Skew Plot]({plot_filename})\n') print(f'Ori vicinity (min pos): {ori_vicinity}\n') k = 9 min_occurrence_in_cluster = 3 cluster_window_size = 500 hamming_distance = 1 reverse_comp = True ori_vicinity_seq = seq[ori_vicinity - 500:ori_vicinity + 500] scan_res = scan_for_repeating_kmers_in_clusters( ori_vicinity_seq, k, min_occurrence_in_cluster, cluster_window_size, Options(hamming_distance, reverse_comp)) scan_res = list(scan_res) scan_res = sorted(scan_res, key=lambda found: (found.start_index, found.occurrence_count)) print( f'In the ori vicinity, found clusters of k={k} (at least {min_occurrence_in_cluster} occurrences in window of {cluster_window_size}) in {textwrap.shorten(ori_vicinity_seq, width=15, placeholder="...")} at...' ) [print(f' * {found}') for found in scan_res] finally: print("</div>", end="\n\n") print("`{bm-enable-all}`", end="\n\n")
def main(): print("<div style=\"border:1px solid black;\">", end="\n\n") print("`{bm-disable-all}`", end="\n\n") try: seq = input() kmer = input() min_occurrence_in_cluster = int(input()) cluster_window_size = int(input()) hamming_distance = int(input()) reverse_complement = bool(input()) idxes = find_kmer_clusters(seq, kmer, min_occurrence_in_cluster, cluster_window_size, Options(hamming_distance, reverse_complement)) print(f'Found clusters of {kmer} (at least {min_occurrence_in_cluster} occurrences in window of {cluster_window_size}) in {seq} at index {idxes}') finally: print("</div>", end="\n\n") print("`{bm-enable-all}`", end="\n\n")
def main(): print("<div style=\"border:1px solid black;\">", end="\n\n") print("`{bm-disable-all}`", end="\n\n") try: seq = input() k = int(input()) hamming_distance = int(input()) reverse_complement = bool(input()) top_kmers = top_repeating_kmers( seq, k, Options(hamming_distance, reverse_complement)) print(f'Top {k}-mer frequencies for {seq}:') [print(f' * {key} = {value} occurrences') for key, value in top_kmers] finally: print("</div>", end="\n\n") print("`{bm-enable-all}`", end="\n\n")
def count_kmers(data: str, k: int, options: Options = Options()) -> Counter[str]: counter = Counter() for kmer, i in slide_window(data, k): neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 if options.reverse_complement: kmer_rc = reverse_complement(kmer) neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer_rc, options.hamming_distance) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 return counter
def main(): print("<div style=\"border:1px solid black;\">", end="\n\n") print("`{bm-disable-all}`", end="\n\n") try: seq = input() k = int(input()) min_occurrence_in_cluster = int(input()) cluster_window_size = int(input()) hamming_distance = int(input()) reverse_comp = bool(input()) scan_res = scan_for_repeating_kmers_in_clusters( seq, k, min_occurrence_in_cluster, cluster_window_size, Options(hamming_distance, reverse_comp)) scan_res = list(scan_res) scan_res = sorted(scan_res, key=lambda found: (found.start_index, found.occurrence_count)) print( f'Found clusters of k={k} (at least {min_occurrence_in_cluster} occurrences in window of {cluster_window_size}) in {seq} at...' ) [print(f' * {found}') for found in scan_res] finally: print("</div>", end="\n\n") print("`{bm-enable-all}`", end="\n\n")
def find_kmer_clusters(sequence: str, kmer: str, min_occurrence_in_cluster: int, cluster_window_size: int, options: Options = Options()) -> List[int]: cluster_locs = [] locs = find_kmer_locations(sequence, kmer, options) start_i = 0 occurrence_count = 1 for end_i in range(1, len(locs)): if locs[end_i] - locs[start_i] < cluster_window_size: # within a cluster window? occurrence_count += 1 else: if occurrence_count >= min_occurrence_in_cluster: # did the last cluster meet the min ocurr requirement? cluster_locs.append(locs[start_i]) start_i = end_i occurrence_count = 1 return cluster_locs