Ejemplo n.º 1
0
def scan_for_repeating_kmers_in_clusters(
    sequence: str,
    k: int,
    min_occurrence_in_cluster: int,
    cluster_window_size: int,
    options: Options = Options()) -> Set[KmerCluster]:
    def neighborhood(kmer: str) -> Set[str]:
        neighbourhood = find_all_dna_kmers_within_hamming_distance(
            kmer, options.hamming_distance)
        if options.reverse_complement:
            kmer_rc = reverse_complement(kmer)
            neighbourhood = find_all_dna_kmers_within_hamming_distance(
                kmer_rc, options.hamming_distance)
        return neighbourhood

    kmer_counter = {}

    def add_kmer(kmer: str, loc: int) -> None:
        if kmer not in kmer_counter:
            kmer_counter[kmer] = set()
        kmer_counter[kmer].add(window_idx + kmer_idx)

    def remove_kmer(kmer: str, loc: int) -> None:
        kmer_counter[kmer].remove(window_idx - 1)
        if len(kmer_counter[kmer]) == 0:
            del kmer_counter[kmer]

    clustered_kmers = set()

    old_first_kmer = None
    for window, window_idx in slide_window(sequence, cluster_window_size):
        first_kmer = window[0:k]
        last_kmer = window[-k:]

        # If first iteration, add all kmers
        if window_idx == 0:
            for kmer, kmer_idx in slide_window(window, k):
                for alt_kmer in neighborhood(kmer):
                    add_kmer(alt_kmer, window_idx + kmer_idx)
        else:
            # Add kmer that was walked in to
            for new_last_kmer in neighborhood(last_kmer):
                add_kmer(new_last_kmer, window_idx + cluster_window_size - k)
            # Remove kmer that was walked out of
            if old_first_kmer is not None:
                for alt_kmer in neighborhood(old_first_kmer):
                    remove_kmer(alt_kmer, window_idx - 1)

        old_first_kmer = first_kmer

        # Find clusters within window -- tuple is k-mer, start_idx, occurrence_count
        [
            clustered_kmers.add(KmerCluster(k, min(v), len(v)))
            for k, v in kmer_counter.items()
            if len(v) >= min_occurrence_in_cluster
        ]

    return clustered_kmers
Ejemplo n.º 2
0
def top_repeating_kmers(data: str, k: int,
                        options: Options = Options()) -> Set[str]:
    counts = count_kmers(data, k, options)

    _, top_count = counts.most_common(1)[0]

    top_kmers = set()
    for kmer, count in counts.items():
        if count == top_count:
            top_kmers.add((kmer, count))
    return top_kmers
Ejemplo n.º 3
0
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        filepath = input()
        with lzma.open(filepath, mode='rt', encoding='utf-8') as f:
            lines = f.read().splitlines()
            lines = [line for line in lines if not line.startswith('>')]
            seq = ''.join(lines)

        print(
            f'Calculating skew for: {textwrap.shorten(seq, width=15, placeholder="...")}\n'
        )

        skew = gc_skew(seq)

        print(
            f'Result: {textwrap.shorten(str(skew), width=15, placeholder="...")}\n'
        )

        plot_filename = 'skew_' + hashlib.md5(
            seq.encode()).hexdigest() + '.png'
        plt.plot(skew)
        plt.ylabel(f'{filepath} skew')
        plt.savefig(f'/output/{plot_filename}')

        ori_vicinity = skew.index(min(skew))
        print(f'![GC Skew Plot]({plot_filename})\n')
        print(f'Ori vicinity (min pos): {ori_vicinity}\n')

        k = 9
        min_occurrence_in_cluster = 3
        cluster_window_size = 500
        hamming_distance = 1
        reverse_comp = True

        ori_vicinity_seq = seq[ori_vicinity - 500:ori_vicinity + 500]
        scan_res = scan_for_repeating_kmers_in_clusters(
            ori_vicinity_seq, k, min_occurrence_in_cluster,
            cluster_window_size, Options(hamming_distance, reverse_comp))
        scan_res = list(scan_res)
        scan_res = sorted(scan_res,
                          key=lambda found:
                          (found.start_index, found.occurrence_count))
        print(
            f'In the ori vicinity, found clusters of k={k} (at least {min_occurrence_in_cluster} occurrences in window of {cluster_window_size}) in {textwrap.shorten(ori_vicinity_seq, width=15, placeholder="...")} at...'
        )
        [print(f' * {found}') for found in scan_res]
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
Ejemplo n.º 4
0
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        seq = input()
        kmer = input()
        min_occurrence_in_cluster = int(input())
        cluster_window_size = int(input())
        hamming_distance = int(input())
        reverse_complement = bool(input())
        idxes = find_kmer_clusters(seq, kmer, min_occurrence_in_cluster, cluster_window_size,
                Options(hamming_distance, reverse_complement))
        print(f'Found clusters of {kmer} (at least {min_occurrence_in_cluster} occurrences in window of {cluster_window_size}) in {seq} at index {idxes}')
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
Ejemplo n.º 5
0
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        seq = input()
        k = int(input())
        hamming_distance = int(input())
        reverse_complement = bool(input())

        top_kmers = top_repeating_kmers(
            seq, k, Options(hamming_distance, reverse_complement))

        print(f'Top {k}-mer frequencies for {seq}:')
        [print(f' * {key} = {value} occurrences') for key, value in top_kmers]
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
Ejemplo n.º 6
0
def count_kmers(data: str, k: int,
                options: Options = Options()) -> Counter[str]:
    counter = Counter()
    for kmer, i in slide_window(data, k):
        neighbourhood = find_all_dna_kmers_within_hamming_distance(
            kmer, options.hamming_distance)
        for neighbouring_kmer in neighbourhood:
            counter[neighbouring_kmer] += 1

        if options.reverse_complement:
            kmer_rc = reverse_complement(kmer)
            neighbourhood = find_all_dna_kmers_within_hamming_distance(
                kmer_rc, options.hamming_distance)
            for neighbouring_kmer in neighbourhood:
                counter[neighbouring_kmer] += 1

    return counter
Ejemplo n.º 7
0
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        seq = input()
        k = int(input())
        min_occurrence_in_cluster = int(input())
        cluster_window_size = int(input())
        hamming_distance = int(input())
        reverse_comp = bool(input())
        scan_res = scan_for_repeating_kmers_in_clusters(
            seq, k, min_occurrence_in_cluster, cluster_window_size,
            Options(hamming_distance, reverse_comp))
        scan_res = list(scan_res)
        scan_res = sorted(scan_res,
                          key=lambda found:
                          (found.start_index, found.occurrence_count))
        print(
            f'Found clusters of k={k} (at least {min_occurrence_in_cluster} occurrences in window of {cluster_window_size}) in {seq} at...'
        )
        [print(f' * {found}') for found in scan_res]
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
Ejemplo n.º 8
0
def find_kmer_clusters(sequence: str, kmer: str, min_occurrence_in_cluster: int, cluster_window_size: int, options: Options = Options()) -> List[int]:
    cluster_locs = []

    locs = find_kmer_locations(sequence, kmer, options)
    start_i = 0
    occurrence_count = 1
    for end_i in range(1, len(locs)):
        if locs[end_i] - locs[start_i] < cluster_window_size:  # within a cluster window?
            occurrence_count += 1
        else:
            if occurrence_count >= min_occurrence_in_cluster:  # did the last cluster meet the min ocurr requirement?
                cluster_locs.append(locs[start_i])
            start_i = end_i
            occurrence_count = 1

    return cluster_locs