Example #1
0
def find_kmer_locations(sequence: str, kmer: str,
                        options: Options = Options()) -> List[int]:
    # Construct test kmers
    test_kmers = set()
    test_kmers.add(kmer)
    [
        test_kmers.add(alt_kmer)
        for alt_kmer in find_all_dna_kmers_within_hamming_distance(
            kmer, options.hamming_distance)
    ]
    if options.reverse_complement:
        rc_kmer = reverse_complement(kmer)
        [
            test_kmers.add(alt_rc_kmer)
            for alt_rc_kmer in find_all_dna_kmers_within_hamming_distance(
                rc_kmer, options.hamming_distance)
        ]

    # Slide over the sequence's kmers and check for matches against test kmers
    k = len(kmer)
    idxes = []
    for seq_kmer, i in slide_window(sequence, k):
        if seq_kmer in test_kmers:
            idxes.append(i)
    return idxes
Example #2
0
 def neighborhood(kmer: str) -> Set[str]:
     neighbourhood = find_all_dna_kmers_within_hamming_distance(
         kmer, options.hamming_distance)
     if options.reverse_complement:
         kmer_rc = reverse_complement(kmer)
         neighbourhood = find_all_dna_kmers_within_hamming_distance(
             kmer_rc, options.hamming_distance)
     return neighbourhood
Example #3
0
def kmer_frequency_with_mismatches_and_reverse_complements(
        data: str, k: int, min_hamming_dist: int) -> Counter[str]:
    counter = Counter()
    for i in range(0, len(data) - k + 1):
        kmer = data[i:i + k]
        neighbourhood = find_all_dna_kmers_within_hamming_distance(
            kmer, min_hamming_dist)
        for neighbouring_kmer in neighbourhood:
            counter[neighbouring_kmer] += 1
        kmer_rc = reverse_complement(kmer)
        neighbourhood = find_all_dna_kmers_within_hamming_distance(
            kmer_rc, min_hamming_dist)
        for neighbouring_kmer in neighbourhood:
            counter[neighbouring_kmer] += 1
    return counter
Example #4
0
def motif_enumeration(
    dnas: List[str],  # dna strings to search in for motif
    k: int,  # k-mer length
    max_mismatches: int  # max num of mismatches for motif (hamming dist)
) -> Set[str]:
    found_kmers = set()

    kmers_to_check = set()
    for dna in dnas:
        for kmer, _ in slide_window(dna, k):
            neighbouring_kmers = find_all_dna_kmers_within_hamming_distance(
                kmer, max_mismatches)
            kmers_to_check |= neighbouring_kmers

    for kmer_to_check in kmers_to_check:
        found_count = 0
        for dna in dnas:
            for other_kmer, _ in slide_window(dna, k):
                if hamming_distance(kmer_to_check,
                                    other_kmer) <= max_mismatches:
                    found_count += 1
                    break
        if found_count == len(dnas):
            found_kmers.add(kmer_to_check)

    return found_kmers
Example #5
0
def count_kmers(data: str, k: int,
                options: Options = Options()) -> Counter[str]:
    counter = Counter()
    for kmer, i in slide_window(data, k):
        neighbourhood = find_all_dna_kmers_within_hamming_distance(
            kmer, options.hamming_distance)
        for neighbouring_kmer in neighbourhood:
            counter[neighbouring_kmer] += 1

        if options.reverse_complement:
            kmer_rc = reverse_complement(kmer)
            neighbourhood = find_all_dna_kmers_within_hamming_distance(
                kmer_rc, options.hamming_distance)
            for neighbouring_kmer in neighbourhood:
                counter[neighbouring_kmer] += 1

    return counter
import hashlib
import textwrap

import matplotlib.pyplot as plt

from CountASequencesKmersWithMismatchesAndReverseComplement import \
    kmer_frequency_with_mismatches_and_reverse_complements
from FindAllDnaKmersWithinHammingDistance import find_all_dna_kmers_within_hamming_distance
from GCSkew import gc_skew

with open('/home/user/Downloads/dataset_240229_4.txt',
          mode='r',
          encoding='utf-8') as f:
    data = f.read()

lines = data.split('\n')
kmer = lines[0]
hamming_dist = int(lines[1])

kmer_variations = find_all_dna_kmers_within_hamming_distance(
    kmer, hamming_dist)
print(f'{" ".join(kmer_variations)}')