def motif_enumeration( dnas: List[str], # dna strings to search in for motif k: int, # k-mer length max_mismatches: int # max num of mismatches for motif (hamming dist) ) -> Set[str]: found_kmers = set() kmers_to_check = set() for dna in dnas: for kmer, _ in slide_window(dna, k): neighbouring_kmers = find_all_dna_kmers_within_hamming_distance( kmer, max_mismatches) kmers_to_check |= neighbouring_kmers for kmer_to_check in kmers_to_check: found_count = 0 for dna in dnas: for other_kmer, _ in slide_window(dna, k): if hamming_distance(kmer_to_check, other_kmer) <= max_mismatches: found_count += 1 break if found_count == len(dnas): found_kmers.add(kmer_to_check) return found_kmers
def scan_for_repeating_kmers_in_clusters( sequence: str, k: int, min_occurrence_in_cluster: int, cluster_window_size: int, options: Options = Options()) -> Set[KmerCluster]: def neighborhood(kmer: str) -> Set[str]: neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) if options.reverse_complement: kmer_rc = reverse_complement(kmer) neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer_rc, options.hamming_distance) return neighbourhood kmer_counter = {} def add_kmer(kmer: str, loc: int) -> None: if kmer not in kmer_counter: kmer_counter[kmer] = set() kmer_counter[kmer].add(window_idx + kmer_idx) def remove_kmer(kmer: str, loc: int) -> None: kmer_counter[kmer].remove(window_idx - 1) if len(kmer_counter[kmer]) == 0: del kmer_counter[kmer] clustered_kmers = set() old_first_kmer = None for window, window_idx in slide_window(sequence, cluster_window_size): first_kmer = window[0:k] last_kmer = window[-k:] # If first iteration, add all kmers if window_idx == 0: for kmer, kmer_idx in slide_window(window, k): for alt_kmer in neighborhood(kmer): add_kmer(alt_kmer, window_idx + kmer_idx) else: # Add kmer that was walked in to for new_last_kmer in neighborhood(last_kmer): add_kmer(new_last_kmer, window_idx + cluster_window_size - k) # Remove kmer that was walked out of if old_first_kmer is not None: for alt_kmer in neighborhood(old_first_kmer): remove_kmer(alt_kmer, window_idx - 1) old_first_kmer = first_kmer # Find clusters within window -- tuple is k-mer, start_idx, occurrence_count [ clustered_kmers.add(KmerCluster(k, min(v), len(v))) for k, v in kmer_counter.items() if len(v) >= min_occurrence_in_cluster ] return clustered_kmers
def shatter(self: ReadPair, k: int) -> List[ReadPair]: ret = [] d = (self.k - k) + self.d for window_head, window_tail in zip(slide_window(self.data.head, k), slide_window(self.data.tail, k)): kmer_head, _ = window_head kmer_tail, _ = window_tail kdmer = Kdmer(kmer_head, kmer_tail, d) rp = ReadPair(kdmer, source=('shatter', [self])) ret.append(rp) return ret
def find_kmer_locations(sequence: str, kmer: str, options: Options = Options()) -> List[int]: # Construct test kmers test_kmers = set() test_kmers.add(kmer) [ test_kmers.add(alt_kmer) for alt_kmer in find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) ] if options.reverse_complement: rc_kmer = reverse_complement(kmer) [ test_kmers.add(alt_rc_kmer) for alt_rc_kmer in find_all_dna_kmers_within_hamming_distance( rc_kmer, options.hamming_distance) ] # Slide over the sequence's kmers and check for matches against test kmers k = len(kmer) idxes = [] for seq_kmer, i in slide_window(sequence, k): if seq_kmer in test_kmers: idxes.append(i) return idxes
def from_string(text: str, k: int, instantize: bool = False): counter = Counter() ret = [] for kmer, _ in slide_window(text, k): instance = counter[kmer] if instantize else 0 ret.append(Read(kmer, instance=instance)) counter[kmer] += 1 return ret
def determine_probabilities_of_all_kmers_in_dna( profile_matrix: Dict[str, List[float]], dna: str, k: int) -> List[int]: ret = [] for kmer, _ in slide_window(dna, k): prob = determine_probability_of_match_using_profile_matrix( profile_matrix, kmer) ret.append(prob) return ret
def find_most_probable_kmer_using_profile_matrix(profile: Dict[str, List[float]], dna: str): k = len(list(profile.values())[0]) most_probable: Tuple[str, float] = None # [kmer, probability] for kmer, _ in slide_window(dna, k): prob = determine_probability_of_match_using_profile_matrix(profile, kmer) if most_probable is None or prob > most_probable[1]: most_probable = (kmer, prob) return most_probable
def distance_between_pattern_and_strings(pattern: str, dnas: List[str]) -> int: min_hds = [] k = len(pattern) for dna in dnas: min_hd = None for dna_kmer, _ in slide_window(dna, k): hd = hamming_distance(pattern, dna_kmer) if min_hd is None or hd < min_hd: min_hd = hd min_hds.append(min_hd) return sum(min_hds)
def find_peptide_encodings_in_dna(dna: str, amino_acid_seq: str) -> List[str]: ret = [] for kmer, _ in slide_window(dna, len(amino_acid_seq) * 3): rna_kmer = dna_to_rna(kmer) rna_kmer_rev_comp = dna_to_rna(dna_reverse_complement(kmer)) found = False for rna in [rna_kmer, rna_kmer_rev_comp]: amino_acids = [codon_to_amino_acid(codon) for codon in split_to_size(rna, 3)] if None in amino_acids: continue if ''.join(amino_acids) == amino_acid_seq: found = True break if found: ret.append(kmer) return ret
def count_kmers(data: str, k: int, options: Options = Options()) -> Counter[str]: counter = Counter() for kmer, i in slide_window(data, k): neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 if options.reverse_complement: kmer_rc = reverse_complement(kmer) neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer_rc, options.hamming_distance) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 return counter
def greedy_motif_search(k: int, dnas: List[str]): best_motif_matrix = [dna[0:k] for dna in dnas] for motif, _ in slide_window(dnas[0], k): motif_matrix = [motif] counts = motif_matrix_count(motif_matrix) profile = motif_matrix_profile(counts) for dna in dnas[1:]: next_motif, _ = find_most_probable_kmer_using_profile_matrix( profile, dna) # push in closest kmer as a motif member and recompute profile for the next iteration motif_matrix.append(next_motif) counts = motif_matrix_count(motif_matrix) profile = motif_matrix_profile(counts) if score_motif(motif_matrix) < score_motif(best_motif_matrix): best_motif_matrix = motif_matrix return best_motif_matrix
# GGGGACTTCTGTCCCTAGCC # TGGGACTTTCGGCCCTGTCC # GGGGACCAACGCCCCTGGGA # GGGGACCGAAGTCCCCGGGC # 11 # consensus_kmer = 'CGGGACCTACGTCCCTAGCC' # this is consensus string for hte matrix it finds best_motif_matrix_counts = motif_matrix_count(best_motif_matrix) for elem, counts in best_motif_matrix_counts.items(): # add in pseudocounts best_motif_matrix_counts[elem] = [c + 1 for c in counts] best_motif_matrix_profile = motif_matrix_profile(best_motif_matrix_counts) with open('/home/user/Downloads/GCF_000195955.2_ASM19595v2_genomic.fna', mode='r', encoding='utf-8') as f: data = f.read() lines = data.split('\n') lines = [l.strip() for l in lines] # get rid of whitespace lines = [l if not l.startswith('>') else '' for l in lines] # remove comments dna = ''.join(lines) # concat into single dna str for kmer, _ in slide_window(dna, k): prob = determine_probability_of_match_using_profile_matrix(best_motif_matrix_profile, kmer) if prob >= 0.01: # 1% or greater print(f'{kmer} {prob}') # Nothing is found... # # The strings in DosR.txt aren't matching up to the genome at the link I posted (even though the name of the organism # matches). I'm guessing it's a different variant of the organism that was studied in the original 2003 paper. Maybe # this variant uses a different motif or doesn't have it (doesn't have the ability to lie dormant like the organism that # the original paper studied).
from Read import Read from ToDeBruijnGraph import to_debruijn_graph from Utils import slide_window with open('/home/user/Downloads/dataset_240257_6(1).txt', mode='r', encoding='utf-8') as f: data = f.read() lines = data.split('\n') k = int(lines[0].strip()) dna = lines[1].strip() reads = [Read(kmer) for kmer, _ in slide_window(dna, k)] graph = to_debruijn_graph(reads) for node, other_nodes in graph.get_all_outputs(): other_nodes = list(other_nodes) if len(other_nodes) == 0: continue print(f'{node} -> {",".join([str(x) for x in other_nodes])}')
def shatter(self: Read, k: int) -> List[Read]: ret = [] for kmer, _ in slide_window(self.data, k): r = Read(kmer, source=('shatter', [self])) ret.append(r) return ret