def test_kmers(self): """ Iterate over all the possible 2-mers of a string """ s = "ATTGCTCA" possible_2mers = list(kmers(s, 2)) self.assertEqual(possible_2mers, ['AT', 'TT', 'TG', 'GC', 'CT', 'TC', 'CA'])
def profile_random_kmer(profile, sequence): """ Generate a profile-randomly chosen k-mer in a sequence. The size of the kmer will be deducted from profile. :param profile: the profile :param sequence: the sequence to search in :return: """ k = len(profile) distribution = tuple( probability_from_profile(kmer, profile) for kmer in kmers(sequence, k)) index = biased_random(distribution) return sequence[index:index + k]
def most_probable_kmer_from_profile(sequence, k, profile_matrix): """ Find the most profile-probable k-mer in a sequence :param sequence: the sequence :param k: the size of the k-mer :param profile_matrix: the profile matrix :return: the most probable k-mer in sequence according to the given profile matrix """ most_probable = (-1, None) for kmer in kmers(sequence, k): probability = probability_from_profile(kmer, profile_matrix) if probability > most_probable[0]: most_probable = (probability, kmer) return most_probable[1]
def motifs_enumeration(sequences, k, d): """ Check if a motif of length k appears in each sequence in strings with at most d mismatches :param sequences: the array of sequences :param k: the length of the motif :param d: the maximum number of mismatches :return: the (k, d)-motifs in string as a set """ motifs = set() for kmer in kmers(sequences, k): neighborhood = neighbors(kmer, d) for neighbor in neighborhood: neighborhood2 = neighbors(neighbor, d) if all( any(neighbor2 in seq for neighbor2 in neighborhood2) for seq in sequences): motifs.add(neighbor) return motifs
def greedy_motifs_search(sequences, k, cromwell=True): """ Tries to find a collection of motifs in a collection of sequences of DNA :param sequences: the collection of sequences :param k: the size of the motifs to search for :param cromwell: should we use Cromwell's rule when generating the profile matrix? :return: a collection of the most probable motifs (one motif for each sequence) """ best_motifs = None for motif1 in kmers(sequences[0], k): motifs = [motif1] for sequence in sequences[1:]: profile_matrix = profile(motifs, cromwell) motifs.append( most_probable_kmer_from_profile(sequence, k, profile_matrix)) if not best_motifs or motifs_entropy(motifs) < motifs_entropy( best_motifs): best_motifs = motifs return best_motifs
def hamming_distance_two_strings(s1, s2): """ Compute the hamming distance between two strings. If the second string if bigger than the first one, this will return the minimum hamming distance it finds between the first k-mer, and all the possible k-mers in the second string. Efficiency: O(nk) with k being the size of the first string, and n the size of the second :param s1: the first string :param s2: the second string :return: """ k = len(s1) min_distance = float("inf") # We compute the hamming distance between seq1 and s for s being all the possibles strings the same size as seq1 # in seq2. for s in kmers(s2, k): distance = hamming_distance_same_size(s1, s) if distance < min_distance: # If the current newly found distance is lower than the minimum we have now min_distance = distance return min_distance