Esempio n. 1
0
 def test_kmers(self):
     """
     Iterate over all the possible 2-mers of a string
     """
     s = "ATTGCTCA"
     possible_2mers = list(kmers(s, 2))
     self.assertEqual(possible_2mers,
                      ['AT', 'TT', 'TG', 'GC', 'CT', 'TC', 'CA'])
Esempio n. 2
0
def profile_random_kmer(profile, sequence):
    """
    Generate a profile-randomly chosen k-mer in a sequence.
    The size of the kmer will be deducted from profile.
    :param profile: the profile
    :param sequence: the sequence to search in
    :return:
    """
    k = len(profile)
    distribution = tuple(
        probability_from_profile(kmer, profile) for kmer in kmers(sequence, k))
    index = biased_random(distribution)
    return sequence[index:index + k]
Esempio n. 3
0
def most_probable_kmer_from_profile(sequence, k, profile_matrix):
    """
    Find the most profile-probable k-mer in a sequence
    :param sequence: the sequence
    :param k: the size of the k-mer
    :param profile_matrix: the profile matrix
    :return: the most probable k-mer in sequence according to the given profile matrix
    """
    most_probable = (-1, None)
    for kmer in kmers(sequence, k):
        probability = probability_from_profile(kmer, profile_matrix)
        if probability > most_probable[0]:
            most_probable = (probability, kmer)
    return most_probable[1]
Esempio n. 4
0
def motifs_enumeration(sequences, k, d):
    """
    Check if a motif of length k appears in each sequence in strings with at most d mismatches
    :param sequences: the array of sequences
    :param k: the length of the motif
    :param d: the maximum number of mismatches
    :return: the (k, d)-motifs in string as a set
    """
    motifs = set()
    for kmer in kmers(sequences, k):
        neighborhood = neighbors(kmer, d)
        for neighbor in neighborhood:
            neighborhood2 = neighbors(neighbor, d)
            if all(
                    any(neighbor2 in seq for neighbor2 in neighborhood2)
                    for seq in sequences):
                motifs.add(neighbor)
    return motifs
Esempio n. 5
0
def greedy_motifs_search(sequences, k, cromwell=True):
    """
    Tries to find a collection of motifs in a collection of sequences of DNA
    :param sequences: the collection of sequences
    :param k: the size of the motifs to search for
    :param cromwell: should we use Cromwell's rule when generating the profile matrix?
    :return: a collection of the most probable motifs (one motif for each sequence)
    """
    best_motifs = None
    for motif1 in kmers(sequences[0], k):
        motifs = [motif1]
        for sequence in sequences[1:]:
            profile_matrix = profile(motifs, cromwell)
            motifs.append(
                most_probable_kmer_from_profile(sequence, k, profile_matrix))
        if not best_motifs or motifs_entropy(motifs) < motifs_entropy(
                best_motifs):
            best_motifs = motifs
    return best_motifs
    def hamming_distance_two_strings(s1, s2):
        """
        Compute the hamming distance between two strings.
        If the second string if bigger than the first one, this will return the minimum hamming distance it finds
        between the first k-mer, and all the possible k-mers in the second string.

        Efficiency: O(nk) with k being the size of the first string, and n the size of the second

        :param s1: the first string
        :param s2: the second string
        :return:
        """
        k = len(s1)

        min_distance = float("inf")

        # We compute the hamming distance between seq1 and s for s being all the possibles strings the same size as seq1
        # in seq2.
        for s in kmers(s2, k):
            distance = hamming_distance_same_size(s1, s)
            if distance < min_distance:  # If the current newly found distance is lower than the minimum we have now
                min_distance = distance
        return min_distance