def greedy_motif_search(dna, k, t, pseudo=0):
    """
    :param dna: DNA strings
    :param k: kmer size
    :param t: number of kmers in the DNA
    :param pseudo: pseudocount
    :return: best motif
    """

    n = len(dna[0])
    best_motifs = []
    best_score = 0
    for i in range(n - k + 1):
        motifs = [dna[0][i:i + k]]
        for j in range(1, t):
            profile = get_profile(get_count(motifs[0:j]))
            if pseudo:
                profile = add_scalar_to_matrix(pseudo, profile)
            motifs.append(profile_most_probable_pattern(dna[j], k, profile))

        current_score = score(motifs, consensus(get_count(motifs)))
        if not best_motifs or current_score < best_score:
            best_motifs = motifs
            best_score = current_score

    return best_motifs
def repeated_gibbs_sampler(dna, k, t, n):
    best_score = float('inf')
    best_motifs = []
    for i in range(100):
        motifs = gibbs_sampler(dna, k, t, n)
        current_score = score(motifs, consensus(get_count(motifs)))
        if current_score < best_score:
            best_score = current_score
            best_motifs = motifs
    return best_motifs
def profile_to_motifs(profile, dna):
    """
    :param profile: profile matrix
    :param dna: DNA strings
    :return: best motifs
    """

    k = len(list(profile.values())[0])
    t = len(dna)
    n = len(dna[0])
    best_motifs = []
    best_score = 0
    for i in range(n - k + 1):
        motifs = [dna[0][i:i + k]]
        for j in range(1, t):
            motifs.append(profile_most_probable_pattern(dna[j], k, profile))

        if not best_motifs or score(motifs, consensus(get_count(motifs))) < best_score:
            best_motifs = motifs
            best_score = score(best_motifs, consensus(get_count(best_motifs)))

    return best_motifs
    def test_score(self):
        motifs = [
            'AACGTA',
            'CCCGTT',
            'CACCTT',
            'GGATTA',
            'TTCCGG',
        ]
        expected = 14

        count = get_count(motifs)
        cons = consensus(count)
        sc = score(motifs, cons)
        self.assertEqual(expected, sc)
def gibbs_sampler(dna, k, t, n):
    best_score = float('inf')
    motifs = random_motifs([string for string in dna], k, t)
    best_motifs = motifs

    for j in range(n):
        skip_index = randint(0, t - 1)
        dna_slice = dna[:skip_index] + dna[skip_index + 1:]
        profile = profile_with_pseudocounts(get_count(dna_slice))
        motifs[skip_index] = profile_generated_string(motifs[skip_index],
                                                      profile, k)

        current_score = score(motifs, consensus(get_count(motifs)))
        if current_score < best_score:
            best_motifs = motifs
            best_score = current_score

    return best_motifs
Example #6
0
    def test_greedy_motif_search_with_pseudocounts_long(self):
        dna = [
            'GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC',
            'CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG',
            'ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC',
            'GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC',
            'GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG',
            'CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA',
            'GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA',
            'GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG',
            'GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG',
            'TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC',
        ]
        k = 15
        t = len(dna)
        expected_score = 35

        result = greedy_motif_search_with_pseudocounts(dna, k, t)
        print(result)
        self.assertEqual(expected_score,
                         score(result, consensus(get_count(result))))
Example #7
0
    def test_greedy_motif_search_0(self):
        dna = [
            'GGCGTTCAGGCA',
            'AAGAATCAGTCA',
            'CAAGGAGTTCGC',
            'CACGTCAATCAC',
            'CAATAATATTCG',
        ]
        k = 3
        t = len(dna)
        expected = [
            'CAG',
            'CAG',
            'CAA',
            'CAA',
            'CAA',
        ]

        result = greedy_motif_search(dna, k, t)
        self.assertEqual(expected, result)

        cons = consensus(get_count(expected))
        self.assertEqual(2, score(result, cons))
Example #8
0
    def test_greedy_motif_search_with_pseudocounts_0(self):
        dna = [
            'GGCGTTCAGGCA',
            'AAGAATCAGTCA',
            'CAAGGAGTTCGC',
            'CACGTCAATCAC',
            'CAATAATATTCG',
        ]
        k = 3
        t = len(dna)
        expected = [
            "TTC",
            "ATC",
            "TTC",
            "ATC",
            "TTC",
        ]
        expected_score = 2

        result = greedy_motif_search_with_pseudocounts(dna, k, t)
        self.assertEqual(expected, result)
        self.assertEqual(expected_score,
                         score(result, consensus(get_count(result))))