def greedy_motif_search(dna, k, t, pseudo=0): """ :param dna: DNA strings :param k: kmer size :param t: number of kmers in the DNA :param pseudo: pseudocount :return: best motif """ n = len(dna[0]) best_motifs = [] best_score = 0 for i in range(n - k + 1): motifs = [dna[0][i:i + k]] for j in range(1, t): profile = get_profile(get_count(motifs[0:j])) if pseudo: profile = add_scalar_to_matrix(pseudo, profile) motifs.append(profile_most_probable_pattern(dna[j], k, profile)) current_score = score(motifs, consensus(get_count(motifs))) if not best_motifs or current_score < best_score: best_motifs = motifs best_score = current_score return best_motifs
def repeated_gibbs_sampler(dna, k, t, n): best_score = float('inf') best_motifs = [] for i in range(100): motifs = gibbs_sampler(dna, k, t, n) current_score = score(motifs, consensus(get_count(motifs))) if current_score < best_score: best_score = current_score best_motifs = motifs return best_motifs
def profile_to_motifs(profile, dna): """ :param profile: profile matrix :param dna: DNA strings :return: best motifs """ k = len(list(profile.values())[0]) t = len(dna) n = len(dna[0]) best_motifs = [] best_score = 0 for i in range(n - k + 1): motifs = [dna[0][i:i + k]] for j in range(1, t): motifs.append(profile_most_probable_pattern(dna[j], k, profile)) if not best_motifs or score(motifs, consensus(get_count(motifs))) < best_score: best_motifs = motifs best_score = score(best_motifs, consensus(get_count(best_motifs))) return best_motifs
def test_score(self): motifs = [ 'AACGTA', 'CCCGTT', 'CACCTT', 'GGATTA', 'TTCCGG', ] expected = 14 count = get_count(motifs) cons = consensus(count) sc = score(motifs, cons) self.assertEqual(expected, sc)
def gibbs_sampler(dna, k, t, n): best_score = float('inf') motifs = random_motifs([string for string in dna], k, t) best_motifs = motifs for j in range(n): skip_index = randint(0, t - 1) dna_slice = dna[:skip_index] + dna[skip_index + 1:] profile = profile_with_pseudocounts(get_count(dna_slice)) motifs[skip_index] = profile_generated_string(motifs[skip_index], profile, k) current_score = score(motifs, consensus(get_count(motifs))) if current_score < best_score: best_motifs = motifs best_score = current_score return best_motifs
def test_greedy_motif_search_with_pseudocounts_long(self): dna = [ 'GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC', 'CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG', 'ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC', 'GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC', 'GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG', 'CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA', 'GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA', 'GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG', 'GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG', 'TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC', ] k = 15 t = len(dna) expected_score = 35 result = greedy_motif_search_with_pseudocounts(dna, k, t) print(result) self.assertEqual(expected_score, score(result, consensus(get_count(result))))
def test_greedy_motif_search_0(self): dna = [ 'GGCGTTCAGGCA', 'AAGAATCAGTCA', 'CAAGGAGTTCGC', 'CACGTCAATCAC', 'CAATAATATTCG', ] k = 3 t = len(dna) expected = [ 'CAG', 'CAG', 'CAA', 'CAA', 'CAA', ] result = greedy_motif_search(dna, k, t) self.assertEqual(expected, result) cons = consensus(get_count(expected)) self.assertEqual(2, score(result, cons))
def test_greedy_motif_search_with_pseudocounts_0(self): dna = [ 'GGCGTTCAGGCA', 'AAGAATCAGTCA', 'CAAGGAGTTCGC', 'CACGTCAATCAC', 'CAATAATATTCG', ] k = 3 t = len(dna) expected = [ "TTC", "ATC", "TTC", "ATC", "TTC", ] expected_score = 2 result = greedy_motif_search_with_pseudocounts(dna, k, t) self.assertEqual(expected, result) self.assertEqual(expected_score, score(result, consensus(get_count(result))))