def randomized_motif_search(dna, k, t): rand_ints = [randint(0, len(dna[0]) - k) for a in range(t)] motifs = [dna_list[i][r:r + k] for i, r in enumerate(rand_ints)] best_score = [score(motifs), motifs] while True: current_profile = profile_with_pseudocounts(motifs) motifs = motifs_from_profile(current_profile, dna_list, k) current_score = score(motifs) if current_score < best_score[0]: best_score = [current_score, motifs] else: return best_score
def randomizedMotifSearch(dna, k, t): bestMotifs = [] n = len(dna[0]) for i in xrange(t): index = random.randint(0, n - k) bestMotifs.append(dna[i][index:index + k]) motifs = bestMotifs while True: profile_matrix = formProfile(motifs) motifs = selectMotifs(profile_matrix, dna, k) if score(motifs) < score(bestMotifs): bestMotifs = motifs else: return score(bestMotifs), bestMotifs
def gibbsSampler(dna, k, t, N): motifs = [] n = len(dna[0]) for i in xrange(t): index = random.randint(0, n - k) motifs.append(dna[i][index:index + k]) best_motifs = motifs for j in xrange(N): r = random.randint(0, t - 1) profile_matrix = formProfileGibbs(motifs, r) motif = detectMostProbableKMer(dna[r], k, profile_matrix) motifs = [motif if index == r else m for index, m in enumerate(motifs)] if score(motifs) < score(best_motifs): best_motifs = motifs return best_motifs
def gibbs_sampler(dna, k, t, N): rand_ints = [randint(0, len(dna[0]) - k) for a in xrange(t)] motifs = [dna_list[i][r:r + k] for i, r in enumerate(rand_ints)] best_score = [score(motifs), motifs] for i in range(N): r = randint(0, t - 1) current_profile = profile_with_pseudocounts( [motif for index, motif in enumerate(motifs) if index != r]) motifs = [ profile_most_probable_kmer(dna[index], k, current_profile) if index == r else motif for index, motif in enumerate(motifs) ] current_score = score(motifs) if current_score < best_score[0]: best_score = [current_score, motifs] return best_score
# k = 15 # t = 20 # N = 2000 # # best_motifs = [k * t, None] # for repeat in xrange(20): # current_motifs = gibbsSampler(dna, k, t, N) # if score(current_motifs) < best_motifs[0]: # best_motifs = [score(current_motifs), current_motifs] # # output = best_motifs[1] # example = ['ACGTCCACCGGCGTC', 'AAGCGCACCGGGGTG', 'ACCCTTACCGGGGTG', 'AAGTTCCTCGGGGTG', 'AAGTTTTATGGGGTG', 'AAGTTTACCGGGTGC', 'AAGTTTCGAGGGGTG', 'CTGTTTACCGGGGTA', 'AAGTTGCTCGGGGTG', 'AAACATACCGGGGTG', 'AAGTTTAGGAGGGTG', 'AAGGAAACCGGGGTG', 'AAGTTTACACAGGTG', 'TAGTTTACCGGGGAT', 'CCTTTTACCGGGGTG', 'AAGTGAGCCGGGGTG', 'AAGTCGTCCGGGGTG', 'AAGTTTACCGGACAG', 'AAGTTTACCAATGTG', 'AAGTTTACCGTCATG'] # # for item in output: # if item not in example: # print 'ATATA' dna = ['AGATAAGCGAAAGTCGCCCGTTTGGGAATAGAATTCGGGATAAAAAAGGCCGTAGCTCACTAGAGCCGTTATGCGGACAGGTGATGTGAGAGTCCTGGCACATTCCGCGACTATCGATTTCTGGGGACATCAAGTCGCTCAACGAGACTCGGGAAATCGTTCGCGGTCGCGCGTCGGTTACCGCATATGGTTGGCCTATGTCATGTCGGTTAAGTTCTGTACTTACGAACTCCGTTGAAACCGCTGTGCGTGTAAAACCAGCACTTGGAGCGCCACTTTCGAAGGGAGGTACGAGCAAGTCAAAACTAGCTGTCGGCAGATAAGCGAAAGTC', 'GATGGTGGAGTGCTCGCCCGTTTGGGAATAGAATTCGGGATAAAAAAGGCCGTAGCTCACTAGAGCCGTTATGCGGACAGGTGATGTGAGAGTCCTGGCACATTCCGCGACTATCGATTTCTGGGGACATCAAGTCGCTCAACGAGACTCGGGAAATCGTTCGCGGTCGCGCGTCGGTTACCGCATATGGTTGGCCTATGTCATGTCGGTTAAGTTCTGTACTTACGAACTCCGTTGAAACCGCTGTGCGTGTAAAACCAGCACTTGGAGCGCCACTTTCGAAGGGAGGTACGAGCAAGTCAAAACTAGCTGTCGGCAGATAAGCGAAAGTC', 'CGCGGGGGCTCTGAGTGTCTAAATGCCGAATACCGGTACAAGACACCAAGATTGCGGAAACTGACCTACTTGAACCCTGCCAGGTGCTAGCCAAGATAAGGCTAGGGTCTTAAGTCCAGAAATAGCCTTGCGCCAGCCAAACACGGAGCTGCTTCAGGTTGTGTGGTCGCTGTCTCGCAGTACGAGTGGCACGACAAGCCCCAGCCTCCCAGATTCGCAAGAGCTGATCCATGATGTAGGAAGACCTAGCTCTCAGATAGGGAGGCTTTCTCGTGGCGTGGGATCATCCGTTGTGTTCCCGTAACCAAGAGATACGGGAGCAATCGATTGGT', 'TTGAGTTGTGTGACAGGCGAACGGATATTAGAGTGCTCATGAATTCAATCGACCGGACGGCCGTTAAAATGTGGATAGAAATTTGTCTTTCCATGGATCCGCCAATGTACATCGGCACTAAGGTTAAATCTTCCCGTCACGATGGAGAATACAATCGGACTGCTTTACGCCCAGAACTCGGTAATCCCCGAGGATAGTCCCATCTAGATCGCTACTATGATACGGTCGCACTGACTCGTCAGCACCATGGGATTTAATAAACACAACTGTAGCAGAAGTGAAACCCTGCTAAAGTGCAGTCCTCTACGTTGAATAATTTTTTAAAGCAGCCC', 'CGTGACCTGGGTGACAGTCGACCCAGTGAGTAGGACTCAAAAGTGCCGCTCTTACATATATGCCCCTCAGGTCAGGATCTCAGCCCTAGGCTAGATGATTTCTCTGCTCGCGGTATGGGTCTAAAACTACACCATCTAGGGAACCGGCTGATAGGCGTAAAGTGGGTTTAACTGTTTCCGTGAGCTATCTTAAATAGTCAGTAGACTAGGTTTGTGTATGCCCCGTAACTGATCGTCAGAAGGGTGCCTAAAGTAACGTTTACAGGTCGTACGGGAGTGCTACCACGCTCTTCCGTGTTCGTTAGCCTGGTTCGGACTACCCAGAACCCACA', 'ATCGAGGATACGGGGCAGCTCTGCCGCAAAACAAAGTGATAGGGACGAAATGGCAGTCCCAACTAATAACATAAACACTTTCGTCATACTACGACTCTGCAAGACAGATTATTAGAATGTCCACGCGACTAATCGGCCCCTTCACGAAAATACCAGAAGCCGTAAAGGTCCACGCGACACCCAAACAGGGGAAGGAGGGTCTGTTTGAACGGAAGAACGAGCTTTTATAAAGGAGCCCCAAACATGAATTGTCATGTAGGCAGGTGTGATTTTTAGCATCCCTTACAAGAGTATAGCCTTAATCAGCTGTAAGCATAACGCCTGGGACTGCA', 'CAACAGTTAGATCCGGATAACTCAGTTCGACCACTGGAATACGCATTGCATCAAATGCCCAAGGAGAGCACGTCTGTTCTGCACGGAGTTATTGGTCATCTATTGCTGCCAAAGTTGCGATGGCGGCTAAGACTGGGTGCATACTGGTGACCGGAGAGCGTCGGACTGGTTATCTATTGCGGACGGGGGTGCGCAATCCTACACGTTTAATAGAGTTCAGGTCACGAGATACGGGAGTAAACCAACTTGGCTATTTTATGAATGTTTCAGACTGAGTATTTCGTCTGTCTTTGGTCTAAAGACTGGGGAGCTGAACGCCACAGAACGCCTAG', 'GTGTGCTATTGAACGAGCGTTTCTGGGAGCGCACTATAGGGCACAGTCCCGTCATATTACCGTGGATTCTTTTCCGTTTTCGTTACCTTATGATATGGTGAGGGCACCCAGAATTGTGCGTGAGAATATTTGCCCGGGGCACATACGCCGGCCGTTCATCCCTCAGATGGCTGGCCGTGTGATACGGGACGCCTCATACCACTAGGGCACTAGCGTTGTCCGGCCACTGCCCACTTTGAGAAGGAAACTTCAGCTGTCAAAAAATGAGTTGCCTCGCTTCACCAGACGCTATATCTACTTTGTATGATACGGTCATCGGTCATGCGCACTTG', 'TTACTCACTTCCGCCTCCATTCTAGTGAGTTCTAAGGACCTGGAGGGGATTCGGTCGTCGTACAGAAAACTAACTACTTTCACTACATTTGTGCAAAGCCACGCTCCCCAATCATTTTGTCCATTTCCCGATACGGGAGTGGCTTACACGCTCTGTGAGAGATTATGGCTAGGGTGAAGGTGATCGTAGCCTGTGCAGACACTCCCACTATCGGAATGTAACCCCGCAAGTCTCTTCTTCCGCGCCACGAGATGTCTAGCCGCCTCATGCGCGTGGATCCTGTACTGCGTCTCCCCCTACTCTGGGTCCGGCCAGGTTCCCTTTTAAGTCCC', 'TTCCCCTGGGGGGTAGGCGGTTGTGGTACGATATGGGGAATGGCTTGCCCAATCCACTTACGCGGAACGTTACCAGAAACCCACTAACCAAATCGTACTGTTCTATTAGGCGATAGTAGTTCTTCCGTCAAGGTTACATAGTCTTAGCCTCTAAATGTATACGCGCGGAGGTTGCTGTTCACCTAGCCCATCAATTGTTACGTGGTCCCATATGTATTATGCGGTTGCCGCGGCCTGCGTAGGCTTTTAGGCGTCATTGAAGCACCGAGAGATAGACGAGTGCTCATACCACTAACCGTTATACTCGGGCTCACCAAGGGGCGGATATACTT', 'AGGGTGAGCTAACTTACCGCCGGGGGACAGTGACCTCATTTAGGTCCACAAGTGTTATACTCGCCCACTGAGCACGCAACTCTATTCCTGTCCTGGCGCTCTCTCTCGCACCCGATGATTAGCCCTAAGACTGCACCGCGCCGCTAACTCTAAATTACACCCCTCCTCGTGGAATTAGAGGTGCGGATTTAATGGATCACAGGCTCTCGACGTGACCCCCTGGAACTGTTACGCGAGTTAGATACACTAGTGCTCTGTGCAGATAAGTTACCCCAAAGGCTTGCTCGGAGTTAGAATGACGCACTGGTGTCCCGATATAGGCGTTCGGGAAA', 'CTATCCGTTTAACTGGCGCACAGACTATCAATCAGCCACCAAATGTGACGGTCTGACTTTTCTCAGGCCGGTAAACCCGCGCTTTGTTCTGAGGCATAAGCTATGCTACGGTGCTTGATACGTTGGTGGGACGCACATCCGCGTGACGTGGGAGTGCTCCGTACGGATGACGCCCCTCCCGATTAAGCGCTGTAATTGAAGGGTTTGTGGTTCGCGCCCAACAACGGCGCGATATTAGGGGACCCGAAGCTTGATCTCCCACGCACGAAGGTGCAAAGAGCAGAAAAGACGCCCCAATAACGCATTAACCAGTAGTGTTGGATGCCCTCACG', 'CTCTGCACGGCGGATTTTGTGATTTTCCAGGCGCGTGGCTGTTTCTAGCCCCGGGAGTGCTCAGAAAATACTATATGATTGCGCAGCAACCGTTATATAAAGAGTCTAACAAGGGCCATTAACGGTGCAGTGATCTATAGGGGGTGGCGCCACCAAAAGAATCAAGAAGCTTTGTAGGTGTTCAGCCGGTGGTCTAGAATGCCCAGACCTTCTCACCATCGGTTCGATCTTCCACAAGCATTACTGGCGAGGGAGGATGTAGGAGTGCACGACGACTACCTAACTTCGTAGGTCTAGAAAAGATTCCCCTATTAGCGGTAGCAAGTGCAACG', 'GGGCATAAATATAATTTAGAGTGACGAGCCGGAGGGTTCTCTTTCACGTGCTTCCGAAAGTCGATGGGTTTGAGTCAATCACCCGCCTGTTAGTATTTAATGAAGTACCTCGGAGAGCTTGTCTGAGCTAAAACGTTGTTGAGCTATAACCTAGGGTGGAAAGGCACTAATACACGTCCCAGTTTGAAGGTTCTTGACTCCTCGTTGGTCGAGCCCATAAGGCCAACATCGCATTCGCGCGACTGAAGTAAAGTCGCTCAGTGATGAGGCTCGCCAAACTCCACGGGAGTGCTCTGACACAGTCACGTCACTAATCCCTTCCTGAGTGCCAT', 'TTACTCTTGTTAGACCCGAGGGAGTAGGTGCATAACCCACGTTACGAAGGGCTCGGACGACATGATGAAATACTGTACTGTGCGTTCATTTCCCCCATTCTATGTAGATACGCACGTGCTCAACCATCCAGTTGGTGCCTCCAGAGATCCGTGTGAACGCACCAGTGTATAGCGCCGCAGAGCGTTCGTTTGCGCTGCTCTTCCTTCGTCTATTCGAAGCTCCAAGAGGAAGGGTCCGTTCGCCCCCTTCTATGTGTCTCGCGGCGTGTTGAGTATTGAAATTCACGGCTCTCTACAGGCGCGTTTCGGACACAGCTATCTTGCCTAGCCTT', 'AAAACCGCGAAGTACTCCGGAAAAGCGATTCGCAAAGGCCCATTTCAGGACCCAGCGTCCAAAGGCGTTGACCCCAACGGACAGGACAGCTTTAGAGTGGTTCCCCGCCGTTAAGGTGCGTTCAGCGGCGTTAGTATTTAAAGAACGAGTTTCCATTTAATAAGTCATAGAGTGTCACTTGTGGTTCACTTTGGTTAGGTCAACCCATGCATCATTCAGTGGGATCTTTCTTCTAATTCATGCCTATGATATACTGAACTGGTCACTGAACTTCGCGACTCGGCCGACTTCCCAAGTATTTTTTTCATACGGGAGTGCGGTACCGAGCGGGT', 'AGACGTATTCGCGCAAGTGGCTTCCCGCGAAAATCGGCTCGTGGTACCCTCAGAGGACCTACCTCCGACTGTTACGGACATCCTGTCACCAGGGCAATTCTACAACGCTGTTGAAAGGTCTGTTATATGATTGTCCCGTTACGCCTAACCTCGAACCGGCCGGAAAGCGGATCTACGACCCGCTGTCTAGTATGTGGTGTGTAGACATACGCCGGGAAGGCATAAATGGAGAGTTACCCAATTTGGCACCTGTAGTAAGGTACGGGGATAATGATACTTGCCCTCACCGAATGGAATGTAAGCCGGGATCACGGAGTGCTCCGAGTGGCCCA', 'CCAAAACGATACTCGTATGACAACCCCCGTGTACCACTCGCCTATGGAAGGACTAGTTCTGCAGACAAGATATCGGATACGAATGTGCTCGTTACTACTAACGTTAAATTACTAAGCGACGATAGTGCGTTGCAAAGTGAGCGTTTGAATGTCACCATACTAAATAGACCTCAGTACCTGCATCGATGCCTACGCACTATTATGTCCAGCATTCGGTTTTAGGACAAATCATAATAACCTAGTCGGCAGAACAAGCCATGAACGAATGCTTGTGTTGACCGCTGTCGTGGCGATTCGCGTAGAATGTGTGCCCGACTTTCGGCGTTTCAGTG', 'GTATATCCAATATACGGGCGACATTCATCTACTAGCGTACATAACGTTTGTGCGAGCACAATGAAGCGCCATAGTCTAGATCTCGAATTTAATGAGCCCAACTCGCGTCTTACATGTTCTCTGGATCGAGCCACAACCCGATACGGAGATGCTCATTCGTCTCTAAGCTACTTCCGTCGACCAGGCCGGAGAGTTAGGTAAAAGCATACAATTGAACAACTAGGCAGAGTGGTCCGCGTCCACATGATATACAGCTGCGTGTACTGCTCCTAGTATATATCCGTGTTCCTGCCTGGTTTCACCAGATGCCTGGAAGAGCTCCGTCAGCGCGT', 'TCGAGTTGTTAAGTCTTCAAAGAGCTCGCGCAATTAATTTCCATTATGGCGTAGAGTCGACAGCTACCGCTCGAAAAATCGTGGGGGTTCGCTTGAAGGAAAGGTCAGGTTCTCACGACCTGTAAGCGAGTCTGCACGTTGTGGCGAGCGGTACACGGTATTTTTAACAAAGAATACAGGAGGTGTCCCACCAGTGTAGGGACCACTGGTGCAGATCAAAATAAGGATTAAGTTGCAGCGTACCGATACAACAGTGCTCAGATTCTCAATATTAGGTCGGCTAGCCGGACAGTCAATAGCACCGAGCACTCATCATATAGTTCGCACATAAG'] k = 15 t = 20 N = 2000 best_motifs = [k * t, None] for repeat in xrange(100): current_motifs = gibbsSampler(dna, k, t, N) if score(current_motifs) < best_motifs[0]: best_motifs = [score(current_motifs), current_motifs] print best_motifs[1]