def randomized_motif_search(dna, k, t): motifs = [] for seq in dna: rand_pos = random.randint(0, len(seq) - k) motifs.append(seq[rand_pos:rand_pos + k]) best = motifs score_best = score(best) while True: profile = profile_matrix(motifs, k) motifs = [profile_most_probable(seq, k, profile) for seq in dna] score_motifs = score(motifs) if score_motifs < score_best: best = motifs score_best = score_motifs else: return best, score_best
def greedy_motif_search_with_pseudocounts(dna, k, t): best = [t[:k] for t in dna] score_best = score(best) first = dna[0] for i in range(len(first) - k + 1): motif = first[i : i + k] motifs = [motif] for j in range(1, t): current = dna[j] profile = profile_matrix(motifs, k) motifs_j = profile_most_probable(current, k, profile) motifs.append(motifs_j) score_motifs = score(motifs) if score_motifs < score_best: best = motifs score_best = score_motifs return best
def gibbs_sampler(dna, k, t, N): motifs = [] for seq in dna: rand_pos = random.randint(0, len(seq) - k) motifs.append(seq[rand_pos:rand_pos + k]) best = motifs score_best = score(best) for j in range(N): i = random.randint(0, t - 1) motif_subset = motifs[:] motif_subset.pop(i) profile = profile_matrix(motif_subset, k) motifs[i] = profile_most_probable(dna[i], k, profile) score_motifs = score(motifs) if score_motifs < score_best: best = motifs score_best = score_motifs return best, score_best
else: return best, score_best def repeated_randomized_motif_search(dna, k, t, *, times=1000): best, score_best = randomized_motif_search(dna, k, t) for _ in range(times - 1): new_best, new_score = randomized_motif_search(dna, k, t) if new_score < score_best: best = new_best score_best = new_score return best result = repeated_randomized_motif_search( "CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG TAGTACCGAGACCGAAAGAAGTATACAGGCGT TAGATCAAGTTTCAGGTGCACGTCGGTGAACC AATCCACCAGCTCCACGTGCAATGTTGGCCTA" .split(), 8, 5, ) assert ( score(result) <= score("TCTCGGGG CCAAGGTG TACAGGCG TTCAGGTG TCCACGTG".split()) + 1), result if __name__ == "__main__": with open("data/rosalind_ba2f.txt", "r") as dataset: k, t = [int(d) for d in dataset.readline().rstrip().split()] text = [l.rstrip() for l in dataset.readlines()] print(*repeated_randomized_motif_search(text, k, t), sep="\n")