Ejemplo n.º 1
0
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        dnas = []
        while True:
            try:
                dna = input().strip().upper()
                if len(dna) > 0:
                    dnas.append(dna)
            except EOFError:
                break

        hasher = md5()
        hasher.update(str(dnas).encode('utf-8'))
        logo_filename = 'motif_logo_' + hasher.hexdigest() + '.svg'
        if path.isdir('/output'):
            logo_path = '/output/' + logo_filename
        else:
            logo_path = '/tmp/' + logo_filename

        print(f'Generating logo for the following motif matrix...\n\n')
        print(f'{"<br>".join(dnas)}\n\n')
        counts = motif_matrix_count(dnas)
        profile = motif_matrix_profile(counts)
        logo = create_logo(profile)
        plt.savefig(logo_path)
        print(f'Result...\n\n')
        print(f'![Motif Logo]({logo_filename})\n\n')
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
def gibbs_sampler_motif_search_with_psuedocounts(k: int, dnas: List[str],
                                                 cycles: int) -> List[str]:
    motif_matrix = []
    for dna in dnas:
        start = randrange(len(dna) - k + 1)
        kmer = dna[start:start + k]
        motif_matrix.append(kmer)

    best_motif_matrix = motif_matrix[:]  # create a copy, otherwise you'll be modifying both motif and best_motif

    for j in range(0, cycles):
        i = randrange(len(dnas))  # pick a dna
        del motif_matrix[i]  # remove the kmer for that dna from the motif str

        counts = motif_matrix_count(motif_matrix)
        apply_psuedocounts_to_count_matrix(counts)
        profile = motif_matrix_profile(counts)

        new_motif_kmer_probs = determine_probabilities_of_all_kmers_in_dna(
            profile, dnas[i], k)
        new_motif_kmer_idx = gibbs_rand(new_motif_kmer_probs)
        new_motif_kmer = dnas[i][new_motif_kmer_idx:new_motif_kmer_idx + k]
        motif_matrix.insert(i, new_motif_kmer)

        if score_motif(motif_matrix) < score_motif(best_motif_matrix):
            best_motif_matrix = motif_matrix[:]  # create a copy, otherwise you'll be modifying both motif and best_motif

    return best_motif_matrix
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        dnas = []
        while True:
            try:
                dna = input().strip().upper()
                if len(dna) > 0:
                    dnas.append(dna)
            except EOFError:
                break
        
        kmer = dnas[-1]
        motif_matrix = dnas[:-2]

        counts = motif_matrix_count(motif_matrix)
        profile = motif_matrix_profile(counts)
        prob = determine_probability_of_match_using_profile_matrix(profile, kmer)
        print(f'Motif matrix...\n\n')
        print(f'{"<br>".join(motif_matrix)}\n\n')
        print(f'Probability that {kmer} matches the motif {prob}...\n\n')
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
Ejemplo n.º 4
0
def greedy_motif_search(k: int, dnas: List[str]):
    best_motif_matrix = [dna[0:k] for dna in dnas]

    for motif, _ in slide_window(dnas[0], k):
        motif_matrix = [motif]
        counts = motif_matrix_count(motif_matrix)
        profile = motif_matrix_profile(counts)

        for dna in dnas[1:]:
            next_motif, _ = find_most_probable_kmer_using_profile_matrix(
                profile, dna)
            # push in closest kmer as a motif member and recompute profile for the next iteration
            motif_matrix.append(next_motif)
            counts = motif_matrix_count(motif_matrix)
            profile = motif_matrix_profile(counts)

        if score_motif(motif_matrix) < score_motif(best_motif_matrix):
            best_motif_matrix = motif_matrix

    return best_motif_matrix
Ejemplo n.º 5
0
def score_motify_entropy(motif_matrix: List[str]) -> float:
    rows = len(motif_matrix)
    cols = len(motif_matrix[0])

    # count up each column
    counts = motif_matrix_count(motif_matrix)
    profile = motif_matrix_profile(counts)

    # prob dist to entropy
    entropy_per_col = []
    for c in range(cols):
        entropy = calculate_entropy([profile['A'][c], profile['C'][c], profile['G'][c], profile['T'][c]])
        entropy_per_col.append(entropy)

    # sum up entropies to get entropy of motif
    return sum(entropy_per_col)
Ejemplo n.º 6
0
def score_motif_relative_entropy(motif_matrix: List[str],
                                 source_strs: List[str]) -> float:
    # calculate frequency of nucleotide across all source strings
    nuc_counter = Counter()
    nuc_total = 0
    for source_str in source_strs:
        for nuc in source_str:
            nuc_counter[nuc] += 1
        nuc_total += len(source_str)
    nuc_freqs = dict([(k, v / nuc_total) for k, v in nuc_counter.items()])

    rows = len(motif_matrix)
    cols = len(motif_matrix[0])

    # count up each column
    counts = motif_matrix_count(motif_matrix)
    profile = motif_matrix_profile(counts)
    relative_entropy_per_col = []
    for c in range(cols):
        # get entropy of column in motif
        entropy = calculate_entropy([
            profile['A'][c], profile['C'][c], profile['G'][c], profile['T'][c]
        ])
        # get cross entropy of column in motif (mixes in global nucleotide frequencies)
        cross_entropy = calculate_cross_entropy([
            profile['A'][c], profile['C'][c], profile['G'][c], profile['T'][c]
        ], [nuc_freqs['A'], nuc_freqs['C'], nuc_freqs['G'], nuc_freqs['T']])
        relative_entropy = entropy - cross_entropy
        # Right now relative_entropy is calculated by subtracting cross_entropy from (a negated) entropy. But, according
        # to the Pevzner book, the calculation of relative_entropy can be simplified to just...
        # def calculate_relative_entropy(probabilities_for_nuc: List[float], total_frequencies_for_nucs: List[float]) -> float:
        #     ret = 0.0
        #     for prob, total_freq in zip(probabilities_for_nuc, total_frequency_for_nucs):
        #         ret += value * (log(value / total_freq, 2.0) if value > 0.0 else 0.0)
        #     return ret
        relative_entropy_per_col.append(relative_entropy)

    # sum up entropies to get entropy of motif
    ret = sum(relative_entropy_per_col)

    # All of the other score_motif algorithms try to MINIMIZE score. In the case of relative entropy (this algorithm),
    # the greater the score is the better of a match it is. As such, negate this score so the existing algorithms can
    # still try to minimize.
    return -ret
def randomized_motif_search_with_psuedocounts(k: int,
                                              dnas: List[str]) -> List[str]:
    motif_matrix = []
    for dna in dnas:
        start = randrange(len(dna) - k + 1)
        kmer = dna[start:start + k]
        motif_matrix.append(kmer)

    best_motif_matrix = motif_matrix

    while True:
        counts = motif_matrix_count(motif_matrix)
        apply_psuedocounts_to_count_matrix(counts)
        profile = motif_matrix_profile(counts)

        motif_matrix = [
            find_most_probable_kmer_using_profile_matrix(profile, dna)[0]
            for dna in dnas
        ]
        if score_motif(motif_matrix) < score_motif(best_motif_matrix):
            best_motif_matrix = motif_matrix
        else:
            return best_motif_matrix
Ejemplo n.º 8
0
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        dnas = []
        while True:
            try:
                dna = input().strip().upper()
                if len(dna) > 0:
                    dnas.append(dna)
            except EOFError:
                break

        counts = motif_matrix_count(dnas)
        profile = motif_matrix_profile(counts)
        print(
            f'Profiling nucleotides at each column of the motif matrix...\n\n')
        print(f'{"<br>".join(dnas)}\n\n')
        print(f'Result...\n\n')
        print(f'{"<br>".join([str(i) for i in profile.items()])}\n\n')
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
Ejemplo n.º 9
0
dnas = [
    'GGCGTTCAGGCA', 'AAGAATCAGTCA', 'CAAGGAGTTCGC', 'CACGTCAATCAC',
    'CAATAATATTCG'
]
k = 3

motifs = []
for dna in dnas:
    start = randrange(len(dna) - k)
    motif = dna[start:start + k]
    motifs.append(motif)

best_motifs = motifs

while True:
    counts_matrix = motif_matrix_count(motifs)
    for elem, counts in counts_matrix.items():  # add in pseudocounts
        counts_matrix[elem] = [c + 1 for c in counts]
    profile_matrix = motif_matrix_profile(counts_matrix)

    motifs = [
        find_most_probable_kmer_using_profile_matrix(profile_matrix, dna)[0]
        for dna in dnas
    ]
    if score_motif(motifs) < score_motif(best_motifs):
        best_motifs = motifs
    else:
        break

[print(f'{m}') for m in best_motifs]
Ejemplo n.º 10
0
# Part 2 - Find instances of the motif in the genome

# CGGGACTTCAGGCCCTATCG
# CGGGTCAAACGACCCTAGTG
# CGGGACGTAAGTCCCTAACG
# CCGGGCTTCCAACCGTGGCC
# CGTGACCGACGTCCCCAGCC
# GAGGACCTTCGGCCCCACCC
# GGGGACTTCTGTCCCTAGCC
# TGGGACTTTCGGCCCTGTCC
# GGGGACCAACGCCCCTGGGA
# GGGGACCGAAGTCCCCGGGC
# 11
# consensus_kmer = 'CGGGACCTACGTCCCTAGCC'  # this is consensus string for hte matrix it finds

best_motif_matrix_counts = motif_matrix_count(best_motif_matrix)
for elem, counts in best_motif_matrix_counts.items():  # add in pseudocounts
    best_motif_matrix_counts[elem] = [c + 1 for c in counts]
best_motif_matrix_profile = motif_matrix_profile(best_motif_matrix_counts)

with open('/home/user/Downloads/GCF_000195955.2_ASM19595v2_genomic.fna', mode='r', encoding='utf-8') as f:
    data = f.read()
lines = data.split('\n')
lines = [l.strip() for l in lines]  # get rid of whitespace
lines = [l if not l.startswith('>') else '' for l in lines]  # remove comments
dna = ''.join(lines)  # concat into single dna str
for kmer, _ in slide_window(dna, k):
    prob = determine_probability_of_match_using_profile_matrix(best_motif_matrix_profile, kmer)
    if prob >= 0.01:  # 1% or greater
        print(f'{kmer} {prob}')