Beispiel #1
0
            start_index = acid_list.index('M', index, len(acid_list))
            stop_index = acid_list.index('Stop', start_index + 1,
                                         len(acid_list))
            frames.append(''.join(acid_list[start_index:stop_index])
                          )  # no need +1 since last is Stop codon
            index = start_index + 1
        except ValueError:
            break
    return frames


def reading_frames(dna):
    rna = dna.replace('T', 'U')
    r_rna = reverse_complement(dna).replace('T', 'U')
    return [rna, r_rna, rna[1:], r_rna[1:], rna[2:], r_rna[2:]]


def all_proteins(dna):
    frames = reading_frames(dna)
    translated_frames = [translate(f) for f in frames]
    orf_proteins = [find_orf(tf) for tf in translated_frames]
    return [p for proteins in orf_proteins for p in proteins]


file = open('dna.fas')
_key, dna_seq = next(fasta_sequences(file))
result_proteins = all_proteins(dna_seq)

for pr in set(result_proteins):
    print(pr)
import time

from utils.fasta import fasta_sequences


def shared_motif(sequences):
    curr_max = 0
    curr_max_seq = ''
    seq_a = sequences[0]
    tail = sequences[1:]
    for i in range(len(seq_a) - 1):
        for j in range(2, len(seq_a) - i):
            curr_seq = seq_a[i:i + j]
            match_count = 0
            for seq in tail:
                if curr_seq in seq:
                    match_count += 1
                else:
                    break
            if match_count == len(tail) and j > curr_max:
                curr_max, curr_max_seq = j, curr_seq
    return curr_max_seq


fas_sequences = [seq for key, seq in fasta_sequences(open('dna.fas'))]
start_time = time.time()
motif = shared_motif(sorted(fas_sequences))
print(f'Execution time: {time.time() - start_time} seconds')
print(motif)
Beispiel #3
0
from utils.fasta import fasta_sequences
from math import perm as p


def max_matching_count(seq):
    a, c, g, u = seq.count('A'), \
                 seq.count('C'), \
                 seq.count('G'), \
                 seq.count('U')
    return (p(a, u) if a > u else p(u, a)) * (p(c, g) if c > g else p(g, c))


with open('rna.fas') as file:
    [(_, rna)] = fasta_sequences(file)
    count = max_matching_count(rna)
    print(count)
Beispiel #4
0
from utils.fasta import fasta_sequences


def find_spliced_motif(s, t):
    indices, i = [], -1
    for nt in t:
        i = s.find(nt, i + 1)
        indices.append(i)
    return indices


(_, dna), (_, motif) = fasta_sequences(open('dna.fas'))
res = find_spliced_motif(dna, motif)
print(*map(lambda i: i + 1, res))
Beispiel #5
0
 def test_fasta_sequences(self):
     actual_sequences = [pair for pair in fasta_sequences(self.fasta_file)]
     self.assertListEqual(self.sequences, actual_sequences)
Beispiel #6
0
from utils.fasta import fasta_sequences


def is_transition(p):
    return 'A' in p and 'G' in p or \
           'C' in p and 'T' in p


def category(p):
    return 3 if p[0] == p[1] else 1 if is_transition(p) else 2


def ratio(s1, s2):
    categories = list(map(lambda p: category(p), zip(s1, s2)))
    return categories.count(1) / categories.count(2)


(_, seq_a), (_, seq_b) = fasta_sequences(open('dna.fas'))
res = ratio(seq_a, seq_b)
print(res)
Beispiel #7
0
INDICES = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
NT_ARRAY = 'ACGT'


def create_profile(dna_lines):
    profile = [[0 for _ in range(len(dna_lines[0]))] for _ in range(4)]
    for seq in dna_lines:
        for i, nt in enumerate(seq):
            profile[INDICES[nt]][i] += 1
    return profile


def consensus(profile):
    consensus_st = ''
    for j in range(len(profile[0])):
        p_column = [p_line[j] for p_line in profile]
        nt_index, _ = max(enumerate(p_column), key=lambda pair: pair[1])
        consensus_st += NT_ARRAY[nt_index]
    return consensus_st


file = open('dna.fas')
sequences = [seq for key, seq in fasta_sequences(file)]
dna_profile = create_profile(sequences)
consensus_value = consensus(dna_profile)

print('   ' + ' '.join(consensus_value))
for i, line in enumerate(dna_profile):
    print('{}: {}'.format(NT_ARRAY[i], ' '.join(map(str, line))))
from itertools import product

from utils.fasta import fasta_sequences


def k_mer_list(a):
    return [''.join(mer) for mer in product(a, repeat=4)]


def count_k_mer(seq, k_mer):
    k_len = len(k_mer)
    occurrences = 0
    for i in range(0, len(seq) - k_len + 1):
        if seq[i:i + k_len] == k_mer:
            occurrences += 1
    return occurrences


def k_mer_composition(seq, a):
    count_k_mer(seq, 'ACGT')
    return [count_k_mer(seq, k_mer) for k_mer in k_mer_list(a)]


file = open('dna.fas')
_, dna = next(fasta_sequences(file))

composition_string = k_mer_composition(dna, 'ACGT')
res = ' '.join(map(str, composition_string))
open('res.txt', 'w').write(res)
Beispiel #9
0
from utils.common import gc_content
from utils.fasta import fasta_sequences


def sequences_gc(sequences):
    return [(seq_key, gc_content(dna)) for seq_key, dna in sequences]


file = open('dna.fas')
cg_map = sequences_gc(fasta_sequences(file))
max_gc = max(cg_map, key=lambda pair: pair[1])
file.close()

print(max_gc[0])
print(max_gc[1] * 100)
Beispiel #10
0
    return -1


def ends(s1, s2):
    for j in range(len(s1), len(s1) // 2, -1):
        if s2.endswith(s1[0:j]):
            return j
    return -1


def super_string(s_list):
    [sup, *rest] = s_list
    while len(rest) > 0:
        for s in rest:
            i = starts(s, sup)
            if i != -1:
                sup = s[0:i] + sup
                rest.remove(s)
            else:
                j = ends(s, sup)
                if j != -1:
                    sup += s[j:]
                    rest.remove(s)
    return sup


with open('dna.fas') as dna_file:
    sequences = [seq for _, seq in fasta_sequences(dna_file)]
    res = super_string(sequences)
    print(res)
Beispiel #11
0
from utils.fasta import fasta_sequences


def overlap_graph(sequences, k):
    pairs = []
    for key_v, seq_v in sequences:
        for key_w, seq_w in sequences:
            if seq_v != seq_w and seq_v[len(seq_v) -
                                        k:len(seq_v)] == seq_w[0:k]:
                pairs.append((key_v, key_w))
    return pairs


k_val = 3
fas_sequences = list(fasta_sequences(open('dna.fas')))
adjacency_list = overlap_graph(fas_sequences, k_val)
print(*map(' '.join, adjacency_list), sep='\n')