start_index = acid_list.index('M', index, len(acid_list)) stop_index = acid_list.index('Stop', start_index + 1, len(acid_list)) frames.append(''.join(acid_list[start_index:stop_index]) ) # no need +1 since last is Stop codon index = start_index + 1 except ValueError: break return frames def reading_frames(dna): rna = dna.replace('T', 'U') r_rna = reverse_complement(dna).replace('T', 'U') return [rna, r_rna, rna[1:], r_rna[1:], rna[2:], r_rna[2:]] def all_proteins(dna): frames = reading_frames(dna) translated_frames = [translate(f) for f in frames] orf_proteins = [find_orf(tf) for tf in translated_frames] return [p for proteins in orf_proteins for p in proteins] file = open('dna.fas') _key, dna_seq = next(fasta_sequences(file)) result_proteins = all_proteins(dna_seq) for pr in set(result_proteins): print(pr)
import time from utils.fasta import fasta_sequences def shared_motif(sequences): curr_max = 0 curr_max_seq = '' seq_a = sequences[0] tail = sequences[1:] for i in range(len(seq_a) - 1): for j in range(2, len(seq_a) - i): curr_seq = seq_a[i:i + j] match_count = 0 for seq in tail: if curr_seq in seq: match_count += 1 else: break if match_count == len(tail) and j > curr_max: curr_max, curr_max_seq = j, curr_seq return curr_max_seq fas_sequences = [seq for key, seq in fasta_sequences(open('dna.fas'))] start_time = time.time() motif = shared_motif(sorted(fas_sequences)) print(f'Execution time: {time.time() - start_time} seconds') print(motif)
from utils.fasta import fasta_sequences from math import perm as p def max_matching_count(seq): a, c, g, u = seq.count('A'), \ seq.count('C'), \ seq.count('G'), \ seq.count('U') return (p(a, u) if a > u else p(u, a)) * (p(c, g) if c > g else p(g, c)) with open('rna.fas') as file: [(_, rna)] = fasta_sequences(file) count = max_matching_count(rna) print(count)
from utils.fasta import fasta_sequences def find_spliced_motif(s, t): indices, i = [], -1 for nt in t: i = s.find(nt, i + 1) indices.append(i) return indices (_, dna), (_, motif) = fasta_sequences(open('dna.fas')) res = find_spliced_motif(dna, motif) print(*map(lambda i: i + 1, res))
def test_fasta_sequences(self): actual_sequences = [pair for pair in fasta_sequences(self.fasta_file)] self.assertListEqual(self.sequences, actual_sequences)
from utils.fasta import fasta_sequences def is_transition(p): return 'A' in p and 'G' in p or \ 'C' in p and 'T' in p def category(p): return 3 if p[0] == p[1] else 1 if is_transition(p) else 2 def ratio(s1, s2): categories = list(map(lambda p: category(p), zip(s1, s2))) return categories.count(1) / categories.count(2) (_, seq_a), (_, seq_b) = fasta_sequences(open('dna.fas')) res = ratio(seq_a, seq_b) print(res)
INDICES = {'A': 0, 'C': 1, 'G': 2, 'T': 3} NT_ARRAY = 'ACGT' def create_profile(dna_lines): profile = [[0 for _ in range(len(dna_lines[0]))] for _ in range(4)] for seq in dna_lines: for i, nt in enumerate(seq): profile[INDICES[nt]][i] += 1 return profile def consensus(profile): consensus_st = '' for j in range(len(profile[0])): p_column = [p_line[j] for p_line in profile] nt_index, _ = max(enumerate(p_column), key=lambda pair: pair[1]) consensus_st += NT_ARRAY[nt_index] return consensus_st file = open('dna.fas') sequences = [seq for key, seq in fasta_sequences(file)] dna_profile = create_profile(sequences) consensus_value = consensus(dna_profile) print(' ' + ' '.join(consensus_value)) for i, line in enumerate(dna_profile): print('{}: {}'.format(NT_ARRAY[i], ' '.join(map(str, line))))
from itertools import product from utils.fasta import fasta_sequences def k_mer_list(a): return [''.join(mer) for mer in product(a, repeat=4)] def count_k_mer(seq, k_mer): k_len = len(k_mer) occurrences = 0 for i in range(0, len(seq) - k_len + 1): if seq[i:i + k_len] == k_mer: occurrences += 1 return occurrences def k_mer_composition(seq, a): count_k_mer(seq, 'ACGT') return [count_k_mer(seq, k_mer) for k_mer in k_mer_list(a)] file = open('dna.fas') _, dna = next(fasta_sequences(file)) composition_string = k_mer_composition(dna, 'ACGT') res = ' '.join(map(str, composition_string)) open('res.txt', 'w').write(res)
from utils.common import gc_content from utils.fasta import fasta_sequences def sequences_gc(sequences): return [(seq_key, gc_content(dna)) for seq_key, dna in sequences] file = open('dna.fas') cg_map = sequences_gc(fasta_sequences(file)) max_gc = max(cg_map, key=lambda pair: pair[1]) file.close() print(max_gc[0]) print(max_gc[1] * 100)
return -1 def ends(s1, s2): for j in range(len(s1), len(s1) // 2, -1): if s2.endswith(s1[0:j]): return j return -1 def super_string(s_list): [sup, *rest] = s_list while len(rest) > 0: for s in rest: i = starts(s, sup) if i != -1: sup = s[0:i] + sup rest.remove(s) else: j = ends(s, sup) if j != -1: sup += s[j:] rest.remove(s) return sup with open('dna.fas') as dna_file: sequences = [seq for _, seq in fasta_sequences(dna_file)] res = super_string(sequences) print(res)
from utils.fasta import fasta_sequences def overlap_graph(sequences, k): pairs = [] for key_v, seq_v in sequences: for key_w, seq_w in sequences: if seq_v != seq_w and seq_v[len(seq_v) - k:len(seq_v)] == seq_w[0:k]: pairs.append((key_v, key_w)) return pairs k_val = 3 fas_sequences = list(fasta_sequences(open('dna.fas'))) adjacency_list = overlap_graph(fas_sequences, k_val) print(*map(' '.join, adjacency_list), sep='\n')