def calculate_tt_ratio(fasta_data: List[str]) -> float: """ :param fasta_data: Two equal length DNA sequences in FASTA format :return: Transition/transversion ratio, rounded to 4th digit """ strands = parse_fasta(fasta_data) first, second = strands transitions, transversions = first.transitions_transversions(second) tt_ratio = transitions / transversions return round(tt_ratio, 4)
def test_parse_fasta(): """ Checks proper parsing of fasta files :return: """ data = [">Tag1", "ATGC", "CGTA", "GGCC", ">Tag2", "ATGC", "AATT"] output = parse_fasta(data) output = [(line.sequence, line.tag) for line in output] expected = [("ATGCCGTAGGCC", "Tag1"), ("ATGCAATT", "Tag2")] assert output == expected
def calculate_max_gc_content(fasta_data: List[str]) -> Tuple[str, float]: """ :param fasta_data: - a list of DNA sequences in FASTA format :return: the tag and gc content of DNA strand with highest GC content """ strands = parse_fasta(fasta_data) gc_content_values = [(strand.tag, strand.gc_content()) for strand in strands] max_tag, max_gc_content = max(gc_content_values, key=lambda value: value[1]) return max_tag, max_gc_content
def get_possible_proteins(fasta_data: List[str]) -> str: """ :param fasta_data: A DNA sequence in FASTA format :return: a formatted set of possible proteins encoded in DNA """ strands = parse_fasta(fasta_data) # there should be only one sequence strand, = strands orfs = strand.search_for_orf() proteins = { orf.transcribe().translate_to_protein().sequence for orf in orfs } return '\n'.join(proteins)
def splice_and_translate(fasta_data: List[str]) -> str: """ :param fasta_data: A DNA sequence of gene, followed by several intron sequences, in FASTA format :return: A protein sequence obtained after translating spliced RNA """ dna_strands = parse_fasta(fasta_data) # sequences are DNA, so we need to transcribe them rna_strands = [strand.transcribe() for strand in dna_strands] # first sequence is matrix, *introns = rna_strands for intron in introns: matrix = matrix.splice(intron) # we need protein sequence from spliced RNA protein = matrix.translate_to_protein() return protein.sequence
from rps.sequence_problems.parsing import parse_fasta import rps.sequence_problems.consensus_and_profile as problem dataset = [ ">Rosalind_1", "ATCCAGCT", ">Rosalind_2", "GGGCAACT", ">Rosalind_3", "ATGGATCT", ">Rosalind_4", "AAGCAACC", ">Rosalind_5", "TTGGAACT", ">Rosalind_6", "ATGCCATT", ">Rosalind_7", "ATGGCACT" ] sequences = parse_fasta(dataset) def test_get_profile_matrix(): profile_matrix = problem.get_profile_matrix(sequences) expected_a = [5, 1, 0, 0, 5, 5, 0, 0] expected_c = [0, 0, 1, 4, 2, 0, 6, 1] expected_g = [1, 1, 6, 3, 0, 1, 0, 0] expected_t = [1, 5, 0, 0, 0, 1, 1, 6] assert profile_matrix.A == expected_a assert profile_matrix.C == expected_c assert profile_matrix.G == expected_g assert profile_matrix.T == expected_t def test_get_consensus_string(): matrix = problem.get_profile_matrix(sequences) result = problem.get_consensus_string(matrix) expected_string = 'ATGCAACT' assert result == expected_string