def parse_alphabet(alphabet_str): if alphabet_str is None: return seq.LetterAlphabet( seq.ProteinSequence.alphabet.get_symbols()[:20]) else: if " " in alphabet_str: raise InputError("Alphabet may not contain whitespaces") try: return seq.LetterAlphabet(alphabet_str) except Exception: raise InputError("Invalid alphabet")
def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) if len(symbols) == 1: assert alph.encode(symbols[0]) == exp_code[0] else: assert list(alph.encode_multiple(symbols)) == list(exp_code)
def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) code = np.array(code, dtype=np.uint8) if len(code) == 1: assert alph.decode(code[0]) == exp_symbols[0] else: assert list(alph.decode_multiple(code)) == list(exp_symbols)
def plot_pb_scheme_alignment(): random.seed(1) scheme_file = biotite.temp_file("json") mat_file = biotite.temp_file("mat") with open(mat_file, "w") as file: # PB substitution matrix, adapted from PBxplore file.write(""" a b c d e f g h i j k l m n o p a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 """) gecli.main(args=[ "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast", "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file ]) colors = graphics.load_color_scheme(scheme_file)["colors"] fig = plt.figure(figsize=(8.0, 5.0)) ax = fig.gca() pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") fasta_file = fasta.FastaFile() fasta_file.read(PB_EXAMPLE_FILE_NAME) seq_strings = list(fasta_file.values()) sequences = [ seq.GeneralSequence(pb_alphabet, seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) graphics.plot_alignment_type_based(ax, alignment, symbols_per_line=60, spacing=2, color_scheme=colors) fig.tight_layout() return fig
def test_input_types(alphabet_symbols, symbols): """ 'LetterAlphabet' handles different input iterable types in different ways. Assert that all ways work. """ alph = seq.LetterAlphabet(alphabet_symbols) code = alph.encode_multiple(symbols) conv_symbols = alph.decode_multiple(code) if isinstance(symbols, bytes): symbols = symbols.decode("ASCII") assert list(conv_symbols) == list([ symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol for symbol in symbols ])
def test_match_table(use_similarity_rule): """ Test the :meth:`match_table()` method based on a known example. Using the similarity rule should give the same result, as it is chosen to yield only the same k-mer as similar k-mer. """ alphabet = seq.LetterAlphabet(string.ascii_lowercase + "_") phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" \ "chuck_wood" phrase2 = "woodchuck" sequence1 = seq.GeneralSequence(alphabet, phrase1) sequence2 = seq.GeneralSequence(alphabet, phrase2) rule = _identity_rule(alphabet) if use_similarity_rule else None table1 = align.KmerTable.from_sequences(4, [sequence1]) table2 = align.KmerTable.from_sequences(4, [sequence2]) ref_matches = set([ (0, 9), (0, 22), (1, 23), (2, 24), (3, 25), (4, 26), (5, 27), (4, 32), (5, 33), (0, 43), (1, 44), (2, 45), (3, 46), (4, 47), (5, 48), (4, 59), (5, 60), (0, 65), ]) test_matches = table1.match_table(table2, similarity_rule=rule) # the reference indices are irrelevant for this test test_matches = test_matches[:, [1,3]] test_matches = set([tuple(match) for match in test_matches]) assert test_matches == ref_matches
def test_load_color_scheme(scheme_path): from matplotlib.colors import to_rgb import biotite.sequence.graphics as graphics supported_alphabets = [ seq.NucleotideSequence.alphabet_amb, seq.ProteinSequence.alphabet, seq.LetterAlphabet("abcdefghijklmnop") # Protein block alphabet ] test_scheme = graphics.load_color_scheme(scheme_path) assert test_scheme["alphabet"] in supported_alphabets assert len(test_scheme["colors"]) == len(test_scheme["alphabet"]) for color in test_scheme["colors"]: if color is not None: # Should not raise error to_rgb(color)
def test_error(alphabet_symbols, use_letter_alphabet, is_single_val): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) if is_single_val: with pytest.raises(seq.AlphabetError): alph.encode("G") with pytest.raises(seq.AlphabetError): alph.encode(42) with pytest.raises(seq.AlphabetError): alph.decode(len(alphabet_symbols)) with pytest.raises(seq.AlphabetError): alph.decode(-1) else: with pytest.raises(seq.AlphabetError): alph.encode_multiple("G") with pytest.raises(seq.AlphabetError): alph.encode_multiple([42]) with pytest.raises(seq.AlphabetError): alph.decode_multiple(np.array([len(alphabet_symbols)])) with pytest.raises(seq.AlphabetError): alph.decode_multiple(np.array([-1]))
# Code source: Patrick Kunzmann # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt import biotite import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.structure as struc import biotite.structure.io.mmtf as mmtf import biotite.database.rcsb as rcsb # PB alphabet pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") # PB substitution matrix, adapted from PBxplore matrix_str = """ a b c d e f g h i j k l m n o p a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
def test_contains(alphabet_symbols, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) assert "D" in alph
def test_length(alphabet_symbols, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) assert len(alph) == len(alphabet_symbols)