def test_matrix_str(): alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("def") score_matrix = np.arange(9).reshape((3, 3)) matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix) assert str(matrix) == "\n".join( [" d e f", "a 0 1 2", "b 3 4 5", "c 6 7 8"])
def test_alphabet_extension(): alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("abc") alph3 = seq.Alphabet("acb") alph4 = seq.Alphabet("abcde") assert alph1.extends(alph1) assert alph2.extends(alph1) assert not alph3.extends(alph1) assert alph4.extends(alph1) assert not alph1.extends(alph4)
def test_matrix_str(): """ Test conversion of substitution matrix to string via a small constructed test case. """ alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("def") score_matrix = np.arange(9).reshape((3,3)) matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix) assert str(matrix) == "\n".join( [" d e f", "a 0 1 2", "b 3 4 5", "c 6 7 8"] )
def test_custom_sequence_type(app_cls): alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] exp_trace = [ [0, 0], [1, -1], [2, 1], [3, 2], [-1, 3], [4, 4], [5, 5], [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) score_matrix[score_matrix == 0] = -1000 score_matrix[score_matrix == 1] = 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) app = app_cls(sequences, matrix=matrix) app.start() app.join() alignment = app.get_alignment() assert alignment.sequences == sequences assert alignment.trace.tolist() == exp_trace
def test_alphabet_mapper(source_alph_symbols, target_alph_symbols): CODE_LENGTH = 10000 source_alph = seq.Alphabet(source_alph_symbols) target_alph = seq.Alphabet(target_alph_symbols) mapper = seq.AlphabetMapper(source_alph, target_alph) ref_sequence = seq.GeneralSequence(source_alph) np.random.seed(0) ref_sequence.code = np.random.randint( len(source_alph), size=CODE_LENGTH, dtype=int ) test_sequence = seq.GeneralSequence(target_alph) test_sequence.code = mapper[ref_sequence.code] assert test_sequence.symbols == ref_sequence.symbols
def test_to_consensus_nuc_ambiguous(): symbols = np.array([[1, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) profile = seq.SequenceProfile(symbols, gaps, alphabet) assert seq.NucleotideSequence("MGTCATGC") == profile.to_consensus()
def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) if len(symbols) == 1: assert alph.encode(symbols[0]) == exp_code[0] else: assert list(alph.encode_multiple(symbols)) == list(exp_code)
def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) code = np.array(code, dtype=np.uint8) if len(code) == 1: assert alph.decode(code[0]) == exp_symbols[0] else: assert list(alph.decode_multiple(code)) == list(exp_symbols)
def test_invalid_sequence_type_no_matrix(app_cls): """ A custom substitution matrix is required for normally unsupported sequence types. """ alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] with pytest.raises(TypeError): app = app_cls(sequences)
def test_invalid_sequence_type_unsuitable_alphabet(app_cls): """ The alphabet of the custom sequence type cannot be longer than the amino acid alphabet. """ alph = seq.Alphabet(range(50)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ [1, 2, 3], [1, 2, 3], ] ] with pytest.raises(TypeError): pp = app_cls(sequences)
def test_from_alignment(): seq1 = seq.NucleotideSequence("CGTCAT") seq2 = seq.NucleotideSequence("TCATGC") ali_str = ["CGTCAT--", "--TCATGC"] trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) profile = seq.SequenceProfile.from_alignment(alignment) symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) assert np.array_equal(symbols, profile.symbols) assert np.array_equal(gaps, profile.gaps) assert (alphabet == profile.alphabet)
def test_invalid_sequence_type_unsuitable_alphabet(app_cls): """ The alphabet of the custom sequence type cannot be longer than the amino acid alphabet. """ bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(range(50)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ [1, 2, 3], [1, 2, 3], ] ] with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: pytest.skip(f"Invalid software version")
def test_invalid_sequence_type_no_matrix(app_cls): """ A custom substitution matrix is required for normally unsupported sequence types. """ bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: pytest.skip(f"Invalid software version")
def _convert_to_uint16_code(seq1, seq2, matrix): """ Adjust sequences, so that they use 'uint16' as dtype for the code. This is a necessary test, since 'uint8' uses a separate implementation. """ new_alph = seq.Alphabet(np.arange(500)) code = seq1.code seq1 = seq.GeneralSequence(new_alph) seq1.code = code code = seq2.code seq2 = seq.GeneralSequence(new_alph) seq2.code = code # Adjust the substitution matrix as well, # so that it is compatible with the new alphabet score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32) orig_len = len(matrix.score_matrix()) score_matrix[:orig_len, :orig_len] = matrix.score_matrix() matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix) return seq1, seq2, matrix
def test_custom_sequence_type(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] exp_trace = [ [0, 0], [1, -1], [2, 1], [3, 2], [-1, 3], [4, 4], [5, 5], [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) score_matrix[score_matrix == 0] = -1000 score_matrix[score_matrix == 1] = 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) try: app = app_cls(sequences, matrix=matrix) except VersionError: pytest.skip(f"Invalid software version") app.start() app.join() alignment = app.get_alignment() assert alignment.sequences == sequences assert alignment.trace.tolist() == exp_trace
def test_error(alphabet_symbols, use_letter_alphabet, is_single_val): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) if is_single_val: with pytest.raises(seq.AlphabetError): alph.encode("G") with pytest.raises(seq.AlphabetError): alph.encode(42) with pytest.raises(seq.AlphabetError): alph.decode(len(alphabet_symbols)) with pytest.raises(seq.AlphabetError): alph.decode(-1) else: with pytest.raises(seq.AlphabetError): alph.encode_multiple("G") with pytest.raises(seq.AlphabetError): alph.encode_multiple([42]) with pytest.raises(seq.AlphabetError): alph.decode_multiple(np.array([len(alphabet_symbols)])) with pytest.raises(seq.AlphabetError): alph.decode_multiple(np.array([-1]))
import biotite.sequence.phylo as phylo import biotite.sequence.graphics as graphics # Obtain BLOSUM62 matrix = align.SubstitutionMatrix.std_protein_matrix() print(matrix) ######################################################################## # The original *BLOSUM62* contains symbols for ambiguous amino acids and # the stop signal. # As these are not actual amino acids, a new substitution matrix is # created, where these symbols are are removed. # Matrix should not contain ambiguous symbols or stop signal matrix = align.SubstitutionMatrix( seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]), seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]), matrix.score_matrix()[:-4, :-4]) similarities = matrix.score_matrix() print(matrix) ######################################################################## # Now a function must be defined, that converts the similarity depicted # by a substitution matrix into a distance required by the UPGMA method. # In this case, the distance is defined as the difference between the # similarity of the two symbols and the average maximum similarity of # the symbols to themselves. # # Finally the obtained (phylogenetic) tree is plotted as dendrogram. def get_distance(similarities, i, j):
class NonsenseSequence(seq.Sequence): alphabet = seq.Alphabet([42, "foo", b"bar"]) def get_alphabet(self): return NonsenseSequence.alphabet
# differs from the one performed with MUSCLE. # # If the MSA software supports protein sequence alignment AND # custom substitution matrices, e.g. MUSCLE and MAFFT, almost any type # of sequence can be aligned: # Internally the sequences and the matrix are converted into protein # sequences/matrix. # Then the masquerading sequences are aligned via the software and # finally the sequences are mapped back into the original sequence type. # Let's show this on the example of a nonsense alphabet. import numpy as np import biotite.application.mafft as mafft import biotite.sequence.align as align alphabet = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alphabet, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] matrix = align.SubstitutionMatrix( alphabet, alphabet, np.array([[100, -100, -100], [-100, 100, -100], [-100, -100, 100]])) alignment = mafft.MafftApp.align(sequences, matrix=matrix) # As the alphabet do not has characters as symbols # the alignment cannot be directly printed # However, we can print the trace print(alignment.trace)
def test_length(alphabet_symbols, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) assert len(alph) == len(alphabet_symbols)
def test_contains(alphabet_symbols, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) assert "D" in alph