def test_alphabet_mapper(source_alph_symbols, target_alph_symbols): CODE_LENGTH = 10000 source_alph = seq.Alphabet(source_alph_symbols) target_alph = seq.Alphabet(target_alph_symbols) mapper = seq.AlphabetMapper(source_alph, target_alph) ref_sequence = seq.GeneralSequence(source_alph) np.random.seed(0) ref_sequence.code = np.random.randint( len(source_alph), size=CODE_LENGTH, dtype=int ) test_sequence = seq.GeneralSequence(target_alph) test_sequence.code = mapper[ref_sequence.code] assert test_sequence.symbols == ref_sequence.symbols
def test_custom_sequence_type(app_cls): alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] exp_trace = [ [0, 0], [1, -1], [2, 1], [3, 2], [-1, 3], [4, 4], [5, 5], [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) score_matrix[score_matrix == 0] = -1000 score_matrix[score_matrix == 1] = 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) app = app_cls(sequences, matrix=matrix) app.start() app.join() alignment = app.get_alignment() assert alignment.sequences == sequences assert alignment.trace.tolist() == exp_trace
def test_match_table(use_similarity_rule): """ Test the :meth:`match_table()` method based on a known example. Using the similarity rule should give the same result, as it is chosen to yield only the same k-mer as similar k-mer. """ alphabet = seq.LetterAlphabet(string.ascii_lowercase + "_") phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" \ "chuck_wood" phrase2 = "woodchuck" sequence1 = seq.GeneralSequence(alphabet, phrase1) sequence2 = seq.GeneralSequence(alphabet, phrase2) rule = _identity_rule(alphabet) if use_similarity_rule else None table1 = align.KmerTable.from_sequences(4, [sequence1]) table2 = align.KmerTable.from_sequences(4, [sequence2]) ref_matches = set([ (0, 9), (0, 22), (1, 23), (2, 24), (3, 25), (4, 26), (5, 27), (4, 32), (5, 33), (0, 43), (1, 44), (2, 45), (3, 46), (4, 47), (5, 48), (4, 59), (5, 60), (0, 65), ]) test_matches = table1.match_table(table2, similarity_rule=rule) # the reference indices are irrelevant for this test test_matches = test_matches[:, [1,3]] test_matches = set([tuple(match) for match in test_matches]) assert test_matches == ref_matches
def plot_pb_scheme_alignment(): random.seed(1) scheme_file = biotite.temp_file("json") mat_file = biotite.temp_file("mat") with open(mat_file, "w") as file: # PB substitution matrix, adapted from PBxplore file.write(""" a b c d e f g h i j k l m n o p a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 """) gecli.main(args=[ "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast", "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file ]) colors = graphics.load_color_scheme(scheme_file)["colors"] fig = plt.figure(figsize=(8.0, 5.0)) ax = fig.gca() pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") fasta_file = fasta.FastaFile() fasta_file.read(PB_EXAMPLE_FILE_NAME) seq_strings = list(fasta_file.values()) sequences = [ seq.GeneralSequence(pb_alphabet, seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) graphics.plot_alignment_type_based(ax, alignment, symbols_per_line=60, spacing=2, color_scheme=colors) fig.tight_layout() return fig
def _convert_to_uint16_code(seq1, seq2, matrix): """ Adjust sequences, so that they use 'uint16' as dtype for the code. This is a necessary test, since 'uint8' uses a separate implementation. """ new_alph = seq.Alphabet(np.arange(500)) code = seq1.code seq1 = seq.GeneralSequence(new_alph) seq1.code = code code = seq2.code seq2 = seq.GeneralSequence(new_alph) seq2.code = code # Adjust the substitution matrix as well, # so that it is compatible with the new alphabet score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32) orig_len = len(matrix.score_matrix()) score_matrix[:orig_len, :orig_len] = matrix.score_matrix() matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix) return seq1, seq2, matrix
def test_invalid_sequence_type_unsuitable_alphabet(app_cls): """ The alphabet of the custom sequence type cannot be longer than the amino acid alphabet. """ alph = seq.Alphabet(range(50)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ [1, 2, 3], [1, 2, 3], ] ] with pytest.raises(TypeError): pp = app_cls(sequences)
def test_invalid_sequence_type_no_matrix(app_cls): """ A custom substitution matrix is required for normally unsupported sequence types. """ alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] with pytest.raises(TypeError): app = app_cls(sequences)
def test_invalid_sequence_type_unsuitable_alphabet(app_cls): """ The alphabet of the custom sequence type cannot be longer than the amino acid alphabet. """ bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(range(50)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ [1, 2, 3], [1, 2, 3], ] ] with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: pytest.skip(f"Invalid software version")
def test_invalid_sequence_type_no_matrix(app_cls): """ A custom substitution matrix is required for normally unsupported sequence types. """ bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: pytest.skip(f"Invalid software version")
def test_custom_sequence_type(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] exp_trace = [ [0, 0], [1, -1], [2, 1], [3, 2], [-1, 3], [4, 4], [5, 5], [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) score_matrix[score_matrix == 0] = -1000 score_matrix[score_matrix == 1] = 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) try: app = app_cls(sequences, matrix=matrix) except VersionError: pytest.skip(f"Invalid software version") app.start() app.join() alignment = app.get_alignment() assert alignment.sequences == sequences assert alignment.trace.tolist() == exp_trace
pb_angles[:, 5] = phi[3:-1] pb_angles[:, 6] = psi[3:-1] pb_angles[:, 7] = phi[4:] pb_angles = np.rad2deg(pb_angles) # Angle RMSD of all reference angles with all actual angles rmsda = np.sum( ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180)**2, axis=-1) # Chose PB, where the RMSDA to the reference angle is lowest # Due to the definition of Biotite symbol codes # the index of the chosen PB is directly the symbol code pb_seq_code = np.argmin(rmsda, axis=0) # Put the array of symbol codes into actual sequence objects pb_sequence = seq.GeneralSequence(pb_alphabet) pb_sequence.code = pb_seq_code pb_seqs.append(pb_sequence) # Perfrom a multiple sequence alignment of the PB sequences matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str) matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict) alignment, order, _, _ = align.align_multiple(pb_seqs, matrix, gap_penalty=(-500, -100), terminal_penalty=False) # Visualize the alignment # Order alignment according to guide tree alignment = alignment[:, order.tolist()] labels = [organisms[i] for i in order]
# If the MSA software supports protein sequence alignment AND # custom substitution matrices, e.g. MUSCLE and MAFFT, almost any type # of sequence can be aligned: # Internally the sequences and the matrix are converted into protein # sequences/matrix. # Then the masquerading sequences are aligned via the software and # finally the sequences are mapped back into the original sequence type. # Let's show this on the example of a nonsense alphabet. import numpy as np import biotite.application.mafft as mafft import biotite.sequence.align as align alphabet = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alphabet, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] matrix = align.SubstitutionMatrix( alphabet, alphabet, np.array([[100, -100, -100], [-100, 100, -100], [-100, -100, 100]])) alignment = mafft.MafftApp.align(sequences, matrix=matrix) # As the alphabet do not has characters as symbols # the alignment cannot be directly printed # However, we can print the trace print(alignment.trace) ######################################################################## # Secondary structure annotation