def parse_matrix(matrix_str, alphabet): if isfile(matrix_str): with open(matrix_str) as f: matrix_dict = align.SubstitutionMatrix.dict_from_str(f.read()) return align.SubstitutionMatrix(alphabet, alphabet, matrix_dict) else: # String is a NCBI matrix name upper_matrix_str = matrix_str.upper() if upper_matrix_str not in align.SubstitutionMatrix.list_db(): raise InputError( f"'{matrix_str}' is neither a file " f"nor a valid NCBI substitution matrix" ) return align.SubstitutionMatrix(alphabet, alphabet, upper_matrix_str)
def test_matrix_str(): alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("def") score_matrix = np.arange(9).reshape((3, 3)) matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix) assert str(matrix) == "\n".join( [" d e f", "a 0 1 2", "b 3 4 5", "c 6 7 8"])
def test_custom_sequence_type(app_cls): alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] exp_trace = [ [0, 0], [1, -1], [2, 1], [3, 2], [-1, 3], [4, 4], [5, 5], [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) score_matrix[score_matrix == 0] = -1000 score_matrix[score_matrix == 1] = 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) app = app_cls(sequences, matrix=matrix) app.start() app.join() alignment = app.get_alignment() assert alignment.sequences == sequences assert alignment.trace.tolist() == exp_trace
def test_matrices(db_entry): """ Test reading of matrix files. """ alph1 = seq.ProteinSequence.alphabet alph2 = seq.ProteinSequence.alphabet matrix = align.SubstitutionMatrix(alph1, alph2, db_entry)
def matrices(name): """ A SubstitutionMatrix maps each possible pairing of a symbol of a first alphabet with a symbol of a second alphabet to a score (int) Parameters ---------- name: string Name of the matrix which is loaded from the internal matrix database. If the name of Substitution Matrix could not be found, the default SubstitutionMatrix will be BLOSUM62. Returns ------- SubstitutionMatrix The class uses a 2-D (m x n) ndarray, where each element stores the score for a symbol pairing, indexed by the symbol codes of the respective symbols in an m-length alphabet 1 and an n-length alphabet 2 """ if name == "BLOSUM62": matrix = seq_align.SubstitutionMatrix.std_protein_matrix() else: alph = seq.ProteinSequence.alphabet matrix = seq_align.SubstitutionMatrix(alph, alph, name) return matrix
def test_custom_substitution_matrix(sequences, app_cls): alph = seq.ProteinSequence.alphabet # Strong identity matrix score_matrix = np.identity(len(alph)) * 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) exp_ali = ("BI-QTITE\n" "TITANITE\n" "BI-SMITE\n" "-I-QLITE") app = app_cls(sequences, matrix=matrix) app.start() app.join() alignment = app.get_alignment() assert str(alignment) == exp_ali
def parse_matrix(matrix_str, alphabet): if isfile(matrix_str): with open(matrix_str) as f: matrix_dict = align.SubstitutionMatrix.dict_from_str(f.read()) return align.SubstitutionMatrix(alphabet, alphabet, matrix_dict) else: # String is a NCBI matrix name # For user convenience there is no case sensitivity # -> Find fitting matrix matrix_list = align.SubstitutionMatrix.list_db() upper_matrix_str = matrix_str.upper() upper_matrix_list = [ m.upper() for m in align.SubstitutionMatrix.list_db() ] try: matrix_str = matrix_list[upper_matrix_list.index(upper_matrix_str)] except: raise InputError(f"'{matrix_str}' is neither a file " f"nor a valid NCBI substitution matrix") return align.SubstitutionMatrix(alphabet, alphabet, matrix_str)
def test_invalid_scoring_scheme(): """ Check if `from_samples()` raises an exception when the expected similarity score between to random symbols is positive. """ alph = seq.ProteinSequence.alphabet matrix = align.SubstitutionMatrix( alph, alph, np.ones((len(alph), len(alph)), dtype=int)) # Uniform background frequencies freq = np.ones(len(alph)) with pytest.raises(ValueError): estimator = EValueEstimator.from_samples(alph, matrix, -10, freq)
def test_score_scaling(sequences): """ Scaling the substitution scores and gap penalties by a constant factor should not influence the obtained E-values. Test this by aligning real sequences with a standard and scaled scoring scheme and comparing the calculated E-values of these alignments. """ SCALING_FACTOR = 1000 GAP_PENALTY = (-12, -1) SEQ_LENGTH = 300 matrix = align.SubstitutionMatrix.std_protein_matrix() np.random.seed(0) std_estimator = align.EValueEstimator.from_samples( seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND) scores = [ align.align_optimal(sequences[i], sequences[i + 1], matrix, GAP_PENALTY, local=True, max_number=1)[0].score for i in range(9) ] std_log_evalues = std_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) scaled_matrix = align.SubstitutionMatrix( seq.ProteinSequence.alphabet, seq.ProteinSequence.alphabet, matrix.score_matrix() * SCALING_FACTOR) scaled_gap_penalty = (GAP_PENALTY[0] * SCALING_FACTOR, GAP_PENALTY[1] * SCALING_FACTOR) scaled_estimator = align.EValueEstimator.from_samples( seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty, BACKGROUND) scores = [ align.align_optimal(sequences[i], sequences[i + 1], scaled_matrix, scaled_gap_penalty, local=True, max_number=1)[0].score for i in range(9) ] scaled_log_evalues = scaled_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) # Due to relatively low sample size, expect rather large deviation assert std_log_evalues.tolist() \ == pytest.approx(scaled_log_evalues.tolist(), rel=0.2)
def test_matrix_str(): """ Test conversion of substitution matrix to string via a small constructed test case. """ alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("def") score_matrix = np.arange(9).reshape((3,3)) matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix) assert str(matrix) == "\n".join( [" d e f", "a 0 1 2", "b 3 4 5", "c 6 7 8"] )
def test_custom_substitution_matrix(sequences, app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.ProteinSequence.alphabet # Strong identity matrix score_matrix = np.identity(len(alph)) * 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) exp_ali = ("BI-QTITE\n" "TITANITE\n" "BI-SMITE\n" "-I-QLITE") try: app = app_cls(sequences, matrix=matrix) except VersionError: pytest.skip(f"Invalid software version") app.start() app.join() alignment = app.get_alignment() assert str(alignment) == exp_ali
def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k): """ Check if `EValueEstimator` estimates the extreme value distribution parameters correctly by comparing them to the parameters described in the original publication by Altschul *et al*. """ SAMPLE_LENGTH = 500 SAMPLE_SIZE = 1000 alphabet = seq.ProteinSequence.alphabet matrix = align.SubstitutionMatrix(alphabet, alphabet, matrix_name) np.random.seed(0) estimator = align.EValueEstimator.from_samples(alphabet, matrix, gap_penalty, BACKGROUND, SAMPLE_LENGTH, SAMPLE_SIZE) # Due to relatively low sample size, expect rather large deviation assert estimator.lam == pytest.approx(ref_lam, rel=0.1) assert estimator.k == pytest.approx(ref_k, rel=0.6)
def _convert_to_uint16_code(seq1, seq2, matrix): """ Adjust sequences, so that they use 'uint16' as dtype for the code. This is a necessary test, since 'uint8' uses a separate implementation. """ new_alph = seq.Alphabet(np.arange(500)) code = seq1.code seq1 = seq.GeneralSequence(new_alph) seq1.code = code code = seq2.code seq2 = seq.GeneralSequence(new_alph) seq2.code = code # Adjust the substitution matrix as well, # so that it is compatible with the new alphabet score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32) orig_len = len(matrix.score_matrix()) score_matrix[:orig_len, :orig_len] = matrix.score_matrix() matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix) return seq1, seq2, matrix
def test_custom_sequence_type(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] exp_trace = [ [0, 0], [1, -1], [2, 1], [3, 2], [-1, 3], [4, 4], [5, 5], [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) score_matrix[score_matrix == 0] = -1000 score_matrix[score_matrix == 1] = 1000 matrix = align.SubstitutionMatrix(alph, alph, score_matrix) try: app = app_cls(sequences, matrix=matrix) except VersionError: pytest.skip(f"Invalid software version") app.start() app.join() alignment = app.get_alignment() assert alignment.sequences == sequences assert alignment.trace.tolist() == exp_trace
def test_matrices(db_entry): alph1 = seq.ProteinSequence.alphabet alph2 = seq.ProteinSequence.alphabet matrix = align.SubstitutionMatrix(alph1, alph2, db_entry)
import biotite.sequence.phylo as phylo import biotite.sequence.graphics as graphics # Obtain BLOSUM62 matrix = align.SubstitutionMatrix.std_protein_matrix() print(matrix) ######################################################################## # The original *BLOSUM62* contains symbols for ambiguous amino acids and # the stop signal. # As these are not actual amino acids, a new substitution matrix is # created, where these symbols are are removed. # Matrix should not contain ambiguous symbols or stop signal matrix = align.SubstitutionMatrix( seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]), seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]), matrix.score_matrix()[:-4, :-4]) similarities = matrix.score_matrix() print(matrix) ######################################################################## # Now a function must be defined, that converts the similarity depicted # by a substitution matrix into a distance required by the UPGMA method. # In this case, the distance is defined as the difference between the # similarity of the two symbols and the average maximum similarity of # the symbols to themselves. # # Finally the obtained (phylogenetic) tree is plotted as dendrogram. def get_distance(similarities, i, j): s_max = (similarities[i, i] + similarities[j, j]) / 2
# So much for theory. # Let's start by showing different ways to construct a # :class:`SubstitutionMatrix`, in our case for protein sequence # alignments: import biotite.sequence as seq import biotite.sequence.align as align import numpy as np alph = seq.ProteinSequence.alphabet # Load the standard protein substitution matrix, which is BLOSUM62 matrix = align.SubstitutionMatrix.std_protein_matrix() print("\nBLOSUM62\n") print(matrix) # Load another matrix from internal database matrix = align.SubstitutionMatrix(alph, alph, "BLOSUM50") # Load a matrix dictionary representation, # modify it, and create the SubstitutionMatrix # (Dictionary could be loaded from matrix string in NCBI format, too) matrix_dict = align.SubstitutionMatrix.dict_from_db("BLOSUM62") matrix_dict[("P", "Y")] = 100 matrix = align.SubstitutionMatrix(alph, alph, matrix_dict) # And now create a matrix by directly provding the ndarray # containing the similarity scores # (identity matrix in our case) scores = np.identity(len(alph), dtype=int) matrix = align.SubstitutionMatrix(alph, alph, scores) print("\n\nIdentity matrix\n") print(matrix) ########################################################################
def simple_matrix(): alph = seq.NucleotideSequence.alphabet_unamb return align.SubstitutionMatrix( alph, alph, np.array([[1, -1, -1, -1], [-1, 1, -1, -1], [-1, -1, 1, -1], [-1, -1, -1, 1]]))
def _identity_rule(alphabet): score_matrix = np.full((len(alphabet),) * 2, -1, dtype=int) np.fill_diagonal(score_matrix, 0) matrix = align.SubstitutionMatrix(alphabet, alphabet, score_matrix) rule = align.ScoreThresholdRule(matrix, 0) return rule
# finally the sequences are mapped back into the original sequence type. # Let's show this on the example of a nonsense alphabet. import numpy as np import biotite.application.mafft as mafft import biotite.sequence.align as align alphabet = seq.Alphabet(("foo", "bar", 42)) sequences = [ seq.GeneralSequence(alphabet, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], ] ] matrix = align.SubstitutionMatrix( alphabet, alphabet, np.array([[100, -100, -100], [-100, 100, -100], [-100, -100, 100]])) alignment = mafft.MafftApp.align(sequences, matrix=matrix) # As the alphabet do not has characters as symbols # the alignment cannot be directly printed # However, we can print the trace print(alignment.trace) ######################################################################## # Secondary structure annotation # ------------------------------ # # .. currentmodule:: biotite.application.dssp # # Althogh :mod:`biotite.structure` offers the function # :func:`annotate_sse()` to assign secondary structure elements based on
rmsda = np.sum( ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180)**2, axis=-1) # Chose PB, where the RMSDA to the reference angle is lowest # Due to the definition of Biotite symbol codes # the index of the chosen PB is directly the symbol code pb_seq_code = np.argmin(rmsda, axis=0) # Put the array of symbol codes into actual sequence objects pb_sequence = seq.GeneralSequence(pb_alphabet) pb_sequence.code = pb_seq_code pb_seqs.append(pb_sequence) # Perfrom a multiple sequence alignment of the PB sequences matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str) matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict) alignment, order, _, _ = align.align_multiple(pb_seqs, matrix, gap_penalty=(-500, -100), terminal_penalty=False) # Visualize the alignment # Order alignment according to guide tree alignment = alignment[:, order.tolist()] labels = [organisms[i] for i in order] fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.add_subplot(111) # The color scheme was generated with the 'Gecos' software graphics.plot_alignment_type_based(ax, alignment, labels=labels,