def test_pairwise_identity(sequences, mode): """ Test correct calculation of `get_pairwise_sequence_identity()` via pairwise calls of `get_sequence_identity()`. """ sequences = sequences msa, _, _, _ = align.align_multiple( sequences, matrix=align.SubstitutionMatrix.std_protein_matrix() ) ref_identity_matrix = np.zeros((len(sequences), len(sequences))) for i in range(len(sequences)): for j in range(len(sequences)): ref_identity_matrix[i,j] = align.get_sequence_identity( msa[:, [i,j]], mode=mode ) test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode) # Identity of two equal sequences should be 1, if only the length of # the sequence is counted if mode == "shortest": assert (np.diag(test_identity_matrix) == 1).all() # Identity must be between 0 and 1 assert ((test_identity_matrix <= 1) & (test_identity_matrix >= 0)).all() # Identity matrix is symmetric assert (test_identity_matrix == test_identity_matrix.T).all() # Pairwise identity must be equal in the two functions assert (test_identity_matrix == ref_identity_matrix).all()
def get_alignment(cls, seq1: str, seq2: str, local: bool = True): """ Generate an alignment between two sequences Parameters ---------- seq1: str The first sequence to be aligned seq1: str The second sequence to be aligned local: bool If false, a global alignment is performed (based on the Needleman-Wunsch algorithm), otherwise a local alignment is performed (based on the Smith–Waterman algorithm). (Default: True) Returns ------- Alignment """ import biotite.sequence as seq import biotite.sequence.align as align import numpy as np # create the default matrix # TODO add more options for the choice of matrix matrix = align.SubstitutionMatrix.std_protein_matrix() alignments = align.align_optimal( seq.ProteinSequence(seq1), seq.ProteinSequence(seq2), matrix, local=local, ) alignment = alignments[0] score = alignment.score seq_identity = align.get_sequence_identity(alignment) symbols = align.get_symbols(alignment) codes = align.get_codes(alignment) return cls( alignment=alignment, metadata={ "score": score, "sequence_identity": seq_identity, "symbols": symbols, "codes": codes, }, )
def test_complex_alignment(sequences, gap_penalty, local, seq_indices): """ Test `align_banded()` by comparing the output to `align_optimal()`. This test uses a set of long sequences, which are pairwise compared. The band should be chosen sufficiently large so `align_banded()` can return the optimal alignment(s). """ MAX_NUMBER = 100 matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] ref_alignments = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False, max_number=MAX_NUMBER ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] identity = align.get_sequence_identity(ref_alignments[0]) # Use a relatively small band width, if the sequences are similar, # otherwise use the entire search space band_width = 100 if identity > 0.5 else len(seq1) + len(seq2) test_alignments = align.align_banded( seq1, seq2, matrix, (-band_width, band_width), gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER ) try: assert test_alignments[0].score == ref_alignments[0].score if len(ref_alignments) < MAX_NUMBER: # Only test if the exact same alignments were created, # if the number of traces was not limited by MAX_NUMBER assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments except AssertionError: print("First tested alignment:") print() print(test_alignments[0]) print("\n") print("First reference alignment:") print() print(ref_alignments[0]) raise
def test_identity(): seq_str1 = "--HAKLPRDD--WL--" seq_str2 = "FRHA--QRTDADWLHH" seq_strings = [seq_str1, seq_str2] sequences = [ seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Assert correct sequence identity calculation modes = ["all", "not_terminal", "shortest"] values = [6 / 16, 6 / 12, 6 / 10] for mode, value in zip(modes, values): assert align.get_sequence_identity(alignment, mode=mode) == value
def test_search_sequence(): IDENTIY_CUTOFF = 0.9 pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) ref_sequence = pdbx.get_sequence(pdbx_file)[0] query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF) test_ids = rcsb.search(query) for id in test_ids: fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta")) test_sequence = fasta.get_sequence(fasta_file) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(ref_sequence, test_sequence, matrix, terminal_penalty=False)[0] identity = align.get_sequence_identity(alignment, mode="shortest") assert identity >= IDENTIY_CUTOFF
symbols_per_line=len(alignments[0])) fig.tight_layout() ######################################################################## # If you are interested in more advanced visualization examples, have a # look at the :doc:`example gallery <../examples/gallery/index>`. # # You can also do some simple analysis on these objects, like # determining the sequence identity or calculating the score. # For further custom analysis, it can be convenient to have directly the # aligned symbos codes instead of the trace. alignment = alignments[0] print("Score: ", alignment.score) print("Recalculated score:", align.score(alignment, matrix=matrix)) print("Sequence identity:", align.get_sequence_identity(alignment)) print("Symbols:") print(align.get_symbols(alignment)) print("symbols codes:") print(align.get_codes(alignment)) ######################################################################## # # .. currentmodule:: biotite.sequence.io.fasta # # You may ask, why should you recalculate the score, when the score has # already been directly calculated via :func:`align_optimal()`. # The answer is, that you might load an alignment from an external # alignment program as FASTA file using :func:`get_alignment()`. # # .. currentmodule:: biotite.sequence.align
# Mutations in the B.1.1.7 variant # -------------------------------- # # To get an rough overview about the overall sequence identity between # the genomes and the locations of mutations in the B.1.1.7 variant, # we need to align the original genome to our assembled one. # As both genomes are expected to be highly similar, we can use a banded # alignment again using a very conservative band width. BAND_WIDTH = 1000 genome_alignment = align.align_banded( var_genome, orig_genome, matrix, band=(-BAND_WIDTH//2, BAND_WIDTH//2), max_number=1 )[0] identity = align.get_sequence_identity(genome_alignment, 'all') print(f"Sequence identity: {identity * 100:.2f} %") ######################################################################## # Now we would like to have a closer look at the mutation locations. # To contextualize the locations we plot the mutation frequency along # with the gene locations. # The genomic coordinates for each gene can be extracted from the # already downloaded *GenBank* file of the reference genome. N_BINS = 50 # Get genomic coordinates for all SARS-Cov-2 genes gb_file = gb.GenBankFile.read(orig_genome_file) annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"])
# to ensure that values in the 'res_id' annotation point to the sequence structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) structure = structure[struc.filter_amino_acids(structure)] # Identity threshold for a sequence to be counted as homologous sequence IDENTITY_THESHOLD = 0.4 # Find homologous proteins in SwissProt via BLAST app = blast.BlastWebApp("blastp", sequence, database="swissprot") app.start() app.join() alignments = app.get_alignments() hit_seqs = [sequence] hit_ids = ["Query"] hit_starts = [1] for ali in alignments: identity = align.get_sequence_identity(ali) # Do not include the exact same sequence -> identity < 1.0 if identity > IDENTITY_THESHOLD and identity < 1.0: hit_seqs.append(ali.sequences[1]) hit_ids.append(ali.hit_id) hit_starts.append(ali.hit_interval[0]) # Perform MSA alignment = clustalo.ClustalOmegaApp.align(hit_seqs) # Plot MSA number_functions = [] for start in hit_starts: def some_func(x, start=start): return x + start