def get_alignment(cls, seq1: str, seq2: str, local: bool = True): """ Generate an alignment between two sequences Parameters ---------- seq1: str The first sequence to be aligned seq1: str The second sequence to be aligned local: bool If false, a global alignment is performed (based on the Needleman-Wunsch algorithm), otherwise a local alignment is performed (based on the Smith–Waterman algorithm). (Default: True) Returns ------- Alignment """ import biotite.sequence as seq import biotite.sequence.align as align import numpy as np # create the default matrix # TODO add more options for the choice of matrix matrix = align.SubstitutionMatrix.std_protein_matrix() alignments = align.align_optimal( seq.ProteinSequence(seq1), seq.ProteinSequence(seq2), matrix, local=local, ) alignment = alignments[0] score = alignment.score seq_identity = align.get_sequence_identity(alignment) symbols = align.get_symbols(alignment) codes = align.get_codes(alignment) return cls( alignment=alignment, metadata={ "score": score, "sequence_identity": seq_identity, "symbols": symbols, "codes": codes, }, )
def mutual_information_zscore(alignment, n_shuffle=100): codes = align.get_codes(alignment).T alph = alignment.sequences[0].alphabet mi = _mutual_information(codes, alph) np.random.seed(0) random_mi = [None] * n_shuffle for i in range(n_shuffle): shuffled_codes = _shuffle(codes) random_mi[i] = _mutual_information(shuffled_codes, alph) random_mi = np.stack(random_mi) mean = np.mean(random_mi, axis=0) std = np.std(random_mi, axis=0) z_score = (mi - mean) / std return z_score
# If you are interested in more advanced visualization examples, have a # look at the :doc:`example gallery <../examples/gallery/index>`. # # You can also do some simple analysis on these objects, like # determining the sequence identity or calculating the score. # For further custom analysis, it can be convenient to have directly the # aligned symbos codes instead of the trace. alignment = alignments[0] print("Score: ", alignment.score) print("Recalculated score:", align.score(alignment, matrix=matrix)) print("Sequence identity:", align.get_sequence_identity(alignment)) print("Symbols:") print(align.get_symbols(alignment)) print("symbols codes:") print(align.get_codes(alignment)) ######################################################################## # # .. currentmodule:: biotite.sequence.io.fasta # # You may ask, why should you recalculate the score, when the score has # already been directly calculated via :func:`align_optimal()`. # The answer is, that you might load an alignment from an external # alignment program as FASTA file using :func:`get_alignment()`. # # .. currentmodule:: biotite.sequence.align # # If you want to perform a multiple sequence alignment, have a look at # the :func:`align_multiple()` function or the interfaces to external # MSA software in the :mod:`biotite.application` subpackage.
similarities[i] = 0 else: sim = matrix[code1, code2] # Normalize (range 0.0 - 1.0) min_sim = np.min(matrix[code1]) max_sim = np.max(matrix[code1]) sim = (sim - min_sim) / (max_sim - min_sim) similarities[i] = sim # Delete self-similarity similarities = np.delete(similarities, seq_i) return np.average(similarities) matrix = align.SubstitutionMatrix.std_protein_matrix() # Get the alignment columns as symbols codes (-1 for gaps) trace_code = align.get_codes(alignment) similarities = np.zeros(trace_code.shape) for i in range(similarities.shape[0]): for j in range(similarities.shape[1]): similarities[i, j] = get_average_normalized_similarity( trace_code, matrix.score_matrix(), i, j) figure = plt.figure(figsize=(8.0, 3.0)) ax = figure.add_subplot(111) heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0) cbar = figure.colorbar(heatmap) figure.tight_layout() ######################################################################## # As the plot creates a heatmap field for every alignment column, # the plot looks quite confusing.
# # Finally, we predict and plot the secondary structure of the *M1* RNA # with help from *ViennaRNA* and highlight mismatch position between # *E. coli* and *S. enterica* *M1*. app = viennarna.RNAfoldApp(m1_sequence) app.start() app.join() base_pairs = app.get_base_pairs() app = viennarna.RNAplotApp(base_pairs=base_pairs, length=len(m1_sequence)) app.start() app.join() plot_coord = app.get_coordinates() codes = align.get_codes(best_alignment) m1_no_gap_codes = codes[codes[:, 0] != -1] identities = m1_no_gap_codes[0] == m1_no_gap_codes[1] fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111) # Plot base connections ax.plot(*plot_coord.T, color="black", linewidth=1, zorder=1) # Plot base pairings ax.add_collection( LineCollection([(plot_coord[i], plot_coord[j]) for i, j in base_pairs], color="silver", linewidth=1, zorder=1)) # Plot base markers ax.scatter(