biotite.temp_file("sequences.fasta"), "protein", "fasta") file = fasta.FastaFile.read(file_name) for name, sequence in file.items(): if "CAC34569" in name: avidin_seq = seq.ProteinSequence(sequence) elif "ACL82594" in name: streptavidin_seq = seq.ProteinSequence(sequence) # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False) # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based(ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"], show_numbers=True, show_line_position=True) fig.tight_layout() plt.show()
listed_sources.add(source) print("Binding sites:") for site in binding_sites[:20]: print(site) ######################################################################## # Now we can perform a multiple sequence alignment of the binding site # sequences. Here we use Clustal Omega to perform this task. # Since we have up to 200 sequences we visualize only a small portion of # the alignment. alignment = clustalo.ClustalOmegaApp.align(binding_sites) fig = plt.figure(figsize=(4.5, 4.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( ax, alignment[:,:20], labels=sources[:20], symbols_per_line=len(alignment) ) # Source names in italic ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"}) fig.tight_layout() ######################################################################## # Finally we can generate our sequence logo. fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_sequence_logo(ax, alignment) ax.set_xticks([5,10,15,20]) ax.set_xlabel("Residue position") ax.set_ylabel("Bits") # Only show left and bottom spine
# Additionally the alignment score is stored in this object. # Furthermore, this object can prettyprint the alignment into a human # readable form. # # For publication purposes you can create an actual figure based # on *Matplotlib*. # You can either decide to color the symbols based on the symbol type # or based on the similarity within the alignment columns. # In this case we will go with the similarity visualization. import matplotlib.pyplot as plt import biotite.sequence.graphics as graphics fig, ax = plt.subplots(figsize=(2.0, 0.8)) graphics.plot_alignment_similarity_based(ax, alignments[0], matrix=matrix, symbols_per_line=len(alignments[0])) fig.tight_layout() ######################################################################## # If you are interested in more advanced visualization examples, have a # look at the :doc:`example gallery <../examples/gallery/index>`. # # You can also do some simple analysis on these objects, like # determining the sequence identity or calculating the score. # For further custom analysis, it can be convenient to have directly the # aligned symbos codes instead of the trace. alignment = alignments[0] print("Score: ", alignment.score) print("Recalculated score:", align.score(alignment, matrix=matrix))
# for the genomic sequence, # since the original indices refer to the reverse complement sequence, # but we want the numbers to refer to the original sequence. # Use first and only alignment alignment = rev_alignments[0] # Reverse sequence numbering for second sequence (genome) in alignment number_funcs = [None, lambda x: len(alignment.sequences[1]) - x] # Visualize alignment, use custom color fig = plt.figure(figsize=(8.0, 2.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( ax, alignment, matrix=matrix, labels=["E. coli (leuL)", "S. enterica"], show_numbers=True, number_functions=number_funcs, show_line_position=True, color=biotite.colors["lightorange"]) fig.tight_layout() ######################################################################## # We will now go even further and align the translated protein # sequences. leul_ec = leul_seq # Obtain the S enterica leuL sequence # using the first and last index in the alignment trace first_i = alignment.trace[0, 1] last_i = alignment.trace[-1, 1]
alignment = align.align_optimal( var_spike_prot_seq, orig_spike_prot_seq, blosum_matrix, max_number=1 )[0] fig = plt.figure(figsize=(8.0, 10.0)) ax = fig.add_subplot(111) # Plot alignment cmap = LinearSegmentedColormap.from_list( "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)] # ^ reddish ^ white ) graphics.plot_alignment_similarity_based( ax, alignment, matrix=blosum_matrix, symbols_per_line=SYMBOLS_PER_LINE, labels=["B.1.1.7", "Reference"], show_numbers=True, label_size=9, number_size=9, symbol_size=7, spacing=SPACING, cmap=cmap ) ## Add indicator for features to the alignment for row in range(1 + len(alignment) // SYMBOLS_PER_LINE): col_start = SYMBOLS_PER_LINE * row col_stop = SYMBOLS_PER_LINE * (row + 1) if col_stop > len(alignment): # This happens in the last line col_stop = len(alignment) seq_start = alignment.trace[col_start, 1] seq_stop = alignment.trace[col_stop-1, 1] + 1 n_sequences = len(alignment.sequences) y_base = (n_sequences + SPACING) * row + n_sequences
if trace[i, 0] != -1: start_index = i break # ...and the end of the sequence for i in range(len(trace) - 1, -1, -1): # Check if all sequences have no gap at the given position if trace[i, 0] != -1: stop_index = i + 1 break # Truncate alignment to region where the 'PI3K' sequence exists alignment.trace = alignment.trace[start_index:stop_index] matrix = align.SubstitutionMatrix.std_protein_matrix() fig = plt.figure(figsize=(8.0, 15)) ax = fig.add_subplot(111) # The alignment is quite long # -> Reduce font size to reduce figure size graphics.plot_alignment_similarity_based(ax, alignment, matrix=matrix, symbols_per_line=80, labels=names, show_numbers=True, label_size=10, number_size=10, symbol_size=6, color=biotite.colors["orange"]) fig.tight_layout() plt.show()
[sequences[strain] for strain in (9, 5, 11, 45, 66, 68, 78)] ) # Create an alignment for visualization purposes # No insertion/deletions -> Align ungapped matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_ungapped( drug_type_consensus, fiber_type_consensus, matrix=matrix ) # A colormap for hightlighting sequence dissimilarity: # At low similarity the symbols are colored red, # at high similarity the symbols are colored white cmap = LinearSegmentedColormap.from_list( "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)] # ^ reddish ^ white ) fig = plt.figure(figsize=(8.0, 6.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( ax, alignment, matrix=matrix, symbols_per_line=50, labels=["Drug-type", "Fiber-type"], show_numbers=True, cmap=cmap, symbol_size=8 ) fig.tight_layout() plt.show()
matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignment = align.align_optimal( query_seq, hit_seq, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] print(f"Score: {alignment.score}") fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( ax, alignment, matrix=matrix, labels=["Avidin (query)", "Database hit"], show_numbers=True, show_line_position=True ) fig.tight_layout() ######################################################################## # How can you make sure that you observe a true homology and not simply # a product of coincidence? # The value you have at hand is the similarity score of the # alignment, but it is an absolute value that cannot be used without # context to answer this question. # But it can be used to ask another question: # How many alignments with a score at least this high can you expect # in this database by chance? # We call this quantity *expect value* (E-value). # If this value is close to 1 or even higher, we can assume that the # reported alignment was found by chance.
matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignments = align.align_optimal(mini_covid_seq, mini_mers_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False) # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based(ax, alignments[0], matrix=matrix, labels=["SARS_Covid", "MERS"], show_numbers=True, show_line_position=True) fig.tight_layout() plt.show() # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based(ax, alignments[0], matrix=matrix, labels=["SARS_Covid", "MERS"], show_numbers=True,
# For visualization purposes we have to apply a renumbering function # for the genomic sequence, # since the indices in the alignment trace refer to the reverse # complement sequence, but we want the numbers to refer to the original # genomic sequence. # Reverse sequence numbering for second sequence (genome) in alignment number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x] # Visualize alignment, use custom color fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.add_subplot(111) seqgraphics.plot_alignment_similarity_based( ax, best_alignment, matrix=matrix, labels=["E. coli M1 coding gene", "S. enterica genome"], show_numbers=True, number_functions=number_funcs, show_line_position=True, color=biotite.colors["brightorange"]) fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 ######################################################################## # The results show, that *E. coli* and *S. enterica* *M1* are almost # identical. # # Finally, we predict and plot the secondary structure of the *M1* RNA # with help from *ViennaRNA* and highlight mismatch position between # *E. coli* and *S. enterica* *M1*.