Exemple #1
0
                                     biotite.temp_file("sequences.fasta"),
                                     "protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq,
                                 streptavidin_seq,
                                 matrix,
                                 gap_penalty=(-10, -1),
                                 terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(ax,
                                         alignments[0],
                                         matrix=matrix,
                                         labels=["Avidin", "Streptavidin"],
                                         show_numbers=True,
                                         show_line_position=True)
fig.tight_layout()

plt.show()
Exemple #2
0
        listed_sources.add(source)
print("Binding sites:")
for site in binding_sites[:20]:
    print(site)

########################################################################
# Now we can perform a multiple sequence alignment of the binding site
# sequences. Here we use Clustal Omega to perform this task.
# Since we have up to 200 sequences we visualize only a small portion of
# the alignment.

alignment = clustalo.ClustalOmegaApp.align(binding_sites)
fig = plt.figure(figsize=(4.5, 4.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
    ax, alignment[:,:20], labels=sources[:20], symbols_per_line=len(alignment)
)
# Source names in italic
ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"})
fig.tight_layout()

########################################################################
# Finally we can generate our sequence logo.

fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_sequence_logo(ax, alignment)
ax.set_xticks([5,10,15,20])
ax.set_xlabel("Residue position")
ax.set_ylabel("Bits")
# Only show left and bottom spine
Exemple #3
0
# Additionally the alignment score is stored in this object.
# Furthermore, this object can prettyprint the alignment into a human
# readable form.
#
# For publication purposes you can create an actual figure based
# on *Matplotlib*.
# You can either decide to color the symbols based on the symbol type
# or based on the similarity within the alignment columns.
# In this case we will go with the similarity visualization.

import matplotlib.pyplot as plt
import biotite.sequence.graphics as graphics

fig, ax = plt.subplots(figsize=(2.0, 0.8))
graphics.plot_alignment_similarity_based(ax,
                                         alignments[0],
                                         matrix=matrix,
                                         symbols_per_line=len(alignments[0]))
fig.tight_layout()

########################################################################
# If you are interested in more advanced visualization examples, have a
# look at the :doc:`example gallery <../examples/gallery/index>`.
#
# You can also do some simple analysis on these objects, like
# determining the sequence identity or calculating the score.
# For further custom analysis, it can be convenient to have directly the
# aligned symbos codes instead of the trace.

alignment = alignments[0]
print("Score: ", alignment.score)
print("Recalculated score:", align.score(alignment, matrix=matrix))
# for the genomic sequence,
# since the original indices refer to the reverse complement sequence,
# but we want the numbers to refer to the original sequence.

# Use first and only alignment
alignment = rev_alignments[0]
# Reverse sequence numbering for second sequence (genome) in alignment
number_funcs = [None, lambda x: len(alignment.sequences[1]) - x]
# Visualize alignment, use custom color
fig = plt.figure(figsize=(8.0, 2.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
    ax,
    alignment,
    matrix=matrix,
    labels=["E. coli (leuL)", "S. enterica"],
    show_numbers=True,
    number_functions=number_funcs,
    show_line_position=True,
    color=biotite.colors["lightorange"])
fig.tight_layout()

########################################################################
# We will now go even further and align the translated protein
# sequences.

leul_ec = leul_seq
# Obtain the S enterica leuL sequence
# using the first and last index in the alignment trace
first_i = alignment.trace[0, 1]
last_i = alignment.trace[-1, 1]
Exemple #5
0
alignment = align.align_optimal(
    var_spike_prot_seq, orig_spike_prot_seq, blosum_matrix, max_number=1
)[0]


fig = plt.figure(figsize=(8.0, 10.0))
ax = fig.add_subplot(111)

# Plot alignment
cmap = LinearSegmentedColormap.from_list(
    "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)]
    #                    ^ reddish        ^ white
)
graphics.plot_alignment_similarity_based(
    ax, alignment, matrix=blosum_matrix, symbols_per_line=SYMBOLS_PER_LINE,
    labels=["B.1.1.7", "Reference"], show_numbers=True, label_size=9,
    number_size=9, symbol_size=7, spacing=SPACING, cmap=cmap
)

## Add indicator for features to the alignment
for row in range(1 + len(alignment) // SYMBOLS_PER_LINE):
    col_start = SYMBOLS_PER_LINE * row
    col_stop  = SYMBOLS_PER_LINE * (row + 1)
    if col_stop > len(alignment):
        # This happens in the last line
        col_stop = len(alignment)
    seq_start = alignment.trace[col_start, 1]
    seq_stop  = alignment.trace[col_stop-1,  1] + 1
    n_sequences = len(alignment.sequences)
    y_base = (n_sequences + SPACING) * row + n_sequences
    
    if trace[i, 0] != -1:
        start_index = i
        break
# ...and the end of the sequence
for i in range(len(trace) - 1, -1, -1):
    # Check if all sequences have no gap at the given position
    if trace[i, 0] != -1:
        stop_index = i + 1
        break

# Truncate alignment to region where the 'PI3K' sequence exists
alignment.trace = alignment.trace[start_index:stop_index]

matrix = align.SubstitutionMatrix.std_protein_matrix()
fig = plt.figure(figsize=(8.0, 15))
ax = fig.add_subplot(111)
# The alignment is quite long
# -> Reduce font size to reduce figure size
graphics.plot_alignment_similarity_based(ax,
                                         alignment,
                                         matrix=matrix,
                                         symbols_per_line=80,
                                         labels=names,
                                         show_numbers=True,
                                         label_size=10,
                                         number_size=10,
                                         symbol_size=6,
                                         color=biotite.colors["orange"])
fig.tight_layout()

plt.show()
    [sequences[strain] for strain in (9, 5, 11, 45, 66, 68, 78)]
)


# Create an alignment for visualization purposes
# No insertion/deletions -> Align ungapped
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment = align.align_ungapped(
    drug_type_consensus, fiber_type_consensus, matrix=matrix
)

# A colormap for hightlighting sequence dissimilarity:
# At low similarity the symbols are colored red,
# at high similarity the symbols are colored white
cmap = LinearSegmentedColormap.from_list(
    "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)]
    #                    ^ reddish        ^ white
)

fig = plt.figure(figsize=(8.0, 6.0))
ax = fig.add_subplot(111)

graphics.plot_alignment_similarity_based(
    ax, alignment, matrix=matrix, symbols_per_line=50,
    labels=["Drug-type", "Fiber-type"],
    show_numbers=True, cmap=cmap, symbol_size=8
)

fig.tight_layout()

plt.show()
matrix = align.SubstitutionMatrix.std_protein_matrix()

# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignment = align.align_optimal(
    query_seq, hit_seq, matrix,
    local=True, gap_penalty=GAP_PENALTY, max_number=1
)[0]


print(f"Score: {alignment.score}")

fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
    ax, alignment, matrix=matrix, labels=["Avidin (query)", "Database hit"],
    show_numbers=True, show_line_position=True
)
fig.tight_layout()

########################################################################
# How can you make sure that you observe a true homology and not simply
# a product of coincidence?
# The value you have at hand is the similarity score of the
# alignment, but it is an absolute value that cannot be used without
# context to answer this question.
# But it can be used to ask another question:
# How many alignments with a score at least this high can you expect
# in this database by chance?
# We call this quantity *expect value* (E-value).
# If this value is close to 1 or even higher, we can assume that the
# reported alignment was found by chance.
Exemple #9
0
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(mini_covid_seq,
                                 mini_mers_seq,
                                 matrix,
                                 gap_penalty=(-10, -1),
                                 terminal_penalty=False)

# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(ax,
                                         alignments[0],
                                         matrix=matrix,
                                         labels=["SARS_Covid", "MERS"],
                                         show_numbers=True,
                                         show_line_position=True)
fig.tight_layout()

plt.show()

# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(ax,
                                         alignments[0],
                                         matrix=matrix,
                                         labels=["SARS_Covid", "MERS"],
                                         show_numbers=True,
Exemple #10
0
# For visualization purposes we have to apply a renumbering function
# for the genomic sequence,
# since the indices in the alignment trace refer to the reverse
# complement sequence, but we want the numbers to refer to the original
# genomic sequence.

# Reverse sequence numbering for second sequence (genome) in alignment
number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x]
# Visualize alignment, use custom color
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
seqgraphics.plot_alignment_similarity_based(
    ax,
    best_alignment,
    matrix=matrix,
    labels=["E. coli M1 coding gene", "S. enterica genome"],
    show_numbers=True,
    number_functions=number_funcs,
    show_line_position=True,
    color=biotite.colors["brightorange"])
fig.tight_layout()
# sphinx_gallery_thumbnail_number = 2

########################################################################
# The results show, that *E. coli* and *S. enterica* *M1* are almost
# identical.
#
# Finally, we predict and plot the secondary structure of the *M1* RNA
# with help from *ViennaRNA* and highlight mismatch position between
# *E. coli* and *S. enterica* *M1*.