Beispiel #1
0
    def get_alignment(cls, seq1: str, seq2: str, local: bool = True):
        """
        Generate an alignment between two sequences

        Parameters
        ----------
        seq1: str
            The first sequence to be aligned
        seq1: str
            The second sequence to be aligned
        local: bool
            If false, a global alignment is performed
            (based on the Needleman-Wunsch algorithm),
            otherwise a local alignment is performed
            (based on the Smith–Waterman algorithm).
            (Default: True)

        Returns
        -------
        Alignment
        """

        import biotite.sequence as seq
        import biotite.sequence.align as align
        import numpy as np

        # create the default matrix
        # TODO add more options for the choice of matrix
        matrix = align.SubstitutionMatrix.std_protein_matrix()

        alignments = align.align_optimal(
            seq.ProteinSequence(seq1),
            seq.ProteinSequence(seq2),
            matrix,
            local=local,
        )

        alignment = alignments[0]

        score = alignment.score
        seq_identity = align.get_sequence_identity(alignment)
        symbols = align.get_symbols(alignment)
        codes = align.get_codes(alignment)

        return cls(
            alignment=alignment,
            metadata={
                "score": score,
                "sequence_identity": seq_identity,
                "symbols": symbols,
                "codes": codes,
            },
        )
def mutual_information_zscore(alignment, n_shuffle=100):
    codes = align.get_codes(alignment).T
    alph = alignment.sequences[0].alphabet

    mi = _mutual_information(codes, alph)
    np.random.seed(0)
    random_mi = [None] * n_shuffle
    for i in range(n_shuffle):
        shuffled_codes = _shuffle(codes)
        random_mi[i] = _mutual_information(shuffled_codes, alph)
    random_mi = np.stack(random_mi)
    mean = np.mean(random_mi, axis=0)
    std = np.std(random_mi, axis=0)
    z_score = (mi - mean) / std
    return z_score
Beispiel #3
0
# If you are interested in more advanced visualization examples, have a
# look at the :doc:`example gallery <../examples/gallery/index>`.
#
# You can also do some simple analysis on these objects, like
# determining the sequence identity or calculating the score.
# For further custom analysis, it can be convenient to have directly the
# aligned symbos codes instead of the trace.

alignment = alignments[0]
print("Score: ", alignment.score)
print("Recalculated score:", align.score(alignment, matrix=matrix))
print("Sequence identity:", align.get_sequence_identity(alignment))
print("Symbols:")
print(align.get_symbols(alignment))
print("symbols codes:")
print(align.get_codes(alignment))

########################################################################
#
# .. currentmodule:: biotite.sequence.io.fasta
#
# You may ask, why should you recalculate the score, when the score has
# already been directly calculated via :func:`align_optimal()`.
# The answer is, that you might load an alignment from an external
# alignment program as FASTA file using :func:`get_alignment()`.
#
# .. currentmodule:: biotite.sequence.align
#
# If you want to perform a multiple sequence alignment, have a look at
# the :func:`align_multiple()` function or the interfaces to external
# MSA software in the :mod:`biotite.application` subpackage.
            similarities[i] = 0
        else:
            sim = matrix[code1, code2]
            # Normalize (range 0.0 - 1.0)
            min_sim = np.min(matrix[code1])
            max_sim = np.max(matrix[code1])
            sim = (sim - min_sim) / (max_sim - min_sim)
            similarities[i] = sim
    # Delete self-similarity
    similarities = np.delete(similarities, seq_i)
    return np.average(similarities)


matrix = align.SubstitutionMatrix.std_protein_matrix()
# Get the alignment columns as symbols codes (-1 for gaps)
trace_code = align.get_codes(alignment)
similarities = np.zeros(trace_code.shape)
for i in range(similarities.shape[0]):
    for j in range(similarities.shape[1]):
        similarities[i, j] = get_average_normalized_similarity(
            trace_code, matrix.score_matrix(), i, j)

figure = plt.figure(figsize=(8.0, 3.0))
ax = figure.add_subplot(111)
heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0)
cbar = figure.colorbar(heatmap)
figure.tight_layout()

########################################################################
# As the plot creates a heatmap field for every alignment column,
# the plot looks quite confusing.
Beispiel #5
0
#
# Finally, we predict and plot the secondary structure of the *M1* RNA
# with help from *ViennaRNA* and highlight mismatch position between
# *E. coli* and *S. enterica* *M1*.

app = viennarna.RNAfoldApp(m1_sequence)
app.start()
app.join()
base_pairs = app.get_base_pairs()

app = viennarna.RNAplotApp(base_pairs=base_pairs, length=len(m1_sequence))
app.start()
app.join()
plot_coord = app.get_coordinates()

codes = align.get_codes(best_alignment)
m1_no_gap_codes = codes[codes[:, 0] != -1]
identities = m1_no_gap_codes[0] == m1_no_gap_codes[1]

fig = plt.figure(figsize=(8.0, 8.0))
ax = fig.add_subplot(111)
# Plot base connections
ax.plot(*plot_coord.T, color="black", linewidth=1, zorder=1)
# Plot base pairings
ax.add_collection(
    LineCollection([(plot_coord[i], plot_coord[j]) for i, j in base_pairs],
                   color="silver",
                   linewidth=1,
                   zorder=1))
# Plot base markers
ax.scatter(