コード例 #1
0
def test_pairwise_identity(sequences, mode):
    """
    Test correct calculation of `get_pairwise_sequence_identity()` via
    pairwise calls of `get_sequence_identity()`.
    """
    sequences = sequences
    msa, _, _, _ = align.align_multiple(
        sequences,
        matrix=align.SubstitutionMatrix.std_protein_matrix()
    )
    
    ref_identity_matrix = np.zeros((len(sequences), len(sequences)))
    for i in range(len(sequences)):
        for j in range(len(sequences)):
            ref_identity_matrix[i,j] = align.get_sequence_identity(
                msa[:, [i,j]], mode=mode
            )
    
    test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode)
    
    # Identity of two equal sequences should be 1, if only the length of
    # the sequence is counted
    if mode == "shortest":
        assert (np.diag(test_identity_matrix) == 1).all()
    # Identity must be between 0 and 1
    assert ((test_identity_matrix <= 1) & (test_identity_matrix >= 0)).all()
    # Identity matrix is symmetric
    assert (test_identity_matrix == test_identity_matrix.T).all()
    # Pairwise identity must be equal in the two functions
    assert (test_identity_matrix == ref_identity_matrix).all()
コード例 #2
0
ファイル: alignment.py プロジェクト: sukritsingh/kinoml
    def get_alignment(cls, seq1: str, seq2: str, local: bool = True):
        """
        Generate an alignment between two sequences

        Parameters
        ----------
        seq1: str
            The first sequence to be aligned
        seq1: str
            The second sequence to be aligned
        local: bool
            If false, a global alignment is performed
            (based on the Needleman-Wunsch algorithm),
            otherwise a local alignment is performed
            (based on the Smith–Waterman algorithm).
            (Default: True)

        Returns
        -------
        Alignment
        """

        import biotite.sequence as seq
        import biotite.sequence.align as align
        import numpy as np

        # create the default matrix
        # TODO add more options for the choice of matrix
        matrix = align.SubstitutionMatrix.std_protein_matrix()

        alignments = align.align_optimal(
            seq.ProteinSequence(seq1),
            seq.ProteinSequence(seq2),
            matrix,
            local=local,
        )

        alignment = alignments[0]

        score = alignment.score
        seq_identity = align.get_sequence_identity(alignment)
        symbols = align.get_symbols(alignment)
        codes = align.get_codes(alignment)

        return cls(
            alignment=alignment,
            metadata={
                "score": score,
                "sequence_identity": seq_identity,
                "symbols": symbols,
                "codes": codes,
            },
        )
コード例 #3
0
ファイル: test_banded.py プロジェクト: ebetica/biotite
def test_complex_alignment(sequences, gap_penalty, local, seq_indices):
    """
    Test `align_banded()` by comparing the output to `align_optimal()`.
    This test uses a set of long sequences, which are pairwise compared.
    The band should be chosen sufficiently large so `align_banded()`
    can return the optimal alignment(s).
    """
    MAX_NUMBER = 100
    
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]

    ref_alignments = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, local=local, terminal_penalty=False,
        max_number=MAX_NUMBER
    )
    # Remove terminal gaps in reference to obtain a true semi-global
    # alignment, as returned by align_banded()
    ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
    
    identity = align.get_sequence_identity(ref_alignments[0])
    # Use a relatively small band width, if the sequences are similar,
    # otherwise use the entire search space
    band_width = 100 if identity > 0.5 else len(seq1) + len(seq2)
    test_alignments = align.align_banded(
        seq1, seq2, matrix, (-band_width, band_width),
        gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER
    )

    try:
        assert test_alignments[0].score == ref_alignments[0].score
        if len(ref_alignments) < MAX_NUMBER:
            # Only test if the exact same alignments were created,
            # if the number of traces was not limited by MAX_NUMBER
            assert len(test_alignments) == len(ref_alignments)
            for alignment in test_alignments:
                assert alignment in ref_alignments
    except AssertionError:
        print("First tested alignment:")
        print()
        print(test_alignments[0])
        print("\n")
        print("First reference alignment:")
        print()
        print(ref_alignments[0])
        raise
コード例 #4
0
def test_identity():
    seq_str1 = "--HAKLPRDD--WL--"
    seq_str2 = "FRHA--QRTDADWLHH"
    seq_strings = [seq_str1, seq_str2]
    sequences = [
        seq.ProteinSequence(seq_str.replace("-", ""))
        for seq_str in seq_strings
    ]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)
    # Assert correct sequence identity calculation
    modes = ["all", "not_terminal", "shortest"]
    values = [6 / 16, 6 / 12, 6 / 10]
    for mode, value in zip(modes, values):
        assert align.get_sequence_identity(alignment, mode=mode) == value
コード例 #5
0
def test_search_sequence():
    IDENTIY_CUTOFF = 0.9
    pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif"))
    ref_sequence = pdbx.get_sequence(pdbx_file)[0]
    query = rcsb.SequenceQuery(ref_sequence,
                               "protein",
                               min_identity=IDENTIY_CUTOFF)
    test_ids = rcsb.search(query)

    for id in test_ids:
        fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta"))
        test_sequence = fasta.get_sequence(fasta_file)
        matrix = align.SubstitutionMatrix.std_protein_matrix()
        alignment = align.align_optimal(ref_sequence,
                                        test_sequence,
                                        matrix,
                                        terminal_penalty=False)[0]
        identity = align.get_sequence_identity(alignment, mode="shortest")
        assert identity >= IDENTIY_CUTOFF
コード例 #6
0
                                         symbols_per_line=len(alignments[0]))
fig.tight_layout()

########################################################################
# If you are interested in more advanced visualization examples, have a
# look at the :doc:`example gallery <../examples/gallery/index>`.
#
# You can also do some simple analysis on these objects, like
# determining the sequence identity or calculating the score.
# For further custom analysis, it can be convenient to have directly the
# aligned symbos codes instead of the trace.

alignment = alignments[0]
print("Score: ", alignment.score)
print("Recalculated score:", align.score(alignment, matrix=matrix))
print("Sequence identity:", align.get_sequence_identity(alignment))
print("Symbols:")
print(align.get_symbols(alignment))
print("symbols codes:")
print(align.get_codes(alignment))

########################################################################
#
# .. currentmodule:: biotite.sequence.io.fasta
#
# You may ask, why should you recalculate the score, when the score has
# already been directly calculated via :func:`align_optimal()`.
# The answer is, that you might load an alignment from an external
# alignment program as FASTA file using :func:`get_alignment()`.
#
# .. currentmodule:: biotite.sequence.align
コード例 #7
0
ファイル: genome_assembly.py プロジェクト: ebetica/biotite
# Mutations in the B.1.1.7 variant
# --------------------------------
#
# To get an rough overview about the overall sequence identity between
# the genomes and the locations of mutations in the B.1.1.7 variant,
# we need to align the original genome to our assembled one.
# As both genomes are expected to be highly similar, we can use a banded
# alignment again using a very conservative band width.

BAND_WIDTH = 1000

genome_alignment = align.align_banded(
    var_genome, orig_genome, matrix,
    band=(-BAND_WIDTH//2, BAND_WIDTH//2), max_number=1
)[0]
identity = align.get_sequence_identity(genome_alignment, 'all')
print(f"Sequence identity: {identity * 100:.2f} %")

########################################################################
# Now we would like to have a closer look at the mutation locations.
# To contextualize the locations we plot the mutation frequency along
# with the gene locations.
# The genomic coordinates for each gene can be extracted from the
# already downloaded *GenBank* file of the reference genome.

N_BINS = 50

# Get genomic coordinates for all SARS-Cov-2 genes
gb_file = gb.GenBankFile.read(orig_genome_file)
annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"])
コード例 #8
0
# to ensure that values in the 'res_id' annotation point to the sequence
structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
structure = structure[struc.filter_amino_acids(structure)]

# Identity threshold for a sequence to be counted as homologous sequence
IDENTITY_THESHOLD = 0.4
# Find homologous proteins in SwissProt via BLAST
app = blast.BlastWebApp("blastp", sequence, database="swissprot")
app.start()
app.join()
alignments = app.get_alignments()
hit_seqs = [sequence]
hit_ids = ["Query"]
hit_starts = [1]
for ali in alignments:
    identity = align.get_sequence_identity(ali)
    # Do not include the exact same sequence -> identity < 1.0
    if identity > IDENTITY_THESHOLD and identity < 1.0:
        hit_seqs.append(ali.sequences[1])
        hit_ids.append(ali.hit_id)
        hit_starts.append(ali.hit_interval[0])

# Perform MSA
alignment = clustalo.ClustalOmegaApp.align(hit_seqs)

# Plot MSA
number_functions = []
for start in hit_starts:

    def some_func(x, start=start):
        return x + start