Ejemplo n.º 1
0
def test_pairwise_identity(sequences, mode):
    """
    Test correct calculation of `get_pairwise_sequence_identity()` via
    pairwise calls of `get_sequence_identity()`.
    """
    sequences = sequences
    msa, _, _, _ = align.align_multiple(
        sequences,
        matrix=align.SubstitutionMatrix.std_protein_matrix()
    )
    
    ref_identity_matrix = np.zeros((len(sequences), len(sequences)))
    for i in range(len(sequences)):
        for j in range(len(sequences)):
            ref_identity_matrix[i,j] = align.get_sequence_identity(
                msa[:, [i,j]], mode=mode
            )
    
    test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode)
    
    # Identity of two equal sequences should be 1, if only the length of
    # the sequence is counted
    if mode == "shortest":
        assert (np.diag(test_identity_matrix) == 1).all()
    # Identity must be between 0 and 1
    assert ((test_identity_matrix <= 1) & (test_identity_matrix >= 0)).all()
    # Identity matrix is symmetric
    assert (test_identity_matrix == test_identity_matrix.T).all()
    # Pairwise identity must be equal in the two functions
    assert (test_identity_matrix == ref_identity_matrix).all()
Ejemplo n.º 2
0
def test_align_multiple(sequences, gap_penalty):
    r"""
    Test `align_multiple()` function using actual long sequences,
    compared to the output of MUSCLE.
    Both alignment methods are heuristic, the exact same result is not
    expected.
    Just assert that the resulting score is at least the 50 % of the
    score of the MUSCLE alignment.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    test_alignment, order, tree, distances = align.align_multiple(
        sequences, matrix, gap_penalty=gap_penalty, terminal_penalty=True)
    test_score = align.score(test_alignment,
                             matrix,
                             gap_penalty,
                             terminal_penalty=True)

    try:
        ref_alignment = muscle.MuscleApp.align(sequences,
                                               matrix=matrix,
                                               gap_penalty=gap_penalty)
    except VersionError:
        pytest.skip(f"Invalid Muscle software version")
    ref_score = align.score(ref_alignment,
                            matrix,
                            gap_penalty,
                            terminal_penalty=True)

    assert test_score >= ref_score * 0.5
Ejemplo n.º 3
0
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez

# Generate example alignment
# (the same as in the bacterial luciferase example)
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
file_name = entrez.fetch_single_file(uids,
                                     biotite.temp_file("fasta"),
                                     db_name="protein",
                                     ret_type="fasta")
fasta_file = fasta.FastaFile.read(file_name)
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]

# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
    "rainbow", "clustalx", "flower", "blossom", "spring", "wither", "autumn",
    "sunset", "ocean", "zappo", "taylor", "buried", "hydrophobicity",
    "prophelix", "propstrand", "propturn"
]
count = len(schemes)
# Assert that this example displays all available amino acid color schemes
all_schemes = graphics.list_color_scheme_names(alphabet)
assert set(schemes) == set(all_schemes)
Ejemplo n.º 4
0
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta"))

ids = []
sequences = []
for header, seq_str in fasta_file.items():
    # Extract the UniProt Entry name from header
    identifier = header.split("|")[-1].split()[0]
    ids.append(identifier)
    sequences.append(seq.ProteinSequence(seq_str))

matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
    sequences, matrix, gap_penalty=(-10, -1), terminal_penalty=False)
# Order alignment according to the guide tree
alignment = alignment[:, order]
ids = [ids[i] for i in order]

fig = plt.figure(figsize=(8.0, 20.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_type_based(ax,
                                   alignment,
                                   labels=ids,
                                   show_numbers=True,
                                   spacing=2.0)
fig.tight_layout()

plt.show()
Ejemplo n.º 5
0
         180)**2,
        axis=-1)
    # Chose PB, where the RMSDA to the reference angle is lowest
    # Due to the definition of Biotite symbol codes
    # the index of the chosen PB is directly the symbol code
    pb_seq_code = np.argmin(rmsda, axis=0)
    # Put the array of symbol codes into actual sequence objects
    pb_sequence = seq.GeneralSequence(pb_alphabet)
    pb_sequence.code = pb_seq_code
    pb_seqs.append(pb_sequence)

# Perfrom a multiple sequence alignment of the PB sequences
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str)
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict)
alignment, order, _, _ = align.align_multiple(pb_seqs,
                                              matrix,
                                              gap_penalty=(-500, -100),
                                              terminal_penalty=False)

# Visualize the alignment
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [organisms[i] for i in order]
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
# The color scheme was generated with the 'Gecos' software
graphics.plot_alignment_type_based(ax,
                                   alignment,
                                   labels=labels,
                                   symbols_per_line=45,
                                   spacing=2,
                                   show_numbers=True,
Ejemplo n.º 6
0
# application.
#
#
# Multiple sequence alignments
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# If you want to perform a multiple sequence alignment (MSA), have a
# look at the :func:`align_multiple()` function:

seq1 = seq.ProteinSequence("BIQTITE")
seq2 = seq.ProteinSequence("TITANITE")
seq3 = seq.ProteinSequence("BISMITE")
seq4 = seq.ProteinSequence("IQLITE")
alignment, order, guide_tree, distance_matrix = align.align_multiple(
    [seq1, seq2, seq3, seq4],
    matrix=align.SubstitutionMatrix.std_protein_matrix(),
    gap_penalty=-5,
    terminal_penalty=False)
print(alignment)

########################################################################
# This function is only recommended for strongly related sequences or
# exotic sequence types.
# When high accuracy or computation time matters, other MSA programs
# deliver better results.
# External MSA software can accessed via the :mod:`biotite.application`
# subpackage.
#
# Sequence features
# -----------------
#