Python Alphabetの例、biotite.sequence.Alphabet Pythonの例

コード例 #1

0

ファイルを表示

def test_matrix_str():
    alph1 = seq.Alphabet("abc")
    alph2 = seq.Alphabet("def")
    score_matrix = np.arange(9).reshape((3, 3))
    matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix)
    assert str(matrix) == "\n".join(
        ["    d   e   f", "a   0   1   2", "b   3   4   5", "c   6   7   8"])

コード例 #2

0

ファイルを表示

def test_alphabet_extension():
    alph1 = seq.Alphabet("abc")
    alph2 = seq.Alphabet("abc")
    alph3 = seq.Alphabet("acb")
    alph4 = seq.Alphabet("abcde")
    assert alph1.extends(alph1)
    assert alph2.extends(alph1)
    assert not alph3.extends(alph1)
    assert alph4.extends(alph1)
    assert not alph1.extends(alph4)

コード例 #3

0

ファイルを表示

def test_matrix_str():
    """
    Test conversion of substitution matrix to string via a small
    constructed test case.
    """
    alph1 = seq.Alphabet("abc")
    alph2 = seq.Alphabet("def")
    score_matrix = np.arange(9).reshape((3,3))
    matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix)
    assert str(matrix) == "\n".join(
        ["    d   e   f",
         "a   0   1   2",
         "b   3   4   5",
         "c   6   7   8"]
    )

コード例 #4

0

ファイルを表示

def test_custom_sequence_type(app_cls):
    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    exp_trace = [
        [0, 0],
        [1, -1],
        [2, 1],
        [3, 2],
        [-1, 3],
        [4, 4],
        [5, 5],
        [6, 6],
    ]
    # Strong identity matrix
    score_matrix = np.identity(len(alph))
    score_matrix[score_matrix == 0] = -1000
    score_matrix[score_matrix == 1] = 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    app = app_cls(sequences, matrix=matrix)
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert alignment.sequences == sequences
    assert alignment.trace.tolist() == exp_trace

コード例 #5

0

ファイルを表示

ファイル: test_alphabet.py プロジェクト: thomasnevolianis/biotite

def test_alphabet_mapper(source_alph_symbols, target_alph_symbols):
    CODE_LENGTH = 10000
    source_alph = seq.Alphabet(source_alph_symbols)
    target_alph = seq.Alphabet(target_alph_symbols)
    mapper = seq.AlphabetMapper(source_alph, target_alph)
    
    ref_sequence = seq.GeneralSequence(source_alph)
    np.random.seed(0)
    ref_sequence.code = np.random.randint(
        len(source_alph), size=CODE_LENGTH, dtype=int
    )

    test_sequence = seq.GeneralSequence(target_alph)
    test_sequence.code = mapper[ref_sequence.code]

    assert test_sequence.symbols == ref_sequence.symbols

コード例 #6

0

ファイルを表示

ファイル: test_profile.py プロジェクト: Discngine/biotite

def test_to_consensus_nuc_ambiguous():
    symbols = np.array([[1, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
                        [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0,
                                                                   0]])
    gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
    alphabet = seq.Alphabet(["A", "C", "G", "T"])
    profile = seq.SequenceProfile(symbols, gaps, alphabet)

    assert seq.NucleotideSequence("MGTCATGC") == profile.to_consensus()

コード例 #7

0

ファイルを表示

def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)

    if len(symbols) == 1:
        assert alph.encode(symbols[0]) == exp_code[0]
    else:
        assert list(alph.encode_multiple(symbols)) == list(exp_code)

コード例 #8

0

ファイルを表示

def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)

    code = np.array(code, dtype=np.uint8)
    if len(code) == 1:
        assert alph.decode(code[0]) == exp_symbols[0]
    else:
        assert list(alph.decode_multiple(code)) == list(exp_symbols)

コード例 #9

0

ファイルを表示

def test_invalid_sequence_type_no_matrix(app_cls):
    """
    A custom substitution matrix is required for normally unsupported
    sequence types.
    """
    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    with pytest.raises(TypeError):
        app = app_cls(sequences)

コード例 #10

0

ファイルを表示

def test_invalid_sequence_type_unsuitable_alphabet(app_cls):
    """
    The alphabet of the custom sequence type cannot be longer than the
    amino acid alphabet.
    """
    alph = seq.Alphabet(range(50))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            [1, 2, 3],
            [1, 2, 3],
        ]
    ]
    with pytest.raises(TypeError):
        pp = app_cls(sequences)

コード例 #11

0

ファイルを表示

ファイル: test_profile.py プロジェクト: Discngine/biotite

def test_from_alignment():
    seq1 = seq.NucleotideSequence("CGTCAT")
    seq2 = seq.NucleotideSequence("TCATGC")
    ali_str = ["CGTCAT--", "--TCATGC"]
    trace = align.Alignment.trace_from_strings(ali_str)
    alignment = align.Alignment([seq1, seq2], trace, None)

    profile = seq.SequenceProfile.from_alignment(alignment)
    symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
                        [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0,
                                                                   0]])
    gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
    alphabet = seq.Alphabet(["A", "C", "G", "T"])
    assert np.array_equal(symbols, profile.symbols)
    assert np.array_equal(gaps, profile.gaps)
    assert (alphabet == profile.alphabet)

コード例 #12

0

ファイルを表示

def test_invalid_sequence_type_unsuitable_alphabet(app_cls):
    """
    The alphabet of the custom sequence type cannot be longer than the
    amino acid alphabet.
    """
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.Alphabet(range(50))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            [1, 2, 3],
            [1, 2, 3],
        ]
    ]
    with pytest.raises(TypeError):
        try:
            app_cls(sequences)
        except VersionError:
            pytest.skip(f"Invalid software version")

コード例 #13

0

ファイルを表示

def test_invalid_sequence_type_no_matrix(app_cls):
    """
    A custom substitution matrix is required for normally unsupported
    sequence types.
    """
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    with pytest.raises(TypeError):
        try:
            app_cls(sequences)
        except VersionError:
            pytest.skip(f"Invalid software version")

コード例 #14

0

ファイルを表示

ファイル: test_localungapped.py プロジェクト: Discngine/biotite

def _convert_to_uint16_code(seq1, seq2, matrix):
        """
        Adjust sequences, so that they use 'uint16' as dtype for the
        code.
        This is a necessary test, since 'uint8' uses a separate
        implementation.
        """
        new_alph = seq.Alphabet(np.arange(500))
        code = seq1.code
        seq1 = seq.GeneralSequence(new_alph)
        seq1.code = code
        code = seq2.code
        seq2 = seq.GeneralSequence(new_alph)
        seq2.code = code
        # Adjust the substitution matrix as well,
        # so that it is compatible with the new alphabet
        score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32)
        orig_len = len(matrix.score_matrix())
        score_matrix[:orig_len, :orig_len] = matrix.score_matrix()
        matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix)
        return seq1, seq2, matrix

コード例 #15

0

ファイルを表示

def test_custom_sequence_type(app_cls):
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    exp_trace = [
        [0, 0],
        [1, -1],
        [2, 1],
        [3, 2],
        [-1, 3],
        [4, 4],
        [5, 5],
        [6, 6],
    ]
    # Strong identity matrix
    score_matrix = np.identity(len(alph))
    score_matrix[score_matrix == 0] = -1000
    score_matrix[score_matrix == 1] = 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    try:
        app = app_cls(sequences, matrix=matrix)
    except VersionError:
        pytest.skip(f"Invalid software version")
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert alignment.sequences == sequences
    assert alignment.trace.tolist() == exp_trace

コード例 #16

0

ファイルを表示

def test_error(alphabet_symbols, use_letter_alphabet, is_single_val):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)

    if is_single_val:
        with pytest.raises(seq.AlphabetError):
            alph.encode("G")
        with pytest.raises(seq.AlphabetError):
            alph.encode(42)
        with pytest.raises(seq.AlphabetError):
            alph.decode(len(alphabet_symbols))
        with pytest.raises(seq.AlphabetError):
            alph.decode(-1)
    else:
        with pytest.raises(seq.AlphabetError):
            alph.encode_multiple("G")
        with pytest.raises(seq.AlphabetError):
            alph.encode_multiple([42])
        with pytest.raises(seq.AlphabetError):
            alph.decode_multiple(np.array([len(alphabet_symbols)]))
        with pytest.raises(seq.AlphabetError):
            alph.decode_multiple(np.array([-1]))

コード例 #17

0

ファイルを表示

ファイル: blosum_dendrogram.py プロジェクト: thomasnevolianis/biotite

import biotite.sequence.phylo as phylo
import biotite.sequence.graphics as graphics

# Obtain BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print(matrix)

########################################################################
# The original *BLOSUM62* contains symbols for ambiguous amino acids and
# the stop signal.
# As these are not actual amino acids, a new substitution matrix is
# created, where these symbols are are removed.

# Matrix should not contain ambiguous symbols or stop signal
matrix = align.SubstitutionMatrix(
    seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]),
    seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]),
    matrix.score_matrix()[:-4, :-4])
similarities = matrix.score_matrix()
print(matrix)


########################################################################
# Now a function must be defined, that converts the similarity depicted
# by a substitution matrix into a distance required by the UPGMA method.
# In this case, the distance is defined as the difference between the
# similarity of the two symbols and the average maximum similarity of
# the symbols to themselves.
#
# Finally the obtained (phylogenetic) tree is plotted as dendrogram.
def get_distance(similarities, i, j):

コード例 #18

0

ファイルを表示

class NonsenseSequence(seq.Sequence):

    alphabet = seq.Alphabet([42, "foo", b"bar"])

    def get_alphabet(self):
        return NonsenseSequence.alphabet

コード例 #19

0

ファイルを表示

# differs from the one performed with MUSCLE.
#
# If the MSA software supports protein sequence alignment AND
# custom substitution matrices, e.g. MUSCLE and MAFFT, almost any type
# of sequence can be aligned:
# Internally the sequences and the matrix are converted into protein
# sequences/matrix.
# Then the masquerading sequences are aligned via the software and
# finally the sequences are mapped back into the original sequence type.
# Let's show this on the example of a nonsense alphabet.

import numpy as np
import biotite.application.mafft as mafft
import biotite.sequence.align as align

alphabet = seq.Alphabet(("foo", "bar", 42))
sequences = [
    seq.GeneralSequence(alphabet, sequence) for sequence in [
        ["foo", "bar", 42, "foo", "foo", 42, 42],
        ["foo", 42, "foo", "bar", "foo", 42, 42],
    ]
]
matrix = align.SubstitutionMatrix(
    alphabet, alphabet,
    np.array([[100, -100, -100], [-100, 100, -100], [-100, -100, 100]]))
alignment = mafft.MafftApp.align(sequences, matrix=matrix)
# As the alphabet do not has characters as symbols
# the alignment cannot be directly printed
# However, we can print the trace
print(alignment.trace)

コード例 #20

0

ファイルを表示

def test_length(alphabet_symbols, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)
    assert len(alph) == len(alphabet_symbols)

コード例 #21

0

ファイルを表示

def test_contains(alphabet_symbols, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)
    assert "D" in alph