def test_alphabet_mapper(source_alph_symbols, target_alph_symbols):
    CODE_LENGTH = 10000
    source_alph = seq.Alphabet(source_alph_symbols)
    target_alph = seq.Alphabet(target_alph_symbols)
    mapper = seq.AlphabetMapper(source_alph, target_alph)
    
    ref_sequence = seq.GeneralSequence(source_alph)
    np.random.seed(0)
    ref_sequence.code = np.random.randint(
        len(source_alph), size=CODE_LENGTH, dtype=int
    )

    test_sequence = seq.GeneralSequence(target_alph)
    test_sequence.code = mapper[ref_sequence.code]

    assert test_sequence.symbols == ref_sequence.symbols
Exemple #2
0
def test_custom_sequence_type(app_cls):
    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    exp_trace = [
        [0, 0],
        [1, -1],
        [2, 1],
        [3, 2],
        [-1, 3],
        [4, 4],
        [5, 5],
        [6, 6],
    ]
    # Strong identity matrix
    score_matrix = np.identity(len(alph))
    score_matrix[score_matrix == 0] = -1000
    score_matrix[score_matrix == 1] = 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    app = app_cls(sequences, matrix=matrix)
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert alignment.sequences == sequences
    assert alignment.trace.tolist() == exp_trace
Exemple #3
0
def test_match_table(use_similarity_rule):
    """
    Test the :meth:`match_table()` method based on a known example.
    
    Using the similarity rule should give the same result, as it is
    chosen to yield only the same k-mer as similar k-mer.
    """
    alphabet = seq.LetterAlphabet(string.ascii_lowercase + "_")
    phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" \
              "chuck_wood"
    phrase2 = "woodchuck"
    sequence1 = seq.GeneralSequence(alphabet, phrase1)
    sequence2 = seq.GeneralSequence(alphabet, phrase2)

    rule = _identity_rule(alphabet) if use_similarity_rule else None
    
    table1 = align.KmerTable.from_sequences(4, [sequence1])
    table2 = align.KmerTable.from_sequences(4, [sequence2])

    ref_matches = set([
        (0,  9),
        (0, 22),
        (1, 23),
        (2, 24),
        (3, 25),
        (4, 26),
        (5, 27),
        (4, 32),
        (5, 33),
        (0, 43),
        (1, 44),
        (2, 45),
        (3, 46),
        (4, 47),
        (5, 48),
        (4, 59),
        (5, 60),
        (0, 65),
    ])

    test_matches = table1.match_table(table2, similarity_rule=rule)
    # the reference indices are irrelevant for this test
    test_matches = test_matches[:, [1,3]]
    test_matches = set([tuple(match) for match in test_matches])
    assert test_matches == ref_matches
Exemple #4
0
def plot_pb_scheme_alignment():
    random.seed(1)
    scheme_file = biotite.temp_file("json")
    mat_file = biotite.temp_file("mat")
    with open(mat_file, "w") as file:
        # PB substitution matrix, adapted from PBxplore
        file.write("""
                a     b     c     d     e     f     g     h     i     j     k     l     m     n     o     p
            a  516   -59   113  -105  -411  -177   -27  -361    47  -103  -644  -259  -599  -372  -124   -83
            b  -59   541  -146  -210  -155  -310   -97    90   182  -128   -30    29  -745  -242  -165    22
            c  113  -146   360   -14  -333  -240    49  -438  -269  -282  -688  -682  -608  -455  -147     6
            d -105  -210   -14   221     5  -131  -349  -278  -253  -173  -585  -670 -1573 -1048  -691  -497
            e -411  -155  -333     5   520   185   186   138  -378   -70  -112  -514 -1136  -469  -617  -632
            f -177  -310  -240  -131   185   459   -99   -45  -445    83  -214   -88  -547  -629  -406  -552
            g  -27   -97    49  -349   186   -99   665   -99   -89  -118  -409  -138  -124   172   128   254
            h -361    90  -438  -278   138   -45   -99   632  -205   316   192  -108  -712  -359    95  -399
            i   47   182  -269  -253  -378  -445   -89  -205   696   186     8    15  -709  -269  -169   226
            j -103  -128  -282  -173   -70    83  -118   316   186   768   196     5  -398  -340  -117  -104
            k -644   -30  -688  -585  -112  -214  -409   192     8   196   568   -65  -270  -231  -471  -382
            l -259    29  -682  -670  -514   -88  -138  -108    15     5   -65   533  -131     8   -11  -316
            m -599  -745  -608 -1573 -1136  -547  -124  -712  -709  -398  -270  -131   241    -4  -190  -155
            n -372  -242  -455 -1048  -469  -629   172  -359  -269  -340  -231     8    -4   703    88   146
            o -124  -165  -147  -691  -617  -406   128    95  -169  -117  -471   -11  -190    88   716    58
            p  -83    22     6  -497  -632  -552   254  -399   226  -104  -382  -316  -155   146    58   609
            """)
    gecli.main(args=[
        "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast",
        "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file
    ])

    colors = graphics.load_color_scheme(scheme_file)["colors"]
    fig = plt.figure(figsize=(8.0, 5.0))
    ax = fig.gca()

    pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
    fasta_file = fasta.FastaFile()
    fasta_file.read(PB_EXAMPLE_FILE_NAME)
    seq_strings = list(fasta_file.values())
    sequences = [
        seq.GeneralSequence(pb_alphabet, seq_str.replace("-", ""))
        for seq_str in seq_strings
    ]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)

    graphics.plot_alignment_type_based(ax,
                                       alignment,
                                       symbols_per_line=60,
                                       spacing=2,
                                       color_scheme=colors)

    fig.tight_layout()
    return fig
def _convert_to_uint16_code(seq1, seq2, matrix):
        """
        Adjust sequences, so that they use 'uint16' as dtype for the
        code.
        This is a necessary test, since 'uint8' uses a separate
        implementation.
        """
        new_alph = seq.Alphabet(np.arange(500))
        code = seq1.code
        seq1 = seq.GeneralSequence(new_alph)
        seq1.code = code
        code = seq2.code
        seq2 = seq.GeneralSequence(new_alph)
        seq2.code = code
        # Adjust the substitution matrix as well,
        # so that it is compatible with the new alphabet
        score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32)
        orig_len = len(matrix.score_matrix())
        score_matrix[:orig_len, :orig_len] = matrix.score_matrix()
        matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix)
        return seq1, seq2, matrix
Exemple #6
0
def test_invalid_sequence_type_unsuitable_alphabet(app_cls):
    """
    The alphabet of the custom sequence type cannot be longer than the
    amino acid alphabet.
    """
    alph = seq.Alphabet(range(50))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            [1, 2, 3],
            [1, 2, 3],
        ]
    ]
    with pytest.raises(TypeError):
        pp = app_cls(sequences)
Exemple #7
0
def test_invalid_sequence_type_no_matrix(app_cls):
    """
    A custom substitution matrix is required for normally unsupported
    sequence types.
    """
    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    with pytest.raises(TypeError):
        app = app_cls(sequences)
Exemple #8
0
def test_invalid_sequence_type_unsuitable_alphabet(app_cls):
    """
    The alphabet of the custom sequence type cannot be longer than the
    amino acid alphabet.
    """
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.Alphabet(range(50))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            [1, 2, 3],
            [1, 2, 3],
        ]
    ]
    with pytest.raises(TypeError):
        try:
            app_cls(sequences)
        except VersionError:
            pytest.skip(f"Invalid software version")
Exemple #9
0
def test_invalid_sequence_type_no_matrix(app_cls):
    """
    A custom substitution matrix is required for normally unsupported
    sequence types.
    """
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    with pytest.raises(TypeError):
        try:
            app_cls(sequences)
        except VersionError:
            pytest.skip(f"Invalid software version")
Exemple #10
0
def test_custom_sequence_type(app_cls):
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    exp_trace = [
        [0, 0],
        [1, -1],
        [2, 1],
        [3, 2],
        [-1, 3],
        [4, 4],
        [5, 5],
        [6, 6],
    ]
    # Strong identity matrix
    score_matrix = np.identity(len(alph))
    score_matrix[score_matrix == 0] = -1000
    score_matrix[score_matrix == 1] = 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    try:
        app = app_cls(sequences, matrix=matrix)
    except VersionError:
        pytest.skip(f"Invalid software version")
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert alignment.sequences == sequences
    assert alignment.trace.tolist() == exp_trace
Exemple #11
0
    pb_angles[:, 5] = phi[3:-1]
    pb_angles[:, 6] = psi[3:-1]
    pb_angles[:, 7] = phi[4:]
    pb_angles = np.rad2deg(pb_angles)

    # Angle RMSD of all reference angles with all actual angles
    rmsda = np.sum(
        ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 -
         180)**2,
        axis=-1)
    # Chose PB, where the RMSDA to the reference angle is lowest
    # Due to the definition of Biotite symbol codes
    # the index of the chosen PB is directly the symbol code
    pb_seq_code = np.argmin(rmsda, axis=0)
    # Put the array of symbol codes into actual sequence objects
    pb_sequence = seq.GeneralSequence(pb_alphabet)
    pb_sequence.code = pb_seq_code
    pb_seqs.append(pb_sequence)

# Perfrom a multiple sequence alignment of the PB sequences
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str)
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict)
alignment, order, _, _ = align.align_multiple(pb_seqs,
                                              matrix,
                                              gap_penalty=(-500, -100),
                                              terminal_penalty=False)

# Visualize the alignment
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [organisms[i] for i in order]
Exemple #12
0
# If the MSA software supports protein sequence alignment AND
# custom substitution matrices, e.g. MUSCLE and MAFFT, almost any type
# of sequence can be aligned:
# Internally the sequences and the matrix are converted into protein
# sequences/matrix.
# Then the masquerading sequences are aligned via the software and
# finally the sequences are mapped back into the original sequence type.
# Let's show this on the example of a nonsense alphabet.

import numpy as np
import biotite.application.mafft as mafft
import biotite.sequence.align as align

alphabet = seq.Alphabet(("foo", "bar", 42))
sequences = [
    seq.GeneralSequence(alphabet, sequence) for sequence in [
        ["foo", "bar", 42, "foo", "foo", 42, 42],
        ["foo", 42, "foo", "bar", "foo", 42, 42],
    ]
]
matrix = align.SubstitutionMatrix(
    alphabet, alphabet,
    np.array([[100, -100, -100], [-100, 100, -100], [-100, -100, 100]]))
alignment = mafft.MafftApp.align(sequences, matrix=matrix)
# As the alphabet do not has characters as symbols
# the alignment cannot be directly printed
# However, we can print the trace
print(alignment.trace)

########################################################################
# Secondary structure annotation