Example #1
0
def test_alignment_str():
    seq1 = seq.NucleotideSequence("ACCTGA")
    seq2 = seq.NucleotideSequence("TATGCT")
    ali_str = ["A-CCTGA----", "----T-ATGCT"]
    trace = align.Alignment.trace_from_strings(ali_str)
    alignment = align.Alignment([seq1, seq2], trace, None)
    assert str(alignment).split("\n") == ali_str
Example #2
0
def plot_pb_scheme_alignment():
    random.seed(1)
    scheme_file = biotite.temp_file("json")
    mat_file = biotite.temp_file("mat")
    with open(mat_file, "w") as file:
        # PB substitution matrix, adapted from PBxplore
        file.write("""
                a     b     c     d     e     f     g     h     i     j     k     l     m     n     o     p
            a  516   -59   113  -105  -411  -177   -27  -361    47  -103  -644  -259  -599  -372  -124   -83
            b  -59   541  -146  -210  -155  -310   -97    90   182  -128   -30    29  -745  -242  -165    22
            c  113  -146   360   -14  -333  -240    49  -438  -269  -282  -688  -682  -608  -455  -147     6
            d -105  -210   -14   221     5  -131  -349  -278  -253  -173  -585  -670 -1573 -1048  -691  -497
            e -411  -155  -333     5   520   185   186   138  -378   -70  -112  -514 -1136  -469  -617  -632
            f -177  -310  -240  -131   185   459   -99   -45  -445    83  -214   -88  -547  -629  -406  -552
            g  -27   -97    49  -349   186   -99   665   -99   -89  -118  -409  -138  -124   172   128   254
            h -361    90  -438  -278   138   -45   -99   632  -205   316   192  -108  -712  -359    95  -399
            i   47   182  -269  -253  -378  -445   -89  -205   696   186     8    15  -709  -269  -169   226
            j -103  -128  -282  -173   -70    83  -118   316   186   768   196     5  -398  -340  -117  -104
            k -644   -30  -688  -585  -112  -214  -409   192     8   196   568   -65  -270  -231  -471  -382
            l -259    29  -682  -670  -514   -88  -138  -108    15     5   -65   533  -131     8   -11  -316
            m -599  -745  -608 -1573 -1136  -547  -124  -712  -709  -398  -270  -131   241    -4  -190  -155
            n -372  -242  -455 -1048  -469  -629   172  -359  -269  -340  -231     8    -4   703    88   146
            o -124  -165  -147  -691  -617  -406   128    95  -169  -117  -471   -11  -190    88   716    58
            p  -83    22     6  -497  -632  -552   254  -399   226  -104  -382  -316  -155   146    58   609
            """)
    gecli.main(args=[
        "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast",
        "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file
    ])

    colors = graphics.load_color_scheme(scheme_file)["colors"]
    fig = plt.figure(figsize=(8.0, 5.0))
    ax = fig.gca()

    pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
    fasta_file = fasta.FastaFile()
    fasta_file.read(PB_EXAMPLE_FILE_NAME)
    seq_strings = list(fasta_file.values())
    sequences = [
        seq.GeneralSequence(pb_alphabet, seq_str.replace("-", ""))
        for seq_str in seq_strings
    ]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)

    graphics.plot_alignment_type_based(ax,
                                       alignment,
                                       symbols_per_line=60,
                                       spacing=2,
                                       color_scheme=colors)

    fig.tight_layout()
    return fig
Example #3
0
def test_identity():
    seq_str1 = "--HAKLPRDD--WL--"
    seq_str2 = "FRHA--QRTDADWLHH"
    seq_strings = [seq_str1, seq_str2]
    sequences = [
        seq.ProteinSequence(seq_str.replace("-", ""))
        for seq_str in seq_strings
    ]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)
    # Assert correct sequence identity calculation
    modes = ["all", "not_terminal", "shortest"]
    values = [6 / 16, 6 / 12, 6 / 10]
    for mode, value in zip(modes, values):
        assert align.get_sequence_identity(alignment, mode=mode) == value
Example #4
0
def test_from_alignment():
    seq1 = seq.NucleotideSequence("CGTCAT")
    seq2 = seq.NucleotideSequence("TCATGC")
    ali_str = ["CGTCAT--", "--TCATGC"]
    trace = align.Alignment.trace_from_strings(ali_str)
    alignment = align.Alignment([seq1, seq2], trace, None)

    profile = seq.SequenceProfile.from_alignment(alignment)
    symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
                        [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0,
                                                                   0]])
    gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
    alphabet = seq.Alphabet(["A", "C", "G", "T"])
    assert np.array_equal(symbols, profile.symbols)
    assert np.array_equal(gaps, profile.gaps)
    assert (alphabet == profile.alphabet)
Example #5
0
def test_conversion_to_symbols():
    """
    Test conversion of alignments to strings.
    """
    seq_str1 = "HAKLPRDD--WKL--"
    seq_str2 = "HA--PRDDADWKLHH"
    seq_str3 = "HA----DDADWKLHH"
    seq_strings = [seq_str1, seq_str2, seq_str3]
    sequences = [seq.ProteinSequence(seq_str.replace("-",""))
                 for seq_str in seq_strings]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)
    # Test the conversion bach to strings of symbols
    symbols = align.get_symbols(alignment)
    symbols = ["".join([sym if sym is not None else "-" for sym in sym_list])
               for sym_list in symbols]
    assert symbols == seq_strings
Example #6
0
def test_simple_alignments(seq_type, seq1, seq2, seed, threshold,
                           ref_range1, ref_range2,
                           direction, score_only, uint8_code):
    """
    Check if `algin_local_ungapped()` produces correct alignments based on
    simple known examples.
    """
    # Limit start or stop reference alignment range to seed
    # if the alignment does not extend in both directions
    if direction == "upstream":
        ref_range1 = (ref_range1[0], seed[0] + 1)
        ref_range2 = (ref_range2[0], seed[1] + 1)
    elif direction == "downstream":
        ref_range1 = (seed[0], ref_range1[1])
        ref_range2 = (seed[1], ref_range2[1])

    seq1 = seq_type(seq1)
    seq2 = seq_type(seq2)
    
    if seq_type == seq.NucleotideSequence:
        matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    else:
        matrix = align.SubstitutionMatrix.std_protein_matrix()
    
    if not uint8_code:
        seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix)

    
    ref_alignment = align.Alignment(
        [seq1, seq2],
        np.stack([
            np.arange(*ref_range1),
            np.arange(*ref_range2)
        ], axis=-1)
    )
    ref_score = align.score(ref_alignment, matrix)
    ref_alignment.score = ref_score

    test_result = align.align_local_ungapped(
        seq1, seq2, matrix, seed, threshold, direction, score_only)
    
    if score_only:
        assert test_result == ref_score
    else:
        assert test_result == ref_alignment
Example #7
0
        seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"),
        seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"),
        seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"),
        seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"),
        seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"),
        seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"),
        seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"),
        seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"),
        seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"),
        seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"),
        seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"),
        seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"),
        seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"),
        seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc")]
# Sequences do not need to be aligned
# -> Create alignment with trivial trace
# [[0 0 0 ...]
#  [1 1 1 ...]
#  [2 2 2 ...]
#     ...     ]
alignment = align.Alignment(
    sequences = seqs,
    trace     = np.tile(np.arange(len(seqs[0])), len(seqs)) \
                .reshape(len(seqs), len(seqs[0])) \
                .transpose(),
    score     = 0
)
# Create sequence logo from alignment
logo = graphics.SequenceLogo(alignment, 800, 100)
fig = logo.generate()
plt.show()
Example #8
0
def test_new_position_matrices():
    seqs = [
        seq.NucleotideSequence("AAGAAT"),
        seq.NucleotideSequence("ATCATA"),
        seq.NucleotideSequence("AAGTAA"),
        seq.NucleotideSequence("AACAAA"),
        seq.NucleotideSequence("ATTAAA"),
        seq.NucleotideSequence("AAGAAT")
    ]

    alignment = align.Alignment(
        sequences=seqs,
        trace=np.tile(np.arange(len(seqs[0])), len(seqs)) \
            .reshape(len(seqs), len(seqs[0])) \
            .transpose(),
        score=0
    )

    profile = seq.SequenceProfile.from_alignment(alignment)

    probability_matrix = np.array([[
        1.,
        0.,
        0.,
        0.,
    ], [0.66666667, 0., 0., 0.33333333], [0., 0.33333333, 0.5, 0.16666667],
                                   [0.83333333, 0., 0., 0.16666667],
                                   [0.83333333, 0., 0., 0.16666667],
                                   [0.66666667, 0., 0., 0.33333333]])

    ppm = profile.probability_matrix()

    assert np.allclose(probability_matrix, ppm, atol=1e-3)

    probability = profile.sequence_probability(
        seq.NucleotideSequence("AAAAAA"))

    assert probability == 0.0

    ppm = profile.probability_matrix(pseudocount=1)

    probability_matrix = np.array(
        [[0.89285714, 0.03571429, 0.03571429, 0.03571429],
         [0.60714286, 0.03571429, 0.03571429, 0.32142857],
         [0.03571429, 0.32142857, 0.46428571, 0.17857143],
         [0.75, 0.03571429, 0.03571429, 0.17857143],
         [0.75, 0.03571429, 0.03571429, 0.17857143],
         [0.60714286, 0.03571429, 0.03571429, 0.32142857]])

    assert np.allclose(probability_matrix, ppm, atol=1e-3)

    probability = profile.sequence_probability(
        seq.NucleotideSequence("AAAAAA"), pseudocount=1)

    assert probability == pytest.approx(0.0066, abs=1e-3)

    log_odds_matrix = np.array(
        [[1.83650127, -2.80735492, -2.80735492, -2.80735492],
         [1.28010792, -2.80735492, -2.80735492, 0.36257008],
         [-2.80735492, 0.36257008, 0.8930848, -0.48542683],
         [1.5849625, -2.80735492, -2.80735492, -0.48542683],
         [1.5849625, -2.80735492, -2.80735492, -0.48542683],
         [1.28010792, -2.80735492, -2.80735492, 0.36257008]])

    pwm = profile.log_odds_matrix(pseudocount=1)

    assert np.allclose(log_odds_matrix, pwm, atol=1e-3)

    score = profile.sequence_score(seq.NucleotideSequence("AAAAAA"),
                                   pseudocount=1)

    assert score == pytest.approx(4.7593, abs=1e-3)