コード例 #1
0
def test_align_optimal_complex(sequences, gap_penalty, seq_indices):
    """
    Test `align_optimal()` function using real world sequences,
    compared to the output of MUSCLE.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]
    alignment = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, terminal_penalty=True, max_number=1
    )[0]
    ref_alignment = muscle.MuscleApp.align(
        [seq1, seq2], matrix=matrix, gap_penalty=gap_penalty
    )
    # Check whether the score of the optimal alignments is the same
    # or higher as the MUSCLE alignment
    # Direct alignment comparison is not feasible,
    # since the treatment of terminal gaps is different in MUSCLE
    score = align.score(alignment, matrix, gap_penalty, terminal_penalty=True)
    ref_score = align.score(
        ref_alignment, matrix, gap_penalty, terminal_penalty=True
    )
    try:
        assert score >= ref_score
    except AssertionError:
        print("Alignment:")
        print()
        print(alignment)
        print("\n")
        print("Reference alignment:")
        print()
        print(alignment)
        raise
コード例 #2
0
ファイル: test_multiple.py プロジェクト: Discngine/biotite
def test_align_multiple(sequences, gap_penalty):
    r"""
    Test `align_multiple()` function using actual long sequences,
    compared to the output of MUSCLE.
    Both alignment methods are heuristic, the exact same result is not
    expected.
    Just assert that the resulting score is at least the 50 % of the
    score of the MUSCLE alignment.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    test_alignment, order, tree, distances = align.align_multiple(
        sequences, matrix, gap_penalty=gap_penalty, terminal_penalty=True)
    test_score = align.score(test_alignment,
                             matrix,
                             gap_penalty,
                             terminal_penalty=True)

    try:
        ref_alignment = muscle.MuscleApp.align(sequences,
                                               matrix=matrix,
                                               gap_penalty=gap_penalty)
    except VersionError:
        pytest.skip(f"Invalid Muscle software version")
    ref_score = align.score(ref_alignment,
                            matrix,
                            gap_penalty,
                            terminal_penalty=True)

    assert test_score >= ref_score * 0.5
コード例 #3
0
ファイル: test_pairwise.py プロジェクト: ebetica/biotite
def test_align_optimal_simple(local, term, gap_penalty, input1, input2,
                              expect):
    """
    Test `align_optimal()` function using constructed test cases.
    """
    seq1 = seq.NucleotideSequence(input1)
    seq2 = seq.NucleotideSequence(input2)
    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    # Test alignment function
    alignments = align.align_optimal(seq1,
                                     seq2,
                                     matrix,
                                     gap_penalty=gap_penalty,
                                     terminal_penalty=term,
                                     local=local)

    for ali in alignments:
        assert str(ali) in expect
    # Test if separate score function calculates the same score
    for ali in alignments:
        score = align.score(ali,
                            matrix,
                            gap_penalty=gap_penalty,
                            terminal_penalty=term)
        assert score == ali.score
コード例 #4
0
def test_simple_alignments(seq_type, seq1, seq2, seed, threshold,
                           ref_range1, ref_range2,
                           direction, score_only, uint8_code):
    """
    Check if `algin_local_ungapped()` produces correct alignments based on
    simple known examples.
    """
    # Limit start or stop reference alignment range to seed
    # if the alignment does not extend in both directions
    if direction == "upstream":
        ref_range1 = (ref_range1[0], seed[0] + 1)
        ref_range2 = (ref_range2[0], seed[1] + 1)
    elif direction == "downstream":
        ref_range1 = (seed[0], ref_range1[1])
        ref_range2 = (seed[1], ref_range2[1])

    seq1 = seq_type(seq1)
    seq2 = seq_type(seq2)
    
    if seq_type == seq.NucleotideSequence:
        matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    else:
        matrix = align.SubstitutionMatrix.std_protein_matrix()
    
    if not uint8_code:
        seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix)

    
    ref_alignment = align.Alignment(
        [seq1, seq2],
        np.stack([
            np.arange(*ref_range1),
            np.arange(*ref_range2)
        ], axis=-1)
    )
    ref_score = align.score(ref_alignment, matrix)
    ref_alignment.score = ref_score

    test_result = align.align_local_ungapped(
        seq1, seq2, matrix, seed, threshold, direction, score_only)
    
    if score_only:
        assert test_result == ref_score
    else:
        assert test_result == ref_alignment
コード例 #5
0
def test_scoring(sequences, gap_penalty, term, seq_indices):
    """
    Test `score()` function.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]
    alignment = align.align_optimal(
        seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term,
        max_number=1
    )[0]
    try:
        assert align.score(alignment, matrix, gap_penalty, term) \
               == alignment.score
    except AssertionError:
        print(alignment)
        raise
コード例 #6
0
def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only):
    """
    Test `align_local_gapped()` by comparing the output to
    `align_optimal()`.
    This test uses a pair of highly similar short sequences.
    """
    # Cyclotide C, Uniprot: P86843
    seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld")
    # Cyclotide F, Uniprot: P86846
    seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld")
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    ref_alignments = align.align_optimal(seq1,
                                         seq2,
                                         matrix,
                                         gap_penalty=gap_penalty,
                                         local=True)
    # Limit reference alignment range to seed
    # if the alignment does not extend in both directions
    for alignment in ref_alignments:
        seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0]
        if direction == "upstream":
            alignment.trace = alignment.trace[:seed_index + 1]
        elif direction == "downstream":
            alignment.trace = alignment.trace[seed_index:]
        alignment.score = align.score(alignment, matrix, gap_penalty)

    test_result = align.align_local_gapped(seq1, seq2, matrix, seed, threshold,
                                           gap_penalty, 1000, direction,
                                           score_only)

    if score_only:
        test_score = test_result
        # All optimal alignments have the same score
        assert test_score == ref_alignments[0].score
    else:
        test_alignments = test_result
        assert len(test_alignments) == len(ref_alignments)
        for alignment in test_alignments:
            assert alignment in ref_alignments
コード例 #7
0
def test_complex_alignment(sequences, gap_penalty, score_only, seq_indices):
    """
    Test `align_local_gapped()` by comparing the output to
    `align_optimal()`.
    This test uses a set of long sequences, which are pairwise compared.
    The threshold should be chosen sufficiently large so
    `align_local_gapped()` can return the optimal alignment(s).
    """
    MAX_NUMBER = 100
    # The linear gap penalty for longer gaps easily exceeds
    # a small threshold -> increase threshold for linear penalty
    THRESHOLD = 200 if isinstance(gap_penalty, int) else 50

    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]

    ref_alignments = align.align_optimal(seq1,
                                         seq2,
                                         matrix,
                                         gap_penalty=gap_penalty,
                                         local=True,
                                         max_number=MAX_NUMBER)
    # Select the center of the alignment as seed
    trace = ref_alignments[0].trace
    trace = trace[(trace != -1).all(axis=1)]
    seed = trace[len(trace) // 2]

    test_result = align.align_local_gapped(seq1, seq2, matrix, seed, THRESHOLD,
                                           gap_penalty, MAX_NUMBER, "both",
                                           score_only)

    if score_only:
        test_score = test_result
        # All optimal alignments have the same score
        assert test_score == ref_alignments[0].score
    else:
        try:
            test_alignments = test_result
            assert test_alignments[0].score == ref_alignments[0].score
            # Test if the score is also correctly calculated
            assert align.score(test_alignments[0], matrix, gap_penalty) \
                == ref_alignments[0].score
            if len(ref_alignments) < MAX_NUMBER \
               and len(test_alignments) < MAX_NUMBER:
                # Only test if the exact same alignments were created,
                # if the number of traces was not limited by MAX_NUMBER
                for i, alignment in enumerate(test_alignments):
                    try:
                        assert alignment in ref_alignments
                    except AssertionError:
                        # Edge case:
                        # In rare case the local alignment may be
                        # slightly longer on the upstream side for
                        # 'align_local_ungapped()', since the
                        # upstream side is handled in an inverted
                        # manner
                        # However this does not effect the score
                        # Consequently, the exception is ignored
                        # if the alignment is longer than all
                        # reference alignments
                        if len(alignment) <= max(
                            [len(ali) for ali in ref_alignments]):
                            raise
        except AssertionError:
            print(f"Missing test alignment at index {i}:")
            print()
            print(test_alignments[i])
            print("\n")
            print("First reference alignment:")
            print()
            print(ref_alignments[0])
            raise
コード例 #8
0
                                         matrix=matrix,
                                         symbols_per_line=len(alignments[0]))
fig.tight_layout()

########################################################################
# If you are interested in more advanced visualization examples, have a
# look at the :doc:`example gallery <../examples/gallery/index>`.
#
# You can also do some simple analysis on these objects, like
# determining the sequence identity or calculating the score.
# For further custom analysis, it can be convenient to have directly the
# aligned symbos codes instead of the trace.

alignment = alignments[0]
print("Score: ", alignment.score)
print("Recalculated score:", align.score(alignment, matrix=matrix))
print("Sequence identity:", align.get_sequence_identity(alignment))
print("Symbols:")
print(align.get_symbols(alignment))
print("symbols codes:")
print(align.get_codes(alignment))

########################################################################
#
# .. currentmodule:: biotite.sequence.io.fasta
#
# You may ask, why should you recalculate the score, when the score has
# already been directly calculated via :func:`align_optimal()`.
# The answer is, that you might load an alignment from an external
# alignment program as FASTA file using :func:`get_alignment()`.
#
コード例 #9
0
def test_random_alignment(seed, uint8_code):
    """
    Create two randomized sequences and place a conserved region into
    each sequence, where both conserved regions are similar to each
    other.
    The conserved regions only contain point mutations and no indels.
    Expect that the alignment score found by `align_local_ungapped()` is
    equal to the alignment score found by `align_optimal()`.
    """
    MIN_SIZE = 200
    MAX_SIZE = 1000
    MIN_CONSERVED_SIZE = 20
    MAX_CONSERVED_SIZE = 100
    CONSERVED_ENDS = 5
    MUTATION_PROB = 0.1
    THRESHOLD = 100
    
    np.random.seed(seed)

    # Create conserved regions
    conserved1 = ProteinSequence()
    conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE+1)
    conserved1.code = np.random.randint(
        # Do not include stop symbol for aesthetic reasons -> -1
        len(conserved1.alphabet)-1,
        size=conserved_len
    )
    conserved2 = ProteinSequence()
    # The second conserved regions is equal to the first one,
    # except a few point mutations
    conserved2.code = conserved1.code.copy()
    mutation_mask = np.random.choice(
        [False, True],
        size=conserved_len,
        p = [1 - MUTATION_PROB, MUTATION_PROB]
    )
    conserved2.code[mutation_mask] = np.random.randint(
        len(conserved2.alphabet)-1,
        size=np.count_nonzero(mutation_mask)
    )
    # Flank the conserved regions with equal termini to ensure
    # that the alignment extends from start to end of the region
    conserved2.code[:CONSERVED_ENDS] = conserved1.code[:CONSERVED_ENDS]
    conserved2.code[-CONSERVED_ENDS:] = conserved1.code[-CONSERVED_ENDS:]

    # Create randomized sequences
    seq1 = ProteinSequence()
    seq2 = ProteinSequence()
    offset = []
    for sequence, conserved in zip(
        (seq1, seq2), (conserved1, conserved2)
    ):
        sequence.code = np.random.randint(
            len(sequence.alphabet)-1,
            size=np.random.randint(MIN_SIZE, MAX_SIZE+1)
        )
        # Place conserved region randomly within the sequence
        conserved_pos = np.random.randint(0, len(sequence) - len(conserved))
        sequence.code[conserved_pos : conserved_pos + len(conserved)] \
            = conserved.code
        offset.append(conserved_pos)
    # The seed is placed somewhere in the conserved region
    seed = np.array(offset) + np.random.randint(len(conserved))


    matrix = align.SubstitutionMatrix.std_protein_matrix()
    if not uint8_code:
        seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix)
    
    ref_score = align.align_optimal(
        seq1, seq2, matrix, local=True, max_number=1,
        # High gap penalty to prevent introduction of gaps, 
        # since 'align_local_ungapped()' is also no able to place gaps
        gap_penalty=-1000
    )[0].score

    test_alignment = align.align_local_ungapped(
        seq1, seq2, matrix, seed, THRESHOLD
    )

    assert test_alignment.score == ref_score
    # Test if the score is also correctly calculated
    assert align.score(test_alignment, matrix) == ref_score
コード例 #10
0
# The scores are put into an array with the index being the
# corresponding position of the HCN1 sequence.

matrix = align.SubstitutionMatrix.std_protein_matrix()
scores = np.zeros(len(hcn1))
for i in range(len(alignment)):
    # The column is also an alignment with length 1
    column = alignment[i:i + 1]
    hcn1_index = column.trace[0, 0]
    if hcn1_index == -1:
        # Gap in HCN1 row
        # As similarity score should be analyzed in dependence of the
        # HCN1 sequence position, alignment columns with a gap in HCN1
        # are ignored
        continue
    scores[hcn1_index] = align.score(column, matrix, gap_penalty=-5)

scores = moving_average(scores, 2 * ma_radius + 1)

########################################################################
# Now the hydropathy and the similarity score can be plotted.

figure = plt.figure(figsize=(8.0, 4.0))
ax = figure.add_subplot(111)

# Plot hydropathy
ax.plot(np.arange(1 + ma_radius,
                  len(hcn1) - ma_radius + 1),
        hydropathies,
        color=biotite.colors["dimorange"])
ax.axhline(0, color="gray", linewidth=0.5)