Esempio n. 1
0
def test_align_optimal_symmetry(sequences, local, term, gap_penalty,
                                seq_indices):
    """
    Alignments should be indifferent about which sequence comes first.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]
    alignment1 = align.align_optimal(seq1,
                                     seq2,
                                     matrix,
                                     gap_penalty=gap_penalty,
                                     terminal_penalty=term,
                                     local=local,
                                     max_number=1)[0]
    # Swap the sequences
    alignment2 = align.align_optimal(seq2,
                                     seq1,
                                     matrix,
                                     gap_penalty=gap_penalty,
                                     terminal_penalty=term,
                                     local=local,
                                     max_number=1)[0]
    # Comparing all traces of both alignments to each other
    # would be unfeasible
    # Instead the scores are compared
    assert alignment1.score == alignment2.score
Esempio n. 2
0
def test_score_scaling(sequences):
    """
    Scaling the substitution scores and gap penalties by a constant
    factor should not influence the obtained E-values.
    Test this by aligning real sequences with a standard and scaled
    scoring scheme and comparing the calculated E-values of these
    alignments.
    """
    SCALING_FACTOR = 1000
    GAP_PENALTY = (-12, -1)
    SEQ_LENGTH = 300

    matrix = align.SubstitutionMatrix.std_protein_matrix()

    np.random.seed(0)
    std_estimator = align.EValueEstimator.from_samples(
        seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND)
    scores = [
        align.align_optimal(sequences[i],
                            sequences[i + 1],
                            matrix,
                            GAP_PENALTY,
                            local=True,
                            max_number=1)[0].score for i in range(9)
    ]
    std_log_evalues = std_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH)

    scaled_matrix = align.SubstitutionMatrix(
        seq.ProteinSequence.alphabet, seq.ProteinSequence.alphabet,
        matrix.score_matrix() * SCALING_FACTOR)
    scaled_gap_penalty = (GAP_PENALTY[0] * SCALING_FACTOR,
                          GAP_PENALTY[1] * SCALING_FACTOR)
    scaled_estimator = align.EValueEstimator.from_samples(
        seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty,
        BACKGROUND)
    scores = [
        align.align_optimal(sequences[i],
                            sequences[i + 1],
                            scaled_matrix,
                            scaled_gap_penalty,
                            local=True,
                            max_number=1)[0].score for i in range(9)
    ]
    scaled_log_evalues = scaled_estimator.log_evalue(scores, SEQ_LENGTH,
                                                     SEQ_LENGTH)

    # Due to relatively low sample size, expect rather large deviation
    assert std_log_evalues.tolist() \
        == pytest.approx(scaled_log_evalues.tolist(), rel=0.2)
Esempio n. 3
0
def test_simple_alignment(gap_penalty, local, band_width):
    """
    Test `align_banded()` by comparing the output to `align_optimal()`.
    This test uses a pair of highly similar short sequences.
    """
    # Cyclotide C, Uniprot: P86843
    seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld")
    # Cyclotide F, Uniprot: P86846
    seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld")
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    ref_alignments = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, local=local, terminal_penalty=False
    )
    # Remove terminal gaps in reference to obtain a true semi-global
    # alignment, as returned by align_banded()
    ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
    
    test_alignments = align.align_banded(
        seq1, seq2, matrix, (-band_width, band_width),
        gap_penalty=gap_penalty, local=local
    )

    assert len(test_alignments) == len(ref_alignments)
    for alignment in test_alignments:
        assert alignment in ref_alignments
Esempio n. 4
0
def test_align_optimal_simple(local, term, gap_penalty, input1, input2,
                              expect):
    """
    Test `align_optimal()` function using constructed test cases.
    """
    seq1 = seq.NucleotideSequence(input1)
    seq2 = seq.NucleotideSequence(input2)
    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    # Test alignment function
    alignments = align.align_optimal(seq1,
                                     seq2,
                                     matrix,
                                     gap_penalty=gap_penalty,
                                     terminal_penalty=term,
                                     local=local)

    for ali in alignments:
        assert str(ali) in expect
    # Test if separate score function calculates the same score
    for ali in alignments:
        score = align.score(ali,
                            matrix,
                            gap_penalty=gap_penalty,
                            terminal_penalty=term)
        assert score == ali.score
Esempio n. 5
0
def test_align_optimal_complex(sequences, gap_penalty, seq_indices):
    """
    Test `align_optimal()` function using real world sequences,
    compared to the output of MUSCLE.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]
    alignment = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, terminal_penalty=True, max_number=1
    )[0]
    ref_alignment = muscle.MuscleApp.align(
        [seq1, seq2], matrix=matrix, gap_penalty=gap_penalty
    )
    # Check whether the score of the optimal alignments is the same
    # or higher as the MUSCLE alignment
    # Direct alignment comparison is not feasible,
    # since the treatment of terminal gaps is different in MUSCLE
    score = align.score(alignment, matrix, gap_penalty, terminal_penalty=True)
    ref_score = align.score(
        ref_alignment, matrix, gap_penalty, terminal_penalty=True
    )
    try:
        assert score >= ref_score
    except AssertionError:
        print("Alignment:")
        print()
        print(alignment)
        print("\n")
        print("Reference alignment:")
        print()
        print(alignment)
        raise
Esempio n. 6
0
    def get_alignment(cls, seq1: str, seq2: str, local: bool = True):
        """
        Generate an alignment between two sequences

        Parameters
        ----------
        seq1: str
            The first sequence to be aligned
        seq1: str
            The second sequence to be aligned
        local: bool
            If false, a global alignment is performed
            (based on the Needleman-Wunsch algorithm),
            otherwise a local alignment is performed
            (based on the Smith–Waterman algorithm).
            (Default: True)

        Returns
        -------
        Alignment
        """

        import biotite.sequence as seq
        import biotite.sequence.align as align
        import numpy as np

        # create the default matrix
        # TODO add more options for the choice of matrix
        matrix = align.SubstitutionMatrix.std_protein_matrix()

        alignments = align.align_optimal(
            seq.ProteinSequence(seq1),
            seq.ProteinSequence(seq2),
            matrix,
            local=local,
        )

        alignment = alignments[0]

        score = alignment.score
        seq_identity = align.get_sequence_identity(alignment)
        symbols = align.get_symbols(alignment)
        codes = align.get_codes(alignment)

        return cls(
            alignment=alignment,
            metadata={
                "score": score,
                "sequence_identity": seq_identity,
                "symbols": symbols,
                "codes": codes,
            },
        )
Esempio n. 7
0
def test_affine_gap_penalty(local, term, gap_penalty, seed):
    """
    Expect the same alignment results for a linear gap penalty and an
    affine gap penalty with the same gap open and extension penalty.
    """
    LENGTH_RANGE = (10, 100)
    MAX_NUMBER = 1000

    np.random.seed(seed)
    sequences = []
    for _ in range(2):
        sequence = seq.NucleotideSequence()
        length = np.random.randint(*LENGTH_RANGE)
        sequence.code = np.random.randint(len(sequence.alphabet), size=length)
        sequences.append(sequence)

    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()

    ref_alignments = align.align_optimal(*sequences, matrix, gap_penalty, term,
                                         local, MAX_NUMBER)

    test_alignments = align.align_optimal(*sequences, matrix,
                                          (gap_penalty, gap_penalty), term,
                                          local, MAX_NUMBER)

    assert test_alignments[0].score == ref_alignments[0].score
    assert len(test_alignments) == len(ref_alignments)
    # We can only expect to get the same alignments in the test and
    # reference, if we get all optimal alignments
    if len(test_alignments) < MAX_NUMBER:
        for alignment in test_alignments:
            try:
                assert alignment in ref_alignments
            except:
                print("Test alignment:")
                print(alignment)
                print()
                print("First reference alignment")
                print(ref_alignments[0])
                raise
Esempio n. 8
0
def test_complex_alignment(sequences, gap_penalty, local, seq_indices):
    """
    Test `align_banded()` by comparing the output to `align_optimal()`.
    This test uses a set of long sequences, which are pairwise compared.
    The band should be chosen sufficiently large so `align_banded()`
    can return the optimal alignment(s).
    """
    MAX_NUMBER = 100
    
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]

    ref_alignments = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, local=local, terminal_penalty=False,
        max_number=MAX_NUMBER
    )
    # Remove terminal gaps in reference to obtain a true semi-global
    # alignment, as returned by align_banded()
    ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
    
    identity = align.get_sequence_identity(ref_alignments[0])
    # Use a relatively small band width, if the sequences are similar,
    # otherwise use the entire search space
    band_width = 100 if identity > 0.5 else len(seq1) + len(seq2)
    test_alignments = align.align_banded(
        seq1, seq2, matrix, (-band_width, band_width),
        gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER
    )

    try:
        assert test_alignments[0].score == ref_alignments[0].score
        if len(ref_alignments) < MAX_NUMBER:
            # Only test if the exact same alignments were created,
            # if the number of traces was not limited by MAX_NUMBER
            assert len(test_alignments) == len(ref_alignments)
            for alignment in test_alignments:
                assert alignment in ref_alignments
    except AssertionError:
        print("First tested alignment:")
        print()
        print(test_alignments[0])
        print("\n")
        print("First reference alignment:")
        print()
        print(ref_alignments[0])
        raise
Esempio n. 9
0
def test_to_consensus_prot():
    # Avidin protein sequence
    seq1 = seq.ProteinSequence(
        "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP"
        "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE")
    # Streptavidin protein sequence
    seq2 = seq.ProteinSequence(
        "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA"
        "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN"
        "GNPLDAVQQ")
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    alignment = align.align_optimal(seq1, seq2, matrix)[0]

    profile = seq.SequenceProfile.from_alignment(alignment)
    assert seq.ProteinSequence(
        "MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD"
        "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG"
        "INIFNPLDAQKE") == profile.to_consensus()
Esempio n. 10
0
def test_scoring(sequences, gap_penalty, term, seq_indices):
    """
    Test `score()` function.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]
    alignment = align.align_optimal(
        seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term,
        max_number=1
    )[0]
    try:
        assert align.score(alignment, matrix, gap_penalty, term) \
               == alignment.score
    except AssertionError:
        print(alignment)
        raise
Esempio n. 11
0
def test_evalue():
    """
    Check if the estimated E-values for a given score approximately
    match the number of random sequences with equal or better score via
    sampling.
    Low scores that lead to a rather high E-value are required to get
    a reasonable accuracy.
    """
    TEST_SCORES = [30, 40, 50]
    GAP_PENALTY = (-12, -1)
    N_SAMPLES = 10000
    SEQ_LENGTH = 300

    matrix = align.SubstitutionMatrix.std_protein_matrix()
    estimator = align.EValueEstimator.from_samples(
        seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND)

    # Generate large number of alignments of random sequences
    np.random.seed(0)
    random_sequence_code = np.random.choice(len(seq.ProteinSequence.alphabet),
                                            size=(N_SAMPLES, 2, SEQ_LENGTH),
                                            p=BACKGROUND)
    sample_scores = np.zeros(N_SAMPLES, dtype=int)
    for i in range(N_SAMPLES):
        seq1 = seq.ProteinSequence()
        seq2 = seq.ProteinSequence()
        seq1.code = random_sequence_code[i, 0]
        seq2.code = random_sequence_code[i, 1]
        sample_scores[i] = align.align_optimal(seq1,
                                               seq2,
                                               matrix,
                                               local=True,
                                               gap_penalty=GAP_PENALTY,
                                               max_number=1)[0].score

    e_values = [
        10**estimator.log_evalue(score, SEQ_LENGTH, SEQ_LENGTH * N_SAMPLES)
        for score in TEST_SCORES
    ]
    counts = [
        np.count_nonzero(sample_scores >= score) for score in TEST_SCORES
    ]
    assert e_values == pytest.approx(counts, rel=0.5)
Esempio n. 12
0
def test_search_sequence():
    IDENTIY_CUTOFF = 0.9
    pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif"))
    ref_sequence = pdbx.get_sequence(pdbx_file)[0]
    query = rcsb.SequenceQuery(ref_sequence,
                               "protein",
                               min_identity=IDENTIY_CUTOFF)
    test_ids = rcsb.search(query)

    for id in test_ids:
        fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta"))
        test_sequence = fasta.get_sequence(fasta_file)
        matrix = align.SubstitutionMatrix.std_protein_matrix()
        alignment = align.align_optimal(ref_sequence,
                                        test_sequence,
                                        matrix,
                                        terminal_penalty=False)[0]
        identity = align.get_sequence_identity(alignment, mode="shortest")
        assert identity >= IDENTIY_CUTOFF
Esempio n. 13
0
def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only):
    """
    Test `align_local_gapped()` by comparing the output to
    `align_optimal()`.
    This test uses a pair of highly similar short sequences.
    """
    # Cyclotide C, Uniprot: P86843
    seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld")
    # Cyclotide F, Uniprot: P86846
    seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld")
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    ref_alignments = align.align_optimal(seq1,
                                         seq2,
                                         matrix,
                                         gap_penalty=gap_penalty,
                                         local=True)
    # Limit reference alignment range to seed
    # if the alignment does not extend in both directions
    for alignment in ref_alignments:
        seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0]
        if direction == "upstream":
            alignment.trace = alignment.trace[:seed_index + 1]
        elif direction == "downstream":
            alignment.trace = alignment.trace[seed_index:]
        alignment.score = align.score(alignment, matrix, gap_penalty)

    test_result = align.align_local_gapped(seq1, seq2, matrix, seed, threshold,
                                           gap_penalty, 1000, direction,
                                           score_only)

    if score_only:
        test_score = test_result
        # All optimal alignments have the same score
        assert test_score == ref_alignments[0].score
    else:
        test_alignments = test_result
        assert len(test_alignments) == len(ref_alignments)
        for alignment in test_alignments:
            assert alignment in ref_alignments
Esempio n. 14
0
def sequence_alignment(seq1: str,
                       seq2: str,
                       matrix: str,
                       gap: int,
                       local: bool = False) -> str:
    """
    Perform a global alignment, based on the Needleman-Wunsch algorithm

    Parameters
    ----------
    seq1,seq2: str
        The sequences to be aligned

    matrix: SubstitutionMatrix
        The substitution matrix used for scoring

    gap: int or (tuple, dtype=int)
         Int the value will be interpreted as general gap penalty.
         Tupel is provided, an affine gap penalty is used. The first integer in the tuple is the gap opening penalty,
         the second integer is the gap extension penalty. The values need to be negative.

    local : bool, optional, default=False
        Whether to use local alignment (Smith-Waterman) or global (Needleman-Wunsch)

    Returns
    -------
    str
        An optimal alignment of two sequences
    """

    matrix = matrices(matrix)
    alignment = seq_align.align_optimal(
        seq.ProteinSequence(seq1),
        seq.ProteinSequence(seq2),
        matrix,
        gap_penalty=gap,
        local=local,
    )
    return alignment[0]
Esempio n. 15
0
def test_random_alignment(seed, uint8_code):
    """
    Create two randomized sequences and place a conserved region into
    each sequence, where both conserved regions are similar to each
    other.
    The conserved regions only contain point mutations and no indels.
    Expect that the alignment score found by `align_local_ungapped()` is
    equal to the alignment score found by `align_optimal()`.
    """
    MIN_SIZE = 200
    MAX_SIZE = 1000
    MIN_CONSERVED_SIZE = 20
    MAX_CONSERVED_SIZE = 100
    CONSERVED_ENDS = 5
    MUTATION_PROB = 0.1
    THRESHOLD = 100
    
    np.random.seed(seed)

    # Create conserved regions
    conserved1 = ProteinSequence()
    conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE+1)
    conserved1.code = np.random.randint(
        # Do not include stop symbol for aesthetic reasons -> -1
        len(conserved1.alphabet)-1,
        size=conserved_len
    )
    conserved2 = ProteinSequence()
    # The second conserved regions is equal to the first one,
    # except a few point mutations
    conserved2.code = conserved1.code.copy()
    mutation_mask = np.random.choice(
        [False, True],
        size=conserved_len,
        p = [1 - MUTATION_PROB, MUTATION_PROB]
    )
    conserved2.code[mutation_mask] = np.random.randint(
        len(conserved2.alphabet)-1,
        size=np.count_nonzero(mutation_mask)
    )
    # Flank the conserved regions with equal termini to ensure
    # that the alignment extends from start to end of the region
    conserved2.code[:CONSERVED_ENDS] = conserved1.code[:CONSERVED_ENDS]
    conserved2.code[-CONSERVED_ENDS:] = conserved1.code[-CONSERVED_ENDS:]

    # Create randomized sequences
    seq1 = ProteinSequence()
    seq2 = ProteinSequence()
    offset = []
    for sequence, conserved in zip(
        (seq1, seq2), (conserved1, conserved2)
    ):
        sequence.code = np.random.randint(
            len(sequence.alphabet)-1,
            size=np.random.randint(MIN_SIZE, MAX_SIZE+1)
        )
        # Place conserved region randomly within the sequence
        conserved_pos = np.random.randint(0, len(sequence) - len(conserved))
        sequence.code[conserved_pos : conserved_pos + len(conserved)] \
            = conserved.code
        offset.append(conserved_pos)
    # The seed is placed somewhere in the conserved region
    seed = np.array(offset) + np.random.randint(len(conserved))


    matrix = align.SubstitutionMatrix.std_protein_matrix()
    if not uint8_code:
        seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix)
    
    ref_score = align.align_optimal(
        seq1, seq2, matrix, local=True, max_number=1,
        # High gap penalty to prevent introduction of gaps, 
        # since 'align_local_ungapped()' is also no able to place gaps
        gap_penalty=-1000
    )[0].score

    test_alignment = align.align_local_ungapped(
        seq1, seq2, matrix, seed, THRESHOLD
    )

    assert test_alignment.score == ref_score
    # Test if the score is also correctly calculated
    assert align.score(test_alignment, matrix) == ref_score
    ["CAC34569", "ACL82594"], None, "protein", "fasta"
))
for name, sequence in fasta_file.items():
    if "CAC34569" in name:
        query_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        hit_seq = seq.ProteinSequence(sequence)


# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()

# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignment = align.align_optimal(
    query_seq, hit_seq, matrix,
    local=True, gap_penalty=GAP_PENALTY, max_number=1
)[0]


print(f"Score: {alignment.score}")

fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
    ax, alignment, matrix=matrix, labels=["Avidin (query)", "Database hit"],
    show_numbers=True, show_line_position=True
)
fig.tight_layout()

########################################################################
# How can you make sure that you observe a true homology and not simply
Esempio n. 17
0
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
                                     biotite.temp_file("sequences.fasta"),
                                     "protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq,
                                 streptavidin_seq,
                                 matrix,
                                 gap_penalty=(-10, -1),
                                 terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(ax,
                                         alignments[0],
                                         matrix=matrix,
                                         labels=["Avidin", "Streptavidin"],
                                         show_numbers=True,
                                         show_line_position=True)
fig.tight_layout()

plt.show()
Esempio n. 18
0
# And now create a matrix by directly provding the ndarray
# containing the similarity scores
# (identity matrix in our case)
scores = np.identity(len(alph), dtype=int)
matrix = align.SubstitutionMatrix(alph, alph, scores)
print("\n\nIdentity matrix\n")
print(matrix)

########################################################################
# For our protein alignment we will use the standard *BLOSUM62* matrix.

seq1 = seq.ProteinSequence("BIQTITE")
seq2 = seq.ProteinSequence("IQLITE")
matrix = align.SubstitutionMatrix.std_protein_matrix()
print("\nLocal alignment")
alignments = align.align_optimal(seq1, seq2, matrix, local=True)
for ali in alignments:
    print(ali)
print("Global alignment")
alignments = align.align_optimal(seq1, seq2, matrix, local=False)
for ali in alignments:
    print(ali)

########################################################################
# The alignment functions return a list of :class:`Alignment` objects.
# This object saves the input sequences together with a so called trace
# - the indices to symbols in these sequences that are aligned to each
# other (*-1* for a gap).
# Additionally the alignment score is stored in this object.
# Furthermore, this object can prettyprint the alignment into a human
# readable form.
Esempio n. 19
0
def test_complex_alignment(sequences, gap_penalty, score_only, seq_indices):
    """
    Test `align_local_gapped()` by comparing the output to
    `align_optimal()`.
    This test uses a set of long sequences, which are pairwise compared.
    The threshold should be chosen sufficiently large so
    `align_local_gapped()` can return the optimal alignment(s).
    """
    MAX_NUMBER = 100
    # The linear gap penalty for longer gaps easily exceeds
    # a small threshold -> increase threshold for linear penalty
    THRESHOLD = 200 if isinstance(gap_penalty, int) else 50

    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]

    ref_alignments = align.align_optimal(seq1,
                                         seq2,
                                         matrix,
                                         gap_penalty=gap_penalty,
                                         local=True,
                                         max_number=MAX_NUMBER)
    # Select the center of the alignment as seed
    trace = ref_alignments[0].trace
    trace = trace[(trace != -1).all(axis=1)]
    seed = trace[len(trace) // 2]

    test_result = align.align_local_gapped(seq1, seq2, matrix, seed, THRESHOLD,
                                           gap_penalty, MAX_NUMBER, "both",
                                           score_only)

    if score_only:
        test_score = test_result
        # All optimal alignments have the same score
        assert test_score == ref_alignments[0].score
    else:
        try:
            test_alignments = test_result
            assert test_alignments[0].score == ref_alignments[0].score
            # Test if the score is also correctly calculated
            assert align.score(test_alignments[0], matrix, gap_penalty) \
                == ref_alignments[0].score
            if len(ref_alignments) < MAX_NUMBER \
               and len(test_alignments) < MAX_NUMBER:
                # Only test if the exact same alignments were created,
                # if the number of traces was not limited by MAX_NUMBER
                for i, alignment in enumerate(test_alignments):
                    try:
                        assert alignment in ref_alignments
                    except AssertionError:
                        # Edge case:
                        # In rare case the local alignment may be
                        # slightly longer on the upstream side for
                        # 'align_local_ungapped()', since the
                        # upstream side is handled in an inverted
                        # manner
                        # However this does not effect the score
                        # Consequently, the exception is ignored
                        # if the alignment is longer than all
                        # reference alignments
                        if len(alignment) <= max(
                            [len(ali) for ali in ref_alignments]):
                            raise
        except AssertionError:
            print(f"Missing test alignment at index {i}:")
            print()
            print(test_alignments[i])
            print("\n")
            print("First reference alignment:")
            print()
            print(ref_alignments[0])
            raise
Esempio n. 20
0
    "FP":  ( 788,  806),
    # Transmembrane domain
    "TM":  (1214, 1234),
    # Cytoplasmatic tail
    "CT":  (1269, 1273),
}

# Get RNA sequence coding for spike protein from the reference genome
for feature in annot_seq.annotation:
    if feature.qual["gene"] == "S":
        orig_spike_seq = annot_seq[feature]

# Align spike protein sequence to variant genome to get the B.1.1.7
# spike protein sequence
alignment = align.align_optimal(
    var_genome, orig_spike_seq, matrix, local=True, max_number=1
)[0]
var_spike_seq = var_genome[alignment.trace[alignment.trace[:,0] != -1, 0]]

# Obtain protein sequences from RNA sequences
orig_spike_prot_seq = orig_spike_seq.translate(complete=True).remove_stops()
var_spike_prot_seq  =  var_spike_seq.translate(complete=True).remove_stops()

# Align both protein sequences with each other for later comparison
blosum_matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment = align.align_optimal(
    var_spike_prot_seq, orig_spike_prot_seq, blosum_matrix, max_number=1
)[0]


fig = plt.figure(figsize=(8.0, 10.0))
Esempio n. 21
0
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
se_genome = fasta.get_sequence(fasta_file)
# Find leuL in genome by local alignment
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Use general gap penalty to save RAM
alignments = align.align_optimal(leul_seq,
                                 se_genome,
                                 matrix,
                                 gap_penalty=-7,
                                 local=True)
# Do the same for reverse complement genome
se_genome_rev = se_genome.reverse().complement()
rev_alignments = align.align_optimal(leul_seq,
                                     se_genome_rev,
                                     matrix,
                                     gap_penalty=-7,
                                     local=True)

########################################################################
# Now that we have both alignments (forward and reverse strand),
# we can can check which of them has a higher score.
# We simply take the score of the first alignment in each list.
# Due to the nature of the dynamic programming algorithm, every
Esempio n. 22
0
for h, s in c_file.items():
    print(h)
    print(s)
    covid_seq = seq.NucleotideSequence(s)
for h, s in m_file.items():
    print(h)
    print(s)
    mers_seq = seq.NucleotideSequence(s)
mini_covid_seq = covid_seq[0:100]
mini_mers_seq = mers_seq[0:100]
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(mini_covid_seq,
                                 mini_mers_seq,
                                 matrix,
                                 gap_penalty=(-10, -1),
                                 terminal_penalty=False)

# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(ax,
                                         alignments[0],
                                         matrix=matrix,
                                         labels=["SARS_Covid", "MERS"],
                                         show_numbers=True,
                                         show_line_position=True)
fig.tight_layout()