def test_align_optimal_complex(sequences, gap_penalty, seq_indices): """ Test `align_optimal()` function using real world sequences, compared to the output of MUSCLE. """ matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] alignment = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=True, max_number=1 )[0] ref_alignment = muscle.MuscleApp.align( [seq1, seq2], matrix=matrix, gap_penalty=gap_penalty ) # Check whether the score of the optimal alignments is the same # or higher as the MUSCLE alignment # Direct alignment comparison is not feasible, # since the treatment of terminal gaps is different in MUSCLE score = align.score(alignment, matrix, gap_penalty, terminal_penalty=True) ref_score = align.score( ref_alignment, matrix, gap_penalty, terminal_penalty=True ) try: assert score >= ref_score except AssertionError: print("Alignment:") print() print(alignment) print("\n") print("Reference alignment:") print() print(alignment) raise
def test_align_multiple(sequences, gap_penalty): r""" Test `align_multiple()` function using actual long sequences, compared to the output of MUSCLE. Both alignment methods are heuristic, the exact same result is not expected. Just assert that the resulting score is at least the 50 % of the score of the MUSCLE alignment. """ matrix = align.SubstitutionMatrix.std_protein_matrix() test_alignment, order, tree, distances = align.align_multiple( sequences, matrix, gap_penalty=gap_penalty, terminal_penalty=True) test_score = align.score(test_alignment, matrix, gap_penalty, terminal_penalty=True) try: ref_alignment = muscle.MuscleApp.align(sequences, matrix=matrix, gap_penalty=gap_penalty) except VersionError: pytest.skip(f"Invalid Muscle software version") ref_score = align.score(ref_alignment, matrix, gap_penalty, terminal_penalty=True) assert test_score >= ref_score * 0.5
def test_align_optimal_simple(local, term, gap_penalty, input1, input2, expect): """ Test `align_optimal()` function using constructed test cases. """ seq1 = seq.NucleotideSequence(input1) seq2 = seq.NucleotideSequence(input2) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Test alignment function alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local) for ali in alignments: assert str(ali) in expect # Test if separate score function calculates the same score for ali in alignments: score = align.score(ali, matrix, gap_penalty=gap_penalty, terminal_penalty=term) assert score == ali.score
def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, ref_range1, ref_range2, direction, score_only, uint8_code): """ Check if `algin_local_ungapped()` produces correct alignments based on simple known examples. """ # Limit start or stop reference alignment range to seed # if the alignment does not extend in both directions if direction == "upstream": ref_range1 = (ref_range1[0], seed[0] + 1) ref_range2 = (ref_range2[0], seed[1] + 1) elif direction == "downstream": ref_range1 = (seed[0], ref_range1[1]) ref_range2 = (seed[1], ref_range2[1]) seq1 = seq_type(seq1) seq2 = seq_type(seq2) if seq_type == seq.NucleotideSequence: matrix = align.SubstitutionMatrix.std_nucleotide_matrix() else: matrix = align.SubstitutionMatrix.std_protein_matrix() if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) ref_alignment = align.Alignment( [seq1, seq2], np.stack([ np.arange(*ref_range1), np.arange(*ref_range2) ], axis=-1) ) ref_score = align.score(ref_alignment, matrix) ref_alignment.score = ref_score test_result = align.align_local_ungapped( seq1, seq2, matrix, seed, threshold, direction, score_only) if score_only: assert test_result == ref_score else: assert test_result == ref_alignment
def test_scoring(sequences, gap_penalty, term, seq_indices): """ Test `score()` function. """ matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] alignment = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, max_number=1 )[0] try: assert align.score(alignment, matrix, gap_penalty, term) \ == alignment.score except AssertionError: print(alignment) raise
def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. This test uses a pair of highly similar short sequences. """ # Cyclotide C, Uniprot: P86843 seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld") # Cyclotide F, Uniprot: P86846 seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld") matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, local=True) # Limit reference alignment range to seed # if the alignment does not extend in both directions for alignment in ref_alignments: seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0] if direction == "upstream": alignment.trace = alignment.trace[:seed_index + 1] elif direction == "downstream": alignment.trace = alignment.trace[seed_index:] alignment.score = align.score(alignment, matrix, gap_penalty) test_result = align.align_local_gapped(seq1, seq2, matrix, seed, threshold, gap_penalty, 1000, direction, score_only) if score_only: test_score = test_result # All optimal alignments have the same score assert test_score == ref_alignments[0].score else: test_alignments = test_result assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments
def test_complex_alignment(sequences, gap_penalty, score_only, seq_indices): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. This test uses a set of long sequences, which are pairwise compared. The threshold should be chosen sufficiently large so `align_local_gapped()` can return the optimal alignment(s). """ MAX_NUMBER = 100 # The linear gap penalty for longer gaps easily exceeds # a small threshold -> increase threshold for linear penalty THRESHOLD = 200 if isinstance(gap_penalty, int) else 50 matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] ref_alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER) # Select the center of the alignment as seed trace = ref_alignments[0].trace trace = trace[(trace != -1).all(axis=1)] seed = trace[len(trace) // 2] test_result = align.align_local_gapped(seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, MAX_NUMBER, "both", score_only) if score_only: test_score = test_result # All optimal alignments have the same score assert test_score == ref_alignments[0].score else: try: test_alignments = test_result assert test_alignments[0].score == ref_alignments[0].score # Test if the score is also correctly calculated assert align.score(test_alignments[0], matrix, gap_penalty) \ == ref_alignments[0].score if len(ref_alignments) < MAX_NUMBER \ and len(test_alignments) < MAX_NUMBER: # Only test if the exact same alignments were created, # if the number of traces was not limited by MAX_NUMBER for i, alignment in enumerate(test_alignments): try: assert alignment in ref_alignments except AssertionError: # Edge case: # In rare case the local alignment may be # slightly longer on the upstream side for # 'align_local_ungapped()', since the # upstream side is handled in an inverted # manner # However this does not effect the score # Consequently, the exception is ignored # if the alignment is longer than all # reference alignments if len(alignment) <= max( [len(ali) for ali in ref_alignments]): raise except AssertionError: print(f"Missing test alignment at index {i}:") print() print(test_alignments[i]) print("\n") print("First reference alignment:") print() print(ref_alignments[0]) raise
matrix=matrix, symbols_per_line=len(alignments[0])) fig.tight_layout() ######################################################################## # If you are interested in more advanced visualization examples, have a # look at the :doc:`example gallery <../examples/gallery/index>`. # # You can also do some simple analysis on these objects, like # determining the sequence identity or calculating the score. # For further custom analysis, it can be convenient to have directly the # aligned symbos codes instead of the trace. alignment = alignments[0] print("Score: ", alignment.score) print("Recalculated score:", align.score(alignment, matrix=matrix)) print("Sequence identity:", align.get_sequence_identity(alignment)) print("Symbols:") print(align.get_symbols(alignment)) print("symbols codes:") print(align.get_codes(alignment)) ######################################################################## # # .. currentmodule:: biotite.sequence.io.fasta # # You may ask, why should you recalculate the score, when the score has # already been directly calculated via :func:`align_optimal()`. # The answer is, that you might load an alignment from an external # alignment program as FASTA file using :func:`get_alignment()`. #
def test_random_alignment(seed, uint8_code): """ Create two randomized sequences and place a conserved region into each sequence, where both conserved regions are similar to each other. The conserved regions only contain point mutations and no indels. Expect that the alignment score found by `align_local_ungapped()` is equal to the alignment score found by `align_optimal()`. """ MIN_SIZE = 200 MAX_SIZE = 1000 MIN_CONSERVED_SIZE = 20 MAX_CONSERVED_SIZE = 100 CONSERVED_ENDS = 5 MUTATION_PROB = 0.1 THRESHOLD = 100 np.random.seed(seed) # Create conserved regions conserved1 = ProteinSequence() conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE+1) conserved1.code = np.random.randint( # Do not include stop symbol for aesthetic reasons -> -1 len(conserved1.alphabet)-1, size=conserved_len ) conserved2 = ProteinSequence() # The second conserved regions is equal to the first one, # except a few point mutations conserved2.code = conserved1.code.copy() mutation_mask = np.random.choice( [False, True], size=conserved_len, p = [1 - MUTATION_PROB, MUTATION_PROB] ) conserved2.code[mutation_mask] = np.random.randint( len(conserved2.alphabet)-1, size=np.count_nonzero(mutation_mask) ) # Flank the conserved regions with equal termini to ensure # that the alignment extends from start to end of the region conserved2.code[:CONSERVED_ENDS] = conserved1.code[:CONSERVED_ENDS] conserved2.code[-CONSERVED_ENDS:] = conserved1.code[-CONSERVED_ENDS:] # Create randomized sequences seq1 = ProteinSequence() seq2 = ProteinSequence() offset = [] for sequence, conserved in zip( (seq1, seq2), (conserved1, conserved2) ): sequence.code = np.random.randint( len(sequence.alphabet)-1, size=np.random.randint(MIN_SIZE, MAX_SIZE+1) ) # Place conserved region randomly within the sequence conserved_pos = np.random.randint(0, len(sequence) - len(conserved)) sequence.code[conserved_pos : conserved_pos + len(conserved)] \ = conserved.code offset.append(conserved_pos) # The seed is placed somewhere in the conserved region seed = np.array(offset) + np.random.randint(len(conserved)) matrix = align.SubstitutionMatrix.std_protein_matrix() if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) ref_score = align.align_optimal( seq1, seq2, matrix, local=True, max_number=1, # High gap penalty to prevent introduction of gaps, # since 'align_local_ungapped()' is also no able to place gaps gap_penalty=-1000 )[0].score test_alignment = align.align_local_ungapped( seq1, seq2, matrix, seed, THRESHOLD ) assert test_alignment.score == ref_score # Test if the score is also correctly calculated assert align.score(test_alignment, matrix) == ref_score
# The scores are put into an array with the index being the # corresponding position of the HCN1 sequence. matrix = align.SubstitutionMatrix.std_protein_matrix() scores = np.zeros(len(hcn1)) for i in range(len(alignment)): # The column is also an alignment with length 1 column = alignment[i:i + 1] hcn1_index = column.trace[0, 0] if hcn1_index == -1: # Gap in HCN1 row # As similarity score should be analyzed in dependence of the # HCN1 sequence position, alignment columns with a gap in HCN1 # are ignored continue scores[hcn1_index] = align.score(column, matrix, gap_penalty=-5) scores = moving_average(scores, 2 * ma_radius + 1) ######################################################################## # Now the hydropathy and the similarity score can be plotted. figure = plt.figure(figsize=(8.0, 4.0)) ax = figure.add_subplot(111) # Plot hydropathy ax.plot(np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), hydropathies, color=biotite.colors["dimorange"]) ax.axhline(0, color="gray", linewidth=0.5)