def test_align_optimal_symmetry(sequences, local, term, gap_penalty, seq_indices): """ Alignments should be indifferent about which sequence comes first. """ matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] alignment1 = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local, max_number=1)[0] # Swap the sequences alignment2 = align.align_optimal(seq2, seq1, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local, max_number=1)[0] # Comparing all traces of both alignments to each other # would be unfeasible # Instead the scores are compared assert alignment1.score == alignment2.score
def test_score_scaling(sequences): """ Scaling the substitution scores and gap penalties by a constant factor should not influence the obtained E-values. Test this by aligning real sequences with a standard and scaled scoring scheme and comparing the calculated E-values of these alignments. """ SCALING_FACTOR = 1000 GAP_PENALTY = (-12, -1) SEQ_LENGTH = 300 matrix = align.SubstitutionMatrix.std_protein_matrix() np.random.seed(0) std_estimator = align.EValueEstimator.from_samples( seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND) scores = [ align.align_optimal(sequences[i], sequences[i + 1], matrix, GAP_PENALTY, local=True, max_number=1)[0].score for i in range(9) ] std_log_evalues = std_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) scaled_matrix = align.SubstitutionMatrix( seq.ProteinSequence.alphabet, seq.ProteinSequence.alphabet, matrix.score_matrix() * SCALING_FACTOR) scaled_gap_penalty = (GAP_PENALTY[0] * SCALING_FACTOR, GAP_PENALTY[1] * SCALING_FACTOR) scaled_estimator = align.EValueEstimator.from_samples( seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty, BACKGROUND) scores = [ align.align_optimal(sequences[i], sequences[i + 1], scaled_matrix, scaled_gap_penalty, local=True, max_number=1)[0].score for i in range(9) ] scaled_log_evalues = scaled_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) # Due to relatively low sample size, expect rather large deviation assert std_log_evalues.tolist() \ == pytest.approx(scaled_log_evalues.tolist(), rel=0.2)
def test_simple_alignment(gap_penalty, local, band_width): """ Test `align_banded()` by comparing the output to `align_optimal()`. This test uses a pair of highly similar short sequences. """ # Cyclotide C, Uniprot: P86843 seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld") # Cyclotide F, Uniprot: P86846 seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld") matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] test_alignments = align.align_banded( seq1, seq2, matrix, (-band_width, band_width), gap_penalty=gap_penalty, local=local ) assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments
def test_align_optimal_simple(local, term, gap_penalty, input1, input2, expect): """ Test `align_optimal()` function using constructed test cases. """ seq1 = seq.NucleotideSequence(input1) seq2 = seq.NucleotideSequence(input2) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Test alignment function alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local) for ali in alignments: assert str(ali) in expect # Test if separate score function calculates the same score for ali in alignments: score = align.score(ali, matrix, gap_penalty=gap_penalty, terminal_penalty=term) assert score == ali.score
def test_align_optimal_complex(sequences, gap_penalty, seq_indices): """ Test `align_optimal()` function using real world sequences, compared to the output of MUSCLE. """ matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] alignment = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=True, max_number=1 )[0] ref_alignment = muscle.MuscleApp.align( [seq1, seq2], matrix=matrix, gap_penalty=gap_penalty ) # Check whether the score of the optimal alignments is the same # or higher as the MUSCLE alignment # Direct alignment comparison is not feasible, # since the treatment of terminal gaps is different in MUSCLE score = align.score(alignment, matrix, gap_penalty, terminal_penalty=True) ref_score = align.score( ref_alignment, matrix, gap_penalty, terminal_penalty=True ) try: assert score >= ref_score except AssertionError: print("Alignment:") print() print(alignment) print("\n") print("Reference alignment:") print() print(alignment) raise
def get_alignment(cls, seq1: str, seq2: str, local: bool = True): """ Generate an alignment between two sequences Parameters ---------- seq1: str The first sequence to be aligned seq1: str The second sequence to be aligned local: bool If false, a global alignment is performed (based on the Needleman-Wunsch algorithm), otherwise a local alignment is performed (based on the Smith–Waterman algorithm). (Default: True) Returns ------- Alignment """ import biotite.sequence as seq import biotite.sequence.align as align import numpy as np # create the default matrix # TODO add more options for the choice of matrix matrix = align.SubstitutionMatrix.std_protein_matrix() alignments = align.align_optimal( seq.ProteinSequence(seq1), seq.ProteinSequence(seq2), matrix, local=local, ) alignment = alignments[0] score = alignment.score seq_identity = align.get_sequence_identity(alignment) symbols = align.get_symbols(alignment) codes = align.get_codes(alignment) return cls( alignment=alignment, metadata={ "score": score, "sequence_identity": seq_identity, "symbols": symbols, "codes": codes, }, )
def test_affine_gap_penalty(local, term, gap_penalty, seed): """ Expect the same alignment results for a linear gap penalty and an affine gap penalty with the same gap open and extension penalty. """ LENGTH_RANGE = (10, 100) MAX_NUMBER = 1000 np.random.seed(seed) sequences = [] for _ in range(2): sequence = seq.NucleotideSequence() length = np.random.randint(*LENGTH_RANGE) sequence.code = np.random.randint(len(sequence.alphabet), size=length) sequences.append(sequence) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() ref_alignments = align.align_optimal(*sequences, matrix, gap_penalty, term, local, MAX_NUMBER) test_alignments = align.align_optimal(*sequences, matrix, (gap_penalty, gap_penalty), term, local, MAX_NUMBER) assert test_alignments[0].score == ref_alignments[0].score assert len(test_alignments) == len(ref_alignments) # We can only expect to get the same alignments in the test and # reference, if we get all optimal alignments if len(test_alignments) < MAX_NUMBER: for alignment in test_alignments: try: assert alignment in ref_alignments except: print("Test alignment:") print(alignment) print() print("First reference alignment") print(ref_alignments[0]) raise
def test_complex_alignment(sequences, gap_penalty, local, seq_indices): """ Test `align_banded()` by comparing the output to `align_optimal()`. This test uses a set of long sequences, which are pairwise compared. The band should be chosen sufficiently large so `align_banded()` can return the optimal alignment(s). """ MAX_NUMBER = 100 matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] ref_alignments = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False, max_number=MAX_NUMBER ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] identity = align.get_sequence_identity(ref_alignments[0]) # Use a relatively small band width, if the sequences are similar, # otherwise use the entire search space band_width = 100 if identity > 0.5 else len(seq1) + len(seq2) test_alignments = align.align_banded( seq1, seq2, matrix, (-band_width, band_width), gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER ) try: assert test_alignments[0].score == ref_alignments[0].score if len(ref_alignments) < MAX_NUMBER: # Only test if the exact same alignments were created, # if the number of traces was not limited by MAX_NUMBER assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments except AssertionError: print("First tested alignment:") print() print(test_alignments[0]) print("\n") print("First reference alignment:") print() print(ref_alignments[0]) raise
def test_to_consensus_prot(): # Avidin protein sequence seq1 = seq.ProteinSequence( "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP" "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE") # Streptavidin protein sequence seq2 = seq.ProteinSequence( "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA" "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN" "GNPLDAVQQ") matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(seq1, seq2, matrix)[0] profile = seq.SequenceProfile.from_alignment(alignment) assert seq.ProteinSequence( "MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD" "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG" "INIFNPLDAQKE") == profile.to_consensus()
def test_scoring(sequences, gap_penalty, term, seq_indices): """ Test `score()` function. """ matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] alignment = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, max_number=1 )[0] try: assert align.score(alignment, matrix, gap_penalty, term) \ == alignment.score except AssertionError: print(alignment) raise
def test_evalue(): """ Check if the estimated E-values for a given score approximately match the number of random sequences with equal or better score via sampling. Low scores that lead to a rather high E-value are required to get a reasonable accuracy. """ TEST_SCORES = [30, 40, 50] GAP_PENALTY = (-12, -1) N_SAMPLES = 10000 SEQ_LENGTH = 300 matrix = align.SubstitutionMatrix.std_protein_matrix() estimator = align.EValueEstimator.from_samples( seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND) # Generate large number of alignments of random sequences np.random.seed(0) random_sequence_code = np.random.choice(len(seq.ProteinSequence.alphabet), size=(N_SAMPLES, 2, SEQ_LENGTH), p=BACKGROUND) sample_scores = np.zeros(N_SAMPLES, dtype=int) for i in range(N_SAMPLES): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() seq1.code = random_sequence_code[i, 0] seq2.code = random_sequence_code[i, 1] sample_scores[i] = align.align_optimal(seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1)[0].score e_values = [ 10**estimator.log_evalue(score, SEQ_LENGTH, SEQ_LENGTH * N_SAMPLES) for score in TEST_SCORES ] counts = [ np.count_nonzero(sample_scores >= score) for score in TEST_SCORES ] assert e_values == pytest.approx(counts, rel=0.5)
def test_search_sequence(): IDENTIY_CUTOFF = 0.9 pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) ref_sequence = pdbx.get_sequence(pdbx_file)[0] query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF) test_ids = rcsb.search(query) for id in test_ids: fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta")) test_sequence = fasta.get_sequence(fasta_file) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(ref_sequence, test_sequence, matrix, terminal_penalty=False)[0] identity = align.get_sequence_identity(alignment, mode="shortest") assert identity >= IDENTIY_CUTOFF
def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. This test uses a pair of highly similar short sequences. """ # Cyclotide C, Uniprot: P86843 seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld") # Cyclotide F, Uniprot: P86846 seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld") matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, local=True) # Limit reference alignment range to seed # if the alignment does not extend in both directions for alignment in ref_alignments: seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0] if direction == "upstream": alignment.trace = alignment.trace[:seed_index + 1] elif direction == "downstream": alignment.trace = alignment.trace[seed_index:] alignment.score = align.score(alignment, matrix, gap_penalty) test_result = align.align_local_gapped(seq1, seq2, matrix, seed, threshold, gap_penalty, 1000, direction, score_only) if score_only: test_score = test_result # All optimal alignments have the same score assert test_score == ref_alignments[0].score else: test_alignments = test_result assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments
def sequence_alignment(seq1: str, seq2: str, matrix: str, gap: int, local: bool = False) -> str: """ Perform a global alignment, based on the Needleman-Wunsch algorithm Parameters ---------- seq1,seq2: str The sequences to be aligned matrix: SubstitutionMatrix The substitution matrix used for scoring gap: int or (tuple, dtype=int) Int the value will be interpreted as general gap penalty. Tupel is provided, an affine gap penalty is used. The first integer in the tuple is the gap opening penalty, the second integer is the gap extension penalty. The values need to be negative. local : bool, optional, default=False Whether to use local alignment (Smith-Waterman) or global (Needleman-Wunsch) Returns ------- str An optimal alignment of two sequences """ matrix = matrices(matrix) alignment = seq_align.align_optimal( seq.ProteinSequence(seq1), seq.ProteinSequence(seq2), matrix, gap_penalty=gap, local=local, ) return alignment[0]
def test_random_alignment(seed, uint8_code): """ Create two randomized sequences and place a conserved region into each sequence, where both conserved regions are similar to each other. The conserved regions only contain point mutations and no indels. Expect that the alignment score found by `align_local_ungapped()` is equal to the alignment score found by `align_optimal()`. """ MIN_SIZE = 200 MAX_SIZE = 1000 MIN_CONSERVED_SIZE = 20 MAX_CONSERVED_SIZE = 100 CONSERVED_ENDS = 5 MUTATION_PROB = 0.1 THRESHOLD = 100 np.random.seed(seed) # Create conserved regions conserved1 = ProteinSequence() conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE+1) conserved1.code = np.random.randint( # Do not include stop symbol for aesthetic reasons -> -1 len(conserved1.alphabet)-1, size=conserved_len ) conserved2 = ProteinSequence() # The second conserved regions is equal to the first one, # except a few point mutations conserved2.code = conserved1.code.copy() mutation_mask = np.random.choice( [False, True], size=conserved_len, p = [1 - MUTATION_PROB, MUTATION_PROB] ) conserved2.code[mutation_mask] = np.random.randint( len(conserved2.alphabet)-1, size=np.count_nonzero(mutation_mask) ) # Flank the conserved regions with equal termini to ensure # that the alignment extends from start to end of the region conserved2.code[:CONSERVED_ENDS] = conserved1.code[:CONSERVED_ENDS] conserved2.code[-CONSERVED_ENDS:] = conserved1.code[-CONSERVED_ENDS:] # Create randomized sequences seq1 = ProteinSequence() seq2 = ProteinSequence() offset = [] for sequence, conserved in zip( (seq1, seq2), (conserved1, conserved2) ): sequence.code = np.random.randint( len(sequence.alphabet)-1, size=np.random.randint(MIN_SIZE, MAX_SIZE+1) ) # Place conserved region randomly within the sequence conserved_pos = np.random.randint(0, len(sequence) - len(conserved)) sequence.code[conserved_pos : conserved_pos + len(conserved)] \ = conserved.code offset.append(conserved_pos) # The seed is placed somewhere in the conserved region seed = np.array(offset) + np.random.randint(len(conserved)) matrix = align.SubstitutionMatrix.std_protein_matrix() if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) ref_score = align.align_optimal( seq1, seq2, matrix, local=True, max_number=1, # High gap penalty to prevent introduction of gaps, # since 'align_local_ungapped()' is also no able to place gaps gap_penalty=-1000 )[0].score test_alignment = align.align_local_ungapped( seq1, seq2, matrix, seed, THRESHOLD ) assert test_alignment.score == ref_score # Test if the score is also correctly calculated assert align.score(test_alignment, matrix) == ref_score
["CAC34569", "ACL82594"], None, "protein", "fasta" )) for name, sequence in fasta_file.items(): if "CAC34569" in name: query_seq = seq.ProteinSequence(sequence) elif "ACL82594" in name: hit_seq = seq.ProteinSequence(sequence) # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignment = align.align_optimal( query_seq, hit_seq, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] print(f"Score: {alignment.score}") fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( ax, alignment, matrix=matrix, labels=["Avidin (query)", "Database hit"], show_numbers=True, show_line_position=True ) fig.tight_layout() ######################################################################## # How can you make sure that you observe a true homology and not simply
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"], biotite.temp_file("sequences.fasta"), "protein", "fasta") file = fasta.FastaFile.read(file_name) for name, sequence in file.items(): if "CAC34569" in name: avidin_seq = seq.ProteinSequence(sequence) elif "ACL82594" in name: streptavidin_seq = seq.ProteinSequence(sequence) # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False) # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based(ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"], show_numbers=True, show_line_position=True) fig.tight_layout() plt.show()
# And now create a matrix by directly provding the ndarray # containing the similarity scores # (identity matrix in our case) scores = np.identity(len(alph), dtype=int) matrix = align.SubstitutionMatrix(alph, alph, scores) print("\n\nIdentity matrix\n") print(matrix) ######################################################################## # For our protein alignment we will use the standard *BLOSUM62* matrix. seq1 = seq.ProteinSequence("BIQTITE") seq2 = seq.ProteinSequence("IQLITE") matrix = align.SubstitutionMatrix.std_protein_matrix() print("\nLocal alignment") alignments = align.align_optimal(seq1, seq2, matrix, local=True) for ali in alignments: print(ali) print("Global alignment") alignments = align.align_optimal(seq1, seq2, matrix, local=False) for ali in alignments: print(ali) ######################################################################## # The alignment functions return a list of :class:`Alignment` objects. # This object saves the input sequences together with a so called trace # - the indices to symbols in these sequences that are aligned to each # other (*-1* for a gap). # Additionally the alignment score is stored in this object. # Furthermore, this object can prettyprint the alignment into a human # readable form.
def test_complex_alignment(sequences, gap_penalty, score_only, seq_indices): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. This test uses a set of long sequences, which are pairwise compared. The threshold should be chosen sufficiently large so `align_local_gapped()` can return the optimal alignment(s). """ MAX_NUMBER = 100 # The linear gap penalty for longer gaps easily exceeds # a small threshold -> increase threshold for linear penalty THRESHOLD = 200 if isinstance(gap_penalty, int) else 50 matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] ref_alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER) # Select the center of the alignment as seed trace = ref_alignments[0].trace trace = trace[(trace != -1).all(axis=1)] seed = trace[len(trace) // 2] test_result = align.align_local_gapped(seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, MAX_NUMBER, "both", score_only) if score_only: test_score = test_result # All optimal alignments have the same score assert test_score == ref_alignments[0].score else: try: test_alignments = test_result assert test_alignments[0].score == ref_alignments[0].score # Test if the score is also correctly calculated assert align.score(test_alignments[0], matrix, gap_penalty) \ == ref_alignments[0].score if len(ref_alignments) < MAX_NUMBER \ and len(test_alignments) < MAX_NUMBER: # Only test if the exact same alignments were created, # if the number of traces was not limited by MAX_NUMBER for i, alignment in enumerate(test_alignments): try: assert alignment in ref_alignments except AssertionError: # Edge case: # In rare case the local alignment may be # slightly longer on the upstream side for # 'align_local_ungapped()', since the # upstream side is handled in an inverted # manner # However this does not effect the score # Consequently, the exception is ignored # if the alignment is longer than all # reference alignments if len(alignment) <= max( [len(ali) for ali in ref_alignments]): raise except AssertionError: print(f"Missing test alignment at index {i}:") print() print(test_alignments[i]) print("\n") print("First reference alignment:") print() print(ref_alignments[0]) raise
"FP": ( 788, 806), # Transmembrane domain "TM": (1214, 1234), # Cytoplasmatic tail "CT": (1269, 1273), } # Get RNA sequence coding for spike protein from the reference genome for feature in annot_seq.annotation: if feature.qual["gene"] == "S": orig_spike_seq = annot_seq[feature] # Align spike protein sequence to variant genome to get the B.1.1.7 # spike protein sequence alignment = align.align_optimal( var_genome, orig_spike_seq, matrix, local=True, max_number=1 )[0] var_spike_seq = var_genome[alignment.trace[alignment.trace[:,0] != -1, 0]] # Obtain protein sequences from RNA sequences orig_spike_prot_seq = orig_spike_seq.translate(complete=True).remove_stops() var_spike_prot_seq = var_spike_seq.translate(complete=True).remove_stops() # Align both protein sequences with each other for later comparison blosum_matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal( var_spike_prot_seq, orig_spike_prot_seq, blosum_matrix, max_number=1 )[0] fig = plt.figure(figsize=(8.0, 10.0))
leul_feature = feature # Get leuL sequence leul_seq = annot_seq[leul_feature] # Download Salmonella enterica genome without annotations file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name) se_genome = fasta.get_sequence(fasta_file) # Find leuL in genome by local alignment matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Use general gap penalty to save RAM alignments = align.align_optimal(leul_seq, se_genome, matrix, gap_penalty=-7, local=True) # Do the same for reverse complement genome se_genome_rev = se_genome.reverse().complement() rev_alignments = align.align_optimal(leul_seq, se_genome_rev, matrix, gap_penalty=-7, local=True) ######################################################################## # Now that we have both alignments (forward and reverse strand), # we can can check which of them has a higher score. # We simply take the score of the first alignment in each list. # Due to the nature of the dynamic programming algorithm, every
for h, s in c_file.items(): print(h) print(s) covid_seq = seq.NucleotideSequence(s) for h, s in m_file.items(): print(h) print(s) mers_seq = seq.NucleotideSequence(s) mini_covid_seq = covid_seq[0:100] mini_mers_seq = mers_seq[0:100] matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignments = align.align_optimal(mini_covid_seq, mini_mers_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False) # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based(ax, alignments[0], matrix=matrix, labels=["SARS_Covid", "MERS"], show_numbers=True, show_line_position=True) fig.tight_layout()