def test_swapping(gap_penalty, local, seed): """ Check if `align_banded()` returns a 'swapped' alignment, if the order of input sequences is swapped. """ np.random.seed(seed) band = ( np.random.randint(-30, -10), np.random.randint( 10, 30) ) seq1, seq2 = _create_random_pair(seed) matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_banded( seq1, seq2, matrix, band=band, local=local, gap_penalty=gap_penalty ) test_alignments = align.align_banded( seq2, seq1, matrix, band=band, local=local, gap_penalty=gap_penalty ) if len(ref_alignments) != 1 or len(test_alignments) != 1: # If multiple optimal alignments exist, # it is not easy to assign a swapped one to an original one # therefore, simply return in this case # the number of tested seeds should be large enough to generate # a reasonable number of suitable test cases return ref_alignment = ref_alignments[0] test_alignment = test_alignments[0] assert test_alignment.sequences[0] == ref_alignment.sequences[1] assert test_alignment.sequences[1] == ref_alignment.sequences[0] assert np.array_equal(test_alignment.trace, ref_alignment.trace[:, ::-1])
def test_simple_alignment(gap_penalty, local, band_width): """ Test `align_banded()` by comparing the output to `align_optimal()`. This test uses a pair of highly similar short sequences. """ # Cyclotide C, Uniprot: P86843 seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld") # Cyclotide F, Uniprot: P86846 seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld") matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] test_alignments = align.align_banded( seq1, seq2, matrix, (-band_width, band_width), gap_penalty=gap_penalty, local=local ) assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments
def map_sequence(read, diag): deviation = int(3 * np.sqrt(len(read) * P_INDEL)) if diag is None: return None else: return align.align_banded( read, orig_genome, matrix, gap_penalty=-10, band = (diag - deviation, diag + deviation), max_number = 1 )[0]
def test_complex_alignment(sequences, gap_penalty, local, seq_indices): """ Test `align_banded()` by comparing the output to `align_optimal()`. This test uses a set of long sequences, which are pairwise compared. The band should be chosen sufficiently large so `align_banded()` can return the optimal alignment(s). """ MAX_NUMBER = 100 matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] ref_alignments = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False, max_number=MAX_NUMBER ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] identity = align.get_sequence_identity(ref_alignments[0]) # Use a relatively small band width, if the sequences are similar, # otherwise use the entire search space band_width = 100 if identity > 0.5 else len(seq1) + len(seq2) test_alignments = align.align_banded( seq1, seq2, matrix, (-band_width, band_width), gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER ) try: assert test_alignments[0].score == ref_alignments[0].score if len(ref_alignments) < MAX_NUMBER: # Only test if the exact same alignments were created, # if the number of traces was not limited by MAX_NUMBER assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments except AssertionError: print("First tested alignment:") print() print(test_alignments[0]) print("\n") print("First reference alignment:") print() print(ref_alignments[0]) raise
def test_large_sequence_mapping(length, excerpt_length, seed): """ Test whether an excerpt of a very large sequence is aligned to that sequence at the position, where the excerpt was taken from. """ BAND_WIDTH = 100 np.random.seed(seed) sequence = seq.NucleotideSequence() sequence.code = np.random.randint(len(sequence.alphabet), size=length) excerpt_pos = np.random.randint(len(sequence) - excerpt_length) excerpt = sequence[excerpt_pos : excerpt_pos + excerpt_length] diagonal = np.random.randint( excerpt_pos - BAND_WIDTH, excerpt_pos + BAND_WIDTH ) band = ( diagonal - BAND_WIDTH, diagonal + BAND_WIDTH ) print(band) print(len(sequence), len(excerpt)) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() test_alignments = align.align_banded( excerpt, sequence, matrix, band=band ) # The excerpt should be uniquely mappable to a single location on # the long sequence assert len(test_alignments) == 1 test_alignment = test_alignments[0] test_trace = test_alignment.trace ref_trace = np.stack([ np.arange(len(excerpt)), np.arange(excerpt_pos, len(excerpt) + excerpt_pos) ], axis=1) assert np.array_equal(test_trace, ref_trace)
# Now we would like to have a closer look on the difference between the # original and the B.1.1.7 genome. # # Mutations in the B.1.1.7 variant # -------------------------------- # # To get an rough overview about the overall sequence identity between # the genomes and the locations of mutations in the B.1.1.7 variant, # we need to align the original genome to our assembled one. # As both genomes are expected to be highly similar, we can use a banded # alignment again using a very conservative band width. BAND_WIDTH = 1000 genome_alignment = align.align_banded( var_genome, orig_genome, matrix, band=(-BAND_WIDTH//2, BAND_WIDTH//2), max_number=1 )[0] identity = align.get_sequence_identity(genome_alignment, 'all') print(f"Sequence identity: {identity * 100:.2f} %") ######################################################################## # Now we would like to have a closer look at the mutation locations. # To contextualize the locations we plot the mutation frequency along # with the gene locations. # The genomic coordinates for each gene can be extracted from the # already downloaded *GenBank* file of the reference genome. N_BINS = 50 # Get genomic coordinates for all SARS-Cov-2 genes gb_file = gb.GenBankFile.read(orig_genome_file)
# aligned to each other, if :math:`D_L \leq j - i \leq D_U`. # # In our case we center the diagonal band to the diagonal of the match # and use a fixed band width :math:`W = D_U - D_L`. BAND_WIDTH = 4 matrix = SubstitutionMatrix.std_protein_matrix() alignments = [] for query_pos, ref_pos in matches: diagonal = ref_pos - query_pos alignment = align.align_banded( query, reference, matrix, gap_penalty=-5, max_number=1, # Center the band at the match diagonal and extend the band by # one half of the band width in each direction band=(diagonal - BAND_WIDTH // 2, diagonal + BAND_WIDTH // 2))[0] alignments.append(alignment) for alignment in alignments: print(alignment) print("\n") ######################################################################## # 4. Significance evaluation # """""""""""""""""""""""""" # We have obtained two alignments, but which one of them is the # 'correct' one?
# The diagonal of this match can be seen in the figure: # It is the almost continuous line on the right side. # # For the gapped alignment we use :func:`align_banded()`, which reduces # the alignment search space to a narrow diagonal band. BAND_WIDTH = 1000 alignments = [] matrix = align.SubstitutionMatrix.std_nucleotide_matrix() for strand, m1_pos, genome_pos in trigger_matches: genome = genomic_seqs[strand] diagonal = genome_pos - m1_pos alignment = align.align_banded(m1_sequence, genome, matrix, band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), max_number=1)[0] alignments.append((strand, alignment)) strand, best_alignment = max(alignments, key=lambda strand_alignment: alignment[1].score) ######################################################################## # For visualization purposes we have to apply a renumbering function # for the genomic sequence, # since the indices in the alignment trace refer to the reverse # complement sequence, but we want the numbers to refer to the original # genomic sequence. # Reverse sequence numbering for second sequence (genome) in alignment