Ejemplo n.º 1
0
def test_swapping(gap_penalty, local, seed):
    """
    Check if `align_banded()` returns a 'swapped' alignment, if
    the order of input sequences is swapped.
    """
    np.random.seed(seed)
    band = (
        np.random.randint(-30, -10),
        np.random.randint( 10,  30)
    )

    seq1, seq2 = _create_random_pair(seed)
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    ref_alignments = align.align_banded(
        seq1, seq2, matrix, band=band, local=local, gap_penalty=gap_penalty
    )

    test_alignments = align.align_banded(
        seq2, seq1, matrix, band=band, local=local, gap_penalty=gap_penalty
    )

    if len(ref_alignments) != 1 or  len(test_alignments) != 1:
        # If multiple optimal alignments exist,
        # it is not easy to assign a swapped one to an original one
        # therefore, simply return in this case
        # the number of tested seeds should be large enough to generate
        # a reasonable number of suitable test cases
        return
    ref_alignment = ref_alignments[0]
    test_alignment = test_alignments[0]
    
    assert test_alignment.sequences[0] == ref_alignment.sequences[1]
    assert test_alignment.sequences[1] == ref_alignment.sequences[0]
    assert np.array_equal(test_alignment.trace, ref_alignment.trace[:, ::-1])
Ejemplo n.º 2
0
def test_simple_alignment(gap_penalty, local, band_width):
    """
    Test `align_banded()` by comparing the output to `align_optimal()`.
    This test uses a pair of highly similar short sequences.
    """
    # Cyclotide C, Uniprot: P86843
    seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld")
    # Cyclotide F, Uniprot: P86846
    seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld")
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    ref_alignments = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, local=local, terminal_penalty=False
    )
    # Remove terminal gaps in reference to obtain a true semi-global
    # alignment, as returned by align_banded()
    ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
    
    test_alignments = align.align_banded(
        seq1, seq2, matrix, (-band_width, band_width),
        gap_penalty=gap_penalty, local=local
    )

    assert len(test_alignments) == len(ref_alignments)
    for alignment in test_alignments:
        assert alignment in ref_alignments
Ejemplo n.º 3
0
def map_sequence(read, diag):
    deviation = int(3 * np.sqrt(len(read) * P_INDEL))
    if diag is None:
        return None
    else:
        return align.align_banded(
            read, orig_genome, matrix, gap_penalty=-10,
            band = (diag - deviation, diag + deviation),
            max_number = 1
        )[0]
Ejemplo n.º 4
0
def test_complex_alignment(sequences, gap_penalty, local, seq_indices):
    """
    Test `align_banded()` by comparing the output to `align_optimal()`.
    This test uses a set of long sequences, which are pairwise compared.
    The band should be chosen sufficiently large so `align_banded()`
    can return the optimal alignment(s).
    """
    MAX_NUMBER = 100
    
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]

    ref_alignments = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, local=local, terminal_penalty=False,
        max_number=MAX_NUMBER
    )
    # Remove terminal gaps in reference to obtain a true semi-global
    # alignment, as returned by align_banded()
    ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
    
    identity = align.get_sequence_identity(ref_alignments[0])
    # Use a relatively small band width, if the sequences are similar,
    # otherwise use the entire search space
    band_width = 100 if identity > 0.5 else len(seq1) + len(seq2)
    test_alignments = align.align_banded(
        seq1, seq2, matrix, (-band_width, band_width),
        gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER
    )

    try:
        assert test_alignments[0].score == ref_alignments[0].score
        if len(ref_alignments) < MAX_NUMBER:
            # Only test if the exact same alignments were created,
            # if the number of traces was not limited by MAX_NUMBER
            assert len(test_alignments) == len(ref_alignments)
            for alignment in test_alignments:
                assert alignment in ref_alignments
    except AssertionError:
        print("First tested alignment:")
        print()
        print(test_alignments[0])
        print("\n")
        print("First reference alignment:")
        print()
        print(ref_alignments[0])
        raise
Ejemplo n.º 5
0
def test_large_sequence_mapping(length, excerpt_length, seed):
    """
    Test whether an excerpt of a very large sequence is aligned to that
    sequence at the position, where the excerpt was taken from.
    """
    BAND_WIDTH = 100
    
    np.random.seed(seed)

    sequence = seq.NucleotideSequence()
    sequence.code = np.random.randint(len(sequence.alphabet), size=length)
    excerpt_pos = np.random.randint(len(sequence) - excerpt_length)
    excerpt = sequence[excerpt_pos : excerpt_pos + excerpt_length]

    diagonal = np.random.randint(
        excerpt_pos - BAND_WIDTH,
        excerpt_pos + BAND_WIDTH
    )
    band = (
        diagonal - BAND_WIDTH,
        diagonal + BAND_WIDTH
    )
    print(band)
    print(len(sequence), len(excerpt))

    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    test_alignments = align.align_banded(
        excerpt, sequence, matrix, band=band
    )
    # The excerpt should be uniquely mappable to a single location on
    # the long sequence
    assert len(test_alignments) == 1
    test_alignment = test_alignments[0]
    test_trace = test_alignment.trace

    ref_trace = np.stack([
        np.arange(len(excerpt)),
        np.arange(excerpt_pos, len(excerpt) + excerpt_pos)
    ], axis=1)
    assert np.array_equal(test_trace, ref_trace)
Ejemplo n.º 6
0
# Now we would like to have a closer look on the difference between the
# original and the B.1.1.7 genome.
#
# Mutations in the B.1.1.7 variant
# --------------------------------
#
# To get an rough overview about the overall sequence identity between
# the genomes and the locations of mutations in the B.1.1.7 variant,
# we need to align the original genome to our assembled one.
# As both genomes are expected to be highly similar, we can use a banded
# alignment again using a very conservative band width.

BAND_WIDTH = 1000

genome_alignment = align.align_banded(
    var_genome, orig_genome, matrix,
    band=(-BAND_WIDTH//2, BAND_WIDTH//2), max_number=1
)[0]
identity = align.get_sequence_identity(genome_alignment, 'all')
print(f"Sequence identity: {identity * 100:.2f} %")

########################################################################
# Now we would like to have a closer look at the mutation locations.
# To contextualize the locations we plot the mutation frequency along
# with the gene locations.
# The genomic coordinates for each gene can be extracted from the
# already downloaded *GenBank* file of the reference genome.

N_BINS = 50

# Get genomic coordinates for all SARS-Cov-2 genes
gb_file = gb.GenBankFile.read(orig_genome_file)
Ejemplo n.º 7
0
# aligned to each other, if :math:`D_L \leq j - i \leq D_U`.
#
# In our case we center the diagonal band to the diagonal of the match
# and use a fixed band width :math:`W = D_U - D_L`.

BAND_WIDTH = 4

matrix = SubstitutionMatrix.std_protein_matrix()
alignments = []
for query_pos, ref_pos in matches:
    diagonal = ref_pos - query_pos
    alignment = align.align_banded(
        query,
        reference,
        matrix,
        gap_penalty=-5,
        max_number=1,
        # Center the band at the match diagonal and extend the band by
        # one half of the band width in each direction
        band=(diagonal - BAND_WIDTH // 2, diagonal + BAND_WIDTH // 2))[0]
    alignments.append(alignment)

for alignment in alignments:
    print(alignment)
    print("\n")

########################################################################
# 4. Significance evaluation
# """"""""""""""""""""""""""
# We have obtained two alignments, but which one of them is the
# 'correct' one?
Ejemplo n.º 8
0
# The diagonal of this match can be seen in the figure:
# It is the almost continuous line on the right side.
#
# For the gapped alignment we use :func:`align_banded()`, which reduces
# the alignment search space to a narrow diagonal band.

BAND_WIDTH = 1000

alignments = []
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
for strand, m1_pos, genome_pos in trigger_matches:
    genome = genomic_seqs[strand]
    diagonal = genome_pos - m1_pos
    alignment = align.align_banded(m1_sequence,
                                   genome,
                                   matrix,
                                   band=(diagonal - BAND_WIDTH,
                                         diagonal + BAND_WIDTH),
                                   max_number=1)[0]
    alignments.append((strand, alignment))

strand, best_alignment = max(alignments,
                             key=lambda strand_alignment: alignment[1].score)

########################################################################
# For visualization purposes we have to apply a renumbering function
# for the genomic sequence,
# since the indices in the alignment trace refer to the reverse
# complement sequence, but we want the numbers to refer to the original
# genomic sequence.

# Reverse sequence numbering for second sequence (genome) in alignment