Ejemplo n.º 1
0
def test_align_optimal_simple(local, term, gap_penalty, input1, input2,
                              expect):
    """
    Test `align_optimal()` function using constructed test cases.
    """
    seq1 = seq.NucleotideSequence(input1)
    seq2 = seq.NucleotideSequence(input2)
    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    # Test alignment function
    alignments = align.align_optimal(seq1,
                                     seq2,
                                     matrix,
                                     gap_penalty=gap_penalty,
                                     terminal_penalty=term,
                                     local=local)

    for ali in alignments:
        assert str(ali) in expect
    # Test if separate score function calculates the same score
    for ali in alignments:
        score = align.score(ali,
                            matrix,
                            gap_penalty=gap_penalty,
                            terminal_penalty=term)
        assert score == ali.score
Ejemplo n.º 2
0
def test_alignment_str():
    seq1 = seq.NucleotideSequence("ACCTGA")
    seq2 = seq.NucleotideSequence("TATGCT")
    ali_str = ["A-CCTGA----", "----T-ATGCT"]
    trace = align.Alignment.trace_from_strings(ali_str)
    alignment = align.Alignment([seq1, seq2], trace, None)
    assert str(alignment).split("\n") == ali_str
Ejemplo n.º 3
0
def test_align_ungapped():
    seq1 = seq.NucleotideSequence("ACCTGA")
    seq2 = seq.NucleotideSequence("ACTGGT")
    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    ali = align.align_ungapped(seq1, seq2, matrix)
    assert ali.score == 3
    assert str(ali) == "ACCTGA\nACTGGT"
Ejemplo n.º 4
0
def test_sequence_conversion():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    # Cannot compare dicts directly, since the original RNA sequence is
    # now guessed as protein sequence
    for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()):
        assert str(seq1) == str(seq2)

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir("sequence"), "prot.fasta")
    file4 = fasta.FastaFile.read(path)
    # Expect a warning for selenocysteine conversion
    with pytest.warns(UserWarning):
        assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir("sequence"), "invalid.fasta")
    file5 = fasta.FastaFile.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Ejemplo n.º 5
0
def test_find_subsequence():
    string = "ATACGCTTGCT"
    substring = "GCT"
    main_seq = seq.NucleotideSequence(string)
    sub_seq = seq.NucleotideSequence(substring)
    matches = seq.find_subsequence(main_seq, sub_seq)
    assert list(matches) == [4,8]
Ejemplo n.º 6
0
def test_sequence_conversion():
    path = os.path.join(data_dir, "nuc.fasta")
    file = fasta.FastaFile()
    file.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    assert seq_dict == seq_dict2

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir, "prot.fasta")
    file4 = fasta.FastaFile()
    file4.read(path)
    assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir, "invalid.fasta")
    file5 = fasta.FastaFile()
    file5.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Ejemplo n.º 7
0
def test_nucleotide_construction():
    string = "AATGCGTTA"
    string_amb = "ANNGCBRTAN"
    dna = seq.NucleotideSequence(string)
    assert dna.get_alphabet() == seq.NucleotideSequence.alphabet_unamb
    assert str(dna) == string
    dna = seq.NucleotideSequence(string_amb)
    assert dna.get_alphabet() == seq.NucleotideSequence.alphabet_amb
    assert str(dna) == string_amb
Ejemplo n.º 8
0
def test_access_high_level():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    sequences = fasta.get_sequences(file)
    assert sequences == {
        "dna sequence": seq.NucleotideSequence("ACGCTACGT", False),
        "another dna sequence": seq.NucleotideSequence("A", False),
        "third dna sequence": seq.NucleotideSequence("ACGT", False),
        "rna sequence": seq.NucleotideSequence("ACGT", False),
        "ambiguous rna sequence": seq.NucleotideSequence("ACGTNN", True),
    }
Ejemplo n.º 9
0
def test_write_iter(chars_per_line, n_sequences):
    """
    Test whether :class:`FastaFile.write()` and
    :class:`FastaFile.write_iter()` produce the same output file for
    random sequences.
    """
    LENGTH_RANGE = (50, 150)
    SCORE_RANGE = (10, 60)

    # Generate random sequences and scores
    np.random.seed(0)
    sequences = []
    for i in range(n_sequences):
        seq_length = np.random.randint(*LENGTH_RANGE)
        code = np.random.randint(len(seq.NucleotideSequence.alphabet_unamb),
                                 size=seq_length)
        sequence = seq.NucleotideSequence()
        sequence.code = code
        sequences.append(sequence)

    fasta_file = fasta.FastaFile(chars_per_line)
    for i, sequence in enumerate(sequences):
        header = f"seq_{i}"
        fasta_file[header] = str(sequence)
    ref_file = io.StringIO()
    fasta_file.write(ref_file)

    test_file = io.StringIO()
    fasta.FastaFile.write_iter(test_file,
                               ((f"seq_{i}", str(sequence))
                                for i, sequence in enumerate(sequences)),
                               chars_per_line)

    assert test_file.getvalue() == ref_file.getvalue()
Ejemplo n.º 10
0
def test_from_alignment():
    seq1 = seq.NucleotideSequence("CGTCAT")
    seq2 = seq.NucleotideSequence("TCATGC")
    ali_str = ["CGTCAT--", "--TCATGC"]
    trace = align.Alignment.trace_from_strings(ali_str)
    alignment = align.Alignment([seq1, seq2], trace, None)

    profile = seq.SequenceProfile.from_alignment(alignment)
    symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
                        [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0,
                                                                   0]])
    gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
    alphabet = seq.Alphabet(["A", "C", "G", "T"])
    assert np.array_equal(symbols, profile.symbols)
    assert np.array_equal(gaps, profile.gaps)
    assert (alphabet == profile.alphabet)
Ejemplo n.º 11
0
def test_rna_conversion():
    sequence = seq.NucleotideSequence("ACGT")
    fasta_file = fasta.FastaFile()
    fasta.set_sequence(fasta_file, sequence, "seq1", as_rna=False)
    fasta.set_sequence(fasta_file, sequence, "seq2", as_rna=True)
    assert fasta_file["seq1"] == "ACGT"
    assert fasta_file["seq2"] == "ACGU"
Ejemplo n.º 12
0
def test_access():
    string = "AATGCGTTA"
    dna = seq.NucleotideSequence(string)
    assert string[2] == dna[2]
    assert string == "".join([symbol for symbol in dna])
    dna = dna[3:-2]
    assert "GCGT" == str(dna)
Ejemplo n.º 13
0
def test_find_symbol():
    string = "ATACGCTTGCT"
    symbol = "T"
    dna = seq.NucleotideSequence(string)
    assert list(seq.find_symbol(dna, symbol)) == [1,6,7,10]
    assert seq.find_symbol_first(dna, symbol) == 1
    assert seq.find_symbol_last(dna, symbol) == 10
Ejemplo n.º 14
0
def test_rna_conversion():
    sequence = seq.NucleotideSequence("ACGT")
    scores = np.array([0, 0, 0, 0])
    fastq_file = fastq.FastqFile(offset="Sanger")
    fastq.set_sequence(fastq_file, sequence, scores, "seq1", as_rna=False)
    fastq.set_sequence(fastq_file, sequence, scores, "seq2", as_rna=True)
    assert fastq_file["seq1"][0] == "ACGT" 
    assert fastq_file["seq2"][0] == "ACGU"
Ejemplo n.º 15
0
def test_translation_met_start():
    """
    Test whether the start amino acid is replaced by methionine,
    i.e. the correct function of the 'met_start' parameter.
    """
    codon_table = seq.CodonTable.default_table().with_start_codons("AAA")
    dna = seq.NucleotideSequence("GAAACTGAAATAAGAAC")
    proteins, _ = dna.translate(codon_table=codon_table, met_start=True)
    assert [str(protein) for protein in proteins] == ["MLK*", "M*"]
Ejemplo n.º 16
0
def test_to_consensus_nuc_ambiguous():
    symbols = np.array([[1, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
                        [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0,
                                                                   0]])
    gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
    alphabet = seq.Alphabet(["A", "C", "G", "T"])
    profile = seq.SequenceProfile(symbols, gaps, alphabet)

    assert seq.NucleotideSequence("MGTCATGC") == profile.to_consensus()
Ejemplo n.º 17
0
def sample_app():
    """
    Provide a `RNAfoldApp` object, where *RNAfold* has been executed for
    a sample sequence.
    """
    sequence = seq.NucleotideSequence("CGACGTAGATGCTAGCTGACTCGATGC")
    app = RNAfoldApp(sequence)
    app.start()
    app.join()
    return app
Ejemplo n.º 18
0
def test_frame_translation(dna_str, protein_str_list):
    dna = seq.NucleotideSequence(dna_str)
    proteins, pos = dna.translate(complete=False)
    assert len(proteins) == len(protein_str_list)
    assert set([str(protein) for protein in proteins]) == set(protein_str_list)
    # Test if the positions are also right
    # -> Get sequence slice and translate completely
    assert set([
        str(dna[start:stop].translate(complete=True)) for start, stop in pos
    ]) == set(protein_str_list)
Ejemplo n.º 19
0
def random_sequences(k, alphabet):
    N_SEQS = 10
    SEQ_LENGTH = 1000

    np.random.seed(0)
    sequences = []
    for _ in range(N_SEQS):
        sequence = seq.NucleotideSequence()
        sequence.code = np.random.randint(len(alphabet), size=SEQ_LENGTH)
        sequences.append(sequence)
    return sequences
Ejemplo n.º 20
0
def test_manipulation():
    dna_seq = seq.NucleotideSequence("ACGTA")
    dna_copy = dna_seq.copy()
    dna_copy[2] = "C"
    assert "ACCTA" == str(dna_copy)
    dna_copy = dna_seq.copy()
    dna_copy[0:2] = dna_copy[3:5]
    assert "TAGTA" == str(dna_copy)
    dna_copy = dna_seq.copy()
    dna_copy[np.array([True, False, False, False, True])] = "T"
    assert "TCGTT" == str(dna_copy)
    dna_copy = dna_seq.copy()
    dna_copy[1:4] = np.array([0, 1, 2])
    assert "AACGA" == str(dna_copy)
Ejemplo n.º 21
0
def test_concatenation():
    str1 = "AAGTTA"
    str2 = "CGA"
    str3 = "NNN"
    concat_seq = seq.NucleotideSequence(str1) + seq.NucleotideSequence(str2)
    assert str1 + str2 == str(concat_seq)
    concat_seq = seq.NucleotideSequence(str1) + seq.NucleotideSequence(str3)
    assert str1 + str3 == str(concat_seq)
    concat_seq = seq.NucleotideSequence(str3) + seq.NucleotideSequence(str1)
    assert str3 + str1 == str(concat_seq)
Ejemplo n.º 22
0
def test_access(chars_per_line):
    path = os.path.join(data_dir("sequence"), "random.fastq")
    file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line)
    assert len(file) == 20
    assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)]
    del (file["Read:05"])
    assert len(file) == 19
    assert list(
        file.keys()) == [f"Read:{i+1:02d}" for i in range(20) if i + 1 != 5]
    for sequence, scores in file.values():
        assert len(sequence) == len(scores)
        assert (scores >= 0).all()
    sequence = seq.NucleotideSequence("ACTCGGT")
    scores = np.array([10, 12, 20, 11, 0, 80, 42])
    file["test"] = sequence, scores
    sequence2, scores2 = file["test"]
    assert sequence == sequence2
    assert np.array_equal(scores, scores2)
Ejemplo n.º 23
0
def test_nucleotide(simple_matrix, use_custom_matrix):
    """
    Test masking a nucleotide sequence based on a known example.
    """
    seq_string = "TGCAAGCTATTAGGCTTAGGTCAGTGCttaagcttaggtcagtgcAACATA"
    sequence = seq.NucleotideSequence(seq_string)

    if use_custom_matrix:
        matrix = simple_matrix
    else:
        matrix = None

    test_mask = TantanApp.mask_repeats(sequence, matrix)

    ref_mask = [True if char.islower() else False for char in seq_string]

    assert len(test_mask) == len(ref_mask)
    assert np.all(test_mask.tolist() == ref_mask)
Ejemplo n.º 24
0
def test_large_sequence_mapping(length, excerpt_length, seed):
    """
    Test whether an excerpt of a very large sequence is aligned to that
    sequence at the position, where the excerpt was taken from.
    """
    BAND_WIDTH = 100
    
    np.random.seed(seed)

    sequence = seq.NucleotideSequence()
    sequence.code = np.random.randint(len(sequence.alphabet), size=length)
    excerpt_pos = np.random.randint(len(sequence) - excerpt_length)
    excerpt = sequence[excerpt_pos : excerpt_pos + excerpt_length]

    diagonal = np.random.randint(
        excerpt_pos - BAND_WIDTH,
        excerpt_pos + BAND_WIDTH
    )
    band = (
        diagonal - BAND_WIDTH,
        diagonal + BAND_WIDTH
    )
    print(band)
    print(len(sequence), len(excerpt))

    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    test_alignments = align.align_banded(
        excerpt, sequence, matrix, band=band
    )
    # The excerpt should be uniquely mappable to a single location on
    # the long sequence
    assert len(test_alignments) == 1
    test_alignment = test_alignments[0]
    test_trace = test_alignment.trace

    ref_trace = np.stack([
        np.arange(len(excerpt)),
        np.arange(excerpt_pos, len(excerpt) + excerpt_pos)
    ], axis=1)
    assert np.array_equal(test_trace, ref_trace)
Ejemplo n.º 25
0
def test_affine_gap_penalty(local, term, gap_penalty, seed):
    """
    Expect the same alignment results for a linear gap penalty and an
    affine gap penalty with the same gap open and extension penalty.
    """
    LENGTH_RANGE = (10, 100)
    MAX_NUMBER = 1000

    np.random.seed(seed)
    sequences = []
    for _ in range(2):
        sequence = seq.NucleotideSequence()
        length = np.random.randint(*LENGTH_RANGE)
        sequence.code = np.random.randint(len(sequence.alphabet), size=length)
        sequences.append(sequence)

    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()

    ref_alignments = align.align_optimal(*sequences, matrix, gap_penalty, term,
                                         local, MAX_NUMBER)

    test_alignments = align.align_optimal(*sequences, matrix,
                                          (gap_penalty, gap_penalty), term,
                                          local, MAX_NUMBER)

    assert test_alignments[0].score == ref_alignments[0].score
    assert len(test_alignments) == len(ref_alignments)
    # We can only expect to get the same alignments in the test and
    # reference, if we get all optimal alignments
    if len(test_alignments) < MAX_NUMBER:
        for alignment in test_alignments:
            try:
                assert alignment in ref_alignments
            except:
                print("Test alignment:")
                print(alignment)
                print()
                print("First reference alignment")
                print(ref_alignments[0])
                raise
Ejemplo n.º 26
0
def test_annotated_sequence():
    sequence = seq.NucleotideSequence("ATGGCGTACGATTAGAAAAAAA")
    feature1 = Feature("misc_feature",
                       [Location(1, 2), Location(11, 12)], {"note": "walker"})
    feature2 = Feature("misc_feature", [Location(16, 22)], {"note": "poly-A"})
    annotation = Annotation([feature1, feature2])
    annot_seq = AnnotatedSequence(annotation, sequence)
    assert annot_seq[2] == "T"
    assert annot_seq.sequence[2] == "G"
    annot_seq2 = annot_seq[:16]
    assert annot_seq2.sequence == seq.NucleotideSequence("ATGGCGTACGATTAG")
    assert annot_seq[feature1] == seq.NucleotideSequence("ATAT")
    assert annot_seq[feature2] == seq.NucleotideSequence("AAAAAAA")
    annot_seq[feature1] = seq.NucleotideSequence("CCCC")
    assert annot_seq.sequence == seq.NucleotideSequence(
        "CCGGCGTACGCCTAGAAAAAAA")
Ejemplo n.º 27
0
def test_masking(k, input_mask, ref_output_mask):
    """
    Explicitly test the conversion of removal masks to k-mer masks
    using known examples.
    Since the conversion function is private, this is tested indirectly,
    by looking at the sequence positions, that were added to the array.
    """
    input_mask = np.array(input_mask, dtype=bool)
    ref_output_mask = np.array(ref_output_mask, dtype=bool)

    sequence = seq.NucleotideSequence()
    sequence.code = np.zeros(len(input_mask))
    table = align.KmerTable.from_sequences(k, [sequence],
                                           ignore_masks=[input_mask])

    # Get the k-mer positions that were masked
    test_output_mask = np.zeros(len(ref_output_mask), dtype=bool)
    for kmer in table.get_kmers():
        seq_indices = table[kmer][:, 1]
        test_output_mask[seq_indices] = True

    assert test_output_mask.tolist() == ref_output_mask.tolist()
Ejemplo n.º 28
0
def test_max_table_size(gap_penalty, direction, score_only, should_raise):
    """
    Check if the `max_table_size` parameter in `align_local_gapped()`
    raises the expected `MemoryError` if the aligned regions get too
    large.
    """
    if should_raise:
        # This table size is exceed in this test case...
        max_table_size = 1_000_000
    else:
        # ... and this one is not
        max_table_size = 1_000_000_000

    # Align a long random sequence to itself,
    # effectively resulting in a global alignment
    np.random.seed(0)
    seq1 = seq.NucleotideSequence()
    seq1.code = np.random.randint(len(seq1.alphabet), size=10000)

    matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
    # Local alignment starts in the center of the sequences
    seed = (len(seq1) // 2, len(seq1) // 2)
    threshold = 100

    if should_raise:
        with pytest.raises(MemoryError):
            align.align_local_gapped(seq1, seq1, matrix, seed, threshold,
                                     gap_penalty, 1, direction, score_only,
                                     max_table_size)
    else:
        result = align.align_local_gapped(seq1, seq1, matrix, seed, threshold,
                                          gap_penalty, 1, direction,
                                          score_only, max_table_size)
        if not score_only and direction == "both":
            alignment = result[0]
            # Expect that no gaps are introduced
            assert len(alignment) == len(seq1)
Ejemplo n.º 29
0
"""
From A to T - The Sequence subpackage
=====================================

.. currentmodule:: biotite.sequence

:mod:`biotite.sequence` is a *Biotite* subpackage concerning maybe the
most popular data type in computational molecular biology: sequences.
The instantiation can be quite simple as
"""

import biotite.sequence as seq

dna = seq.NucleotideSequence("AACTGCTA")
print(dna)

########################################################################
# This example shows :class:`NucleotideSequence` which is a subclass of
# the abstract base class :class:`Sequence`.
# A :class:`NucleotideSequence` accepts an iterable object of strings,
# where each string can be ``'A'``, ``'C'``, ``'G'`` or ``'T'``.
# Each of these letters is called a *symbol*.
#
# In general the sequence implementation in *Biotite* allows for
# *sequences of anything*.
# This means any (immutable an hashable) *Python* object can be used as
# a symbol in a sequence, as long as the object is part of the
# :class:`Alphabet` of the particular :class:`Sequence`.
# An :class:`Alphabet` object simply represents a list of objects that
# are allowed to occur in a :class:`Sequence`.
# The following figure shows how the symbols are stored in a
Ejemplo n.º 30
0
# An alignment is an instance of :class:`BlastAlignment`, a subclass of
# :class:`biotite.sequence.align.Alignment`.
# It contains some additional information as shown above.
# The hit UID can be used to obtain the complete hit sequence via
# :mod:`biotite.database.entrez`.
#
# The next alignment should be a bit more challenging.
# We take a random part of the *E. coli* BL21 genome and distort it a
# little bit.
# Since we still expect a high similarity to the original sequence,
# we decrease the E-value threshold.

import biotite.application.blast as blast
import biotite.sequence as seq

bl21_seq = seq.NucleotideSequence(
    "CGGAAGCGCTCGGTCTCCTGGCCTTATCAGCCACTGCGCGACGATATGCTCGTCCGTTTCGAAGA")
app = blast.BlastWebApp("blastn", bl21_seq, obey_rules=False)
app.set_max_expect_value(0.1)
app.start()
app.join()
alignments = app.get_alignments()
best_ali = alignments[0]
print(best_ali)
print()
print("HSP position in query: ", best_ali.query_interval)
print("HSP position in hit: ", best_ali.hit_interval)
print("Score: ", best_ali.score)
print("E-value: ", best_ali.e_value)
print("Hit UID: ", best_ali.hit_id)
print("Hit name: ", best_ali.hit_definition)