Esempio n. 1
0
    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3 / 7.
        expected['C'] = 1 / 7.
        expected['G'] = 1 / 7.
        expected['T'] = 2 / 7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1 / 5.
        expected['ATT'] = 1 / 5.
        expected['TTA'] = 1 / 5.
        expected['TAC'] = 1 / 5.
        expected['ACA'] = 1 / 5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1 / 2.
        expected['TAC'] = 1 / 2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)
Esempio n. 2
0
    def test_init_varied_input(self):
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence('ACCGGXZY', 'test-seq-1',
                               'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")
Esempio n. 3
0
    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)), [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)), [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])
Esempio n. 4
0
 def test_make_nr_foundation_alignment_few(self):
     result = _make_nr_foundation_alignment(self.foundation_alignment,
                                            self.extension_genus_dic_few)
     self.assertEqual(list(result), [
         BiologicalSequence("AAA---", id="PBB1", description="Phoma"),
         BiologicalSequence("AAG---", id="CBB1", description="Candida"),
     ])
Esempio n. 5
0
def _compute_substitution_score(aln1_chars, aln2_chars, substitution_matrix,
                                gap_substitution_score):
    substitution_score = 0
    for aln1_char, aln2_char in product(aln1_chars, aln2_chars):
        if BiologicalSequence.is_gap(aln1_char) or\
           BiologicalSequence.is_gap(aln2_char):
            substitution_score += gap_substitution_score
        else:
            try:
                substitution_score += \
                    substitution_matrix[aln1_char][aln2_char]
            except KeyError:
                offending_chars = \
                    [c for c in (aln1_char, aln2_char)
                     if c not in substitution_matrix]
                raise ValueError(
                    "One of the sequences contains a character that is "
                    "not contained in the substitution matrix. Are you "
                    "using an appropriate substitution matrix for your "
                    "sequence type (e.g., a nucleotide substitution "
                    "matrix does not make sense for aligning protein "
                    "sequences)? Does your sequence contain invalid "
                    "characters? The offending character(s) is: "
                    " %s." % ', '.join(offending_chars))
    substitution_score /= (len(aln1_chars) * len(aln2_chars))
    return substitution_score
Esempio n. 6
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label =\
         '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2 =\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.sequence_objects_a = [
         DNASequence('ACTCGAGATC', 'seq1'),
         DNASequence('GGCCT', 'seq2')
     ]
     self.sequence_objects_b = [
         BiologicalSequence('ACTCGAGATC', 'seq1'),
         BiologicalSequence('GGCCT', 'seq2')
     ]
     seqs = [
         DNASequence("ACC--G-GGTA..", id="seq1"),
         DNASequence("TCC--G-GGCA..", id="seqs2")
     ]
     self.alignment = Alignment(seqs)
    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)),
                         [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)),
                         [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])
Esempio n. 8
0
def _compute_substitution_score(aln1_chars, aln2_chars, substitution_matrix,
                                gap_substitution_score):
    substitution_score = 0
    for aln1_char, aln2_char in product(aln1_chars, aln2_chars):
        if BiologicalSequence.is_gap(aln1_char) or\
           BiologicalSequence.is_gap(aln2_char):
                substitution_score += gap_substitution_score
        else:
            try:
                substitution_score += \
                    substitution_matrix[aln1_char][aln2_char]
            except KeyError:
                offending_chars = \
                    [c for c in (aln1_char, aln2_char)
                     if c not in substitution_matrix]
                raise ValueError(
                    "One of the sequences contains a character that is "
                    "not contained in the substitution matrix. Are you "
                    "using an appropriate substitution matrix for your "
                    "sequence type (e.g., a nucleotide substitution "
                    "matrix does not make sense for aligning protein "
                    "sequences)? Does your sequence contain invalid "
                    "characters? The offending character(s) is: "
                    " %s." % ', '.join(offending_chars))
    substitution_score /= (len(aln1_chars) * len(aln2_chars))
    return substitution_score
    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3/7.
        expected['C'] = 1/7.
        expected['G'] = 1/7.
        expected['T'] = 2/7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1/5.
        expected['ATT'] = 1/5.
        expected['TTA'] = 1/5.
        expected['TAC'] = 1/5.
        expected['ACA'] = 1/5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1/2.
        expected['TAC'] = 1/2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)
Esempio n. 10
0
 def test_fasta_with_many_fungi(self):
     result = fungi_from_fasta(self.fasta_many_fungi, self.accession,
                               self.taxonomy_with_fungi)
     self.assertEqual(list(result), [
         BiologicalSequence("GGGG", id="AB123", description="Fungi"),
         BiologicalSequence("CCCC", id="AB125", description="Fungi"),
         BiologicalSequence("AAAA", id="AB126", description="Fungi"),
     ])
Esempio n. 11
0
    def test_quality_no_copy(self):
        qual = np.array([22, 22, 1])
        a = BiologicalSequence('ACA', quality=qual)
        self.assertIs(a.quality, qual)

        with self.assertRaises(ValueError):
            a.quality[1] = 42

        with self.assertRaises(ValueError):
            qual[1] = 42
Esempio n. 12
0
    def test_eq(self):
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different ids and/or descriptions
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT',
                                                                     id='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') == BiologicalSequence(
                'ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c') ==
            BiologicalSequence('ACGT', id='b', description='d'))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))
Esempio n. 13
0
 def generator():
     yield BiologicalSequence('ACGT',
                              id='',
                              description='',
                              quality=range(4))
     yield RNASequence('GAU', id='  foo \t\t bar ', description='')
     yield DNASequence('TAG', id='', description='foo\n\n bar\n')
     yield BiologicalSequence('A',
                              id='foo',
                              description='bar baz',
                              quality=[42])
Esempio n. 14
0
    def test_traceback(self):
        score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1],
                   [-9, -5, -1, 6], [-11, -7, -3, 1]]
        score_m = np.array(score_m)
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        # start at bottom-right
        expected = ([BiologicalSequence("ACG-")], [BiologicalSequence("ACGT")],
                    1, 0, 0)
        actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]),
                            Alignment([DNA('ACGT')]), 4, 3)
        self.assertEqual(actual, expected)

        # four sequences in two alignments
        score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1],
                   [-9, -5, -1, 6], [-11, -7, -3, 1]]
        score_m = np.array(score_m)
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        # start at bottom-right
        expected = ([BiologicalSequence("ACG-"),
                     BiologicalSequence("ACG-")],
                    [BiologicalSequence("ACGT"),
                     BiologicalSequence("ACGT")], 1, 0, 0)
        actual = _traceback(tback_m, score_m,
                            Alignment([DNA('ACG', 's1'),
                                       DNA('ACG', 's2')]),
                            Alignment([DNA('ACGT', 's3'),
                                       DNA('ACGT', 's4')]), 4, 3)
        self.assertEqual(actual, expected)

        # start at highest-score
        expected = ([BiologicalSequence("ACG")], [BiologicalSequence("ACG")],
                    6, 0, 0)
        actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]),
                            Alignment([DNA('ACGT')]), 3, 3)
        self.assertEqual(actual, expected)

        # terminate traceback before top-right
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 0, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        expected = ("G", "G", 6, 2, 2)
        expected = ([BiologicalSequence("G")], [BiologicalSequence("G")], 6, 2,
                    2)
        actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]),
                            Alignment([DNA('ACGT')]), 3, 3)
        self.assertEqual(actual, expected)
Esempio n. 15
0
    def test_quality(self):
        a = BiologicalSequence('ACA', quality=(22, 22, 1))

        # should get back a read-only numpy array of int dtype
        self.assertIsInstance(a.quality, np.ndarray)
        self.assertEqual(a.quality.dtype, np.int)
        npt.assert_equal(a.quality, np.array((22, 22, 1)))

        # test that we can't mutate the quality scores
        with self.assertRaises(ValueError):
            a.quality[1] = 42

        # test that we can't set the property
        with self.assertRaises(AttributeError):
            a.quality = (22, 22, 42)
Esempio n. 16
0
 def test_init_with_validation(self):
     self.assertRaises(BiologicalSequenceError,
                       BiologicalSequence,
                       "ACC",
                       validate=True)
     try:
         # no error raised when only allow characters are passed
         BiologicalSequence("..--..", validate=True)
     except BiologicalSequenceError:
         self.assertTrue(False)
Esempio n. 17
0
    def test_distance(self):
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')),
                         1. / 7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(self.b1.distance(self.b1, distance_fn=dumb_distance),
                         42)
Esempio n. 18
0
 def setUp(self):
     self.b1 = BiologicalSequence('GATTACA')
     self.b2 = BiologicalSequence(
         'ACCGGTACC', id="test-seq-2",
         description="A test sequence")
     self.b3 = BiologicalSequence(
         'GREG', id="test-seq-3", description="A protein sequence")
     self.b4 = BiologicalSequence(
         'PRTEIN', id="test-seq-4")
     self.b5 = BiologicalSequence(
         'LLPRTEIN', description="some description")
     self.b6 = BiologicalSequence('ACGTACGTACGT')
     self.b7 = BiologicalSequence('..--..')
     self.b8 = BiologicalSequence('HE..--..LLO')
Esempio n. 19
0
 def setUp(self):
     self.b1 = BiologicalSequence('GATTACA', quality=range(7))
     self.b2 = BiologicalSequence(
         'ACCGGTACC', id="test-seq-2",
         description="A test sequence")
     self.b3 = BiologicalSequence(
         'GREG', id="test-seq-3", description="A protein sequence")
     self.b4 = BiologicalSequence(
         'PRTEIN', id="test-seq-4")
     self.b5 = BiologicalSequence(
         'LLPRTEIN', description="some description")
     self.b6 = BiologicalSequence('ACGTACGTACGT')
     self.b7 = BiologicalSequence('..--..', quality=range(6))
     self.b8 = BiologicalSequence('HE..--..LLO', id='hello',
                                  description='gapped hello',
                                  quality=range(11))
Esempio n. 20
0
    def test_gap_maps(self):
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(
            BiologicalSequence('-ACCGA-TA-').gap_maps(),
            ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None]))
Esempio n. 21
0
def _coerce_alignment_input_type(seq, disallow_alignment):
    """ Converts variety of types into an skbio.Alignment object
    """
    if isinstance(seq, string_types):
        return Alignment([BiologicalSequence(seq)])
    elif isinstance(seq, BiologicalSequence):
        return Alignment([seq])
    elif isinstance(seq, Alignment):
        if disallow_alignment:
            # This will disallow aligning either a pair of alignments, or an
            # alignment and a sequence. We don't currently support this for
            # local alignment as there is not a clear usecase, and it's also
            # not exactly clear how this would work.
            raise TypeError("Aligning alignments is not currently supported "
                            "with the aligner function that you're calling.")
        else:
            return seq
    else:
        raise TypeError("Unsupported type provided to aligner: %r." %
                        type(seq))
Esempio n. 22
0
 def gen():
     yield BiologicalSequence('ACGT',
                              id='foo',
                              description='bar',
                              quality=range(4))
     yield BiologicalSequence('ACG', id='foo', description='bar')
Esempio n. 23
0
    def test_has_quality(self):
        a = BiologicalSequence('ACA', quality=(5, 4, 67))
        self.assertTrue(a.has_quality())

        b = BiologicalSequence('ACA')
        self.assertFalse(b.has_quality())
Esempio n. 24
0
    def test_equals_true(self):
        # sequences match, all other attributes are not provided
        self.assertTrue(
            BiologicalSequence('ACGT').equals(BiologicalSequence('ACGT')))

        # all attributes are provided and match
        a = BiologicalSequence('ACGT', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        self.assertTrue(a.equals(b))

        # ignore type
        a = BiologicalSequence('ACGT')
        b = DNASequence('ACGT')
        self.assertTrue(a.equals(b, ignore=['type']))

        # ignore id
        a = BiologicalSequence('ACGT', id='foo')
        b = BiologicalSequence('ACGT', id='bar')
        self.assertTrue(a.equals(b, ignore=['id']))

        # ignore description
        a = BiologicalSequence('ACGT', description='foo')
        b = BiologicalSequence('ACGT', description='bar')
        self.assertTrue(a.equals(b, ignore=['description']))

        # ignore quality
        a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT', quality=[5, 6, 7, 8])
        self.assertTrue(a.equals(b, ignore=['quality']))

        # ignore sequence
        a = BiologicalSequence('ACGA')
        b = BiologicalSequence('ACGT')
        self.assertTrue(a.equals(b, ignore=['sequence']))

        # ignore everything
        a = BiologicalSequence('ACGA', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        b = DNASequence('ACGT', id='bar', description='def',
                        quality=[5, 6, 7, 8])
        self.assertTrue(a.equals(b, ignore=['quality', 'description', 'id',
                                            'sequence', 'type']))
Esempio n. 25
0
 def gen():
     for c in components:
         yield BiologicalSequence(c[2],
                                  id=c[0],
                                  description=c[1],
                                  quality=c[3])
Esempio n. 26
0
 def test_degap(self):
     self.assertEqual(self.b1.degap(), self.b1)
     self.assertEqual(self.b7.degap(), BiologicalSequence(''))
     self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))
Esempio n. 27
0
 def test_fasta_with_fungi(self):
     result = fungi_from_fasta(self.fasta_with_fungi, self.accession,
                               self.taxonomy_with_fungi)
     self.assertEqual(
         list(result),
         [BiologicalSequence("ATCG", id="AB21", description="Fungi")])
Esempio n. 28
0
 def blank_seq_gen():
     for seq in (DNASequence('A'), BiologicalSequence(''),
                 RNASequence('GG')):
         yield seq
Esempio n. 29
0
class BiologicalSequenceTests(TestCase):

    def setUp(self):
        self.b1 = BiologicalSequence('GATTACA')
        self.b2 = BiologicalSequence(
            'ACCGGTACC', id="test-seq-2",
            description="A test sequence")
        self.b3 = BiologicalSequence(
            'GREG', id="test-seq-3", description="A protein sequence")
        self.b4 = BiologicalSequence(
            'PRTEIN', id="test-seq-4")
        self.b5 = BiologicalSequence(
            'LLPRTEIN', description="some description")
        self.b6 = BiologicalSequence('ACGTACGTACGT')
        self.b7 = BiologicalSequence('..--..')
        self.b8 = BiologicalSequence('HE..--..LLO')

    def test_init_varied_input(self):
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence(
            'ACCGGXZY', 'test-seq-1', 'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

    def test_init_with_validation(self):
        self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC",
                          validate=True)
        try:
            # no error raised when only allow characters are passed
            BiologicalSequence("..--..", validate=True)
        except BiologicalSequenceError:
            self.assertTrue(False)

    def test_contains(self):
        self.assertTrue('G' in self.b1)
        self.assertFalse('g' in self.b1)

    def test_eq(self):
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different ids and/or descriptions
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a') ==
            BiologicalSequence('ACGT', id='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') ==
            BiologicalSequence('ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c') ==
            BiologicalSequence('ACGT', id='b', description='d'))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))

    def test_getitem(self):
        self.assertEqual(self.b1[0], BiologicalSequence('G'))
        self.assertEqual(self.b1[:], BiologicalSequence('GATTACA'))
        self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))

    def test_getitem_out_of_range(self):
        with self.assertRaises(IndexError):
            self.b1[42]

    def test_hash(self):
        self.assertTrue(isinstance(hash(self.b1), int))

    def test_iter(self):
        b1_iter = iter(self.b1)
        for actual, expected in zip(b1_iter, "GATTACA"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_iter))

    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)),
                         [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)),
                         [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])

    def test_k_word_counts(self):
        # overlapping = True
        expected = Counter('GATTACA')
        self.assertEqual(self.b1.k_word_counts(1, overlapping=True),
                         expected)
        expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = Counter(['GAT', 'TAC'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=False),
                         expected)
        expected = Counter(['GATTACA'])
        self.assertEqual(self.b1.k_word_counts(7, overlapping=False),
                         expected)

    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3/7.
        expected['C'] = 1/7.
        expected['G'] = 1/7.
        expected['T'] = 2/7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1/5.
        expected['ATT'] = 1/5.
        expected['TTA'] = 1/5.
        expected['TAC'] = 1/5.
        expected['ACA'] = 1/5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1/2.
        expected['TAC'] = 1/2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)

    def test_len(self):
        self.assertEqual(len(self.b1), 7)
        self.assertEqual(len(self.b2), 9)
        self.assertEqual(len(self.b3), 4)

    def test_repr(self):
        self.assertEqual(repr(self.b1),
                         "<BiologicalSequence: GATTACA (length: 7)>")
        self.assertEqual(repr(self.b6),
                         "<BiologicalSequence: ACGTACGTAC... (length: 12)>")

    def test_reversed(self):
        b1_reversed = reversed(self.b1)
        for actual, expected in zip(b1_reversed, "ACATTAG"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_reversed))

    def test_str(self):
        self.assertEqual(str(self.b1), "GATTACA")
        self.assertEqual(str(self.b2), "ACCGGTACC")
        self.assertEqual(str(self.b3), "GREG")

    def test_alphabet(self):
        self.assertEqual(self.b1.alphabet(), set())

    def test_description(self):
        self.assertEqual(self.b1.description, "")
        self.assertEqual(self.b2.description, "A test sequence")
        self.assertEqual(self.b3.description, "A protein sequence")

    def test_gap_alphabet(self):
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_id(self):
        self.assertEqual(self.b1.id, "")
        self.assertEqual(self.b2.id, "test-seq-2")
        self.assertEqual(self.b3.id, "test-seq-3")

    def test_count(self):
        self.assertEqual(self.b1.count('A'), 3)
        self.assertEqual(self.b1.count('T'), 2)
        self.assertEqual(self.b1.count('TT'), 1)

    def test_degap(self):
        self.assertEqual(self.b1.degap(), self.b1)
        self.assertEqual(self.b7.degap(), BiologicalSequence(''))
        self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))

    def test_distance(self):
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(
            self.b1.distance(self.b1, distance_fn=dumb_distance), 42)

    def test_distance_unequal_length(self):
        # Hamming distance (default) requires that sequences are of equal
        # length
        with self.assertRaises(BiologicalSequenceError):
            self.b1.distance(self.b2)

        # alternate distance functions don't have that requirement (unless
        # it's implemented within the provided distance function)
        def dumb_distance(x, y):
            return 42
        self.assertEqual(
            self.b1.distance(self.b2, distance_fn=dumb_distance), 42)

    def test_fraction_diff(self):
        self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
        self.assertEqual(
            self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5)

    def test_fraction_same(self):
        self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
        self.assertAlmostEqual(
            self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)

    def test_gap_maps(self):
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(),
                         ([1, 2, 3, 4, 5, 7, 8],
                          [None, 0, 1, 2, 3, 4, None, 5, 6, None]))

    def test_gap_vector(self):
        self.assertEqual(self.b1.gap_vector(),
                         [False] * len(self.b1))
        self.assertEqual(self.b7.gap_vector(),
                         [True] * len(self.b7))
        self.assertEqual(self.b8.gap_vector(),
                         [False, False, True, True, True, True,
                          True, True, False, False, False])

    def test_unsupported_characters(self):
        self.assertEqual(self.b1.unsupported_characters(), set('GATC'))
        self.assertEqual(self.b7.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        self.assertTrue(self.b1.has_unsupported_characters())
        self.assertFalse(self.b7.has_unsupported_characters())

    def test_index(self):
        """ index functions as expected """
        self.assertEqual(self.b1.index('G'), 0)
        self.assertEqual(self.b1.index('A'), 1)
        self.assertEqual(self.b1.index('AC'), 4)
        self.assertRaises(ValueError, self.b1.index, 'x')

    def test_is_gap(self):
        self.assertTrue(self.b1.is_gap('.'))
        self.assertTrue(self.b1.is_gap('-'))
        self.assertFalse(self.b1.is_gap('A'))
        self.assertFalse(self.b1.is_gap('x'))
        self.assertFalse(self.b1.is_gap(' '))
        self.assertFalse(self.b1.is_gap(''))

    def test_is_gapped(self):
        self.assertFalse(self.b1.is_gapped())
        self.assertFalse(self.b2.is_gapped())
        self.assertTrue(self.b7.is_gapped())
        self.assertTrue(self.b8.is_gapped())

    def test_is_valid(self):
        self.assertFalse(self.b1.is_valid())
        self.assertTrue(self.b7.is_valid())

    def test_to_fasta(self):
        self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n")
        self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA")
        self.assertEqual(self.b2.to_fasta(),
                         ">test-seq-2 A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b3.to_fasta(),
                         ">test-seq-3 A protein sequence\nGREG\n")
        self.assertEqual(self.b4.to_fasta(),
                         ">test-seq-4\nPRTEIN\n")
        self.assertEqual(self.b5.to_fasta(),
                         "> some description\nLLPRTEIN\n")

        # alt parameters
        self.assertEqual(self.b2.to_fasta(field_delimiter=":"),
                         ">test-seq-2:A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b2.to_fasta(terminal_character="!"),
                         ">test-seq-2 A test sequence\nACCGGTACC!")
        self.assertEqual(
            self.b2.to_fasta(field_delimiter=":", terminal_character="!"),
            ">test-seq-2:A test sequence\nACCGGTACC!")

    def test_upper(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('GAT.ACA-', id='x',
                                      description='42')
        self.assertEqual(b.upper(), expected)

    def test_lower(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('gat.aca-', id='x',
                                      description='42')
        self.assertEqual(b.lower(), expected)
Esempio n. 30
0
def quality_filter_sequences(sequences, barcodes):
    return BiologicalSequence('')
Esempio n. 31
0
def demux_sequences(sequences, barcodes, barcode_map):
    yield BiologicalSequence('')
Esempio n. 32
0
 def test_getitem(self):
     self.assertEqual(self.b1[0], BiologicalSequence('G'))
     self.assertEqual(self.b1[:], BiologicalSequence('GATTACA'))
     self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))
Esempio n. 33
0
class BiologicalSequenceTests(TestCase):
    def setUp(self):
        self.b1 = BiologicalSequence('GATTACA')
        self.b2 = BiologicalSequence('ACCGGTACC',
                                     id="test-seq-2",
                                     description="A test sequence")
        self.b3 = BiologicalSequence('GREG',
                                     id="test-seq-3",
                                     description="A protein sequence")
        self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4")
        self.b5 = BiologicalSequence('LLPRTEIN',
                                     description="some description")
        self.b6 = BiologicalSequence('ACGTACGTACGT')
        self.b7 = BiologicalSequence('..--..')
        self.b8 = BiologicalSequence('HE..--..LLO')

    def test_init_varied_input(self):
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence('ACCGGXZY', 'test-seq-1',
                               'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

    def test_init_with_validation(self):
        self.assertRaises(BiologicalSequenceError,
                          BiologicalSequence,
                          "ACC",
                          validate=True)
        try:
            # no error raised when only allow characters are passed
            BiologicalSequence("..--..", validate=True)
        except BiologicalSequenceError:
            self.assertTrue(False)

    def test_contains(self):
        self.assertTrue('G' in self.b1)
        self.assertFalse('g' in self.b1)

    def test_eq(self):
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different ids and/or descriptions
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT',
                                                                     id='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') == BiologicalSequence(
                'ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c') ==
            BiologicalSequence('ACGT', id='b', description='d'))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))

    def test_getitem(self):
        self.assertEqual(self.b1[0], BiologicalSequence('G'))
        self.assertEqual(self.b1[:], BiologicalSequence('GATTACA'))
        self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))

    def test_getitem_out_of_range(self):
        with self.assertRaises(IndexError):
            self.b1[42]

    def test_hash(self):
        self.assertTrue(isinstance(hash(self.b1), int))

    def test_iter(self):
        b1_iter = iter(self.b1)
        for actual, expected in zip(b1_iter, "GATTACA"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_iter))

    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)), [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)), [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])

    def test_k_word_counts(self):
        # overlapping = True
        expected = Counter('GATTACA')
        self.assertEqual(self.b1.k_word_counts(1, overlapping=True), expected)
        expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=True), expected)

        # overlapping = False
        expected = Counter(['GAT', 'TAC'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=False), expected)
        expected = Counter(['GATTACA'])
        self.assertEqual(self.b1.k_word_counts(7, overlapping=False), expected)

    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3 / 7.
        expected['C'] = 1 / 7.
        expected['G'] = 1 / 7.
        expected['T'] = 2 / 7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1 / 5.
        expected['ATT'] = 1 / 5.
        expected['TTA'] = 1 / 5.
        expected['TAC'] = 1 / 5.
        expected['ACA'] = 1 / 5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1 / 2.
        expected['TAC'] = 1 / 2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)

    def test_len(self):
        self.assertEqual(len(self.b1), 7)
        self.assertEqual(len(self.b2), 9)
        self.assertEqual(len(self.b3), 4)

    def test_repr(self):
        self.assertEqual(repr(self.b1),
                         "<BiologicalSequence: GATTACA (length: 7)>")
        self.assertEqual(repr(self.b6),
                         "<BiologicalSequence: ACGTACGTAC... (length: 12)>")

    def test_reversed(self):
        b1_reversed = reversed(self.b1)
        for actual, expected in zip(b1_reversed, "ACATTAG"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_reversed))

    def test_str(self):
        self.assertEqual(str(self.b1), "GATTACA")
        self.assertEqual(str(self.b2), "ACCGGTACC")
        self.assertEqual(str(self.b3), "GREG")

    def test_alphabet(self):
        self.assertEqual(self.b1.alphabet(), set())

    def test_description(self):
        self.assertEqual(self.b1.description, "")
        self.assertEqual(self.b2.description, "A test sequence")
        self.assertEqual(self.b3.description, "A protein sequence")

    def test_gap_alphabet(self):
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_id(self):
        self.assertEqual(self.b1.id, "")
        self.assertEqual(self.b2.id, "test-seq-2")
        self.assertEqual(self.b3.id, "test-seq-3")

    def test_count(self):
        self.assertEqual(self.b1.count('A'), 3)
        self.assertEqual(self.b1.count('T'), 2)
        self.assertEqual(self.b1.count('TT'), 1)

    def test_degap(self):
        self.assertEqual(self.b1.degap(), self.b1)
        self.assertEqual(self.b7.degap(), BiologicalSequence(''))
        self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))

    def test_distance(self):
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')),
                         1. / 7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(self.b1.distance(self.b1, distance_fn=dumb_distance),
                         42)

    def test_distance_unequal_length(self):
        # Hamming distance (default) requires that sequences are of equal
        # length
        with self.assertRaises(BiologicalSequenceError):
            self.b1.distance(self.b2)

        # alternate distance functions don't have that requirement (unless
        # it's implemented within the provided distance function)
        def dumb_distance(x, y):
            return 42

        self.assertEqual(self.b1.distance(self.b2, distance_fn=dumb_distance),
                         42)

    def test_fraction_diff(self):
        self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
        self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')),
                         1. / 7., 5)

    def test_fraction_same(self):
        self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
        self.assertAlmostEqual(
            self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)

    def test_gap_maps(self):
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(
            BiologicalSequence('-ACCGA-TA-').gap_maps(),
            ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None]))

    def test_gap_vector(self):
        self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1))
        self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7))
        self.assertEqual(self.b8.gap_vector(), [
            False, False, True, True, True, True, True, True, False, False,
            False
        ])

    def test_unsupported_characters(self):
        self.assertEqual(self.b1.unsupported_characters(), set('GATC'))
        self.assertEqual(self.b7.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        self.assertTrue(self.b1.has_unsupported_characters())
        self.assertFalse(self.b7.has_unsupported_characters())

    def test_index(self):
        """ index functions as expected """
        self.assertEqual(self.b1.index('G'), 0)
        self.assertEqual(self.b1.index('A'), 1)
        self.assertEqual(self.b1.index('AC'), 4)
        self.assertRaises(ValueError, self.b1.index, 'x')

    def test_is_gap(self):
        self.assertTrue(self.b1.is_gap('.'))
        self.assertTrue(self.b1.is_gap('-'))
        self.assertFalse(self.b1.is_gap('A'))
        self.assertFalse(self.b1.is_gap('x'))
        self.assertFalse(self.b1.is_gap(' '))
        self.assertFalse(self.b1.is_gap(''))

    def test_is_gapped(self):
        self.assertFalse(self.b1.is_gapped())
        self.assertFalse(self.b2.is_gapped())
        self.assertTrue(self.b7.is_gapped())
        self.assertTrue(self.b8.is_gapped())

    def test_is_valid(self):
        self.assertFalse(self.b1.is_valid())
        self.assertTrue(self.b7.is_valid())

    def test_to_fasta(self):
        self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n")
        self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA")
        self.assertEqual(self.b2.to_fasta(),
                         ">test-seq-2 A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b3.to_fasta(),
                         ">test-seq-3 A protein sequence\nGREG\n")
        self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n")
        self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n")

        # alt parameters
        self.assertEqual(self.b2.to_fasta(field_delimiter=":"),
                         ">test-seq-2:A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b2.to_fasta(terminal_character="!"),
                         ">test-seq-2 A test sequence\nACCGGTACC!")
        self.assertEqual(
            self.b2.to_fasta(field_delimiter=":", terminal_character="!"),
            ">test-seq-2:A test sequence\nACCGGTACC!")

    def test_upper(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('GAT.ACA-', id='x', description='42')
        self.assertEqual(b.upper(), expected)

    def test_lower(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('gat.aca-', id='x', description='42')
        self.assertEqual(b.lower(), expected)
Esempio n. 34
0
 def setUp(self):
     self.b1 = BiologicalSequence('GATTACA')
     self.b2 = BiologicalSequence('ACCGGTACC',
                                  id="test-seq-2",
                                  description="A test sequence")
     self.b3 = BiologicalSequence('GREG',
                                  id="test-seq-3",
                                  description="A protein sequence")
     self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4")
     self.b5 = BiologicalSequence('LLPRTEIN',
                                  description="some description")
     self.b6 = BiologicalSequence('ACGTACGTACGT')
     self.b7 = BiologicalSequence('..--..')
     self.b8 = BiologicalSequence('HE..--..LLO')
Esempio n. 35
0
class BiologicalSequenceTests(TestCase):

    def setUp(self):
        self.b1 = BiologicalSequence('GATTACA', quality=range(7))
        self.b2 = BiologicalSequence(
            'ACCGGTACC', id="test-seq-2",
            description="A test sequence")
        self.b3 = BiologicalSequence(
            'GREG', id="test-seq-3", description="A protein sequence")
        self.b4 = BiologicalSequence(
            'PRTEIN', id="test-seq-4")
        self.b5 = BiologicalSequence(
            'LLPRTEIN', description="some description")
        self.b6 = BiologicalSequence('ACGTACGTACGT')
        self.b7 = BiologicalSequence('..--..', quality=range(6))
        self.b8 = BiologicalSequence('HE..--..LLO', id='hello',
                                     description='gapped hello',
                                     quality=range(11))

    def test_init_varied_input(self):
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence(
            'ACCGGXZY', 'test-seq-1', 'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

    def test_init_with_validation(self):
        self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC",
                          validate=True)
        try:
            # no error raised when only allow characters are passed
            BiologicalSequence("..--..", validate=True)
        except BiologicalSequenceError:
            self.assertTrue(False)

    def test_init_with_invalid_quality(self):
        # invalid dtype
        with self.assertRaises(TypeError):
            BiologicalSequence('ACGT', quality=[2, 3, 4.1, 5])

        # wrong number of dimensions (2-D)
        with self.assertRaisesRegexp(BiologicalSequenceError, '1-D'):
            BiologicalSequence('ACGT', quality=[[2, 3], [4, 5]])

        # wrong number of elements
        with self.assertRaisesRegexp(BiologicalSequenceError, '\(3\).*\(4\)'):
            BiologicalSequence('ACGT', quality=[2, 3, 4])

    def test_contains(self):
        self.assertTrue('G' in self.b1)
        self.assertFalse('g' in self.b1)

    def test_eq_and_ne(self):
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different ids, descriptions, and/or quality
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a') ==
            BiologicalSequence('ACGT', id='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') ==
            BiologicalSequence('ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c') ==
            BiologicalSequence('ACGT', id='b', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c',
                               quality=[1, 2, 3, 4]) ==
            BiologicalSequence('ACGT', id='b', description='d',
                               quality=[5, 6, 7, 8]))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))

    def test_getitem(self):
        # use equals method to ensure that id, description, and sliced
        # quality are correctly propagated to the resulting sequence
        self.assertTrue(self.b1[0].equals(
            BiologicalSequence('G', quality=(0,))))

        self.assertTrue(self.b1[:].equals(
            BiologicalSequence('GATTACA', quality=range(7))))

        self.assertTrue(self.b1[::-1].equals(
            BiologicalSequence('ACATTAG', quality=range(7)[::-1])))

        # test a sequence without quality scores
        b = BiologicalSequence('ACGT', id='foo', description='bar')
        self.assertTrue(b[2:].equals(
            BiologicalSequence('GT', id='foo', description='bar')))

    def test_getitem_out_of_range(self):
        with self.assertRaises(IndexError):
            self.b1[42]

    def test_hash(self):
        self.assertTrue(isinstance(hash(self.b1), int))

    def test_iter(self):
        b1_iter = iter(self.b1)
        for actual, expected in zip(b1_iter, "GATTACA"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_iter))

    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)),
                         [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)),
                         [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])

    def test_k_word_counts(self):
        # overlapping = True
        expected = Counter('GATTACA')
        self.assertEqual(self.b1.k_word_counts(1, overlapping=True),
                         expected)
        expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = Counter(['GAT', 'TAC'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=False),
                         expected)
        expected = Counter(['GATTACA'])
        self.assertEqual(self.b1.k_word_counts(7, overlapping=False),
                         expected)

    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3/7.
        expected['C'] = 1/7.
        expected['G'] = 1/7.
        expected['T'] = 2/7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1/5.
        expected['ATT'] = 1/5.
        expected['TTA'] = 1/5.
        expected['TAC'] = 1/5.
        expected['ACA'] = 1/5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1/2.
        expected['TAC'] = 1/2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)

    def test_len(self):
        self.assertEqual(len(self.b1), 7)
        self.assertEqual(len(self.b2), 9)
        self.assertEqual(len(self.b3), 4)

    def test_repr(self):
        self.assertEqual(repr(self.b1),
                         "<BiologicalSequence: GATTACA (length: 7)>")
        self.assertEqual(repr(self.b6),
                         "<BiologicalSequence: ACGTACGTAC... (length: 12)>")

    def test_reversed(self):
        b1_reversed = reversed(self.b1)
        for actual, expected in zip(b1_reversed, "ACATTAG"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_reversed))

    def test_str(self):
        self.assertEqual(str(self.b1), "GATTACA")
        self.assertEqual(str(self.b2), "ACCGGTACC")
        self.assertEqual(str(self.b3), "GREG")

    def test_alphabet(self):
        self.assertEqual(self.b1.alphabet(), set())

    def test_gap_alphabet(self):
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_sequence(self):
        self.assertEqual(self.b1.sequence, "GATTACA")
        self.assertEqual(self.b2.sequence, "ACCGGTACC")
        self.assertEqual(self.b3.sequence, "GREG")

    def test_id(self):
        self.assertEqual(self.b1.id, "")
        self.assertEqual(self.b2.id, "test-seq-2")
        self.assertEqual(self.b3.id, "test-seq-3")

    def test_description(self):
        self.assertEqual(self.b1.description, "")
        self.assertEqual(self.b2.description, "A test sequence")
        self.assertEqual(self.b3.description, "A protein sequence")

    def test_quality(self):
        a = BiologicalSequence('ACA', quality=(22, 22, 1))

        # should get back a read-only numpy array of int dtype
        self.assertIsInstance(a.quality, np.ndarray)
        self.assertEqual(a.quality.dtype, np.int)
        npt.assert_equal(a.quality, np.array((22, 22, 1)))

        # test that we can't mutate the quality scores
        with self.assertRaises(ValueError):
            a.quality[1] = 42

        # test that we can't set the property
        with self.assertRaises(AttributeError):
            a.quality = (22, 22, 42)

    def test_quality_not_provided(self):
        b = BiologicalSequence('ACA')
        self.assertIs(b.quality, None)

    def test_quality_scalar(self):
        b = BiologicalSequence('G', quality=2)

        self.assertIsInstance(b.quality, np.ndarray)
        self.assertEqual(b.quality.dtype, np.int)
        self.assertEqual(b.quality.shape, (1,))
        npt.assert_equal(b.quality, np.array([2]))

    def test_quality_empty(self):
        b = BiologicalSequence('', quality=[])

        self.assertIsInstance(b.quality, np.ndarray)
        self.assertEqual(b.quality.dtype, np.int)
        self.assertEqual(b.quality.shape, (0,))
        npt.assert_equal(b.quality, np.array([]))

    def test_quality_no_copy(self):
        qual = np.array([22, 22, 1])
        a = BiologicalSequence('ACA', quality=qual)
        self.assertIs(a.quality, qual)

        with self.assertRaises(ValueError):
            a.quality[1] = 42

        with self.assertRaises(ValueError):
            qual[1] = 42

    def test_has_quality(self):
        a = BiologicalSequence('ACA', quality=(5, 4, 67))
        self.assertTrue(a.has_quality())

        b = BiologicalSequence('ACA')
        self.assertFalse(b.has_quality())

    def test_equals_true(self):
        # sequences match, all other attributes are not provided
        self.assertTrue(
            BiologicalSequence('ACGT').equals(BiologicalSequence('ACGT')))

        # all attributes are provided and match
        a = BiologicalSequence('ACGT', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        self.assertTrue(a.equals(b))

        # ignore type
        a = BiologicalSequence('ACGT')
        b = DNASequence('ACGT')
        self.assertTrue(a.equals(b, ignore=['type']))

        # ignore id
        a = BiologicalSequence('ACGT', id='foo')
        b = BiologicalSequence('ACGT', id='bar')
        self.assertTrue(a.equals(b, ignore=['id']))

        # ignore description
        a = BiologicalSequence('ACGT', description='foo')
        b = BiologicalSequence('ACGT', description='bar')
        self.assertTrue(a.equals(b, ignore=['description']))

        # ignore quality
        a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT', quality=[5, 6, 7, 8])
        self.assertTrue(a.equals(b, ignore=['quality']))

        # ignore sequence
        a = BiologicalSequence('ACGA')
        b = BiologicalSequence('ACGT')
        self.assertTrue(a.equals(b, ignore=['sequence']))

        # ignore everything
        a = BiologicalSequence('ACGA', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        b = DNASequence('ACGT', id='bar', description='def',
                        quality=[5, 6, 7, 8])
        self.assertTrue(a.equals(b, ignore=['quality', 'description', 'id',
                                            'sequence', 'type']))

    def test_equals_false(self):
        # type mismatch
        a = BiologicalSequence('ACGT', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        b = NucleotideSequence('ACGT', id='bar', description='def',
                               quality=[5, 6, 7, 8])
        self.assertFalse(a.equals(b, ignore=['quality', 'description', 'id']))

        # id mismatch
        a = BiologicalSequence('ACGT', id='foo')
        b = BiologicalSequence('ACGT', id='bar')
        self.assertFalse(a.equals(b))

        # description mismatch
        a = BiologicalSequence('ACGT', description='foo')
        b = BiologicalSequence('ACGT', description='bar')
        self.assertFalse(a.equals(b))

        # quality mismatch (both provided)
        a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT', quality=[1, 2, 3, 5])
        self.assertFalse(a.equals(b))

        # quality mismatch (one provided)
        a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT')
        self.assertFalse(a.equals(b))

        # sequence mismatch
        a = BiologicalSequence('ACGT')
        b = BiologicalSequence('TGCA')
        self.assertFalse(a.equals(b))

    def test_count(self):
        self.assertEqual(self.b1.count('A'), 3)
        self.assertEqual(self.b1.count('T'), 2)
        self.assertEqual(self.b1.count('TT'), 1)

    def test_degap(self):
        # use equals method to ensure that id, description, and filtered
        # quality are correctly propagated to the resulting sequence

        # no filtering, has quality
        self.assertTrue(self.b1.degap().equals(self.b1))

        # no filtering, doesn't have quality
        self.assertTrue(self.b2.degap().equals(self.b2))

        # everything is filtered, has quality
        self.assertTrue(self.b7.degap().equals(
            BiologicalSequence('', quality=[])))

        # some filtering, has quality
        self.assertTrue(self.b8.degap().equals(
            BiologicalSequence('HELLO', id='hello', description='gapped hello',
                               quality=[0, 1, 8, 9, 10])))

    def test_distance(self):
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(
            self.b1.distance(self.b1, distance_fn=dumb_distance), 42)

    def test_distance_unequal_length(self):
        # Hamming distance (default) requires that sequences are of equal
        # length
        with self.assertRaises(BiologicalSequenceError):
            self.b1.distance(self.b2)

        # alternate distance functions don't have that requirement (unless
        # it's implemented within the provided distance function)
        def dumb_distance(x, y):
            return 42
        self.assertEqual(
            self.b1.distance(self.b2, distance_fn=dumb_distance), 42)

    def test_fraction_diff(self):
        self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
        self.assertEqual(
            self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5)

    def test_fraction_same(self):
        self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
        self.assertAlmostEqual(
            self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)

    def test_gap_maps(self):
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(),
                         ([1, 2, 3, 4, 5, 7, 8],
                          [None, 0, 1, 2, 3, 4, None, 5, 6, None]))

    def test_gap_vector(self):
        self.assertEqual(self.b1.gap_vector(),
                         [False] * len(self.b1))
        self.assertEqual(self.b7.gap_vector(),
                         [True] * len(self.b7))
        self.assertEqual(self.b8.gap_vector(),
                         [False, False, True, True, True, True,
                          True, True, False, False, False])

    def test_unsupported_characters(self):
        self.assertEqual(self.b1.unsupported_characters(), set('GATC'))
        self.assertEqual(self.b7.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        self.assertTrue(self.b1.has_unsupported_characters())
        self.assertFalse(self.b7.has_unsupported_characters())

    def test_index(self):
        """ index functions as expected """
        self.assertEqual(self.b1.index('G'), 0)
        self.assertEqual(self.b1.index('A'), 1)
        self.assertEqual(self.b1.index('AC'), 4)
        self.assertRaises(ValueError, self.b1.index, 'x')

    def test_is_gap(self):
        self.assertTrue(self.b1.is_gap('.'))
        self.assertTrue(self.b1.is_gap('-'))
        self.assertFalse(self.b1.is_gap('A'))
        self.assertFalse(self.b1.is_gap('x'))
        self.assertFalse(self.b1.is_gap(' '))
        self.assertFalse(self.b1.is_gap(''))

    def test_is_gapped(self):
        self.assertFalse(self.b1.is_gapped())
        self.assertFalse(self.b2.is_gapped())
        self.assertTrue(self.b7.is_gapped())
        self.assertTrue(self.b8.is_gapped())

    def test_is_valid(self):
        self.assertFalse(self.b1.is_valid())
        self.assertTrue(self.b7.is_valid())

    def test_to_fasta(self):
        self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n")
        self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA")
        self.assertEqual(self.b2.to_fasta(),
                         ">test-seq-2 A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b3.to_fasta(),
                         ">test-seq-3 A protein sequence\nGREG\n")
        self.assertEqual(self.b4.to_fasta(),
                         ">test-seq-4\nPRTEIN\n")
        self.assertEqual(self.b5.to_fasta(),
                         "> some description\nLLPRTEIN\n")

        # alt parameters
        self.assertEqual(self.b2.to_fasta(field_delimiter=":"),
                         ">test-seq-2:A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b2.to_fasta(terminal_character="!"),
                         ">test-seq-2 A test sequence\nACCGGTACC!")
        self.assertEqual(
            self.b2.to_fasta(field_delimiter=":", terminal_character="!"),
            ">test-seq-2:A test sequence\nACCGGTACC!")

    def test_upper(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42',
                               quality=range(8))
        expected = NucleotideSequence('GAT.ACA-', id='x',
                                      description='42', quality=range(8))
        # use equals method to ensure that id, description, and quality are
        # correctly propagated to the resulting sequence
        self.assertTrue(b.upper().equals(expected))

    def test_lower(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42',
                               quality=range(8))
        expected = NucleotideSequence('gat.aca-', id='x',
                                      description='42', quality=range(8))
        # use equals method to ensure that id, description, and quality are
        # correctly propagated to the resulting sequence
        self.assertTrue(b.lower().equals(expected))
Esempio n. 36
0
 def missing_qual_gen():
     for seq in (RNASequence('A',
                             quality=[42]), BiologicalSequence('AG'),
                 DNASequence('GG', quality=[41, 40])):
         yield seq
Esempio n. 37
0
def _traceback(traceback_matrix,
               score_matrix,
               aln1,
               aln2,
               start_row,
               start_col,
               gap_character='-'):
    # cache some values for simpler
    aend = _traceback_encoding['alignment-end']
    match = _traceback_encoding['match']
    vgap = _traceback_encoding['vertical-gap']
    hgap = _traceback_encoding['horizontal-gap']

    # initialize the result alignments
    aln1_sequence_count = aln1.sequence_count()
    aligned_seqs1 = [[] for e in range(aln1_sequence_count)]

    aln2_sequence_count = aln2.sequence_count()
    aligned_seqs2 = [[] for e in range(aln2_sequence_count)]

    current_row = start_row
    current_col = start_col

    best_score = score_matrix[current_row, current_col]
    current_value = None

    while current_value != aend:
        current_value = traceback_matrix[current_row, current_col]

        if current_value == match:
            for aligned_seq, input_seq in zip(aligned_seqs1, aln1):
                aligned_seq.append(str(input_seq[current_col - 1]))
            for aligned_seq, input_seq in zip(aligned_seqs2, aln2):
                aligned_seq.append(str(input_seq[current_row - 1]))
            current_row -= 1
            current_col -= 1
        elif current_value == vgap:
            for aligned_seq in aligned_seqs1:
                aligned_seq.append('-')
            for aligned_seq, input_seq in zip(aligned_seqs2, aln2):
                aligned_seq.append(str(input_seq[current_row - 1]))
            current_row -= 1
        elif current_value == hgap:
            for aligned_seq, input_seq in zip(aligned_seqs1, aln1):
                aligned_seq.append(str(input_seq[current_col - 1]))
            for aligned_seq in aligned_seqs2:
                aligned_seq.append('-')
            current_col -= 1
        elif current_value == aend:
            continue
        else:
            raise ValueError("Invalid value in traceback matrix: %s" %
                             current_value)

    for i in range(aln1_sequence_count):
        aligned_seq = ''.join(aligned_seqs1[i][::-1])
        seq_id = _get_seq_id(aln1[i], str(i))
        aligned_seqs1[i] = BiologicalSequence(aligned_seq, id=seq_id)

    for i in range(aln2_sequence_count):
        aligned_seq = ''.join(aligned_seqs2[i][::-1])
        seq_id = _get_seq_id(aln2[i], str(i + aln1_sequence_count))
        aligned_seqs2[i] = BiologicalSequence(aligned_seq, id=seq_id)

    return (aligned_seqs1, aligned_seqs2, best_score, current_col, current_row)
Esempio n. 38
0
    def test_equals_false(self):
        # type mismatch
        a = BiologicalSequence('ACGT', id='foo', description='abc',
                               quality=[1, 2, 3, 4])
        b = NucleotideSequence('ACGT', id='bar', description='def',
                               quality=[5, 6, 7, 8])
        self.assertFalse(a.equals(b, ignore=['quality', 'description', 'id']))

        # id mismatch
        a = BiologicalSequence('ACGT', id='foo')
        b = BiologicalSequence('ACGT', id='bar')
        self.assertFalse(a.equals(b))

        # description mismatch
        a = BiologicalSequence('ACGT', description='foo')
        b = BiologicalSequence('ACGT', description='bar')
        self.assertFalse(a.equals(b))

        # quality mismatch (both provided)
        a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT', quality=[1, 2, 3, 5])
        self.assertFalse(a.equals(b))

        # quality mismatch (one provided)
        a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4])
        b = BiologicalSequence('ACGT')
        self.assertFalse(a.equals(b))

        # sequence mismatch
        a = BiologicalSequence('ACGT')
        b = BiologicalSequence('TGCA')
        self.assertFalse(a.equals(b))
Esempio n. 39
0
 def test_fraction_diff(self):
     self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
     self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')),
                      1. / 7., 5)
Esempio n. 40
0
 def test_fraction_same(self):
     self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
     self.assertAlmostEqual(
         self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)