def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3 / 7. expected['C'] = 1 / 7. expected['G'] = 1 / 7. expected['T'] = 2 / 7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1 / 5. expected['ATT'] = 1 / 5. expected['TTA'] = 1 / 5. expected['TAC'] = 1 / 5. expected['ACA'] = 1 / 5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1 / 2. expected['TAC'] = 1 / 2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected)
def test_init_varied_input(self): # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence('ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "")
def test_k_words(self): # overlapping = True self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=True)), ['GA', 'AT', 'TT', 'TA', 'AC', 'CA']) self.assertEqual(list(self.b1.k_words(3, overlapping=True)), ['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(list(self.b1.k_words(7, overlapping=True)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=True)), []) # overlapping = False self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=False)), ['GA', 'TT', 'AC']) self.assertEqual(list(self.b1.k_words(3, overlapping=False)), ['GAT', 'TAC']) self.assertEqual(list(self.b1.k_words(7, overlapping=False)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=False)), []) # error on invalid k self.assertRaises(ValueError, list, self.b1.k_words(0)) self.assertRaises(ValueError, list, self.b1.k_words(-42)) # tests with different sequences self.assertEqual(list(self.b8.k_words(3, overlapping=False)), ['HE.', '.--', '..L']) b = BiologicalSequence('') self.assertEqual(list(b.k_words(3)), [])
def test_make_nr_foundation_alignment_few(self): result = _make_nr_foundation_alignment(self.foundation_alignment, self.extension_genus_dic_few) self.assertEqual(list(result), [ BiologicalSequence("AAA---", id="PBB1", description="Phoma"), BiologicalSequence("AAG---", id="CBB1", description="Candida"), ])
def _compute_substitution_score(aln1_chars, aln2_chars, substitution_matrix, gap_substitution_score): substitution_score = 0 for aln1_char, aln2_char in product(aln1_chars, aln2_chars): if BiologicalSequence.is_gap(aln1_char) or\ BiologicalSequence.is_gap(aln2_char): substitution_score += gap_substitution_score else: try: substitution_score += \ substitution_matrix[aln1_char][aln2_char] except KeyError: offending_chars = \ [c for c in (aln1_char, aln2_char) if c not in substitution_matrix] raise ValueError( "One of the sequences contains a character that is " "not contained in the substitution matrix. Are you " "using an appropriate substitution matrix for your " "sequence type (e.g., a nucleotide substitution " "matrix does not make sense for aligning protein " "sequences)? Does your sequence contain invalid " "characters? The offending character(s) is: " " %s." % ', '.join(offending_chars)) substitution_score /= (len(aln1_chars) * len(aln2_chars)) return substitution_score
def setUp(self): """Setup for Fasta tests.""" self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu'] self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu' self.fasta_with_label =\ '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU' self.fasta_with_label_lw2 =\ '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU' self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.sequence_objects_a = [ DNASequence('ACTCGAGATC', 'seq1'), DNASequence('GGCCT', 'seq2') ] self.sequence_objects_b = [ BiologicalSequence('ACTCGAGATC', 'seq1'), BiologicalSequence('GGCCT', 'seq2') ] seqs = [ DNASequence("ACC--G-GGTA..", id="seq1"), DNASequence("TCC--G-GGCA..", id="seqs2") ] self.alignment = Alignment(seqs)
def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3/7. expected['C'] = 1/7. expected['G'] = 1/7. expected['T'] = 2/7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1/5. expected['ATT'] = 1/5. expected['TTA'] = 1/5. expected['TAC'] = 1/5. expected['ACA'] = 1/5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1/2. expected['TAC'] = 1/2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected)
def test_fasta_with_many_fungi(self): result = fungi_from_fasta(self.fasta_many_fungi, self.accession, self.taxonomy_with_fungi) self.assertEqual(list(result), [ BiologicalSequence("GGGG", id="AB123", description="Fungi"), BiologicalSequence("CCCC", id="AB125", description="Fungi"), BiologicalSequence("AAAA", id="AB126", description="Fungi"), ])
def test_quality_no_copy(self): qual = np.array([22, 22, 1]) a = BiologicalSequence('ACA', quality=qual) self.assertIs(a.quality, qual) with self.assertRaises(ValueError): a.quality[1] = 42 with self.assertRaises(ValueError): qual[1] = 42
def test_eq(self): self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different ids and/or descriptions self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT', id='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence( 'ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c') == BiologicalSequence('ACGT', id='b', description='d')) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))
def generator(): yield BiologicalSequence('ACGT', id='', description='', quality=range(4)) yield RNASequence('GAU', id=' foo \t\t bar ', description='') yield DNASequence('TAG', id='', description='foo\n\n bar\n') yield BiologicalSequence('A', id='foo', description='bar baz', quality=[42])
def test_traceback(self): score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 6], [-11, -7, -3, 1]] score_m = np.array(score_m) tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) # start at bottom-right expected = ([BiologicalSequence("ACG-")], [BiologicalSequence("ACGT")], 1, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]), Alignment([DNA('ACGT')]), 4, 3) self.assertEqual(actual, expected) # four sequences in two alignments score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 6], [-11, -7, -3, 1]] score_m = np.array(score_m) tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) # start at bottom-right expected = ([BiologicalSequence("ACG-"), BiologicalSequence("ACG-")], [BiologicalSequence("ACGT"), BiologicalSequence("ACGT")], 1, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG', 's1'), DNA('ACG', 's2')]), Alignment([DNA('ACGT', 's3'), DNA('ACGT', 's4')]), 4, 3) self.assertEqual(actual, expected) # start at highest-score expected = ([BiologicalSequence("ACG")], [BiologicalSequence("ACG")], 6, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]), Alignment([DNA('ACGT')]), 3, 3) self.assertEqual(actual, expected) # terminate traceback before top-right tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 0, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) expected = ("G", "G", 6, 2, 2) expected = ([BiologicalSequence("G")], [BiologicalSequence("G")], 6, 2, 2) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]), Alignment([DNA('ACGT')]), 3, 3) self.assertEqual(actual, expected)
def test_quality(self): a = BiologicalSequence('ACA', quality=(22, 22, 1)) # should get back a read-only numpy array of int dtype self.assertIsInstance(a.quality, np.ndarray) self.assertEqual(a.quality.dtype, np.int) npt.assert_equal(a.quality, np.array((22, 22, 1))) # test that we can't mutate the quality scores with self.assertRaises(ValueError): a.quality[1] = 42 # test that we can't set the property with self.assertRaises(AttributeError): a.quality = (22, 22, 42)
def test_init_with_validation(self): self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) try: # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True) except BiologicalSequenceError: self.assertTrue(False)
def test_distance(self): # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1. / 7) def dumb_distance(x, y): return 42 self.assertEqual(self.b1.distance(self.b1, distance_fn=dumb_distance), 42)
def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO')
def setUp(self): self.b1 = BiologicalSequence('GATTACA', quality=range(7)) self.b2 = BiologicalSequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..', quality=range(6)) self.b8 = BiologicalSequence('HE..--..LLO', id='hello', description='gapped hello', quality=range(11))
def test_gap_maps(self): # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual( BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None]))
def _coerce_alignment_input_type(seq, disallow_alignment): """ Converts variety of types into an skbio.Alignment object """ if isinstance(seq, string_types): return Alignment([BiologicalSequence(seq)]) elif isinstance(seq, BiologicalSequence): return Alignment([seq]) elif isinstance(seq, Alignment): if disallow_alignment: # This will disallow aligning either a pair of alignments, or an # alignment and a sequence. We don't currently support this for # local alignment as there is not a clear usecase, and it's also # not exactly clear how this would work. raise TypeError("Aligning alignments is not currently supported " "with the aligner function that you're calling.") else: return seq else: raise TypeError("Unsupported type provided to aligner: %r." % type(seq))
def gen(): yield BiologicalSequence('ACGT', id='foo', description='bar', quality=range(4)) yield BiologicalSequence('ACG', id='foo', description='bar')
def test_has_quality(self): a = BiologicalSequence('ACA', quality=(5, 4, 67)) self.assertTrue(a.has_quality()) b = BiologicalSequence('ACA') self.assertFalse(b.has_quality())
def test_equals_true(self): # sequences match, all other attributes are not provided self.assertTrue( BiologicalSequence('ACGT').equals(BiologicalSequence('ACGT'))) # all attributes are provided and match a = BiologicalSequence('ACGT', id='foo', description='abc', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT', id='foo', description='abc', quality=[1, 2, 3, 4]) self.assertTrue(a.equals(b)) # ignore type a = BiologicalSequence('ACGT') b = DNASequence('ACGT') self.assertTrue(a.equals(b, ignore=['type'])) # ignore id a = BiologicalSequence('ACGT', id='foo') b = BiologicalSequence('ACGT', id='bar') self.assertTrue(a.equals(b, ignore=['id'])) # ignore description a = BiologicalSequence('ACGT', description='foo') b = BiologicalSequence('ACGT', description='bar') self.assertTrue(a.equals(b, ignore=['description'])) # ignore quality a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT', quality=[5, 6, 7, 8]) self.assertTrue(a.equals(b, ignore=['quality'])) # ignore sequence a = BiologicalSequence('ACGA') b = BiologicalSequence('ACGT') self.assertTrue(a.equals(b, ignore=['sequence'])) # ignore everything a = BiologicalSequence('ACGA', id='foo', description='abc', quality=[1, 2, 3, 4]) b = DNASequence('ACGT', id='bar', description='def', quality=[5, 6, 7, 8]) self.assertTrue(a.equals(b, ignore=['quality', 'description', 'id', 'sequence', 'type']))
def gen(): for c in components: yield BiologicalSequence(c[2], id=c[0], description=c[1], quality=c[3])
def test_degap(self): self.assertEqual(self.b1.degap(), self.b1) self.assertEqual(self.b7.degap(), BiologicalSequence('')) self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))
def test_fasta_with_fungi(self): result = fungi_from_fasta(self.fasta_with_fungi, self.accession, self.taxonomy_with_fungi) self.assertEqual( list(result), [BiologicalSequence("ATCG", id="AB21", description="Fungi")])
def blank_seq_gen(): for seq in (DNASequence('A'), BiologicalSequence(''), RNASequence('GG')): yield seq
class BiologicalSequenceTests(TestCase): def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO') def test_init_varied_input(self): # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence( 'ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") def test_init_with_validation(self): self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) try: # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True) except BiologicalSequenceError: self.assertTrue(False) def test_contains(self): self.assertTrue('G' in self.b1) self.assertFalse('g' in self.b1) def test_eq(self): self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different ids and/or descriptions self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT', id='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence('ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c') == BiologicalSequence('ACGT', id='b', description='d')) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT')) def test_getitem(self): self.assertEqual(self.b1[0], BiologicalSequence('G')) self.assertEqual(self.b1[:], BiologicalSequence('GATTACA')) self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG')) def test_getitem_out_of_range(self): with self.assertRaises(IndexError): self.b1[42] def test_hash(self): self.assertTrue(isinstance(hash(self.b1), int)) def test_iter(self): b1_iter = iter(self.b1) for actual, expected in zip(b1_iter, "GATTACA"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_iter)) def test_k_words(self): # overlapping = True self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=True)), ['GA', 'AT', 'TT', 'TA', 'AC', 'CA']) self.assertEqual(list(self.b1.k_words(3, overlapping=True)), ['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(list(self.b1.k_words(7, overlapping=True)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=True)), []) # overlapping = False self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=False)), ['GA', 'TT', 'AC']) self.assertEqual(list(self.b1.k_words(3, overlapping=False)), ['GAT', 'TAC']) self.assertEqual(list(self.b1.k_words(7, overlapping=False)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=False)), []) # error on invalid k self.assertRaises(ValueError, list, self.b1.k_words(0)) self.assertRaises(ValueError, list, self.b1.k_words(-42)) # tests with different sequences self.assertEqual(list(self.b8.k_words(3, overlapping=False)), ['HE.', '.--', '..L']) b = BiologicalSequence('') self.assertEqual(list(b.k_words(3)), []) def test_k_word_counts(self): # overlapping = True expected = Counter('GATTACA') self.assertEqual(self.b1.k_word_counts(1, overlapping=True), expected) expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(self.b1.k_word_counts(3, overlapping=True), expected) # overlapping = False expected = Counter(['GAT', 'TAC']) self.assertEqual(self.b1.k_word_counts(3, overlapping=False), expected) expected = Counter(['GATTACA']) self.assertEqual(self.b1.k_word_counts(7, overlapping=False), expected) def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3/7. expected['C'] = 1/7. expected['G'] = 1/7. expected['T'] = 2/7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1/5. expected['ATT'] = 1/5. expected['TTA'] = 1/5. expected['TAC'] = 1/5. expected['ACA'] = 1/5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1/2. expected['TAC'] = 1/2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected) def test_len(self): self.assertEqual(len(self.b1), 7) self.assertEqual(len(self.b2), 9) self.assertEqual(len(self.b3), 4) def test_repr(self): self.assertEqual(repr(self.b1), "<BiologicalSequence: GATTACA (length: 7)>") self.assertEqual(repr(self.b6), "<BiologicalSequence: ACGTACGTAC... (length: 12)>") def test_reversed(self): b1_reversed = reversed(self.b1) for actual, expected in zip(b1_reversed, "ACATTAG"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_reversed)) def test_str(self): self.assertEqual(str(self.b1), "GATTACA") self.assertEqual(str(self.b2), "ACCGGTACC") self.assertEqual(str(self.b3), "GREG") def test_alphabet(self): self.assertEqual(self.b1.alphabet(), set()) def test_description(self): self.assertEqual(self.b1.description, "") self.assertEqual(self.b2.description, "A test sequence") self.assertEqual(self.b3.description, "A protein sequence") def test_gap_alphabet(self): self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_id(self): self.assertEqual(self.b1.id, "") self.assertEqual(self.b2.id, "test-seq-2") self.assertEqual(self.b3.id, "test-seq-3") def test_count(self): self.assertEqual(self.b1.count('A'), 3) self.assertEqual(self.b1.count('T'), 2) self.assertEqual(self.b1.count('TT'), 1) def test_degap(self): self.assertEqual(self.b1.degap(), self.b1) self.assertEqual(self.b7.degap(), BiologicalSequence('')) self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO')) def test_distance(self): # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b1, distance_fn=dumb_distance), 42) def test_distance_unequal_length(self): # Hamming distance (default) requires that sequences are of equal # length with self.assertRaises(BiologicalSequenceError): self.b1.distance(self.b2) # alternate distance functions don't have that requirement (unless # it's implemented within the provided distance function) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b2, distance_fn=dumb_distance), 42) def test_fraction_diff(self): self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual( self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5) def test_fraction_same(self): self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5) def test_gap_maps(self): # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None])) def test_gap_vector(self): self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1)) self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7)) self.assertEqual(self.b8.gap_vector(), [False, False, True, True, True, True, True, True, False, False, False]) def test_unsupported_characters(self): self.assertEqual(self.b1.unsupported_characters(), set('GATC')) self.assertEqual(self.b7.unsupported_characters(), set()) def test_has_unsupported_characters(self): self.assertTrue(self.b1.has_unsupported_characters()) self.assertFalse(self.b7.has_unsupported_characters()) def test_index(self): """ index functions as expected """ self.assertEqual(self.b1.index('G'), 0) self.assertEqual(self.b1.index('A'), 1) self.assertEqual(self.b1.index('AC'), 4) self.assertRaises(ValueError, self.b1.index, 'x') def test_is_gap(self): self.assertTrue(self.b1.is_gap('.')) self.assertTrue(self.b1.is_gap('-')) self.assertFalse(self.b1.is_gap('A')) self.assertFalse(self.b1.is_gap('x')) self.assertFalse(self.b1.is_gap(' ')) self.assertFalse(self.b1.is_gap('')) def test_is_gapped(self): self.assertFalse(self.b1.is_gapped()) self.assertFalse(self.b2.is_gapped()) self.assertTrue(self.b7.is_gapped()) self.assertTrue(self.b8.is_gapped()) def test_is_valid(self): self.assertFalse(self.b1.is_valid()) self.assertTrue(self.b7.is_valid()) def test_to_fasta(self): self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n") self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA") self.assertEqual(self.b2.to_fasta(), ">test-seq-2 A test sequence\nACCGGTACC\n") self.assertEqual(self.b3.to_fasta(), ">test-seq-3 A protein sequence\nGREG\n") self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n") self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n") # alt parameters self.assertEqual(self.b2.to_fasta(field_delimiter=":"), ">test-seq-2:A test sequence\nACCGGTACC\n") self.assertEqual(self.b2.to_fasta(terminal_character="!"), ">test-seq-2 A test sequence\nACCGGTACC!") self.assertEqual( self.b2.to_fasta(field_delimiter=":", terminal_character="!"), ">test-seq-2:A test sequence\nACCGGTACC!") def test_upper(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('GAT.ACA-', id='x', description='42') self.assertEqual(b.upper(), expected) def test_lower(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('gat.aca-', id='x', description='42') self.assertEqual(b.lower(), expected)
def quality_filter_sequences(sequences, barcodes): return BiologicalSequence('')
def demux_sequences(sequences, barcodes, barcode_map): yield BiologicalSequence('')
def test_getitem(self): self.assertEqual(self.b1[0], BiologicalSequence('G')) self.assertEqual(self.b1[:], BiologicalSequence('GATTACA')) self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))
class BiologicalSequenceTests(TestCase): def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence('ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence('GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence('LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO') def test_init_varied_input(self): # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence('ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") def test_init_with_validation(self): self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) try: # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True) except BiologicalSequenceError: self.assertTrue(False) def test_contains(self): self.assertTrue('G' in self.b1) self.assertFalse('g' in self.b1) def test_eq(self): self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different ids and/or descriptions self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT', id='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence( 'ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c') == BiologicalSequence('ACGT', id='b', description='d')) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT')) def test_getitem(self): self.assertEqual(self.b1[0], BiologicalSequence('G')) self.assertEqual(self.b1[:], BiologicalSequence('GATTACA')) self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG')) def test_getitem_out_of_range(self): with self.assertRaises(IndexError): self.b1[42] def test_hash(self): self.assertTrue(isinstance(hash(self.b1), int)) def test_iter(self): b1_iter = iter(self.b1) for actual, expected in zip(b1_iter, "GATTACA"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_iter)) def test_k_words(self): # overlapping = True self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=True)), ['GA', 'AT', 'TT', 'TA', 'AC', 'CA']) self.assertEqual(list(self.b1.k_words(3, overlapping=True)), ['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(list(self.b1.k_words(7, overlapping=True)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=True)), []) # overlapping = False self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=False)), ['GA', 'TT', 'AC']) self.assertEqual(list(self.b1.k_words(3, overlapping=False)), ['GAT', 'TAC']) self.assertEqual(list(self.b1.k_words(7, overlapping=False)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=False)), []) # error on invalid k self.assertRaises(ValueError, list, self.b1.k_words(0)) self.assertRaises(ValueError, list, self.b1.k_words(-42)) # tests with different sequences self.assertEqual(list(self.b8.k_words(3, overlapping=False)), ['HE.', '.--', '..L']) b = BiologicalSequence('') self.assertEqual(list(b.k_words(3)), []) def test_k_word_counts(self): # overlapping = True expected = Counter('GATTACA') self.assertEqual(self.b1.k_word_counts(1, overlapping=True), expected) expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(self.b1.k_word_counts(3, overlapping=True), expected) # overlapping = False expected = Counter(['GAT', 'TAC']) self.assertEqual(self.b1.k_word_counts(3, overlapping=False), expected) expected = Counter(['GATTACA']) self.assertEqual(self.b1.k_word_counts(7, overlapping=False), expected) def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3 / 7. expected['C'] = 1 / 7. expected['G'] = 1 / 7. expected['T'] = 2 / 7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1 / 5. expected['ATT'] = 1 / 5. expected['TTA'] = 1 / 5. expected['TAC'] = 1 / 5. expected['ACA'] = 1 / 5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1 / 2. expected['TAC'] = 1 / 2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected) def test_len(self): self.assertEqual(len(self.b1), 7) self.assertEqual(len(self.b2), 9) self.assertEqual(len(self.b3), 4) def test_repr(self): self.assertEqual(repr(self.b1), "<BiologicalSequence: GATTACA (length: 7)>") self.assertEqual(repr(self.b6), "<BiologicalSequence: ACGTACGTAC... (length: 12)>") def test_reversed(self): b1_reversed = reversed(self.b1) for actual, expected in zip(b1_reversed, "ACATTAG"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_reversed)) def test_str(self): self.assertEqual(str(self.b1), "GATTACA") self.assertEqual(str(self.b2), "ACCGGTACC") self.assertEqual(str(self.b3), "GREG") def test_alphabet(self): self.assertEqual(self.b1.alphabet(), set()) def test_description(self): self.assertEqual(self.b1.description, "") self.assertEqual(self.b2.description, "A test sequence") self.assertEqual(self.b3.description, "A protein sequence") def test_gap_alphabet(self): self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_id(self): self.assertEqual(self.b1.id, "") self.assertEqual(self.b2.id, "test-seq-2") self.assertEqual(self.b3.id, "test-seq-3") def test_count(self): self.assertEqual(self.b1.count('A'), 3) self.assertEqual(self.b1.count('T'), 2) self.assertEqual(self.b1.count('TT'), 1) def test_degap(self): self.assertEqual(self.b1.degap(), self.b1) self.assertEqual(self.b7.degap(), BiologicalSequence('')) self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO')) def test_distance(self): # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1. / 7) def dumb_distance(x, y): return 42 self.assertEqual(self.b1.distance(self.b1, distance_fn=dumb_distance), 42) def test_distance_unequal_length(self): # Hamming distance (default) requires that sequences are of equal # length with self.assertRaises(BiologicalSequenceError): self.b1.distance(self.b2) # alternate distance functions don't have that requirement (unless # it's implemented within the provided distance function) def dumb_distance(x, y): return 42 self.assertEqual(self.b1.distance(self.b2, distance_fn=dumb_distance), 42) def test_fraction_diff(self): self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5) def test_fraction_same(self): self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5) def test_gap_maps(self): # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual( BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None])) def test_gap_vector(self): self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1)) self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7)) self.assertEqual(self.b8.gap_vector(), [ False, False, True, True, True, True, True, True, False, False, False ]) def test_unsupported_characters(self): self.assertEqual(self.b1.unsupported_characters(), set('GATC')) self.assertEqual(self.b7.unsupported_characters(), set()) def test_has_unsupported_characters(self): self.assertTrue(self.b1.has_unsupported_characters()) self.assertFalse(self.b7.has_unsupported_characters()) def test_index(self): """ index functions as expected """ self.assertEqual(self.b1.index('G'), 0) self.assertEqual(self.b1.index('A'), 1) self.assertEqual(self.b1.index('AC'), 4) self.assertRaises(ValueError, self.b1.index, 'x') def test_is_gap(self): self.assertTrue(self.b1.is_gap('.')) self.assertTrue(self.b1.is_gap('-')) self.assertFalse(self.b1.is_gap('A')) self.assertFalse(self.b1.is_gap('x')) self.assertFalse(self.b1.is_gap(' ')) self.assertFalse(self.b1.is_gap('')) def test_is_gapped(self): self.assertFalse(self.b1.is_gapped()) self.assertFalse(self.b2.is_gapped()) self.assertTrue(self.b7.is_gapped()) self.assertTrue(self.b8.is_gapped()) def test_is_valid(self): self.assertFalse(self.b1.is_valid()) self.assertTrue(self.b7.is_valid()) def test_to_fasta(self): self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n") self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA") self.assertEqual(self.b2.to_fasta(), ">test-seq-2 A test sequence\nACCGGTACC\n") self.assertEqual(self.b3.to_fasta(), ">test-seq-3 A protein sequence\nGREG\n") self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n") self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n") # alt parameters self.assertEqual(self.b2.to_fasta(field_delimiter=":"), ">test-seq-2:A test sequence\nACCGGTACC\n") self.assertEqual(self.b2.to_fasta(terminal_character="!"), ">test-seq-2 A test sequence\nACCGGTACC!") self.assertEqual( self.b2.to_fasta(field_delimiter=":", terminal_character="!"), ">test-seq-2:A test sequence\nACCGGTACC!") def test_upper(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('GAT.ACA-', id='x', description='42') self.assertEqual(b.upper(), expected) def test_lower(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('gat.aca-', id='x', description='42') self.assertEqual(b.lower(), expected)
def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence('ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence('GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence('LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO')
class BiologicalSequenceTests(TestCase): def setUp(self): self.b1 = BiologicalSequence('GATTACA', quality=range(7)) self.b2 = BiologicalSequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..', quality=range(6)) self.b8 = BiologicalSequence('HE..--..LLO', id='hello', description='gapped hello', quality=range(11)) def test_init_varied_input(self): # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence( 'ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") def test_init_with_validation(self): self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) try: # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True) except BiologicalSequenceError: self.assertTrue(False) def test_init_with_invalid_quality(self): # invalid dtype with self.assertRaises(TypeError): BiologicalSequence('ACGT', quality=[2, 3, 4.1, 5]) # wrong number of dimensions (2-D) with self.assertRaisesRegexp(BiologicalSequenceError, '1-D'): BiologicalSequence('ACGT', quality=[[2, 3], [4, 5]]) # wrong number of elements with self.assertRaisesRegexp(BiologicalSequenceError, '\(3\).*\(4\)'): BiologicalSequence('ACGT', quality=[2, 3, 4]) def test_contains(self): self.assertTrue('G' in self.b1) self.assertFalse('g' in self.b1) def test_eq_and_ne(self): self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different ids, descriptions, and/or quality self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT', id='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence('ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c') == BiologicalSequence('ACGT', id='b', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c', quality=[1, 2, 3, 4]) == BiologicalSequence('ACGT', id='b', description='d', quality=[5, 6, 7, 8])) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT')) def test_getitem(self): # use equals method to ensure that id, description, and sliced # quality are correctly propagated to the resulting sequence self.assertTrue(self.b1[0].equals( BiologicalSequence('G', quality=(0,)))) self.assertTrue(self.b1[:].equals( BiologicalSequence('GATTACA', quality=range(7)))) self.assertTrue(self.b1[::-1].equals( BiologicalSequence('ACATTAG', quality=range(7)[::-1]))) # test a sequence without quality scores b = BiologicalSequence('ACGT', id='foo', description='bar') self.assertTrue(b[2:].equals( BiologicalSequence('GT', id='foo', description='bar'))) def test_getitem_out_of_range(self): with self.assertRaises(IndexError): self.b1[42] def test_hash(self): self.assertTrue(isinstance(hash(self.b1), int)) def test_iter(self): b1_iter = iter(self.b1) for actual, expected in zip(b1_iter, "GATTACA"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_iter)) def test_k_words(self): # overlapping = True self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=True)), ['GA', 'AT', 'TT', 'TA', 'AC', 'CA']) self.assertEqual(list(self.b1.k_words(3, overlapping=True)), ['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(list(self.b1.k_words(7, overlapping=True)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=True)), []) # overlapping = False self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=False)), ['GA', 'TT', 'AC']) self.assertEqual(list(self.b1.k_words(3, overlapping=False)), ['GAT', 'TAC']) self.assertEqual(list(self.b1.k_words(7, overlapping=False)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=False)), []) # error on invalid k self.assertRaises(ValueError, list, self.b1.k_words(0)) self.assertRaises(ValueError, list, self.b1.k_words(-42)) # tests with different sequences self.assertEqual(list(self.b8.k_words(3, overlapping=False)), ['HE.', '.--', '..L']) b = BiologicalSequence('') self.assertEqual(list(b.k_words(3)), []) def test_k_word_counts(self): # overlapping = True expected = Counter('GATTACA') self.assertEqual(self.b1.k_word_counts(1, overlapping=True), expected) expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(self.b1.k_word_counts(3, overlapping=True), expected) # overlapping = False expected = Counter(['GAT', 'TAC']) self.assertEqual(self.b1.k_word_counts(3, overlapping=False), expected) expected = Counter(['GATTACA']) self.assertEqual(self.b1.k_word_counts(7, overlapping=False), expected) def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3/7. expected['C'] = 1/7. expected['G'] = 1/7. expected['T'] = 2/7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1/5. expected['ATT'] = 1/5. expected['TTA'] = 1/5. expected['TAC'] = 1/5. expected['ACA'] = 1/5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1/2. expected['TAC'] = 1/2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected) def test_len(self): self.assertEqual(len(self.b1), 7) self.assertEqual(len(self.b2), 9) self.assertEqual(len(self.b3), 4) def test_repr(self): self.assertEqual(repr(self.b1), "<BiologicalSequence: GATTACA (length: 7)>") self.assertEqual(repr(self.b6), "<BiologicalSequence: ACGTACGTAC... (length: 12)>") def test_reversed(self): b1_reversed = reversed(self.b1) for actual, expected in zip(b1_reversed, "ACATTAG"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_reversed)) def test_str(self): self.assertEqual(str(self.b1), "GATTACA") self.assertEqual(str(self.b2), "ACCGGTACC") self.assertEqual(str(self.b3), "GREG") def test_alphabet(self): self.assertEqual(self.b1.alphabet(), set()) def test_gap_alphabet(self): self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_sequence(self): self.assertEqual(self.b1.sequence, "GATTACA") self.assertEqual(self.b2.sequence, "ACCGGTACC") self.assertEqual(self.b3.sequence, "GREG") def test_id(self): self.assertEqual(self.b1.id, "") self.assertEqual(self.b2.id, "test-seq-2") self.assertEqual(self.b3.id, "test-seq-3") def test_description(self): self.assertEqual(self.b1.description, "") self.assertEqual(self.b2.description, "A test sequence") self.assertEqual(self.b3.description, "A protein sequence") def test_quality(self): a = BiologicalSequence('ACA', quality=(22, 22, 1)) # should get back a read-only numpy array of int dtype self.assertIsInstance(a.quality, np.ndarray) self.assertEqual(a.quality.dtype, np.int) npt.assert_equal(a.quality, np.array((22, 22, 1))) # test that we can't mutate the quality scores with self.assertRaises(ValueError): a.quality[1] = 42 # test that we can't set the property with self.assertRaises(AttributeError): a.quality = (22, 22, 42) def test_quality_not_provided(self): b = BiologicalSequence('ACA') self.assertIs(b.quality, None) def test_quality_scalar(self): b = BiologicalSequence('G', quality=2) self.assertIsInstance(b.quality, np.ndarray) self.assertEqual(b.quality.dtype, np.int) self.assertEqual(b.quality.shape, (1,)) npt.assert_equal(b.quality, np.array([2])) def test_quality_empty(self): b = BiologicalSequence('', quality=[]) self.assertIsInstance(b.quality, np.ndarray) self.assertEqual(b.quality.dtype, np.int) self.assertEqual(b.quality.shape, (0,)) npt.assert_equal(b.quality, np.array([])) def test_quality_no_copy(self): qual = np.array([22, 22, 1]) a = BiologicalSequence('ACA', quality=qual) self.assertIs(a.quality, qual) with self.assertRaises(ValueError): a.quality[1] = 42 with self.assertRaises(ValueError): qual[1] = 42 def test_has_quality(self): a = BiologicalSequence('ACA', quality=(5, 4, 67)) self.assertTrue(a.has_quality()) b = BiologicalSequence('ACA') self.assertFalse(b.has_quality()) def test_equals_true(self): # sequences match, all other attributes are not provided self.assertTrue( BiologicalSequence('ACGT').equals(BiologicalSequence('ACGT'))) # all attributes are provided and match a = BiologicalSequence('ACGT', id='foo', description='abc', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT', id='foo', description='abc', quality=[1, 2, 3, 4]) self.assertTrue(a.equals(b)) # ignore type a = BiologicalSequence('ACGT') b = DNASequence('ACGT') self.assertTrue(a.equals(b, ignore=['type'])) # ignore id a = BiologicalSequence('ACGT', id='foo') b = BiologicalSequence('ACGT', id='bar') self.assertTrue(a.equals(b, ignore=['id'])) # ignore description a = BiologicalSequence('ACGT', description='foo') b = BiologicalSequence('ACGT', description='bar') self.assertTrue(a.equals(b, ignore=['description'])) # ignore quality a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT', quality=[5, 6, 7, 8]) self.assertTrue(a.equals(b, ignore=['quality'])) # ignore sequence a = BiologicalSequence('ACGA') b = BiologicalSequence('ACGT') self.assertTrue(a.equals(b, ignore=['sequence'])) # ignore everything a = BiologicalSequence('ACGA', id='foo', description='abc', quality=[1, 2, 3, 4]) b = DNASequence('ACGT', id='bar', description='def', quality=[5, 6, 7, 8]) self.assertTrue(a.equals(b, ignore=['quality', 'description', 'id', 'sequence', 'type'])) def test_equals_false(self): # type mismatch a = BiologicalSequence('ACGT', id='foo', description='abc', quality=[1, 2, 3, 4]) b = NucleotideSequence('ACGT', id='bar', description='def', quality=[5, 6, 7, 8]) self.assertFalse(a.equals(b, ignore=['quality', 'description', 'id'])) # id mismatch a = BiologicalSequence('ACGT', id='foo') b = BiologicalSequence('ACGT', id='bar') self.assertFalse(a.equals(b)) # description mismatch a = BiologicalSequence('ACGT', description='foo') b = BiologicalSequence('ACGT', description='bar') self.assertFalse(a.equals(b)) # quality mismatch (both provided) a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT', quality=[1, 2, 3, 5]) self.assertFalse(a.equals(b)) # quality mismatch (one provided) a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT') self.assertFalse(a.equals(b)) # sequence mismatch a = BiologicalSequence('ACGT') b = BiologicalSequence('TGCA') self.assertFalse(a.equals(b)) def test_count(self): self.assertEqual(self.b1.count('A'), 3) self.assertEqual(self.b1.count('T'), 2) self.assertEqual(self.b1.count('TT'), 1) def test_degap(self): # use equals method to ensure that id, description, and filtered # quality are correctly propagated to the resulting sequence # no filtering, has quality self.assertTrue(self.b1.degap().equals(self.b1)) # no filtering, doesn't have quality self.assertTrue(self.b2.degap().equals(self.b2)) # everything is filtered, has quality self.assertTrue(self.b7.degap().equals( BiologicalSequence('', quality=[]))) # some filtering, has quality self.assertTrue(self.b8.degap().equals( BiologicalSequence('HELLO', id='hello', description='gapped hello', quality=[0, 1, 8, 9, 10]))) def test_distance(self): # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b1, distance_fn=dumb_distance), 42) def test_distance_unequal_length(self): # Hamming distance (default) requires that sequences are of equal # length with self.assertRaises(BiologicalSequenceError): self.b1.distance(self.b2) # alternate distance functions don't have that requirement (unless # it's implemented within the provided distance function) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b2, distance_fn=dumb_distance), 42) def test_fraction_diff(self): self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual( self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5) def test_fraction_same(self): self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5) def test_gap_maps(self): # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None])) def test_gap_vector(self): self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1)) self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7)) self.assertEqual(self.b8.gap_vector(), [False, False, True, True, True, True, True, True, False, False, False]) def test_unsupported_characters(self): self.assertEqual(self.b1.unsupported_characters(), set('GATC')) self.assertEqual(self.b7.unsupported_characters(), set()) def test_has_unsupported_characters(self): self.assertTrue(self.b1.has_unsupported_characters()) self.assertFalse(self.b7.has_unsupported_characters()) def test_index(self): """ index functions as expected """ self.assertEqual(self.b1.index('G'), 0) self.assertEqual(self.b1.index('A'), 1) self.assertEqual(self.b1.index('AC'), 4) self.assertRaises(ValueError, self.b1.index, 'x') def test_is_gap(self): self.assertTrue(self.b1.is_gap('.')) self.assertTrue(self.b1.is_gap('-')) self.assertFalse(self.b1.is_gap('A')) self.assertFalse(self.b1.is_gap('x')) self.assertFalse(self.b1.is_gap(' ')) self.assertFalse(self.b1.is_gap('')) def test_is_gapped(self): self.assertFalse(self.b1.is_gapped()) self.assertFalse(self.b2.is_gapped()) self.assertTrue(self.b7.is_gapped()) self.assertTrue(self.b8.is_gapped()) def test_is_valid(self): self.assertFalse(self.b1.is_valid()) self.assertTrue(self.b7.is_valid()) def test_to_fasta(self): self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n") self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA") self.assertEqual(self.b2.to_fasta(), ">test-seq-2 A test sequence\nACCGGTACC\n") self.assertEqual(self.b3.to_fasta(), ">test-seq-3 A protein sequence\nGREG\n") self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n") self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n") # alt parameters self.assertEqual(self.b2.to_fasta(field_delimiter=":"), ">test-seq-2:A test sequence\nACCGGTACC\n") self.assertEqual(self.b2.to_fasta(terminal_character="!"), ">test-seq-2 A test sequence\nACCGGTACC!") self.assertEqual( self.b2.to_fasta(field_delimiter=":", terminal_character="!"), ">test-seq-2:A test sequence\nACCGGTACC!") def test_upper(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42', quality=range(8)) expected = NucleotideSequence('GAT.ACA-', id='x', description='42', quality=range(8)) # use equals method to ensure that id, description, and quality are # correctly propagated to the resulting sequence self.assertTrue(b.upper().equals(expected)) def test_lower(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42', quality=range(8)) expected = NucleotideSequence('gat.aca-', id='x', description='42', quality=range(8)) # use equals method to ensure that id, description, and quality are # correctly propagated to the resulting sequence self.assertTrue(b.lower().equals(expected))
def missing_qual_gen(): for seq in (RNASequence('A', quality=[42]), BiologicalSequence('AG'), DNASequence('GG', quality=[41, 40])): yield seq
def _traceback(traceback_matrix, score_matrix, aln1, aln2, start_row, start_col, gap_character='-'): # cache some values for simpler aend = _traceback_encoding['alignment-end'] match = _traceback_encoding['match'] vgap = _traceback_encoding['vertical-gap'] hgap = _traceback_encoding['horizontal-gap'] # initialize the result alignments aln1_sequence_count = aln1.sequence_count() aligned_seqs1 = [[] for e in range(aln1_sequence_count)] aln2_sequence_count = aln2.sequence_count() aligned_seqs2 = [[] for e in range(aln2_sequence_count)] current_row = start_row current_col = start_col best_score = score_matrix[current_row, current_col] current_value = None while current_value != aend: current_value = traceback_matrix[current_row, current_col] if current_value == match: for aligned_seq, input_seq in zip(aligned_seqs1, aln1): aligned_seq.append(str(input_seq[current_col - 1])) for aligned_seq, input_seq in zip(aligned_seqs2, aln2): aligned_seq.append(str(input_seq[current_row - 1])) current_row -= 1 current_col -= 1 elif current_value == vgap: for aligned_seq in aligned_seqs1: aligned_seq.append('-') for aligned_seq, input_seq in zip(aligned_seqs2, aln2): aligned_seq.append(str(input_seq[current_row - 1])) current_row -= 1 elif current_value == hgap: for aligned_seq, input_seq in zip(aligned_seqs1, aln1): aligned_seq.append(str(input_seq[current_col - 1])) for aligned_seq in aligned_seqs2: aligned_seq.append('-') current_col -= 1 elif current_value == aend: continue else: raise ValueError("Invalid value in traceback matrix: %s" % current_value) for i in range(aln1_sequence_count): aligned_seq = ''.join(aligned_seqs1[i][::-1]) seq_id = _get_seq_id(aln1[i], str(i)) aligned_seqs1[i] = BiologicalSequence(aligned_seq, id=seq_id) for i in range(aln2_sequence_count): aligned_seq = ''.join(aligned_seqs2[i][::-1]) seq_id = _get_seq_id(aln2[i], str(i + aln1_sequence_count)) aligned_seqs2[i] = BiologicalSequence(aligned_seq, id=seq_id) return (aligned_seqs1, aligned_seqs2, best_score, current_col, current_row)
def test_equals_false(self): # type mismatch a = BiologicalSequence('ACGT', id='foo', description='abc', quality=[1, 2, 3, 4]) b = NucleotideSequence('ACGT', id='bar', description='def', quality=[5, 6, 7, 8]) self.assertFalse(a.equals(b, ignore=['quality', 'description', 'id'])) # id mismatch a = BiologicalSequence('ACGT', id='foo') b = BiologicalSequence('ACGT', id='bar') self.assertFalse(a.equals(b)) # description mismatch a = BiologicalSequence('ACGT', description='foo') b = BiologicalSequence('ACGT', description='bar') self.assertFalse(a.equals(b)) # quality mismatch (both provided) a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT', quality=[1, 2, 3, 5]) self.assertFalse(a.equals(b)) # quality mismatch (one provided) a = BiologicalSequence('ACGT', quality=[1, 2, 3, 4]) b = BiologicalSequence('ACGT') self.assertFalse(a.equals(b)) # sequence mismatch a = BiologicalSequence('ACGT') b = BiologicalSequence('TGCA') self.assertFalse(a.equals(b))
def test_fraction_diff(self): self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5)
def test_fraction_same(self): self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)