def test_reindex_non_empty(self): msa = TabularMSA( [DNA('ACG', metadata={'id': 1}), DNA('AAA', metadata={'id': 2})], key=str) npt.assert_array_equal(msa.keys, np.array(['ACG', 'AAA'])) msa.reindex(key='id') self.assertEqual( msa, TabularMSA([ DNA('ACG', metadata={'id': 1}), DNA('AAA', metadata={'id': 2}) ], key='id')) npt.assert_array_equal(msa.keys, np.array([1, 2])) msa.reindex(keys=iter('ab')) self.assertEqual( msa, TabularMSA([ DNA('ACG', metadata={'id': 1}), DNA('AAA', metadata={'id': 2}) ], keys=iter('ab'))) npt.assert_array_equal(msa.keys, np.array(['a', 'b'])) msa.reindex() self.assertFalse(msa.has_keys())
def test_constructor_not_monomorphic(self): with six.assertRaisesRegex(self, TypeError, 'mixed types.*RNA.*DNA'): TabularMSA([DNA(''), RNA('')]) with six.assertRaisesRegex(self, TypeError, 'mixed types.*float.*Protein'): TabularMSA([Protein(''), Protein(''), 42.0, Protein('')])
def test_eq_handles_missing_metadata_efficiently(self): msa1 = TabularMSA([DNA('ACGT')]) msa2 = TabularMSA([DNA('ACGT')]) self.assertReallyEqual(msa1, msa2) self.assertIsNone(msa1._metadata) self.assertIsNone(msa2._metadata)
def test_sort_multiple_sequences_no_msa_keys_on_metadata_key(self): msa = TabularMSA([ RNA('UCA', metadata={'id': 41}), RNA('AAA', metadata={'id': 44}), RNA('GAC', metadata={'id': -1}), RNA('GAC', metadata={'id': 42}) ]) msa.sort(key='id') self.assertEqual( msa, TabularMSA([ RNA('GAC', metadata={'id': -1}), RNA('UCA', metadata={'id': 41}), RNA('GAC', metadata={'id': 42}), RNA('AAA', metadata={'id': 44}) ])) msa = TabularMSA([ RNA('UCA', metadata={'id': 41}), RNA('AAA', metadata={'id': 44}), RNA('GAC', metadata={'id': -1}), RNA('GAC', metadata={'id': 42}) ]) msa.sort(key='id', reverse=True) self.assertEqual( msa, TabularMSA([ RNA('AAA', metadata={'id': 44}), RNA('GAC', metadata={'id': 42}), RNA('UCA', metadata={'id': 41}), RNA('GAC', metadata={'id': -1}) ]))
def test_sort_on_unorderable_msa_keys(self): unorderable = Unorderable() msa = TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable]) with self.assertRaises(TypeError): msa.sort() self.assertEqual( msa, TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable]))
def test_sort_multiple_sequences_with_msa_keys_on_metadata_key(self): msa = TabularMSA([ DNA('TCA', metadata={'#': 41.2}), DNA('AAA', metadata={'#': 44.5}), DNA('GAC', metadata={'#': 42.999}) ], keys=[None, ('hello', 'world'), True]) msa.sort(key='#') self.assertEqual( msa, TabularMSA([ DNA('TCA', metadata={'#': 41.2}), DNA('GAC', metadata={'#': 42.999}), DNA('AAA', metadata={'#': 44.5}) ], keys=[None, True, ('hello', 'world')])) msa = TabularMSA([ DNA('TCA', metadata={'#': 41.2}), DNA('AAA', metadata={'#': 44.5}), DNA('GAC', metadata={'#': 42.999}) ], keys=[None, ('hello', 'world'), True]) msa.sort(key='#', reverse=True) self.assertEqual( msa, TabularMSA([ DNA('AAA', metadata={'#': 44.5}), DNA('GAC', metadata={'#': 42.999}), DNA('TCA', metadata={'#': 41.2}) ], keys=[('hello', 'world'), True, None]))
def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self): # in these tests one sequence is about 3x the length of the other. # we toggle penalize_terminal_gaps to confirm that it results in # different alignments and alignment scores. seq1 = DNA("ACCGTGGACCGTTAGGATTGGACCCAAGGTTG") seq2 = DNA("T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=False) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTTG-------------------------"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 131.0) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=True) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTT-------------------------G"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 97.0)
def test_constructor_invalid_dtype(self): with six.assertRaisesRegex(self, TypeError, 'sequence.*alphabet.*Sequence'): TabularMSA([Sequence('')]) with six.assertRaisesRegex(self, TypeError, 'sequence.*alphabet.*int'): TabularMSA([42, DNA('')])
def test_local_pairwise_align_protein(self): obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE"), Protein("AW-HE")])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=5., gap_extend_penalty=0.5) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE-E"), Protein("AW-HEAE")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(4, 9), (1, 6)]) # Protein sequences with metadata obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE", metadata={'id': "s1"}), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE"), Protein("AW-HE")])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) # Fails when either input is passed as a TabularMSA self.assertRaises(TypeError, local_pairwise_align_protein, TabularMSA( [Protein("HEAGAWGHEE", metadata={'id': "s1"})]), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE", metadata={'id': "s1"}), TabularMSA( [Protein("PAWHEAE", metadata={'id': "s2"})]), gap_open_penalty=10., gap_extend_penalty=5.) # TypeError on invalid input self.assertRaises(TypeError, local_pairwise_align_protein, 42, Protein("HEAGAWGHEE")) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE"), 42)
def test_sort_single_sequence_on_msa_keys(self): msa = TabularMSA([DNA('ACGT')], keys=[42]) msa.sort() self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42])) msa = TabularMSA([DNA('ACGT')], keys=[42]) msa.sort(reverse=True) self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42]))
def test_compute_score_and_traceback_matrices_invalid(self): # if the sequence contains a character that is not in the # substitution matrix, an informative error should be raised m = make_identity_substitution_matrix(2, -1) self.assertRaises(ValueError, _compute_score_and_traceback_matrices, TabularMSA([DNA('AWG', metadata={'id': 'id'})]), TabularMSA([DNA('ACGT', metadata={'id': 'id'})]), 5, 2, m)
def test_sort_single_sequence_no_msa_keys_on_metadata_key(self): msa = TabularMSA([RNA('UCA', metadata={'id': 42})]) msa.sort(key='id') self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})])) msa = TabularMSA([RNA('UCA', metadata={'id': 42})]) msa.sort(key='id', reverse=True) self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})]))
def test_sort_single_sequence_with_msa_keys_on_callable_key(self): msa = TabularMSA([RNA('UCA')], keys=['foo']) msa.sort(key=str) self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo'])) msa = TabularMSA([RNA('UCA')], keys=['foo']) msa.sort(key=str, reverse=True) self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo']))
def test_sort_empty_with_msa_keys_on_metadata_key(self): msa = TabularMSA([], keys=[]) msa.sort(key='id') self.assertEqual(msa, TabularMSA([], keys=[])) msa = TabularMSA([], keys=[]) msa.sort(key='id', reverse=True) self.assertEqual(msa, TabularMSA([], keys=[]))
def test_sort_empty_with_msa_keys_on_callable_key(self): msa = TabularMSA([], keys=[]) msa.sort(key=str) self.assertEqual(msa, TabularMSA([], keys=[])) msa = TabularMSA([], keys=[]) msa.sort(key=str, reverse=True) self.assertEqual(msa, TabularMSA([], keys=[]))
def test_sort_empty_on_msa_keys(self): msa = TabularMSA([], keys=[]) msa.sort() self.assertEqual(msa, TabularMSA([], keys=[])) msa = TabularMSA([], keys=[]) msa.sort(reverse=True) self.assertEqual(msa, TabularMSA([], keys=[]))
def test_sort_multiple_sequences_no_msa_keys_on_callable_key(self): msa = TabularMSA([RNA('UCC'), RNA('UCG'), RNA('UCA')]) msa.sort(key=str) self.assertEqual(msa, TabularMSA([RNA('UCA'), RNA('UCC'), RNA('UCG')])) msa = TabularMSA([RNA('UCC'), RNA('UCG'), RNA('UCA')]) msa.sort(key=str, reverse=True) self.assertEqual(msa, TabularMSA([RNA('UCG'), RNA('UCC'), RNA('UCA')]))
def test_global_pairwise_align_dtype_mismatch(self): with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"): global_pairwise_align(DNA('ACGT'), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {}) with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"): global_pairwise_align(TabularMSA([DNA('ACGT')]), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {})
def test_global_pairwise_align_nucleotide(self): obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("G-ACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 41.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # DNA sequences with metadata obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s2"})])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # Align one DNA sequence and one TabularMSA, score computed manually obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( TabularMSA([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})]), DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("-GACCATGACCAGGTACC", metadata={'id': "s2"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s3"})])) self.assertEqual(obs_score, 27.5) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # TypeError on invalid input self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42, DNA("ACGT")) self.assertRaises(TypeError, global_pairwise_align_nucleotide, DNA("ACGT"), 42)
def test_reversed(self): with self.assertRaises(StopIteration): next(reversed(TabularMSA([]))) seqs = [DNA(''), DNA('', metadata={'id': 42})] self.assertEqual(list(reversed(TabularMSA(seqs))), seqs[::-1]) seqs = [DNA('AAA'), DNA('GCT')] self.assertEqual(list(reversed(TabularMSA(seqs))), seqs[::-1])
def test_iter(self): with self.assertRaises(StopIteration): next(iter(TabularMSA([]))) seqs = [DNA(''), DNA('')] self.assertEqual(list(iter(TabularMSA(seqs))), seqs) seqs = [DNA('AAA'), DNA('GCT')] self.assertEqual(list(iter(TabularMSA(seqs))), seqs)
def test_dtype(self): self.assertIsNone(TabularMSA([]).dtype) self.assertIs(TabularMSA([Protein('')]).dtype, Protein) with self.assertRaises(AttributeError): TabularMSA([]).dtype = DNA with self.assertRaises(AttributeError): del TabularMSA([]).dtype
def test_init_matrices_sw(self): expected_score_m = np.zeros((5, 4)) expected_tback_m = [[0, 0, 0, 0], [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1]] actual_score_m, actual_tback_m = _init_matrices_sw( TabularMSA([DNA('AAA', metadata={'id': 'id'})]), TabularMSA([DNA('AAAA', metadata={'id': 'id'})]), 5, 2) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
def test_constructor_no_metadata(self): self.assertFalse(TabularMSA([]).has_metadata()) self.assertFalse( TabularMSA([DNA('', metadata={'id': 42})]).has_metadata()) self.assertFalse( TabularMSA([ DNA('AGC', metadata={'id': 42}), DNA('---', metadata={'id': 43}) ]).has_metadata())
def test_has_metadata(self): msa = TabularMSA([]) self.assertFalse(msa.has_metadata()) # Handles metadata efficiently. self.assertIsNone(msa._metadata) self.assertFalse(TabularMSA([], metadata={}).has_metadata()) self.assertTrue(TabularMSA([], metadata={'': ''}).has_metadata()) self.assertTrue(TabularMSA([], metadata={'foo': 42}).has_metadata())
def test_sort_reverse_sorted(self): msa = TabularMSA([DNA('T'), DNA('G'), DNA('A')], keys=[3, 2, 1]) msa.sort() self.assertEqual( msa, TabularMSA([DNA('A'), DNA('G'), DNA('T')], keys=[1, 2, 3])) msa = TabularMSA([DNA('T'), DNA('G'), DNA('A')], keys=[1, 2, 3]) msa.sort(reverse=True) self.assertEqual( msa, TabularMSA([DNA('A'), DNA('G'), DNA('T')], keys=[3, 2, 1]))
def test_constructor_non_empty_with_keys(self): seqs = [DNA('ACG'), DNA('CGA'), DNA('GTT')] msa = TabularMSA(seqs, key=str) self.assertIs(msa.dtype, DNA) self.assertEqual(msa.shape, (3, 3)) npt.assert_array_equal(msa.keys, np.array(['ACG', 'CGA', 'GTT'])) self.assertEqual(list(msa), seqs) msa = TabularMSA(seqs, keys=iter([42, 43, 44])) npt.assert_array_equal(msa.keys, np.array([42, 43, 44]))
def test_init_matrices_nw(self): expected_score_m = [[0, -5, -7, -9], [-5, 0, 0, 0], [-7, 0, 0, 0], [-9, 0, 0, 0], [-11, 0, 0, 0]] expected_tback_m = [[0, 3, 3, 3], [2, -1, -1, -1], [2, -1, -1, -1], [2, -1, -1, -1], [2, -1, -1, -1]] actual_score_m, actual_tback_m = _init_matrices_nw( TabularMSA([DNA('AAA', metadata={'id': 'id'})]), TabularMSA([DNA('AAAA', metadata={'id': 'id'})]), 5, 2) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
def test_constructor_with_metadata(self): msa = TabularMSA([], metadata={'foo': 'bar'}) self.assertEqual(msa.metadata, {'foo': 'bar'}) msa = TabularMSA([DNA('', metadata={'id': 42})], metadata={'foo': 'bar'}) self.assertEqual(msa.metadata, {'foo': 'bar'}) msa = TabularMSA([DNA('AGC'), DNA('---')], metadata={'foo': 'bar'}) self.assertEqual(msa.metadata, {'foo': 'bar'})
def test_local_pairwise_align_nucleotide(self): obs_msa, obs_score, obs_start_end = local_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("ACCTTGACCAGGTACC"), DNA("ACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 41.0) self.assertEqual(obs_start_end, [(1, 16), (2, 14)]) obs_msa, obs_score, obs_start_end = local_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("ACCTTGAC"), DNA("ACTTTGAC")])) self.assertEqual(obs_score, 31.0) self.assertEqual(obs_start_end, [(1, 8), (2, 9)]) # DNA sequences with metadata obs_msa, obs_score, obs_start_end = local_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("ACCTTGAC", metadata={'id': "s1"}), DNA("ACTTTGAC", metadata={'id': "s2"})])) self.assertEqual(obs_score, 31.0) self.assertEqual(obs_start_end, [(1, 8), (2, 9)]) # Fails when either input is passed as a TabularMSA self.assertRaises(TypeError, local_pairwise_align_nucleotide, TabularMSA([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"})]), DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) self.assertRaises(TypeError, local_pairwise_align_nucleotide, DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), TabularMSA([DNA("GAACTTTGACGTAAC", metadata={'id': "s2"})]), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) # TypeError on invalid input self.assertRaises(TypeError, local_pairwise_align_nucleotide, 42, DNA("ACGT")) self.assertRaises(TypeError, local_pairwise_align_nucleotide, DNA("ACGT"), 42)