def test_keys_setter_non_empty(self): msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')]) self.assertFalse(msa.has_keys()) msa.keys = range(3) npt.assert_array_equal(msa.keys, np.array([0, 1, 2])) msa.keys = range(3, 6) npt.assert_array_equal(msa.keys, np.array([3, 4, 5]))
def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self): # in these tests one sequence is about 3x the length of the other. # we toggle penalize_terminal_gaps to confirm that it results in # different alignments and alignment scores. seq1 = DNA("ACCGTGGACCGTTAGGATTGGACCCAAGGTTG") seq2 = DNA("T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=False) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTTG-------------------------"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 131.0) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=True) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTT-------------------------G"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 97.0)
def test_constructor_not_monomorphic(self): with six.assertRaisesRegex(self, TypeError, 'mixed types.*RNA.*DNA'): TabularMSA([DNA(''), RNA('')]) with six.assertRaisesRegex(self, TypeError, 'mixed types.*float.*Protein'): TabularMSA([Protein(''), Protein(''), 42.0, Protein('')])
def test_to_dict_non_empty(self): seqs = [ Protein('PAW', metadata={'id': 42}), Protein('WAP', metadata={'id': -999}) ] msa = TabularMSA(seqs, key='id') self.assertEqual(msa.to_dict(), {42: seqs[0], -999: seqs[1]})
def test_sort_on_unorderable_msa_keys(self): unorderable = Unorderable() msa = TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable]) with self.assertRaises(TypeError): msa.sort() self.assertEqual( msa, TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable]))
def test_constructor_invalid_dtype(self): with six.assertRaisesRegex(self, TypeError, 'sequence.*alphabet.*Sequence'): TabularMSA([Sequence('')]) with six.assertRaisesRegex(self, TypeError, 'sequence.*alphabet.*int'): TabularMSA([42, DNA('')])
def test_sort_on_invalid_key(self): msa = TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, 43]) with self.assertRaises(KeyError): msa.sort(key='id') self.assertEqual( msa, TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, 43]))
def test_sort_multiple_sequences_no_msa_keys_on_metadata_key(self): msa = TabularMSA([ RNA('UCA', metadata={'id': 41}), RNA('AAA', metadata={'id': 44}), RNA('GAC', metadata={'id': -1}), RNA('GAC', metadata={'id': 42}) ]) msa.sort(key='id') self.assertEqual( msa, TabularMSA([ RNA('GAC', metadata={'id': -1}), RNA('UCA', metadata={'id': 41}), RNA('GAC', metadata={'id': 42}), RNA('AAA', metadata={'id': 44}) ])) msa = TabularMSA([ RNA('UCA', metadata={'id': 41}), RNA('AAA', metadata={'id': 44}), RNA('GAC', metadata={'id': -1}), RNA('GAC', metadata={'id': 42}) ]) msa.sort(key='id', reverse=True) self.assertEqual( msa, TabularMSA([ RNA('AAA', metadata={'id': 44}), RNA('GAC', metadata={'id': 42}), RNA('UCA', metadata={'id': 41}), RNA('GAC', metadata={'id': -1}) ]))
def test_eq_handles_missing_metadata_efficiently(self): msa1 = TabularMSA([DNA('ACGT')]) msa2 = TabularMSA([DNA('ACGT')]) self.assertReallyEqual(msa1, msa2) self.assertIsNone(msa1._metadata) self.assertIsNone(msa2._metadata)
def test_sort_multiple_sequences_with_msa_keys_on_metadata_key(self): msa = TabularMSA([ DNA('TCA', metadata={'#': 41.2}), DNA('AAA', metadata={'#': 44.5}), DNA('GAC', metadata={'#': 42.999}) ], keys=[None, ('hello', 'world'), True]) msa.sort(key='#') self.assertEqual( msa, TabularMSA([ DNA('TCA', metadata={'#': 41.2}), DNA('GAC', metadata={'#': 42.999}), DNA('AAA', metadata={'#': 44.5}) ], keys=[None, True, ('hello', 'world')])) msa = TabularMSA([ DNA('TCA', metadata={'#': 41.2}), DNA('AAA', metadata={'#': 44.5}), DNA('GAC', metadata={'#': 42.999}) ], keys=[None, ('hello', 'world'), True]) msa.sort(key='#', reverse=True) self.assertEqual( msa, TabularMSA([ DNA('AAA', metadata={'#': 44.5}), DNA('GAC', metadata={'#': 42.999}), DNA('TCA', metadata={'#': 41.2}) ], keys=[('hello', 'world'), True, None]))
def test_local_pairwise_align_protein(self): obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE"), Protein("AW-HE")])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=5., gap_extend_penalty=0.5) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE-E"), Protein("AW-HEAE")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(4, 9), (1, 6)]) # Protein sequences with metadata obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE", metadata={'id': "s1"}), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE"), Protein("AW-HE")])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) # Fails when either input is passed as a TabularMSA self.assertRaises(TypeError, local_pairwise_align_protein, TabularMSA( [Protein("HEAGAWGHEE", metadata={'id': "s1"})]), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE", metadata={'id': "s1"}), TabularMSA( [Protein("PAWHEAE", metadata={'id': "s2"})]), gap_open_penalty=10., gap_extend_penalty=5.) # TypeError on invalid input self.assertRaises(TypeError, local_pairwise_align_protein, 42, Protein("HEAGAWGHEE")) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE"), 42)
def test_sort_single_sequence_with_msa_keys_on_callable_key(self): msa = TabularMSA([RNA('UCA')], keys=['foo']) msa.sort(key=str) self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo'])) msa = TabularMSA([RNA('UCA')], keys=['foo']) msa.sort(key=str, reverse=True) self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo']))
def test_sort_empty_on_msa_keys(self): msa = TabularMSA([], keys=[]) msa.sort() self.assertEqual(msa, TabularMSA([], keys=[])) msa = TabularMSA([], keys=[]) msa.sort(reverse=True) self.assertEqual(msa, TabularMSA([], keys=[]))
def test_sort_single_sequence_on_msa_keys(self): msa = TabularMSA([DNA('ACGT')], keys=[42]) msa.sort() self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42])) msa = TabularMSA([DNA('ACGT')], keys=[42]) msa.sort(reverse=True) self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42]))
def test_sort_empty_with_msa_keys_on_metadata_key(self): msa = TabularMSA([], keys=[]) msa.sort(key='id') self.assertEqual(msa, TabularMSA([], keys=[])) msa = TabularMSA([], keys=[]) msa.sort(key='id', reverse=True) self.assertEqual(msa, TabularMSA([], keys=[]))
def test_global_pairwise_align_dtype_mismatch(self): with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"): global_pairwise_align(DNA('ACGT'), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {}) with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"): global_pairwise_align(TabularMSA([DNA('ACGT')]), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {})
def test_sort_single_sequence_no_msa_keys_on_metadata_key(self): msa = TabularMSA([RNA('UCA', metadata={'id': 42})]) msa.sort(key='id') self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})])) msa = TabularMSA([RNA('UCA', metadata={'id': 42})]) msa.sort(key='id', reverse=True) self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})]))
def test_sort_multiple_sequences_no_msa_keys_on_callable_key(self): msa = TabularMSA([RNA('UCC'), RNA('UCG'), RNA('UCA')]) msa.sort(key=str) self.assertEqual(msa, TabularMSA([RNA('UCA'), RNA('UCC'), RNA('UCG')])) msa = TabularMSA([RNA('UCC'), RNA('UCG'), RNA('UCA')]) msa.sort(key=str, reverse=True) self.assertEqual(msa, TabularMSA([RNA('UCG'), RNA('UCC'), RNA('UCA')]))
def test_sort_empty_with_msa_keys_on_callable_key(self): msa = TabularMSA([], keys=[]) msa.sort(key=str) self.assertEqual(msa, TabularMSA([], keys=[])) msa = TabularMSA([], keys=[]) msa.sort(key=str, reverse=True) self.assertEqual(msa, TabularMSA([], keys=[]))
def test_compute_score_and_traceback_matrices_invalid(self): # if the sequence contains a character that is not in the # substitution matrix, an informative error should be raised m = make_identity_substitution_matrix(2, -1) self.assertRaises(ValueError, _compute_score_and_traceback_matrices, TabularMSA([DNA('AWG', metadata={'id': 'id'})]), TabularMSA([DNA('ACGT', metadata={'id': 'id'})]), 5, 2, m)
def test_global_pairwise_align_nucleotide(self): obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("G-ACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 41.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # DNA sequences with metadata obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s2"})])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # Align one DNA sequence and one TabularMSA, score computed manually obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( TabularMSA([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})]), DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("-GACCATGACCAGGTACC", metadata={'id': "s2"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s3"})])) self.assertEqual(obs_score, 27.5) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # TypeError on invalid input self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42, DNA("ACGT")) self.assertRaises(TypeError, global_pairwise_align_nucleotide, DNA("ACGT"), 42)
def test_metadata_setter_invalid_type(self): msa = TabularMSA([Protein('PAW')], metadata={123: 456}) for md in (None, 0, 'a', ('f', 'o', 'o'), np.array([]), pd.DataFrame()): with six.assertRaisesRegex(self, TypeError, 'metadata must be a dict'): msa.metadata = md self.assertEqual(msa.metadata, {123: 456})
def test_dtype(self): self.assertIsNone(TabularMSA([]).dtype) self.assertIs(TabularMSA([Protein('')]).dtype, Protein) with self.assertRaises(AttributeError): TabularMSA([]).dtype = DNA with self.assertRaises(AttributeError): del TabularMSA([]).dtype
def test_iter(self): with self.assertRaises(StopIteration): next(iter(TabularMSA([]))) seqs = [DNA(''), DNA('')] self.assertEqual(list(iter(TabularMSA(seqs))), seqs) seqs = [DNA('AAA'), DNA('GCT')] self.assertEqual(list(iter(TabularMSA(seqs))), seqs)
def test_reversed(self): with self.assertRaises(StopIteration): next(reversed(TabularMSA([]))) seqs = [DNA(''), DNA('', metadata={'id': 42})] self.assertEqual(list(reversed(TabularMSA(seqs))), seqs[::-1]) seqs = [DNA('AAA'), DNA('GCT')] self.assertEqual(list(reversed(TabularMSA(seqs))), seqs[::-1])
def test_constructor_no_metadata(self): self.assertFalse(TabularMSA([]).has_metadata()) self.assertFalse( TabularMSA([DNA('', metadata={'id': 42})]).has_metadata()) self.assertFalse( TabularMSA([ DNA('AGC', metadata={'id': 42}), DNA('---', metadata={'id': 43}) ]).has_metadata())
def test_init_matrices_sw(self): expected_score_m = np.zeros((5, 4)) expected_tback_m = [[0, 0, 0, 0], [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1]] actual_score_m, actual_tback_m = _init_matrices_sw( TabularMSA([DNA('AAA', metadata={'id': 'id'})]), TabularMSA([DNA('AAAA', metadata={'id': 'id'})]), 5, 2) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
def test_reindex_key_and_keys_both_provided(self): msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str) keys = np.array(['ACGT', 'TGCA']) npt.assert_array_equal(msa.keys, keys) with six.assertRaisesRegex(self, ValueError, 'both.*key.*keys'): msa.reindex(key=str, keys=['a', 'b']) # original state is maintained npt.assert_array_equal(msa.keys, keys)
def test_reindex_makes_copy_of_keys(self): msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')]) keys = np.asarray([1, 2, 3]) msa.reindex(keys=keys) npt.assert_array_equal(msa.keys, np.array([1, 2, 3])) self.assertFalse(msa.keys.flags.writeable) self.assertTrue(keys.flags.writeable) keys[1] = 42 npt.assert_array_equal(msa.keys, np.array([1, 2, 3]))
def test_has_metadata(self): msa = TabularMSA([]) self.assertFalse(msa.has_metadata()) # Handles metadata efficiently. self.assertIsNone(msa._metadata) self.assertFalse(TabularMSA([], metadata={}).has_metadata()) self.assertTrue(TabularMSA([], metadata={'': ''}).has_metadata()) self.assertTrue(TabularMSA([], metadata={'foo': 42}).has_metadata())
def test_keys_setter_non_hashable_keys(self): msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str) keys = np.array(['ACGT', 'TGCA']) npt.assert_array_equal(msa.keys, keys) with self.assertRaises(TypeError): msa.keys = [[42], [42]] # original state is maintained npt.assert_array_equal(msa.keys, keys)
def test_init_matrices_nw(self): expected_score_m = [[0, -5, -7, -9], [-5, 0, 0, 0], [-7, 0, 0, 0], [-9, 0, 0, 0], [-11, 0, 0, 0]] expected_tback_m = [[0, 3, 3, 3], [2, -1, -1, -1], [2, -1, -1, -1], [2, -1, -1, -1], [2, -1, -1, -1]] actual_score_m, actual_tback_m = _init_matrices_nw( TabularMSA([DNA('AAA', metadata={'id': 'id'})]), TabularMSA([DNA('AAAA', metadata={'id': 'id'})]), 5, 2) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
def test_keys_setter_non_unique_keys(self): msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str) keys = np.array(['ACGT', 'TGCA']) npt.assert_array_equal(msa.keys, keys) with six.assertRaisesRegex(self, UniqueError, 'Duplicate keys:.*42'): msa.keys = [42, 42] # original state is maintained npt.assert_array_equal(msa.keys, keys)
def test_from_dict_to_dict_roundtrip(self): d = {} self.assertEqual(TabularMSA.from_dict(d).to_dict(), d) # can roundtrip even with mixed key types d1 = {'a': DNA('CAT'), 42: DNA('TAG')} d2 = TabularMSA.from_dict(d1).to_dict() self.assertEqual(d2, d1) self.assertIs(d1['a'], d2['a']) self.assertIs(d1[42], d2[42])
def test_reindex_keys_length_mismatch(self): msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str) keys = np.array(['ACGT', 'TGCA']) npt.assert_array_equal(msa.keys, keys) with six.assertRaisesRegex(self, ValueError, 'Number.*keys.*number.*sequences: 0 != 2'): msa.reindex(keys=iter([])) # original state is maintained npt.assert_array_equal(msa.keys, keys)
def test_sort_on_unorderable_key(self): unorderable = Unorderable() msa = TabularMSA([ DNA('AAA', metadata={'id': 42}), DNA('ACG', metadata={'id': unorderable})], keys=[42, 43]) with self.assertRaises(TypeError): msa.sort(key='id') self.assertEqual( msa, TabularMSA([ DNA('AAA', metadata={'id': 42}), DNA('ACG', metadata={'id': unorderable})], keys=[42, 43]))
def test_sort_on_key_with_all_repeats(self): msa = TabularMSA([ DNA('TTT', metadata={'id': 'a'}), DNA('TTT', metadata={'id': 'b'}), DNA('TTT', metadata={'id': 'c'})], keys=range(3)) msa.sort(key=str) self.assertEqual( msa, TabularMSA([ DNA('TTT', metadata={'id': 'a'}), DNA('TTT', metadata={'id': 'b'}), DNA('TTT', metadata={'id': 'c'})], keys=range(3)))
def test_metadata_getter(self): msa = TabularMSA([]) self.assertIsNone(msa._metadata) self.assertEqual(msa.metadata, {}) self.assertIsNotNone(msa._metadata) self.assertIsInstance(msa.metadata, dict) msa = TabularMSA([], metadata={42: 'foo', ('hello', 'world'): 43}) self.assertEqual(msa.metadata, {42: 'foo', ('hello', 'world'): 43}) self.assertIsInstance(msa.metadata, dict) msa.metadata[42] = 'bar' self.assertEqual(msa.metadata, {42: 'bar', ('hello', 'world'): 43})
def filter_positions(alignment_fh, maximum_gap_frequency, maximum_position_entropy): """Filter gaps and high entropy positions from an alignment.""" with alignment_fh: try: aln = TabularMSA.read(alignment_fh, constructor=DNA) except ValueError: alignment_fh.seek(0) aln = TabularMSA.read(alignment_fh, constructor=RNA) aln = _filter_gap_positions(aln, maximum_gap_frequency) aln = _filter_high_entropy_positions(aln, maximum_position_entropy) return aln
def test_metadata_setter_makes_shallow_copy(self): msa = TabularMSA([RNA('-.-'), RNA('.-.')]) md = {'foo': 'bar', 42: []} msa.metadata = md self.assertEqual(msa.metadata, md) self.assertIsNot(msa.metadata, md) md['foo'] = 'baz' self.assertEqual(msa.metadata, {'foo': 'bar', 42: []}) md[42].append(True) self.assertEqual(msa.metadata, {'foo': 'bar', 42: [True]})
def test_reformat_treepuzzle(self): """ Test functionality of reformat_treepuzzle() """ species_tree = TreeNode.read(self.species_tree_fp, format='newick') gene_tree_3 = TreeNode.read(self.gene_tree_3_fp, format='newick') output_tree_fp = join(self.working_dir, "joined_trees.nwk") output_msa_phy_fp = join(self.working_dir, "gene_tree_3.phy") reformat_treepuzzle(gene_tree_3, species_tree, self.msa_fa_3_fp, output_tree_fp, output_msa_phy_fp) reformat_tree_exp = [ "(((((((SE001:2.1494877,SE010:1.08661):3.7761166,SE008:" "0.86305436):0.21024487,(SE006:0.56704221,SE009:0.5014676):" "0.90294223):0.20542323,SE005:3.0992506):0.37145632,SE004:" "1.8129133):0.72933621,SE003:1.737411):0.24447835,(SE002:" "1.6606127,SE007:0.70000178):1.6331374);\n", "(((((((SE001:2.1494876,SE010:2.1494876):" "3.7761166,SE008:5.9256042):0.2102448,(SE006:" "5.2329068,SE009:5.2329068):0.9029422):0.2054233," "SE005:6.3412723):0.3714563,SE004:6.7127286):" "0.7293362,SE003:7.4420648):0.2444784,SE002:" "7.6865432);\n"] with open(output_tree_fp, 'r') as output_tree_f: reformat_tree_act = output_tree_f.readlines() self.assertListEqual(reformat_tree_exp, reformat_tree_act) msa_fa = TabularMSA.read(output_msa_phy_fp, constructor=Protein) labels_exp = [u'SE001', u'SE002', u'SE003', u'SE004', u'SE005', u'SE006', u'SE008', u'SE009', u'SE010'] labels_act = list(msa_fa.index) self.assertListEqual(labels_exp, labels_act)
def test_sort_on_key_with_some_repeats(self): msa = TabularMSA([ DNA('TCCG', metadata={'id': 10}), DNA('TAGG', metadata={'id': 10}), DNA('GGGG', metadata={'id': 8}), DNA('ACGT', metadata={'id': 0}), DNA('TAGG', metadata={'id': 10})], keys=range(5)) msa.sort(key='id') self.assertEqual( msa, TabularMSA([ DNA('ACGT', metadata={'id': 0}), DNA('GGGG', metadata={'id': 8}), DNA('TCCG', metadata={'id': 10}), DNA('TAGG', metadata={'id': 10}), DNA('TAGG', metadata={'id': 10})], keys=[3, 2, 0, 1, 4]))
def aln_distmat(alignment, reps=3): '''Calculate pairwise distances from a MSA of genomes''' aln = TabularMSA.read(alignment, constructor=DNA) aln.reassign_index(minter="id") dist = DistanceMatrix.from_iterable([seq.values for seq in aln], metric=hamming, keys=aln.index) return dist
def filter_positions(alignment_fh, maximum_gap_frequency, maximum_position_entropy): """Filter gaps and high entropy positions from an alignment.""" aln = TabularMSA.read(alignment_fh, constructor=DNA) aln = _filter_gap_positions(aln, maximum_gap_frequency) aln = _filter_high_entropy_positions(aln, maximum_position_entropy) return aln
def test_keys_update_subset_of_keys(self): # keys can be copied, modified, then re-set msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')], key=str) npt.assert_array_equal(msa.keys, np.array(['AC', 'AG', 'AT'])) new_keys = msa.keys.copy() new_keys[1] = 42 msa.keys = new_keys npt.assert_array_equal(msa.keys, np.array(['AC', 42, 'AT'], dtype=object)) self.assertFalse(msa.keys.flags.writeable) self.assertTrue(new_keys.flags.writeable) new_keys[1] = 'GG' npt.assert_array_equal(msa.keys, np.array(['AC', 42, 'AT'], dtype=object))
def test_from_dict_multiple_sequences(self): msa = TabularMSA.from_dict( {1: DNA('ACG'), 2: DNA('GGG'), 3: DNA('TAG')}) # Sort because order is arbitrary. msa.sort() self.assertEqual( msa, TabularMSA([DNA('ACG'), DNA('GGG'), DNA('TAG')], keys=[1, 2, 3]))
def test_metadata_setter(self): msa = TabularMSA([DNA('A-A'), DNA('A-G')]) self.assertFalse(msa.has_metadata()) msa.metadata = {'hello': 'world'} self.assertTrue(msa.has_metadata()) self.assertEqual(msa.metadata, {'hello': 'world'}) msa.metadata = {} self.assertFalse(msa.has_metadata())
def test_has_keys(self): self.assertFalse(TabularMSA([]).has_keys()) self.assertTrue(TabularMSA([], key=str).has_keys()) self.assertFalse(TabularMSA([DNA('')]).has_keys()) self.assertTrue(TabularMSA([DNA('')], key=str).has_keys()) self.assertFalse(TabularMSA([DNA('ACG'), DNA('GCA')]).has_keys()) self.assertTrue( TabularMSA([DNA('ACG', metadata={'id': 1}), DNA('GCA', metadata={'id': 2})], key='id').has_keys()) msa = TabularMSA([]) self.assertFalse(msa.has_keys()) msa.reindex(key=str) self.assertTrue(msa.has_keys()) msa.reindex() self.assertFalse(msa.has_keys())