def setUp(self): self.taxon_set1_data_paths = [ pathmap.tree_source_path("pythonidae.annotated.nexml"), pathmap.char_source_path("pythonidae_continuous.chars.nexml"), pathmap.tree_source_path("pythonidae.annotated.nexml"), pathmap.char_source_path("pythonidae_continuous.chars.nexml"), ] self.taxon_set1_len = 33 self.taxon_set2_data_paths = [ pathmap.tree_source_path("treebase_s373.xml"), ]
def testBoundTaxonNamespaceDefault(self): d = dendropy.DataSet() t = dendropy.TaxonNamespace() d.attach_taxon_namespace(t) self.assertEqual(len(d.taxon_namespaces), 1) self.assertIs(d.taxon_namespaces[0], d.attached_taxon_namespace) d.read(path=pathmap.mixed_source_path( 'reference_single_taxonset_dataset.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read( path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 147)
def testAttachTaxonNamespaceOnGet(self): t = dendropy.TaxonNamespace() d = dendropy.DataSet.get_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus", taxon_namespace=t) self.assertEqual(len(d.taxon_namespaces), 1) self.assertIsNot(d.attached_taxon_namespace, None) self.assertIs(d.taxon_namespaces[0], d.attached_taxon_namespace) self.assertIs(d.attached_taxon_namespace, t) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read( path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.detach_taxon_namespace() d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 2) self.assertEqual(len(d.taxon_namespaces[0]), 33) self.assertEqual(len(d.taxon_namespaces[1]), 114)
def test_basic_read(self): src_filename = "standard-test-chars-multiple-char-blocks.1.basic.nexus" src_path = pathmap.char_source_path(src_filename) ds = dendropy.DataSet() result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1, 0, 4)) self.verify_dataset(ds)
class SinglePopTest(dendropytest.ExtendedTestCase): data = dendropy.DnaCharacterMatrix.get_from_path( pathmap.char_source_path('COII_Apes.nex'), schema="nexus") def test_num_segregating_sites(self): self.assertEqual( popgenstat.num_segregating_sites(self.data, ignore_uncertain=True), 183) def test_average_number_of_pairwise_differences(self): self.assertAlmostEqual( popgenstat.average_number_of_pairwise_differences( self.data, ignore_uncertain=True), 62.75000, 4) def test_nucleotide_diversity(self): self.assertAlmostEqual( popgenstat.nucleotide_diversity(self.data, ignore_uncertain=True), 0.09174, 4) def test_tajimas_d(self): self.assertAlmostEqual( popgenstat.tajimas_d(self.data, ignore_uncertain=True), 1.12467, 4) def test_wattersons_theta(self): self.assertAlmostEqual( popgenstat.wattersons_theta(self.data, ignore_uncertain=True), 49.00528, 4)
def testBindAndUnbind(self): d = dendropy.DataSet(attach_taxon_set=True) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) d.read_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") _LOG.info(d.taxon_sets[0].description(2)) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path( pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.detach_taxon_set() d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[0]), 33) self.assertEqual(len(d.taxon_sets[1]), 114)
def testBindToSpecifiedTaxonSet(self): d = dendropy.DataSet() t = dendropy.TaxonSet() d.attach_taxon_set(t) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) self.assertIs(d.attached_taxon_set, t) d.read_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path( pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.detach_taxon_set() d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[0]), 33) self.assertEqual(len(d.taxon_sets[1]), 114)
def verify_subsets(self, src_filename, expected_sets): """ ``src_filename`` -- name of file containing full data and charsets statement ``expected_sets`` -- dictionary with keys = label of charset, and values = name of file with subset of characters correspond to the charset. """ src_data = dendropy.DnaCharacterMatrix.get_from_path( pathmap.char_source_path(src_filename), 'nexus') state_alphabet = src_data.default_state_alphabet self.assertEqual(len(src_data.character_subsets), len(expected_sets)) for label, expected_data_file in expected_sets.items(): _LOG.debug(label) self.assertTrue(label in src_data.character_subsets) result_subset = src_data.export_character_subset(label) expected_subset = dendropy.DnaCharacterMatrix.get_from_path( pathmap.char_source_path(expected_data_file), 'nexus') # confirm subset is correct self.compare_distinct_char_matrix( result_subset, expected_subset, taxon_namespace_scoped=False, ) # mutate new and confirm that old remains unchanged e1_symbols = src_data[0].symbols_as_string() r1 = result_subset[0] dummy_state = state_alphabet["A"] for idx in range(len(r1)): r1[idx].value = dummy_state self.assertEqual(e1_symbols, src_data[0].symbols_as_string()) # mutate old and confirm that new remains unchanged r2_symbols = result_subset[1].symbols_as_string() e2 = src_data[1] dummy_state = state_alphabet["A"] for idx in range(len(e2)): e2[idx].value = dummy_state self.assertEqual(r2_symbols, result_subset[1].symbols_as_string())
def test_basic_nexus_chars(self): for src_filename, matrix_type, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) d1 = matrix_type.get_from_path(src_path, "nexus") s = self.write_out_validate_equal_and_return( d1, "nexus", {}) d2 = matrix_type.get_from_string(s, "nexus") self.verify_char_matrix(d2, src_matrix_checker_type)
def test_get_single(self): for src_filename, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) ds = dendropy.DataSet.get_from_path(src_path, "nexus") self.assertEqual(len(ds.char_matrices), 1) self.assertEqual(len(ds.taxon_namespaces), 1) self.assertIs(ds.char_matrices[0].taxon_namespace, ds.taxon_namespaces[0]) self.verify_char_matrix(ds.char_matrices[0], src_matrix_checker_type)
def test_basic_fasta_chars(self): for src_filename, matrix_type, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) d1 = matrix_type.get_from_path(src_path, "fasta") for wrap in (True, False): s = self.write_out_validate_equal_and_return( d1, "fasta", {"wrap": wrap}) d2 = matrix_type.get_from_string(s, "fasta") self.verify_char_matrix(d2, src_matrix_checker_type)
def testAsStrReading(self): dataset = dendropy.DataSet( stream=open(pathmap.char_source_path("bad_names.fasta"), "rU"), schema='fasta', data_type='dna', row_type='str' ) taxon_set = dataset.taxon_sets[0] label = [i.label for i in taxon_set] expected = ['a Bad name', 'another', 'a Badn,ame', 'a nothe++-_=+r', 'an!@#$o^&*()}{_ther'] self.assertEquals(label, expected)
def test_read_successive_unattached_taxon_namespace(self): ds = dendropy.DataSet() for src_idx, (src_filename, src_matrix_checker_type) in enumerate(self.__class__.srcs): src_path = pathmap.char_source_path(src_filename) result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1,0,1)) self.assertEqual(len(ds.char_matrices), src_idx+1) self.assertEqual(len(ds.taxon_namespaces), src_idx+1) self.assertIs(ds.char_matrices[src_idx].taxon_namespace, ds.taxon_namespaces[src_idx]) self.verify_char_matrix(ds.char_matrices[src_idx], src_matrix_checker_type)
def test_read_single(self): for src_filename, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) ds = dendropy.DataSet() result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1,0,1)) self.assertEqual(len(ds.char_matrices), 1) self.assertEqual(len(ds.taxon_namespaces), 1) self.assertIs(ds.char_matrices[0].taxon_namespace, ds.taxon_namespaces[0]) self.verify_char_matrix(ds.char_matrices[0], src_matrix_checker_type)
def test_basic_fasta(self): src_path = pathmap.char_source_path("standard-test-chars-rna.fasta") self.verify_get_from(matrix_type=dendropy.RnaCharacterMatrix, src_filepath=src_path, schema="fasta", factory_kwargs={}, check_taxon_annotations=False, check_matrix_annotations=False, check_sequence_annotations=False, check_column_annotations=False, check_cell_annotations=False)
def test_basic_nexml_chars(self): for src_filename, matrix_type, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) d1 = matrix_type.get_from_path(src_path, "nexml") # for markup_as_sequences in (True, False): for markup_as_sequences in (False,): s = self.write_out_validate_equal_and_return( d1, "nexml", {"markup_as_sequences": markup_as_sequences}) # if not markup_as_sequences: # print(s) d2 = matrix_type.get_from_string(s, "nexml") self.verify_char_matrix(d2, src_matrix_checker_type)
def test_read_single(self): for src_filename, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) ds = dendropy.DataSet() result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1, 0, 1)) self.assertEqual(len(ds.char_matrices), 1) self.assertEqual(len(ds.taxon_namespaces), 1) self.assertIs(ds.char_matrices[0].taxon_namespace, ds.taxon_namespaces[0]) self.verify_char_matrix(ds.char_matrices[0], src_matrix_checker_type)
def test_basic_fasta(self): src_path = pathmap.char_source_path("standard-test-chars-protein.fasta") self.verify_get_from( matrix_type=dendropy.ProteinCharacterMatrix, src_filepath=src_path, schema="fasta", factory_kwargs={}, check_taxon_annotations=False, check_matrix_annotations=False, check_sequence_annotations=False, check_column_annotations=False, check_cell_annotations=False, )
def test_read_successive_unattached_taxon_namespace(self): ds = dendropy.DataSet() for src_idx, (src_filename, src_matrix_checker_type) in enumerate( self.__class__.srcs): src_path = pathmap.char_source_path(src_filename) result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1, 0, 1)) self.assertEqual(len(ds.char_matrices), src_idx + 1) self.assertEqual(len(ds.taxon_namespaces), src_idx + 1) self.assertIs(ds.char_matrices[src_idx].taxon_namespace, ds.taxon_namespaces[src_idx]) self.verify_char_matrix(ds.char_matrices[src_idx], src_matrix_checker_type)
def testMultiTaxonNamespace(self): d = dendropy.DataSet() d.read(path=pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 2) self.assertEqual(len(d.taxon_namespaces[1]), 33) d.read(path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 3) self.assertEqual(len(d.taxon_namespaces[2]), 33) d.read(path=pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 4) self.assertEqual(len(d.taxon_namespaces[3]), 114)
def testMultiTaxonSet(self): d = dendropy.DataSet() d.read_from_path(pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[1]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 3) self.assertEqual(len(d.taxon_sets[2]), 33) d.read_from_path(pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 4) self.assertEqual(len(d.taxon_sets[3]), 114)
def test_basic_phylip_chars(self): for src_filename, matrix_type, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) d1 = matrix_type.get_from_path(src_path, "phylip") for strict in (True, False): for spaces_to_underscores in (True, False): for force_unique_taxon_labels in (True, False): s = self.write_out_validate_equal_and_return( d1, "phylip", { "strict": strict, "spaces_to_underscores" : spaces_to_underscores, "force_unique_taxon_labels" : force_unique_taxon_labels, }) d2 = matrix_type.get_from_string(s, "phylip") self.verify_char_matrix(d2, src_matrix_checker_type)
def testPScore(self): expected_scores = [370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 671, 670, 678, 687, 633, 675, 689, 668, 652, 644] dataset = dendropy.DataSet(stream=open(pathmap.char_source_path("apternodus.chars.nexus"), "rU"), schema='NEXUS') dataset.read(stream=open(pathmap.tree_source_path("apternodus.tre"), "rU"), schema='NEXUS', taxon_set=dataset.taxon_sets[0]) char_mat = dataset.char_matrices[0] taxa_to_state_set_map = char_mat.create_taxon_to_state_set_map() tree_list = dataset.tree_lists[0] self.assertEqual(len(expected_scores), len(tree_list)) for n, tree in enumerate(tree_list): node_list = tree.postorder_node_iter() pscore = fitch_down_pass(node_list, taxa_to_state_set_map=taxa_to_state_set_map) self.assertEqual(expected_scores[n], pscore)
def testBoundTaxonSetDefault(self): d = dendropy.DataSet(attach_taxon_set=True) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) d.read_from_path(pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 147)
def test_basic_phylip(self): src_filenames = [ "standard-test-chars-protein.relaxed.phylip", ] for src_idx, src_filename in enumerate(src_filenames): # print(src_idx, src_filename) src_path = pathmap.char_source_path(src_filename) self.verify_get_from( matrix_type=dendropy.ProteinCharacterMatrix, src_filepath=src_path, schema="phylip", factory_kwargs={}, check_taxon_annotations=False, check_matrix_annotations=False, check_sequence_annotations=False, check_column_annotations=False, check_cell_annotations=False)
def test_basic_nexus(self): src_filenames = [ "standard-test-chars-continuous.mesquite.nexus", "standard-test-chars-continuous.mesquite.interleaved.nexus", ] for src_idx, src_filename in enumerate(src_filenames): # print(src_idx, src_filename) src_path = pathmap.char_source_path(src_filename) self.verify_get_from( matrix_type=dendropy.ContinuousCharacterMatrix, src_filepath=src_path, schema="nexus", factory_kwargs={}, check_taxon_annotations=False, check_matrix_annotations=False, check_sequence_annotations=False, check_column_annotations=False, check_cell_annotations=False)
def testBoundTaxonNamespaceDefault(self): d = dendropy.DataSet() t = dendropy.TaxonNamespace() d.attach_taxon_namespace(t) self.assertEqual(len(d.taxon_namespaces), 1) self.assertIs(d.taxon_namespaces[0], d.attached_taxon_namespace) d.read(path=pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 147)
def testBindAndUnbind(self): d = dendropy.DataSet(attach_taxon_set=True) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) d.read_from_path(pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") _LOG.info(d.taxon_sets[0].description(2)) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.detach_taxon_set() d.read_from_path(pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[0]), 33) self.assertEqual(len(d.taxon_sets[1]), 114)
def test_basic_nexus(self): src_filenames = [ ("standard-test-chars-continuous.relaxed.phylip", {}), ("standard-test-chars-continuous.interleaved.phylip", {"interleaved": True}), ] for src_idx, (src_filename, kwargs) in enumerate(src_filenames): # print(src_idx, src_filename) src_path = pathmap.char_source_path(src_filename) self.verify_get_from( matrix_type=dendropy.ContinuousCharacterMatrix, src_filepath=src_path, schema="phylip", factory_kwargs=kwargs, check_taxon_annotations=False, check_matrix_annotations=False, check_sequence_annotations=False, check_column_annotations=False, check_cell_annotations=False)
def testPopulationPairSummaryStatistics(self): seqs = dendropy.DnaCharacterMatrix.get_from_path(pathmap.char_source_path('orti.nex'), schema="nexus") p1 = [] p2 = [] for idx, t in enumerate(seqs.taxon_namespace): if t.label.startswith('EPAC'): p1.append(seqs[t]) else: p2.append(seqs[t]) pp = popgenstat.PopulationPairSummaryStatistics(p1, p2) self.assertAlmostEqual(pp.average_number_of_pairwise_differences, 11.28063, 4) self.assertAlmostEqual(pp.average_number_of_pairwise_differences_between, 16.119047619, 4) self.assertAlmostEqual(pp.average_number_of_pairwise_differences_within, 10.2191697192, 4) self.assertAlmostEqual(pp.average_number_of_pairwise_differences_net, 5.89987789988, 4) self.assertEqual(pp.num_segregating_sites, 29) self.assertAlmostEqual(pp.wattersons_theta, 7.85734688643, 4) self.assertAlmostEqual(pp.tajimas_d, 1.65318627677, 4) self.assertAlmostEqual(pp.wakeleys_psi, 0.8034976, 2)
def test_basic_nexml(self): src_filenames = [ "standard-test-chars-rna.as_cells.nexml", "standard-test-chars-rna.as_seqs.nexml", ] for src_idx, src_filename in enumerate(src_filenames): # print(src_idx, src_filename) src_path = pathmap.char_source_path(src_filename) self.verify_get_from( matrix_type=dendropy.RnaCharacterMatrix, src_filepath=src_path, schema="nexml", factory_kwargs={}, check_taxon_annotations=False, check_matrix_annotations=False, check_sequence_annotations=False, check_column_annotations=False, check_cell_annotations=False)
def test_basic_phylip_chars(self): for src_filename, matrix_type, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) d1 = matrix_type.get_from_path(src_path, "phylip") for strict in (True, False): for spaces_to_underscores in (True, False): for force_unique_taxon_labels in (True, False): s = self.write_out_validate_equal_and_return( d1, "phylip", { "strict": strict, "spaces_to_underscores": spaces_to_underscores, "force_unique_taxon_labels": force_unique_taxon_labels, }) d2 = matrix_type.get_from_string(s, "phylip") self.verify_char_matrix(d2, src_matrix_checker_type)
def testAttachTaxonNamespaceOnGet(self): t = dendropy.TaxonNamespace() d = dendropy.DataSet.get_from_path(pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus", taxon_namespace=t) self.assertEqual(len(d.taxon_namespaces), 1) self.assertIsNot(d.attached_taxon_namespace, None) self.assertIs(d.taxon_namespaces[0], d.attached_taxon_namespace) self.assertIs(d.attached_taxon_namespace, t) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.detach_taxon_namespace() d.read_from_path(pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 2) self.assertEqual(len(d.taxon_namespaces[0]), 33) self.assertEqual(len(d.taxon_namespaces[1]), 114)
def testBindToSpecifiedTaxonSet(self): d = dendropy.DataSet() t = dendropy.TaxonSet() d.attach_taxon_set(t) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) self.assertIs(d.attached_taxon_set, t) d.read_from_path(pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.detach_taxon_set() d.read_from_path(pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[0]), 33) self.assertEqual(len(d.taxon_sets[1]), 114)
def testMultiTaxonSet(self): d = dendropy.DataSet() d.read_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[1]), 33) d.read_from_path( pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 3) self.assertEqual(len(d.taxon_sets[2]), 33) d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 4) self.assertEqual(len(d.taxon_sets[3]), 114)
def testPScore(self): expected_scores = [ 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 671, 670, 678, 687, 633, 675, 689, 668, 652, 644 ] dataset = dendropy.DataSet(stream=open( pathmap.char_source_path("apternodus.chars.nexus"), "rU"), schema='NEXUS') dataset.read(stream=open(pathmap.tree_source_path("apternodus.tre"), "rU"), schema='NEXUS', taxon_set=dataset.taxon_sets[0]) char_mat = dataset.char_matrices[0] taxa_to_state_set_map = char_mat.create_taxon_to_state_set_map() tree_list = dataset.tree_lists[0] self.assertEqual(len(expected_scores), len(tree_list)) for n, tree in enumerate(tree_list): node_list = tree.postorder_node_iter() pscore = fitch_down_pass( node_list, taxa_to_state_set_map=taxa_to_state_set_map) self.assertEqual(expected_scores[n], pscore)
def testMultiTaxonNamespace(self): d = dendropy.DataSet() d.read(path=pathmap.mixed_source_path( 'reference_single_taxonset_dataset.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 2) self.assertEqual(len(d.taxon_namespaces[1]), 33) d.read( path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 3) self.assertEqual(len(d.taxon_namespaces[2]), 33) d.read(path=pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 4) self.assertEqual(len(d.taxon_namespaces[3]), 114)
def verify_pscores(self, char_fname, trees_fname, gaps_as_missing, expected_scores): dataset = dendropy.DataSet.get_from_path( pathmap.char_source_path(char_fname), "nexus") dataset.read_from_path( pathmap.tree_source_path(trees_fname), schema='NEXUS', taxon_namespace=dataset.taxon_namespaces[0]) char_mat = dataset.char_matrices[0] # sa = char_mat.default_state_alphabet # for x in sa: # print("{}: {}".format(x, x.is_gap_state)) # for x in sa: # print("{}\t{}\t{}\t\t\t\t{}".format(x, x._index, x.fundamental_indexes, x.fundamental_indexes_with_gaps_as_missing)) taxon_state_sets_map = char_mat.taxon_state_sets_map(gaps_as_missing=gaps_as_missing) tree_list = dataset.tree_lists[0] self.assertEqual(len(expected_scores), len(tree_list)) for n, tree in enumerate(tree_list): node_list = tree.postorder_node_iter() pscore = fitch_down_pass(node_list, taxon_state_sets_map=taxon_state_sets_map) # print("{} vs. {}".format(expected_scores[n], pscore)) self.assertEqual(expected_scores[n], pscore)
def testPopulationPairSummaryStatistics(self): seqs = dendropy.DnaCharacterMatrix.get_from_path( pathmap.char_source_path('orti.nex'), schema="nexus") p1 = [] p2 = [] for idx, t in enumerate(seqs.taxon_namespace): if t.label.startswith('EPAC'): p1.append(seqs[t]) else: p2.append(seqs[t]) pp = popgenstat.PopulationPairSummaryStatistics(p1, p2) self.assertAlmostEqual(pp.average_number_of_pairwise_differences, 11.28063, 4) self.assertAlmostEqual( pp.average_number_of_pairwise_differences_between, 16.119047619, 4) self.assertAlmostEqual( pp.average_number_of_pairwise_differences_within, 10.2191697192, 4) self.assertAlmostEqual(pp.average_number_of_pairwise_differences_net, 5.89987789988, 4) self.assertEqual(pp.num_segregating_sites, 29) self.assertAlmostEqual(pp.wattersons_theta, 7.85734688643, 4) self.assertAlmostEqual(pp.tajimas_d, 1.65318627677, 4) self.assertAlmostEqual(pp.wakeleys_psi, 0.8034976, 2)
def testBoundTaxonSetDefault(self): d = dendropy.DataSet(attach_taxon_set=True) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) d.read_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path( pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 147)
def testFullCopyTaxonSet(self): src_path = pathmap.char_source_path("crotaphytus_bicinctores.cytb.aligned.nexml") d = dendropy.DataSet.get_from_path(src_path, "nexml") taxon_set1 = d.taxon_sets[0] taxon_set2 = taxon_set1.fullcopy() self.assertTrue(taxon_set1 is not taxon_set2) self.assertDistinctButEqualTaxonSet(taxon_set1, taxon_set2, distinct_taxon_objects=True) self.assertEqual(len(taxon_set1), len(taxon_set2)) for idx, taxon1 in enumerate(taxon_set1): taxon2 = taxon_set2[idx] self.assertTrue(taxon1 is not taxon2) self.assertEqual(taxon1.label, taxon2.label) for idx, a1 in enumerate(taxon1.annotations): a2 = taxon2.annotations[idx] self.assertTrue(a1 is not a2) self.assertEqual(a1.name, a2.name) self.assertEqual(a1.value, a2.value) self.assertEqual(len(taxon_set1.annotations), len(taxon_set2.annotations)) for idx, a1 in enumerate(taxon_set1.annotations): a2 = taxon_set2.annotations[idx] self.assertTrue(a1 is not a2) self.assertEqual(a1.name, a2.name) self.assertEqual(a1.value, a2.value)
def verify_pscores(self, trees_fname, chars_fname, matrix_type, gaps_as_missing, expected_scores, expected_per_site_scores): taxon_namespace = dendropy.TaxonNamespace() chars = matrix_type.get( path=pathmap.char_source_path(chars_fname), schema="nexus", taxon_namespace=taxon_namespace) trees = dendropy.TreeList.get( path=pathmap.tree_source_path(trees_fname), schema="nexus", taxon_namespace=taxon_namespace) self.assertEqual(len(expected_scores), len(trees)) for tree_idx, tree in enumerate(trees): score_by_character_list = [] pscore = treescore.parsimony_score( tree, chars, gaps_as_missing=gaps_as_missing, score_by_character_list=score_by_character_list) self.assertEqual(pscore, expected_scores[tree_idx]) self.assertEqual(len(score_by_character_list), len(expected_per_site_scores[tree_idx])) for obs, exp in zip(score_by_character_list, expected_per_site_scores[tree_idx]): self.assertEqual(obs, exp) self.assertEqual(sum(score_by_character_list), pscore) # just to be sure it works without passing in `score_by_character_list`: pscore = treescore.parsimony_score( tree, chars, gaps_as_missing=gaps_as_missing) self.assertEqual(pscore, expected_scores[tree_idx])
def test_basic_get(self): src_filename = "standard-test-chars-multiple-char-blocks.1.basic.nexus" src_path = pathmap.char_source_path(src_filename) ds = dendropy.DataSet.get_from_path(src_path, "nexus") self.verify_dataset(ds)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "-f", "--target-file", type=str, dest="target_files", default=[], action="append", help= "Path to file to be tokenized; option may be specified multiple times for multiple files." ) parser.add_argument( "-t", "--target-type", type=str, dest="target_types", default=[], choices=["trees", "chars", "all"], action="append", help= "Input data file types (default='all' if '-f'/'--file' argument not given); option may be specified multiple times." ) parser.add_argument( "-r", "--repeat", type=int, default=10, help= "Repeat each tokenization this number of times (default=%(default)s).") parser.add_argument( "--delimited-output", action="store_true", default=False, help="Output in tab-delimited instead of aligned format") args = parser.parse_args() messenger = messaging.ConsoleMessenger(name="-benchmark") src_descs = [] src_paths = [] results = [] if args.target_files: for f in args.target_files: ff = os.path.expanduser(os.path.expandvars(f)) src_paths.append(ff) src_descs.append(("User", f)) if not args.target_types and not args.target_files: messenger.info( "No sources specified: adding default benchmark target set") args.target_types = ["all"] if "all" in args.target_types or "trees" in args.target_types: for f in TREE_FILENAMES: ff = pathmap.tree_source_path(f) src_paths.append(ff) src_descs.append(("Trees", f)) if "all" in args.target_types or "chars" in args.target_types: for f in CHAR_FILENAMES: ff = pathmap.char_source_path(f) src_paths.append(ff) src_descs.append(("Alignment", f)) for src_path, src_desc in zip(src_paths, src_descs): messenger.info("Processing: '{}'".format(src_desc[1])) t = timeit.Timer(tokenizing_fn_factory([src_path])) result = min(t.repeat(args.repeat, 1)) messenger.info("Best time (of {} repetions): {:.10f} seconds".format( args.repeat, result)) results.append(result) messenger.info("Benchmarking complete: all files processed") if args.delimited_output: result_template = "{}\t{}\t{:.10f}\n" header_template = "{}\t{}\t{}\n" else: max_len1 = max(len(r[0]) for r in src_descs) max_len2 = max(len(r[1]) for r in src_descs) col1 = "{{:{}}}".format(max_len1) col2 = "{{:{}}}".format(max_len2) result_template = "[" + col1 + "] " + col2 + " {:.10f}\n" header_template = col1 + " " + col2 + " {}\n" sys.stdout.write(header_template.format("Type", "File", "Seconds")) for result, src_desc in zip(results, src_descs): sys.stdout.write( result_template.format(src_desc[0], src_desc[1], result))
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-f", "--target-file", type=str, dest="target_files", default=[], action="append", help="Path to file to be tokenized; option may be specified multiple times for multiple files.") parser.add_argument("-t", "--target-type", type=str, dest="target_types", default=[], choices=["trees", "chars", "all"], action="append", help="Input data file types (default='all' if '-f'/'--file' argument not given); option may be specified multiple times.") parser.add_argument("-r", "--repeat", type=int, default=10, help="Repeat each tokenization this number of times (default=%(default)s).") parser.add_argument("--delimited-output", action="store_true", default=False, help="Output in tab-delimited instead of aligned format") args = parser.parse_args() messenger = messaging.ConsoleMessenger(name="-benchmark") src_descs = [] src_paths = [] results = [] if args.target_files: for f in args.target_files: ff = os.path.expanduser(os.path.expandvars(f)) src_paths.append(ff) src_descs.append( ("User", f) ) if not args.target_types and not args.target_files: messenger.info("No sources specified: adding default benchmark target set") args.target_types = ["all"] if "all" in args.target_types or "trees" in args.target_types: for f in TREE_FILENAMES: ff = pathmap.tree_source_path(f) src_paths.append(ff) src_descs.append( ("Trees", f) ) if "all" in args.target_types or "chars" in args.target_types: for f in CHAR_FILENAMES: ff = pathmap.char_source_path(f) src_paths.append(ff) src_descs.append( ("Alignment", f) ) for src_path, src_desc in zip(src_paths, src_descs): messenger.info("Processing: '{}'".format(src_desc[1])) t = timeit.Timer(tokenizing_fn_factory([src_path])) result = min(t.repeat(args.repeat, 1)) messenger.info("Best time (of {} repetions): {:.10f} seconds".format(args.repeat, result)) results.append(result) messenger.info("Benchmarking complete: all files processed") if args.delimited_output: result_template = "{}\t{}\t{:.10f}\n" header_template = "{}\t{}\t{}\n" else: max_len1 = max(len(r[0]) for r in src_descs) max_len2 = max(len(r[1]) for r in src_descs) col1 = "{{:{}}}".format(max_len1) col2 = "{{:{}}}".format(max_len2) result_template = "[" + col1 + "] " + col2 + " {:.10f}\n" header_template = col1 + " " + col2 + " {}\n" sys.stdout.write(header_template.format("Type", "File", "Seconds")) for result, src_desc in zip(results, src_descs): sys.stdout.write(result_template.format(src_desc[0], src_desc[1], result))