def setup_module(): for schema in schema_extension_map: _TREE_FILEPATHS[schema] = {} for tree_file_title in tree_file_titles: tf = "{}.{}".format(tree_file_title, schema_extension_map[schema]) _TREE_FILEPATHS[schema][tree_file_title] = pathmap.tree_source_path(tf) for tree_file_title in tree_file_titles: with open(_TREE_FILEPATHS["json"][tree_file_title]) as src: _TREE_REFERENCES[tree_file_title] = json.load(src) if "annotated" in tree_file_title: with open(_TREE_FILEPATHS["nexus-metadata-comments"][tree_file_title]) as src: _NEXUS_METADATA_COMMENTS[tree_file_title] = json.load(src)
def test_out_of_range_collection_offset_newick_get(self): tree_file_title = 'dendropy-test-trees-n33-unrooted-x10a' tree_filepath = self.schema_tree_filepaths[tree_file_title] with open(tree_filepath, "r") as src: tree_string = src.read() with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for method, src in approaches: with self.assertRaises(IndexError): method(src, "newick", collection_offset=1, tree_offset=0)
def test_out_of_range_tree_offset_newick_get(self): tree_file_title = 'dendropy-test-trees-n33-unrooted-x10a' tree_filepath = self.schema_tree_filepaths[tree_file_title] tree_reference = standard_file_test_trees._TREE_REFERENCES[tree_file_title] expected_number_of_trees = tree_reference["num_trees"] with open(tree_filepath, "r") as src: tree_string = src.read() with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for method, src in approaches: with self.assertRaises(IndexError): method(src, "newick", collection_offset=0, tree_offset=expected_number_of_trees)
def get_splits_reference(splits_filename, splits_dir=None, key_column_index=0): # Key columns are: # 0 : PAUP* bipartition string representation '....**...' etc. # 1 : unnormalized split bitmask (for rooted trees) == leafset_bitmask for all trees and split_bitmask for rooted trees # 2 : normalized split bitmask (for unrooted trees) == split_bitmask for unrooted trees # 3 : (weighted) counts # 4 : (weighted) frequencies if splits_dir is not None: splits_filepath = os.path.join(splits_dir, splits_filename) else: splits_filepath = pathmap.splits_source_path(splits_filename) d = collections.OrderedDict() with open(splits_filepath, "r") as src: for row in src: content = row.split("#")[0] if not content: continue fields = content.split("\t") assert len(fields) == 5, "{}: {}".format(content, fields) for idx, field in enumerate(fields): fields[idx] = _SPLITS_REFERENCE_FIELD_TYPES[idx](fields[idx]) key = fields[key_column_index] d[key] = { "bipartition_string": fields[0], "unnormalized_split_bitmask": fields[1], "normalized_split_bitmask": fields[2], "count": fields[3], "frequency": fields[4] / 100, } return d
def get_splits_reference( splits_filename, splits_dir=None, key_column_index=0): # Key columns are: # 0 : PAUP* bipartition string representation '....**...' etc. # 1 : unnormalized split bitmask (for rooted trees) == leafset_bitmask for all trees and split_bitmask for rooted trees # 2 : normalized split bitmask (for unrooted trees) == split_bitmask for unrooted trees # 3 : (weighted) counts # 4 : (weighted) frequencies if splits_dir is not None: splits_filepath = os.path.join(splits_dir, splits_filename) else: splits_filepath = pathmap.splits_source_path(splits_filename) d = collections.OrderedDict() with open(splits_filepath, "r") as src: for row in src: content = row.split("#")[0] if not content: continue fields = content.split("\t") assert len(fields) == 5, "{}: {}".format(content, fields) for idx, field in enumerate(fields): fields[idx] = _SPLITS_REFERENCE_FIELD_TYPES[idx](fields[idx]) key = fields[key_column_index] d[key] = { "bipartition_string": fields[0], "unnormalized_split_bitmask": fields[1], "normalized_split_bitmask": fields[2], "count": fields[3], "frequency": fields[4]/100, } return d
def test_basic_parsing(self): tree_string = self.get_newick_string() reader_kwargs = {} with pathmap.SandboxedFile() as tempf: tempf.write(tree_string) tempf.flush() tree_filepath = tempf.name for suppress_internal_node_taxa in (None, False, True): if suppress_internal_node_taxa is None: expected_suppress_internal_node_taxa = True reader_kwargs.pop("suppress_internal_node_taxa", None) else: expected_suppress_internal_node_taxa = suppress_internal_node_taxa reader_kwargs[ "suppress_internal_node_taxa"] = suppress_internal_node_taxa for suppress_leaf_node_taxa in (None, False, True): if suppress_leaf_node_taxa is None: expected_suppress_leaf_node_taxa = False reader_kwargs.pop("suppress_leaf_node_taxa", None) else: expected_suppress_leaf_node_taxa = suppress_leaf_node_taxa reader_kwargs[ "suppress_leaf_node_taxa"] = suppress_leaf_node_taxa for suppress_edge_lengths in (None, False, True): if suppress_edge_lengths is None: expected_suppress_edge_lengths = False reader_kwargs.pop("suppress_edge_lengths", None) else: expected_suppress_edge_lengths = suppress_edge_lengths reader_kwargs[ "suppress_edge_lengths"] = suppress_edge_lengths with open(tree_filepath, "r") as tree_stream: approaches = ( { "path": tree_filepath }, { "file": tree_stream }, { "data": tree_string }, ) for approach_kwargs in approaches: approach_kwargs.update(reader_kwargs) approach_kwargs["schema"] = "newick" t = dendropy.Tree.get(**approach_kwargs) self.verify_curated_tree( t, suppress_internal_node_taxa= expected_suppress_internal_node_taxa, suppress_leaf_node_taxa= expected_suppress_leaf_node_taxa, suppress_edge_lengths= expected_suppress_edge_lengths)
def test_tree_offset_newick_get(self): tree_file_title = "dendropy-test-trees-n33-unrooted-x100a" tree_reference = standard_file_test_trees._TREE_REFERENCES[ tree_file_title] expected_number_of_trees = tree_reference["num_trees"] tree_offsets = set( [0, expected_number_of_trees - 1, -1, -expected_number_of_trees]) while len(tree_offsets) < 8: tree_offsets.add(random.randint(1, expected_number_of_trees - 2)) while len(tree_offsets) < 12: tree_offsets.add(random.randint(-expected_number_of_trees - 2, -2)) tree_filepath = self.schema_tree_filepaths[tree_file_title] with open(tree_filepath, "r") as src: tree_string = src.read() for tree_offset in tree_offsets: tree_reference = standard_file_test_trees._TREE_REFERENCES[ tree_file_title] expected_number_of_trees = tree_reference["num_trees"] if tree_offset < 0: if abs(tree_offset) > expected_number_of_trees: tree_offset = 0 else: tree_offset = expected_number_of_trees + tree_offset with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for method, src in approaches: tree = method(src, "newick", collection_offset=0, tree_offset=tree_offset, suppress_internal_node_taxa=True, suppress_leaf_node_taxa=False, rooting="default-unrooted") reference_tree_idx = tree_offset self.compare_to_reference_by_title_and_index( tree=tree, tree_file_title=tree_file_title, reference_tree_idx=tree_offset)
def test_out_of_range_tree_offset_newick_get(self): tree_file_title = 'dendropy-test-trees-n33-unrooted-x10a' tree_filepath = self.schema_tree_filepaths[tree_file_title] tree_reference = standard_file_test_trees._TREE_REFERENCES[ tree_file_title] expected_number_of_trees = tree_reference["num_trees"] with open(tree_filepath, "r") as src: tree_string = src.read() with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for method, src in approaches: with self.assertRaises(IndexError): method(src, "newick", collection_offset=0, tree_offset=expected_number_of_trees)
def test_tree_offset_without_collection_offset_newick_get(self): tree_file_title = 'dendropy-test-trees-n33-unrooted-x10a' tree_filepath = self.schema_tree_filepaths[tree_file_title] tree_reference = standard_file_test_trees._TREE_REFERENCES[tree_file_title] expected_number_of_trees = tree_reference["num_trees"] with open(tree_filepath, "r") as src: tree_string = src.read() with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for approach in approaches: tree_offset = 2 tree = approach[0](approach[1], "newick", tree_offset=tree_offset) reference_tree_idx = tree_offset self.compare_to_reference_by_title_and_index( tree=tree, tree_file_title=tree_file_title, reference_tree_idx=tree_offset)
def test_tree_offset_newick_get(self): tree_file_title = "dendropy-test-trees-n33-unrooted-x100a" tree_reference = standard_file_test_trees._TREE_REFERENCES[tree_file_title] expected_number_of_trees = tree_reference["num_trees"] tree_offsets = set([0, expected_number_of_trees-1, -1, -expected_number_of_trees]) while len(tree_offsets) < 8: tree_offsets.add(random.randint(1, expected_number_of_trees-2)) while len(tree_offsets) < 12: tree_offsets.add(random.randint(-expected_number_of_trees-2, -2)) tree_filepath = self.schema_tree_filepaths[tree_file_title] with open(tree_filepath, "r") as src: tree_string = src.read() for tree_offset in tree_offsets: tree_reference = standard_file_test_trees._TREE_REFERENCES[tree_file_title] expected_number_of_trees = tree_reference["num_trees"] if tree_offset < 0: if abs(tree_offset) > expected_number_of_trees: tree_offset = 0 else: tree_offset = expected_number_of_trees + tree_offset with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for method, src in approaches: tree = method( src, "newick", collection_offset=0, tree_offset=tree_offset, suppress_internal_node_taxa=True, suppress_leaf_node_taxa=False, rooting="default-unrooted") reference_tree_idx = tree_offset self.compare_to_reference_by_title_and_index( tree=tree, tree_file_title=tree_file_title, reference_tree_idx=tree_offset)
def test_read_metadata(self): tree_file_titles = [ "dendropy-test-trees-multifurcating-rooted-annotated", "dendropy-test-trees-n33-unrooted-annotated-x10a", ] for tree_file_title in tree_file_titles: tree_filepath = standard_file_test_trees._TREE_FILEPATHS["newick"][tree_file_title] with open(tree_filepath, "r") as src: tree_string = src.read() with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.TreeList.get_from_path, tree_filepath), (dendropy.TreeList.get_from_stream, tree_stream), (dendropy.TreeList.get_from_string, tree_string), ) for method, src in approaches: tree_list = method(src, "newick", extract_comment_metadata=True) self.verify_standard_trees( tree_list=tree_list, tree_file_title=tree_file_title)
def test_read_metadata(self): tree_file_titles = [ "dendropy-test-trees-multifurcating-rooted-annotated", "dendropy-test-trees-n33-unrooted-annotated-x10a", ] for tree_file_title in tree_file_titles: tree_filepath = standard_file_test_trees._TREE_FILEPATHS["newick"][ tree_file_title] with open(tree_filepath, "r") as src: tree_string = src.read() with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.TreeList.get_from_path, tree_filepath), (dendropy.TreeList.get_from_stream, tree_stream), (dendropy.TreeList.get_from_string, tree_string), ) for method, src in approaches: tree_list = method(src, "newick", extract_comment_metadata=True) self.verify_standard_trees(tree_list=tree_list, tree_file_title=tree_file_title)
def check(self, title, src_prefix, to_retain=False): input_ds = dendropy.DataSet.get_from_path( src=pathmap.tree_source_path(src_prefix + ".pre-pruned.nex"), schema='nexus') tns1 = dendropy.TaxonNamespace() input_ds.attach_taxon_namespace(tns1) input_taxa = input_ds.taxon_namespaces[0] output_ds = dendropy.DataSet.get_from_path( src=pathmap.tree_source_path(src_prefix + ".paup-pruned.nex"), schema='nexus', taxon_namespace=input_taxa) tns2 = dendropy.TaxonNamespace() output_ds.attach_taxon_namespace(tns2) if to_retain: taxf = open(pathmap.tree_source_path(src_prefix + ".retained_taxa.txt"), "r") else: taxf = open(pathmap.tree_source_path(src_prefix + ".pruned_taxa.txt"), "r") rows = taxf.readlines() taxon_idxs_list = [ [int(i) for i in row.split()] for row in rows ] for set_idx, src_trees in enumerate(input_ds.tree_lists): src_trees = input_ds.tree_lists[set_idx] ref_trees = output_ds.tree_lists[set_idx] taxon_idxs = taxon_idxs_list[set_idx] sub_taxa = [src_trees.taxon_namespace[i] for i in taxon_idxs] for tree_idx, src_tree in enumerate(src_trees): _LOG.debug("%s Set %d/%d, Tree %d/%d" % (title, set_idx+1, len(input_ds.tree_lists), tree_idx+1, len(src_trees))) ref_tree = ref_trees[tree_idx] if to_retain: src_tree.retain_taxa(sub_taxa) else: src_tree.prune_taxa(sub_taxa) # tree_dist = paup.symmetric_difference(src_tree, ref_tree) self.assertEqual(treecompare.symmetric_difference(src_tree, ref_tree), 0) taxf.close()
def test_tree_offset_without_collection_offset_newick_get(self): tree_file_title = 'dendropy-test-trees-n33-unrooted-x10a' tree_filepath = self.schema_tree_filepaths[tree_file_title] tree_reference = standard_file_test_trees._TREE_REFERENCES[ tree_file_title] expected_number_of_trees = tree_reference["num_trees"] with open(tree_filepath, "r") as src: tree_string = src.read() with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for approach in approaches: tree_offset = 2 tree = approach[0](approach[1], "newick", tree_offset=tree_offset) reference_tree_idx = tree_offset self.compare_to_reference_by_title_and_index( tree=tree, tree_file_title=tree_file_title, reference_tree_idx=tree_offset)
def test_unsupported_keyword_arguments(self): tree_filepath = pathmap.tree_source_path('dendropy-test-trees-n12-x2.newick') tree_string = self.get_newick_string() reader_kwargs = { "suppress_internal_taxa": True, # should be suppress_internal_node_taxa "gobbledegook": False, } with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for method, src in approaches: with self.assertRaises(TypeError): t = method(src, "newick", **reader_kwargs)
def test_unsupported_keyword_arguments(self): tree_filepath = pathmap.tree_source_path( 'dendropy-test-trees-n12-x2.newick') tree_string = self.get_newick_string() reader_kwargs = { "suppress_internal_taxa": True, # should be suppress_internal_node_taxa "gobbledegook": False, } with open(tree_filepath, "r") as tree_stream: approaches = ( (dendropy.Tree.get_from_path, tree_filepath), (dendropy.Tree.get_from_stream, tree_stream), (dendropy.Tree.get_from_string, tree_string), ) for method, src in approaches: with self.assertRaises(TypeError): t = method(src, "newick", **reader_kwargs)
def iterate_over_file(self, current_file): if textprocessing.is_str_type(current_file): self._current_file = open(current_file, "r") self._current_file_name = current_file else: self._current_file = current_file try: self._current_file_name = self.current_file.name except AttributeError: self._current_file_name = None if hasattr(self._current_file, "__exit__"): with self._current_file: for item in self._yield_items_from_stream(stream=self._current_file): yield item else: # StringIO does not support ``with`` for item in self._yield_items_from_stream(stream=self._current_file): yield item self._current_file = None
def test_basic_parsing(self): tree_string = self.get_newick_string() reader_kwargs = {} with pathmap.SandboxedFile() as tempf: tempf.write(tree_string) tempf.flush() tree_filepath = tempf.name for suppress_internal_node_taxa in (None, False, True): if suppress_internal_node_taxa is None: expected_suppress_internal_node_taxa = True reader_kwargs.pop("suppress_internal_node_taxa", None) else: expected_suppress_internal_node_taxa = suppress_internal_node_taxa reader_kwargs["suppress_internal_node_taxa"] = suppress_internal_node_taxa for suppress_leaf_node_taxa in (None, False, True): if suppress_leaf_node_taxa is None: expected_suppress_leaf_node_taxa = False reader_kwargs.pop("suppress_leaf_node_taxa", None) else: expected_suppress_leaf_node_taxa = suppress_leaf_node_taxa reader_kwargs["suppress_leaf_node_taxa"] = suppress_leaf_node_taxa for suppress_edge_lengths in (None, False, True): if suppress_edge_lengths is None: expected_suppress_edge_lengths = False reader_kwargs.pop("suppress_edge_lengths", None) else: expected_suppress_edge_lengths = suppress_edge_lengths reader_kwargs["suppress_edge_lengths"] = suppress_edge_lengths with open(tree_filepath, "r") as tree_stream: approaches = ( {"path": tree_filepath}, {"file": tree_stream}, {"data": tree_string}, ) for approach_kwargs in approaches: approach_kwargs.update(reader_kwargs) approach_kwargs["schema"] = "newick" t = dendropy.Tree.get(**approach_kwargs) self.verify_curated_tree(t, suppress_internal_node_taxa=expected_suppress_internal_node_taxa, suppress_leaf_node_taxa=expected_suppress_leaf_node_taxa, suppress_edge_lengths=expected_suppress_edge_lengths)
def test_basic(self): tree_file_titles = [ # "dendropy-test-trees-multifurcating-rooted-annotated", # "dendropy-test-trees-multifurcating-rooted", # "dendropy-test-trees-multifurcating-unrooted", # "dendropy-test-trees-n10-rooted-treeshapes", "dendropy-test-trees-n12-x2", "dendropy-test-trees-n33-unrooted-x10a", "dendropy-test-trees-n33-unrooted-x10b", "dendropy-test-trees-n33-unrooted-annotated-x10a", "dendropy-test-trees-n33-unrooted-annotated-x10a", ] expected_file_names = [] expected_tree_references = [] tree_files = [] for file_idx, tree_file_title in enumerate(tree_file_titles): tree_filepath = self.schema_tree_filepaths[tree_file_title] if False and idx % 2 == 0: tree_files.append(open(tree_filepath, "r")) else: tree_files.append(tree_filepath) num_trees = self.tree_references[tree_file_title]["num_trees"] for tree_idx in range(num_trees): expected_file_names.append(tree_filepath) expected_tree_references.append(self.tree_references[tree_file_title][str(tree_idx)]) collected_trees = [] tns = dendropy.TaxonNamespace() # for f in tree_files: # dendropy.TreeList.get_from_path(f, "nexus") tree_sources = dendropy.Tree.yield_from_files( files=tree_files, schema="nexus", taxon_namespace=tns) for tree_idx, tree in enumerate(tree_sources): self.assertEqual(tree_sources.current_file_name, expected_file_names[tree_idx]) tree.current_file_name = tree_sources.current_file_name collected_trees.append(tree) self.assertEqual(len(collected_trees), len(expected_tree_references)) for tree, ref_tree in zip(collected_trees, expected_tree_references): self.assertIs(tree.taxon_namespace, tns) self.compare_to_reference_tree(tree, ref_tree)
def read_expected_sfs(self, filename): filepath = pathmap.char_source_path(filename) with open(filepath) as src: return [int(v) for v in src.read().strip().split(",")]