def test_construct_from_another_with_simple_annotations(self): t1 = Taxon("a") t1.annotations.add_new("a", 0) t1.annotations.add_new("b", 1) t1.annotations.add_new("c", 3) for t2 in (Taxon(t1), copy.deepcopy(t1), t1.clone(2)): self.assertIsNot(t1, t2) self.assertNotEqual(t1, t2) self.assertEqual(t1.label, t2.label) self.assertTrue(hasattr(t1, "annotations")) self.assertTrue(hasattr(t2, "annotations")) self.assertEqual(len(t1.annotations), len(t2.annotations)) self.compare_distinct_annotables(t1, t2)
def simulateTreeTopology(n): # simulate a binary tree of n leaves leaves = [Node()] nodeOrder = [] myTree = Tree(seed_node = leaves[0]) for i in range(n-1): r = randint(0,i) a = Node() b = Node() p = leaves[r] p.add_child(a) p.add_child(b) leaves[r] = a leaves.append(b) nodeOrder.append(p) IDs = list(range(1,n+1)) i = 0 shuffle(IDs) for leaf in leaves: leaf.taxon = Taxon(label=str(IDs[i])) leaf.time = 0 i += 1 return myTree,nodeOrder
def test_basic_adding(self): tns = TaxonNamespace() self.assertEqual(len(tns), 0) for idx, label in enumerate(self.str_labels): tns.add_taxon(Taxon(label=label)) self.assertEqual(len(tns), idx+1) self.validate_taxon_concepts(tns, self.str_labels)
def test_basic_adding_to_immutable(self): tns = TaxonNamespace() self.assertEqual(len(tns), 0) tns.is_mutable = False for idx, label in enumerate(self.str_labels): with self.assertRaises(TypeError): tns.add_taxon(Taxon(label=label)) self.assertEqual(len(tns), 0)
def leave_only(self, to_keep_dict): ''' Removes terminal nodes associated with Taxon objects whose labels are not a key in to_keep_dict. For each item in to_keep_dict that matches, increment its value by 1 (allows tracking how many we have) We have to be careful with non-branching twigs (a series of unifurcations) because we want the tip to be left, and the rest pruned (so that a single species is left on the tree) ''' to_keep = self.taxon_namespace.get_taxa(labels=to_keep_dict.keys()) #flag up the routes to keep for nd in self.postorder_node_iter(): if hasattr(nd, 'keep') or (nd.taxon and nd.taxon in to_keep): if (nd.parent_node): nd.parent_node.keep = True to_prune = set([t for t in self.taxon_namespace if t not in to_keep]) for lf in self.leaf_node_iter(lambda n: n.taxon in to_prune): lf.spp = 1 for nd in self.postorder_node_iter(): if (hasattr(nd, 'spp')): if (nd.parent_node and not hasattr(nd.parent_node, 'keep')): if (hasattr(nd.parent_node, 'spp')): nd.parent_node.spp += nd.spp else: nd.parent_node.spp = nd.spp else: if (nd.spp > 1): if (nd.taxon): nd.taxon.label += '#%d' % (nd.spp) else: if (nd.label): nd.taxon = Taxon(label=nd.label + '#%d' % (nd.spp)) else: warn( "Couldn't find a label for a node with multiple descendants. Using label 'unknown'." ) nd.taxon = Taxon(label='unknown#%d' % (nd.spp)) nd.set_child_nodes([]) self.prune_leaves_without_taxa(update_bipartitions=False, suppress_unifurcations=False) check_list_against_taxa(self, to_keep_dict)
def test_discard_taxon_label(self): taxa = [Taxon(s) for s in self.str_labels] tns = TaxonNamespace(taxa) expected = taxa[:] for idx, label in enumerate(set(self.str_labels)): tns.discard_taxon_label(label) for t in taxa: if t.label == label and t in expected: expected.remove(t) self.assertEqual(len(tns), len(expected)) for t1, t2 in zip(tns, expected): self.assertIs(t1, t2)
def test_remove_taxon(self): taxa = [Taxon(s) for s in self.str_labels] tns = TaxonNamespace(taxa) expected = taxa[:] for idx, taxon in enumerate(taxa): tns.remove_taxon(taxon) expected.remove(taxon) self.assertEqual(len(tns), len(expected)) for idx2, taxon2 in enumerate(expected): if taxon2 in expected: self.assertIn(taxon2, tns) elif taxon2 not in expected: self.assertNotIn(taxon2, tns)
def test_discard_taxon_label_case_insensitive(self): ucase_labels = [s.upper() for s in self.str_labels] assert ucase_labels assert ucase_labels != self.str_labels taxa = [Taxon(s) for s in self.str_labels] tns = TaxonNamespace(taxa) expected = taxa[:] # default: case-insensitive for idx, label in enumerate(set(ucase_labels)): tns.discard_taxon_label(label) for t in taxa: if t.label.upper() == label.upper() and t in expected: expected.remove(t) self.assertEqual(len(tns), len(expected)) for t1, t2 in zip(tns, expected): self.assertIs(t1, t2)
def test_discard_taxon_label_case_sensitive(self): ucase_labels = [s.upper() for s in self.str_labels] assert ucase_labels assert ucase_labels != self.str_labels taxa = [Taxon(s) for s in self.str_labels] tns = TaxonNamespace(taxa) expected = taxa[:] # test: case sensitive tns.is_case_sensitive = True for idx, label in enumerate(set(ucase_labels)): if label != label.lower(): x1 = len(tns) try: tns.discard_taxon_label(label) except LookupError: self.fail() else: self.assertEqual(len(tns), x1)
def test_remove_taxon_label_case_insensitive(self): ucase_labels = [s.upper() for s in self.str_labels] assert ucase_labels assert ucase_labels != self.str_labels taxa = [Taxon(s) for s in self.str_labels] tns = TaxonNamespace(taxa) expected = taxa[:] for idx, label in enumerate(set(ucase_labels)): if label != label.lower(): with self.assertRaises(LookupError): tns.is_case_sensitive = True tns.remove_taxon_label(label) tns.is_case_sensitive = False tns.remove_taxon_label(label) for t in taxa: if t.label.upper() == label.upper() and t in expected: expected.remove(t) self.assertEqual(len(tns), len(expected)) for t1, t2 in zip(tns, expected): self.assertIs(t1, t2)
def test_construct_from_another_with_complex_annotations(self): t1 = Taxon("a") t1.annotations.add_new("a", 0) b = t1.annotations.add_new("b", (t1, "label"), is_attribute=True) b.annotations.add_new("c", 3) for t2 in (Taxon(t1), copy.deepcopy(t1), t1.clone(2)): self.assertIsNot(t1, t2) self.assertNotEqual(t1, t2) self.assertEqual(t1.label, t2.label) self.assertTrue(hasattr(t1, "annotations")) self.assertTrue(hasattr(t2, "annotations")) self.assertEqual(len(t1.annotations), len(t2.annotations)) self.compare_distinct_annotables(t1, t2) t1.label = "x" self.assertEqual(t1.annotations[1].value, "x") self.assertEqual(t2.annotations[1].value, "a") t2.label = "z" self.assertEqual(t1.annotations[1].value, "x") self.assertEqual(t2.annotations[1].value, "z") t1.label = "a"
def test_no_contains_taxa(self): tns = TaxonNamespace(self.taxa) taxa2 = [Taxon(label=t.label) for t in self.taxa] for taxon in taxa2: self.assertNotIn(taxon, tns)
def test_construct_from_another(self): t1 = Taxon("a") for t2 in (Taxon(t1), copy.deepcopy(t1), t1.clone(2)): self.assertIsNot(t1, t2) self.assertNotEqual(t1, t2) self.assertEqual(t1.label, t2.label)
def setUp(self): self.str_labels = ["a", "a", "b", "c", "d", "e", "_", "_", "_", "z", "z", "z"] self.taxa = [ Taxon(label) for label in self.str_labels ] self.tns1 = TaxonNamespace(self.taxa, label="T1")
def main(args): if len(args) < 2: print( '''USAGE: %s [tree_file] [outgroups] [-mrca -mrca-dummy (optional)] [output name (optional)] [-igerr (optional)] -- tree_file: a path to the newick tree file -- outgroups: a list of outgroups, separated by comma. The script goes through the list of outgroups. If the outgroup is found in the tree, the tree is rooted at that outgroup. Otherwise, the next outgroup in the list is used. Each element in the comma-delimited list is itself a + delimited list of taxa. By default the script makes sure that this list of taxa are monophyletic in the tree and roots the tree at the node leading to the clade represented by outgroups given in the + delimited list. Alternatively, you can specify -m which will result in mid-point rooting. Example: HUMAN,ANOCA,STRCA+TINMA first tries to root at HUMAN, if not present, tries to use ANOCA, if not present, tries to root at parent of STRCA and TINMA which need to be monophyletic. If not monophyletic, roots at STRCA. -- (optional) -mrca: using this option the mono-phyletic requirement is relaxed and always the mrca of the + delimited list of outgroups is used. -- (optional) -mrca-dummy: is like -mrca, but also adds a dummy taxon as outgroup to the root. ''' % args[0]) sys.exit(1) treeName = args[1] outgroups = [x.replace("_", " ") for x in args[2].split(",")] use_mrca = True if len(args) > 3 and ( args[3] == "-mrca" or args[3] == "-mrca-dummy") else False add_dummy = True if len(args) > 3 and (args[3] == "-mrca-dummy") else False resultsFile = args[4] if len(args) > 4 else ( "%s.rooted" % treeName[:-9] if treeName.endswith("unrooted") else "%s.rooted" % treeName) ignore = True if len(args) > 5 and args[5] == "-igerr" else False print("Reading input trees %s ..." % treeName, end=' ') trees = dendropy.TreeList.get_from_path(treeName, schema='newick') print("%d tree(s) found" % len(trees)) i = 0 outtrees = TreeList() for tree in trees: tree.encode_bipartitions() i += 1 print(".") oldroot = tree.seed_node #print "Tree %d:" %i sl = {} for n in tree.internal_nodes(): sl[n.edge.bipartition.normalize( bitmask=n.edge.bipartition._split_bitmask)] = n.label if outgroups[0] == "-m": print("Midpoint rooting ... ") tree.reroot_at_midpoint(update_bipartitions=True) else: mrca = None for outgroup in outgroups: outs = outgroup.split("+") outns = [] for out in outs: n = tree.find_node_with_taxon_label(out) if n is None: print("outgroup not found %s," % out, end=' ') continue outns.append(n.taxon) if len(outns) != 0: # Find an ingroup and root the tree there for n in tree.leaf_iter(): if n.taxon not in outns: ingroup = n break #print "rerooting at ingroup %s" %ingroup.taxon.label '''reroot at an ingroup, so that outgroups form monophyletic groups, if possible''' if ingroup.edge.length is not None: tree.reroot_at_edge(ingroup.edge, update_bipartitions=True, length1=ingroup.edge.length / 2, length2=ingroup.edge.length / 2) else: tree.reroot_at_edge(ingroup.edge, update_bipartitions=True) mrca = tree.mrca(taxa=outns) break if mrca is None: if ignore: print("Outgroups not found: %s" % outgroups, file=sys.stderr) continue else: raise KeyError("Outgroups not found %d: %s" % (i, outgroups)) #print mrca.leaf_nodes() #if not mono-phyletic, then use the first if not use_mrca and len(mrca.leaf_nodes()) != len(outns): print("selected set is not monophyletic. Using %s instead. " % outns[0], file=sys.stderr) mrca = tree.find_node_with_taxon_label(outns[0].label) if mrca.parent_node is None: print("Already rooted at the root.", file=sys.stderr) #print "rerooting on %s" % [s.label for s in outns] #tree.reroot_at_midpoint() elif mrca.edge.length is not None: #print "rerooting at %s" %mrca.as_newick_string() if ingroup.edge.length is not None: tree.reroot_at_edge(mrca.edge, update_bipartitions=True, length1=mrca.edge.length / 2, length2=mrca.edge.length / 2) else: tree.reroot_at_edge(mrca.edge, update_bipartitions=True) else: tree.reroot_at_edge(mrca.edge, update_bipartitions=True) if add_dummy: dummy = tree.seed_node.new_child(taxon=Taxon(label="outgroup"), edge_length=1) tree.reroot_at_edge(dummy.edge, update_bipartitions=True) '''This is to fix internal node labels when treated as support values ''' for n in tree.internal_nodes(): n.label = sl.get( n.edge.bipartition.normalize( n.edge.bipartition._split_bitmask), '') ''' print (oldroot.parent_node) print (tree.seed_node) print (oldroot) print("relabel") while oldroot.parent_node != tree.seed_node and oldroot.parent_node != None: oldroot.label = oldroot.parent_node.label oldroot = oldroot.parent_node print ("--") if len(oldroot.sister_nodes()) > 0: oldroot.label = oldroot.sister_nodes()[0].label #tree.reroot_at_midpoint(update_bipartitions=False)''' print("writing results to %s" % resultsFile, file=sys.stderr) trees.write(file=open(resultsFile, 'w'), schema='newick', suppress_internal_taxon_labels=False, suppress_rooting=True)
def get_super_tree(self, superTree_method, **args): def parse_trees(**args): n_tree, n_branch = float(len(self.data['trees'])), {} for mt_id, mt in enumerate(self.data['trees']): w = (float(len(mt.tre.leaf_nodes())) / len(self.data['taxa']))**2 for node in mt.tre.preorder_node_iter(): if node.barcode not in n_branch: n_branch[node.barcode] = [[w, mt_id, node]] else: n_branch[node.barcode].append([w, mt_id, node]) return n_tree, n_branch def consensus(self, **args): n_tree, n_branch = parse_trees(**args) n_branch = sorted([[len(v) / n_tree, k, v] for k, v in n_branch.iteritems()], reverse=True) consensus_tree = [] for posterior, branch, nodes in n_branch: for cbr, _, _ in consensus_tree: b1, b2 = sorted([branch, cbr]) if not (((b1 & b2) == b1) or ((b1 & (~b2)) == b1)): branch = 0 break if branch: consensus_tree.append([branch, posterior, nodes]) return sorted(consensus_tree, reverse=True) def MCC(self, **args): n_tree, n_branch = parse_trees(**args) for mt_id, mt in enumerate(self.data['trees']): if len(mt.tre.leaf_nodes()) == len(self.data['taxa']): mt.score = np.sum([ len(n_branch[node.barcode]) for node in mt.tre.preorder_node_iter() ]) tre = max(self.data['trees'], key=lambda x: x.score).tre return [[ n.barcode, len(n_branch[n.barcode]) / n_tree, n_branch[n.barcode] ] for n in tre.preorder_node_iter()] def load_subtree(self, treeLabel, **args): n_tree, n_branch = parse_trees(**args) for mt_id, mt in enumerate(self.data['trees']): if mt.tre.label == treeLabel: tre = mt.tre break return [[ n.barcode, len(n_branch[n.barcode]) / n_tree, n_branch[n.barcode], n.age, n.edge_length ] for n in tre.preorder_node_iter()] #def ASTRID(self, **args) : #from dendropy import PhylogeneticDistanceMatrix def load_tree(self, consFile=None, **args): n_tree, n_branch = parse_trees(**args) with open(consFile) as fin: schema = 'nexus' if fin.readline().upper().startswith( '#NEXUS') else 'newick' for tre in Tree.yield_from_files([consFile], schema=schema): break internal_id = n_taxa = len(self.data['taxa']) digit_code = np.power(2, np.arange(n_taxa, dtype='object')) for node in tre.postorder_node_iter(): if node.is_leaf(): node.id = self.data['taxa'][node.taxon.label] node.barcode = digit_code[node.id] else: node.id, internal_id = internal_id, internal_id + 1 node.barcode = sum([c.barcode for c in node.child_nodes()]) tre.seed_node.age = tre.seed_node.distance_from_tip() for node in tre.preorder_node_iter(): if node.parent_node: node.age = node.parent_node.age - node.edge_length return [[ n.barcode, len(n_branch.get(n.barcode, [])) / n_tree, n_branch.get(n.barcode, []), n.age, n.edge_length ] for n in tre.preorder_node_iter()] if superTree_method in ('MCC', 'ASTRID', 'consensus'): branches = locals()[superTree_method](self, **args) elif os.path.isfile(superTree_method): branches = load_tree(self, consFile=superTree_method, **args) else: branches = load_subtree(self, treeLabel=superTree_method, **args) supertree = Tree() sn = supertree.seed_node sn.barcode, sn.posterior = branches[0][0], branches[0][1] sn.age = branches[0][3] if len(branches[0]) > 3 else np.sum( [n[2].age * n[0] for n in branches[0][2]]) / np.sum([n[0] for n in branches[0][2]]) sn.contain = [[b[0], b[1], b[2].id] for b in branches[0][2]] for br in branches[1:]: cbr, posterior, nodes = br[:3] while (sn.barcode & cbr) != cbr: sn = sn.parent_node new_node = Node() if len(nodes) == 0 or ( not nodes[0][2].taxon) else Node(taxon=Taxon( label=nodes[0][2].taxon.label)) sn.add_child(new_node) sn = new_node sn.barcode, sn.posterior = cbr, posterior sn.contain = [[b[0], b[1], b[2].id] for b in nodes] if len(br) <= 3: sn.edge_length = 0.0 if len(nodes) == 0 else np.sum( [n[2].edge_length * n[0] for n in nodes]) / np.sum([n[0] for n in nodes]) sn.age = sn.parent_node.age if len(nodes) == 0 else np.sum( [n[2].age * n[0] for n in nodes]) / np.sum([n[0] for n in nodes]) else: sn.age, sn.edge_length = br[3:] internal_id = len(self.data['taxa']) for node in supertree.postorder_node_iter(): if node.is_leaf(): node.id = self.data['taxa'][node.taxon.label] else: node.id = internal_id internal_id += 1 return MetaTree(supertree)
def rdf2dendropyTree(file_obj=None, data=None): ''' Parses the content (a `file_obj` file object or `data` as a) into a dendropyTree. Uses the 'has_Parent' term in http://www.evolutionaryontology.org/cdao/1.0/cdao.owl# to construct and return a rooted dendropy.Tree object Relies on rdflib and dendropy. Raises ValueError if the graph does not imply exactly 1 root node ''' from dendropy import Node, Tree, Edge, TaxonSet, Taxon graph = rdflib.Graph() if file_obj: graph.parse(file=file_obj) else: graph.parse(data=data, format='xml') nd_dict = {} has_parent_predicate = OBO_PREFIX + HAS_PARENT_PREDICATE if _DEBUGGING: out = open('parse_rdf.txt', 'w') taxon_set = TaxonSet() OBO = rdflib.Namespace(u"http://purl.obolibrary.org/obo/") parentless = set() for s, p, o in graph.triples((None, OBO[HAS_PARENT_PREDICATE], None)): parent = nd_dict.get(id(o)) if parent is None: #print 'Parent o.value = ', o.value(rdflib.RDF.nodeID) raw_o = o o = rdflib.resource.Resource(graph, o) o_tu = o.value(OBO[REPRESENTS_TU_PREDICATE]) if o_tu: o_label = o_tu.value(rdflib.RDFS.label) t = Taxon(label=o_label) taxon_set.append(t) parent = Node(taxon=t) else: parent = Node() nd_dict[id(raw_o)] = parent parentless.add(parent) child = nd_dict.get(id(s)) if child is None: raw_s = s s = rdflib.resource.Resource(graph, s) s_tu = s.value(OBO[REPRESENTS_TU_PREDICATE]) if s_tu: s_label = s_tu.value(rdflib.RDFS.label) t = Taxon(label=s_label) taxon_set.append(t) child = Node(taxon=t) else: child = Node() nd_dict[id(raw_s)] = child else: if child in parentless: parentless.remove(child) parent.add_child(child) if _DEBUGGING: out.write('%s %s %s\n' % (str(s), p, o)) out.write('%s\n' % (str(parentless))) if _DEBUGGING: out.close() if len(parentless) != 1: message = "Expecting to find exactly Node (an object of a has_Parent triple) in the graph without a parent. Found %d" % len( parentless) CUTOFF_FOR_LISTING_PARENTLESS_NODES = 1 + len( parentless ) # we might want to put in a magic number here to suppress really long output if len(parentless) > 0 and len( parentless) < CUTOFF_FOR_LISTING_PARENTLESS_NODES: message += ":\n " for i in parentless: if i.label: message += "\n " + i.label else: message += "\n <unlabeled>" + str(id(i)) raise ValueError(message) else: return None tree = Tree(taxon_set=taxon_set) tree.seed_node = list(parentless)[0] tree.is_rooted = True return tree
def test_simple_copy(self): t1 = Taxon("a") with self.assertRaises(TypeError): copy.copy(t1) with self.assertRaises(TypeError): t1.clone(0)
def test_taxon_namespace_scoped_copy(self): t1 = Taxon("a") for t2 in (t1.clone(1), t1.taxon_namespace_scoped_copy()): self.assertIs(t2, t1)
def setUp(self): self.str_labels = ["a", "a", "b", "c", "d", "e", "_", "_", "_", "z", "z", "z"] self.taxa = [ Taxon("t1"), Taxon("t2"), Taxon("t3"), ] self.taxa_labels = [t.label for t in self.taxa]
treefile = argv[1] infofile = argv[2] outfile = argv[3] with open(infofile, 'r') as f: mapping = {} for line in f: taxa = line.split() mapping[taxa[0]] = taxa[1:] myTree = Tree.get_from_path(treefile, "newick") leaves = list(myTree.leaf_node_iter()) for node in leaves: if node.taxon.label in mapping: new_node = Node(edge_length=node.edge_length) pNode = node.parent_node pNode.remove_child(node) pNode.add_child(new_node) new_node.add_child(node) pNode = new_node for taxon_name in mapping[node.taxon.label]: new_taxon = Taxon(label=taxon_name) myTree.taxon_namespace.add_taxon(new_taxon) new_node = Node(edge_length=0, taxon=new_taxon) pNode.add_child(new_node) myTree.write_to_path(outfile, "newick")
def setUp(self): self.t1 = Taxon("a") self.t2 = Taxon("a")
line = lines.readline() results = line.strip().split(',') tree = Tree() root = Node() root.__dict__['label'] = results[0].replace("\"", "") nodes_dict[results[0].replace("\"", "")] = root prune = ['1'] #Add root node to tree tree.__dict__['_seed_node'].add_child(root) for line in lines: results = line.strip().split(',') node = Node() node.__dict__['label'] = results[0].replace("\"", "") node.taxon = Taxon(results[0].replace("\"", "")) nodes_dict[results[0].replace("\"", "")] = node nodes_dict[results[1].replace("\"", "")].add_child(node) if results[0].replace("\"", "") not in species: prune.append(results[0].replace("\"", "")) for taxa in prune: nodes_dict[taxa].label = '' # tree.delete_outdegree_one_nodes() tree.suppress_unifurcations() output = open(taxonomyTree, 'w') output.write(str(tree) + ";") output.close() lines.close()
required=False, help="Output Newick tree file") args = parser.parse_args() tree = Tree.get_from_path(args.i, schema="newick", rooting="force-unrooted") namespace = tree.taxon_namespace labels = namespace.labels() regex = re.compile("(.+) .+ .+") species = [ match.group(1) for label in labels for match in [regex.match(label)] if match ] species_set = set(species) species = list(species_set) newNamespace = dendropy.datamodel.taxonmodel.TaxonNamespace() for specie in species: regex = re.compile(specie + " .+ .+") leaves = [ match.group(0) for label in labels for match in [regex.match(label)] if match ] mrca_node = tree.mrca(taxon_labels=leaves) del mrca_node._child_nodes[:] taxon = Taxon(specie) mrca_node.taxon = taxon newNamespace.add_taxon(taxon) tree.taxon_namespace = newNamespace tree.write(path=args.o, schema="newick", suppress_rooting=True)