def _add_observed_isotypes( tree: ete3.Tree, newidmap: Dict[str, str], isotype_order: Sequence[str], weight_matrix: Optional[Sequence[Sequence[float]]] = None, ): # Drop observed nodes as leaves and explode by observed isotype: # Descend internal observed nodes as leaves: newisotype = IsotypeTemplate(isotype_order, weight_matrix=weight_matrix).new for node in list(tree.iter_descendants()): if node.abundance > 0 and not node.is_leaf(): newchild = ete3.TreeNode(name=node.name) newchild.add_feature("sequence", node.sequence) newchild.add_feature("abundance", node.abundance) node.abundance = 0 node.add_child(child=newchild) # Now duplicate nodes which represent multiple isotypes for node in list(tree.get_leaves()): if node.abundance == 0: node.add_feature("isotype", newisotype("?")) else: try: thisnode_isotypemap = newidmap[node.name] except KeyError as e: warnings.warn( f"The sequence name {e} labels an observed node, but no mapping to an original sequence ID was found." " Isotype will be assumed ambiguous.") thisnode_isotypemap = { "?": {f"Unknown_id_{n+1}" for n in range(node.abundance)} } if "?" in thisnode_isotypemap: warnings.warn( f"The sequence name {node.name} labels an observed node, and corresponds to sequence IDs for " "which no observed isotype was provided. " f" Isotype will be assumed ambiguous for: {', '.join(thisnode_isotypemap['?'])}" ) # node.name had better be in newidmap, since this is an observed node if len(thisnode_isotypemap) > 1: for isotype, cell_ids in thisnode_isotypemap.items(): # add new node below this leaf node. Must be below, and not child # of parent, to preserve max parsimony in case that node.up has # different sequence from node. newchild = ete3.TreeNode(name=node.name) newchild.add_feature("abundance", len(cell_ids)) newchild.add_feature("sequence", node.sequence) newchild.add_feature("isotype", newisotype(isotype)) node.add_child(child=newchild) node.abundance = 0 else: node.isotype = newisotype(list(thisnode_isotypemap.keys())[0]) # Now add ancestral ambiguous isotypes for node in tree.traverse(): if not node.is_leaf(): node.add_feature("isotype", newisotype("?"))
def create_nodes(node_names): """ Returns a list of ete3.TreeNodes from the list of names """ nodes = [] for name in node_names: new_node = ete3.TreeNode(name=name) nodes.append(new_node) return nodes
def constrain_main(args): labels = read_item_per_line_file(args.species_list) if (args.backbone == 'ncbi'): lineages = get_lineages(labels=labels, rank=args.rank) taxid_counts = get_taxid_counts(lineages) #taxid_counts = limit_rank(taxid_counts=taxid_counts, rank=args.rank) tree = taxid2tree(lineages, taxid_counts) else: if (args.backbone.endswith('user')): tree = read_tree(args.infile, args.format) for node in tree.traverse(): node.name = node.name.replace('_', ' ') elif (args.backbone == 'ncbi_apgiv'): file_path = 'data_tree/apgiv.nwk' nwk_string = pkg_resources.resource_string( __name__, file_path).decode("utf-8") tree = ete3.TreeNode(newick=nwk_string, format=0, quoted_node_names=True) tree = initialize_tree(tree) tree = match_taxa(tree, labels, args.backbone) tree = delete_nomatch_leaves(tree) tree = polytomize_one2many_matches(tree) tree = remove_singleton(tree, verbose=False, preserve_branch_length=False) for node in tree.traverse(): node.name = node.name.replace(' ', '_') write_tree(tree, args, format=9)
def visualize(self): me = ete3.TreeNode(name=self.splitFeature) for key in self.leafes.keys(): me.add_child(self.leafes[key].visualize()) for key in self.branches.keys(): me.add_child(self.branches[key].visualize()) return me
def main(name='phytree_show', args=None): myname = 'phytree_show' if name == myname: nwkfile, tree_format, tree_mode, show_leaf_name, show_branch_length, show_branch_support, \ align_leaf_name, hide_inner_node, save_plot, show_inner_name, out_file = getargs(args) tree = ete3.TreeNode(nwkfile, format=tree_format) tree_style = ete3.TreeStyle() tree_style.mode = tree_mode tree_style.show_leaf_name = show_leaf_name tree_style.show_branch_length = show_branch_length if align_leaf_name: tree_style.show_leaf_name = False _align_leaf_name(tree) if hide_inner_node: _hide_inner_node(tree) if show_inner_name: _show_inner_name(tree, tree_format) try: if save_plot: tree.render(out_file, tree_style=tree_style) else: tree.show(tree_style=tree_style) except: print('opps...') return 0
def traverse_tree(circle_node, parent_node): for child in circle_node.get_children(): child_node = ete3.TreeNode() child_node.name = child.name child_node.dist = child.base_dist parent_node.add_child(child_node) traverse_tree(child, child_node) return 0
def sample(treestring, n=20): ogt = ete3.TreeNode(newick=newick_tree2, format=1) print(ogt.sequence) newickset = set() for i in range(n): t = ogt.copy() random.seed(i) t = pps.disambiguate(t, random_state=random.getstate()) newickset.add(treeprint(t)) return newickset
def break_branches(tree, step=1): for n in tree.traverse(): for i, ch in enumerate(n.get_children()): breaks = ch.dist // step lastdist = ch.dist % step ch.dist = lastdist for j in range(int(breaks)): new = ete3.TreeNode(dist=step) ## set this new node as the parent of the current one. #ch.up = new ## take one step back (rootward) #ch = new new.add_child(ch.detach()) ch = new if breaks: n.add_child(ch)
def make_profile_tree(self): def traverse_tree(circle_node, parent_node): for child in circle_node.get_children(): child_node = ete3.TreeNode() child_node.name = child.name child_node.dist = child.base_dist parent_node.add_child(child_node) traverse_tree(child, child_node) return 0 root_circle_node = self.get_root_circle_node() root = ete3.TreeNode() root.dist = 0 root.name = root_circle_node.name traverse_tree(root_circle_node, root) return root
def alelabel_to_events(label, current, number_to_names): #events = label.split('.') parent_S = (None if current is None else current.S) n_new = 0 # DEBUG while label: logger.debug("label='%s'", label) event_match = REGEX_EVENT.match(label) event = event_match.group() if event.startswith('.'): event = event[1:] label = label[event_match.end():] #for event in events: #if event == '': # continue try: current = current.add_child() except AttributeError: assert current is None, "parent node should be ete3.TreeNode or None." current = ete3.TreeNode() n_new += 1 m = REGEX_TRANSFER.match(event) if not m: # Speciation and loss. i.e. transmission to one single descendant species. current.add_features(D=0) # Speciation (and loss) branch = event else: typ, time, branch = m.groups() if typ == 'D@': current.add_features(D=2) elif typ == 'T@': # Transfer out # should be represented as dupli current.add_features(D=11, T=1) elif typ == '@': # Transfer into that branch: # we will represent it simply as a speciation node. current.add_features(D=0, T=-1) elif typ == 'Tb@': # Is it a gene divergence in non represented lineages? # Could be represented as Duplication or Speciation, does not matter. current.add_features(D=12, T=0) # Should S==branch? current.add_feature('S', number_to_names.get(branch, branch)) logger.debug('From %r: created %d new nodes leading to species %s', parent_S, n_new, branch) return current, n_new
def parentdata_to_ete3(df, dist_column='dist', root_value=None): #, parent_column='parent' roots = [] trees = {} #if isinstance(root_value, float) and isnan(root_value): # def is_root(node): return isnan(df.loc[node, parent_column]) #else: # def is_root(node): return df.loc[node, parent_column] == root_value if dist_column is None: def get_dist(node): return 1 else: def get_dist(node): dist = df.loc[node, dist_column] # NaN creates an error with ete3.Tree.show() return 0 if isnan(dist) else dist for nodename, children in roll_leafwards_indices(df, root_value=root_value): #root, root_children = next(iter_leafwards) #if is_root(nodename): try: node = trees.pop(nodename) except KeyError: node = ete3.TreeNode(name=nodename, dist=get_dist(nodename)) roots.append(node) for ch in children: if ch in trees: logger.error('Node name already used: %r (override).', ch) trees[ch] = node.add_child(name=ch, dist=get_dist(ch)) logger.debug('node %r %r -> child %r dist[%r]=%s', nodename, node, ch, dist_column, trees[ch].dist) return roots
def extendparalogy(paralog_packs: list, taxon: str, parent_paralogy=None) -> None: """'Speciate' the paralogy, i.e divide the `paralog_packs` into their descendant paralog_packs in each child_taxon, and call itself again on each descendant paralogy""" nonlocal callnb, indent #debug callnb += 1 #debug indent += 1 #debug pdbug(indent, taxon, 'call:', callnb, prefix='# ') # Create a node object representing the paralogy, if any. Attach to parent if any. para_size = len(paralog_packs) if para_size > 1: para_name = '|'.join('-'.join(ch.name for ch in pack) for pack in paralog_packs) current_paralogy = ete3.TreeNode(name=para_name) current_paralogy.add_feature("S", taxon) current_paralogy.add_feature("P", 1) # It is a paralogy #current_paralogy.add_feature("D", ("Y" if getattr(parent_paralogy, 'S', None)==taxon else "N")) #optional #parent_paralogy.add_feature("D", ) else: #assert len(paralog_packs[0]) == 1 # Not sure if it should happen current_paralogy = make_singleton_node( list(paralog_packs[0])[0].name, taxon) pdbug(indent, "No paralogy") if current_paralogy is not None: if parent_paralogy is None: current_paralogy.add_feature('D', 'Y') paralogies.append(current_paralogy) pdbug(indent, 'Create new paralogy', current_paralogy.name) else: if parent_paralogy.D == "Y" and all( len(p) == 1 for p in paralog_packs): current_paralogy.add_feature('A', 1) else: current_paralogy.add_feature('A', 0) # It is a "sub-paralogy" (steming from a main one by gene dupli) parent_paralogy.add_child(current_paralogy) pdbug(indent, ("dup" if parent_paralogy.D == "Y" else "spe") + '-extend paralogy from', parent_paralogy.name) #pdbug(indent, current_paralogy.get_tree_root() if current_paralogy else None, prefix='> ') pdbug(indent, 'paralog_packs:', paralog_packs) # The descendant paralog_pack in each species after the speciation: paralog_packs_after_speciation = {} # {ch: [set()] * para_size} #speciated_taxa = set() # Empty paralogs if genes reached a speciation node. # Otherwise, replace the node by the duplication descendants and start a new paralogy seen_paralogs = set() #debug for pack_i, paralog_pack in enumerate(paralog_packs): has_sub_paralogies = len(paralog_pack) > 1 #pdbug(indent, pack_i, paralog_pack, prefix='* ') while paralog_pack: paralog = paralog_pack.pop( ) # if isinstance(paralog_pack, set) else paralog = paralog_pack children_taxa = [ get_taxon(ch, ancgene2sp, ensembl_version) for ch in paralog.children ] event = infer_gene_event(paralog, taxon, set(children_taxa)) #pdbug(indent, repr(paralog), event+':', paralog.children, prefix=' ** ') if event == 'dup': #if para_size > 1 / current_paralogy is not None pdbug(indent, 'Dup!') if current_paralogy is not None: current_paralogy.add_feature('D', 'Y') #if current_paralogy.P: # # Only update the pack when we are in a paralogy. # # If not, the current branch can be dropped, because # # each paralog is already going to be checked in # # subsequent calls. paralog_pack.update(paralog.children) #if callnb == 32: import ipdb; ipdb.set_trace(context=1) if not has_sub_paralogies: assert len(set( children_taxa)) == 1, "Missing speciation nodes" child_taxon = children_taxa[0] extendparalogy( [set((ch, )) for ch in paralog.children], children_taxa[0], current_paralogy) if include_singleton_branches and not getattr( parent_paralogy, 'P', 0): # It miraculously worked. I don't know why. # This is needed with option `include_singleton_branches=True`: # it avoids drawing a "duplicate" branch for singleton genes left from a new paralogy node. break extendparalogy(paralog_packs, children_taxa[0], current_paralogy) # Seems to work okay, but I suspect some paralogs are # checked several times. #break has_sub_paralogies = True indent -= 1 #debug #also extendparalogy of the current paralogy (current_paralogy) #if has_sub_paralogies: # paralog_pack.update(paralog.children) #extendparalogy #has_sub_paralogies = True else: # What if it is a leaf? The paralog is just popped out. for child_taxon, speciated_paralog in zip( children_taxa, paralog.children): speciated_paralog_packs = paralog_packs_after_speciation.setdefault( child_taxon, [set() for p in range(para_size)]) #if child_taxon not in paralog_packs_after_speciation: # paralog_packs_after_speciation[child_taxon] = [set() for i * para_size speciated_paralog_packs[pack_i].add(speciated_paralog) #pdbug(indent, ' ** Add speciated paralog:', repr(speciated_paralog), # '->', pack_i, child_taxon) assert speciated_paralog not in seen_paralogs #debug seen_paralogs.add(speciated_paralog) #debug assert taxon not in paralog_packs_after_speciation, \ "Intermediate speciation nodes are missing at: %s -> %s" % \ (taxon, tuple(paralogs_after_speciation.keys())) #if para_size > 1 or no dup: #if there hasn't been a complete update of the paralog_pack # if there was a dup, this is redundant. for child_taxon, speciated_paralog_packs in paralog_packs_after_speciation.items( ): speciated_paralog_packs = [ pack for pack in speciated_paralog_packs if pack ] redundant_nodes = (len(speciated_paralog_packs) > 1 and set.intersection(*speciated_paralog_packs)) assert not redundant_nodes, paralog_packs_after_speciation pdbug(indent, 'Spe!') if current_paralogy is not None: current_paralogy.add_feature('D', 'N') extendparalogy(speciated_paralog_packs, child_taxon, current_paralogy) indent -= 1 #debug
def build_tree(tablesoup, recurs=0, _recurs_count=0): tbody = tablesoup.findChild('tbody', recursive=False) # not findChild nodes = [] if tbody is not None: for row in tbody.findChildren('tr', recursive=False): cell0 = row.findChild('td', recursive=False) tagclass = cell0.get('class', []) if 'clade-label' in tagclass: cladename = cell0.get_text().strip() nodes.append(ete3.TreeNode(name=cladename)) nodes[-1].add_feature('info', []) nodes[-1].add_feature('wikipedia_page_depth', _recurs_count) if 'dashed' in cell0.get('style', ()): # This branch is controversial nodes[-1].support = 0.5 cladeleaf = cell0.find_next_sibling('td', class_='clade-leaf') child_clade = cladeleaf.findChild('table', class_='clade', recursive=False) if child_clade is not None: for child_node in build_tree(child_clade, recurs, _recurs_count): nodes[-1].add_child(child=child_node) else: leafname = cladeleaf.get_text().strip() #not_img = lambda tag: "image" not in tag.get('class', '') leaflink = cladeleaf.find('a') leafimg = cladeleaf.find('img') if not nodes[-1].name: # Update the preceding node, which is actually just the leading branch. nodes[-1].name = leafname else: nodes[-1].add_child(name=leafname) if leaflink: nodes[-1].add_feature('link', leaflink.get('href', '')) otherlinks = [ l for l in leaflink.find_next_siblings('a') if 'image' not in l.get('class', '') ] if otherlinks: logger.warning("Alternative leaf links: " + \ ";".join("%r class=%r" % (l.get_text(), l.get('class')) for l in otherlinks)) if leafimg: nodes[-1].add_feature('img', leafimg['src']) nodes[-1].add_feature('imgwidth', leafimg['width']) nodes[-1].add_feature('imgheight', leafimg['height']) if _recurs_count < recurs and leaflink and 'redlink=1' not in leaflink[ 'href']: href = leaflink['href'] if not href.startswith('https://'): href = WIKIPEDIA_URL + href hreftreesoups = get_wiki_tree(url=href) if hreftreesoups: logger.info("Recursing into %r from %r", leafname, tablesoup.find_parent('[document]')\ .findChild('title').get_text().strip() ) logger.info( "Found %d phylogenetic trees (at depth %d).", len(hreftreesoups), _recurs_count + 1) leaflinktext = leaflink.get_text().strip() for hreftreesoup in hreftreesoups: for hreftree in build_tree( hreftreesoup, recurs, _recurs_count + 1): matched_node = find_matching_node( hreftree, leaflinktext, *leafname.split('/')) logger.debug("Matched node: %r", matched_node) if matched_node and not matched_node.is_leaf( ): for leafchild in matched_node.children: nodes[-1].add_child( child=leafchild) break else: continue # next hreftreesoup if no match break else: logger.warning( "Corresponding internal node (%r/%r) not found.", leafname, leaflinktext) elif 'clade-slabel' in tagclass: nodes[-1].info.append(cell0.get_text().strip()) else: logger.warning( "Unexpected class of cell in the row under %r: %r", (nodes[-1].name if nodes else None), tagclass) return nodes
def gen_test_tree(): a = ete3.TreeNode() a.name = 'root' a.dist = 0 b = ete3.TreeNode() b.name = 'b' b.dist = 2 c = ete3.TreeNode() c.name = 'c' c.dist = 1 a.add_child(b) a.add_child(c) d = ete3.TreeNode() d.name = 'd' d.dist = 2 e = ete3.TreeNode() e.name = 'e' e.dist = 2 b.add_child(d) b.add_child(e) f = ete3.TreeNode() f.name = 'f' f.dist = 1 g = ete3.TreeNode() g.name = 'g' g.dist = 1 c.add_child(f) c.add_child(g) h = ete3.TreeNode() h.name = 'h' h.dist = 1 i = ete3.TreeNode() i.name = 'i' i.dist = 1 d.add_child(h) d.add_child(i) j = ete3.TreeNode() j.name = 'j' j.dist = 1 k = ete3.TreeNode() k.name = 'k' k.dist = 1 h.add_child(j) h.add_child(k) l = ete3.TreeNode() l.name = 'l' l.dist = 1 m = ete3.TreeNode() m.name = 'm' m.dist = 2 e.add_child(l) e.add_child(m) n = ete3.TreeNode() n.name = 'n' n.dist = 1 o = ete3.TreeNode() o.name = 'o' o.dist = 1 m.add_child(n) m.add_child(o) p = ete3.TreeNode() p.name = 'p' p.dist = 1 q = ete3.TreeNode() q.name = 'q' q.dist = 2 f.add_child(p) f.add_child(q) r = ete3.TreeNode() r.name = 'r' r.dist = 1 s = ete3.TreeNode() s.name = 's' s.dist = 1 q.add_child(r) q.add_child(s) t = ete3.TreeNode() t.name = 't' t.dist = 1 u = ete3.TreeNode() u.name = 'u' u.dist = 1 s.add_child(t) s.add_child(u) #print(a.write(features=['name'])) #print(a) #print(a.name) return a
def duplicateOrthoTree(trees1, trees2, distDuplication=0.005) : result = alignTrees([trees1, trees2]) tips = np.random.permutation(result[1][0][1].get_leaf_names()) dists = np.random.exponential(distDuplication, size=len(tips)) events = [[t.split('_', 1)[0], t, d] for t, d in zip(tips.tolist(), dists)] for res in result : for r in res : w, t1, t2 = r tr, ts = t1.copy(), t2.copy() n1 = {} for n in tr.get_leaves() : o = int(n.name.split('_', 1)[0]) if o not in n1 : n1[o] = [n] else : n1[o].append(n) n2 = { n.split('_', 1)[0]:n for n in ts.get_leaf_names() } for o, cn1, dist in events : c2 = ts.get_leaves_by_name(n2[o]) if not c2 : continue c2 = c2[0] c1 = tr.get_leaves_by_name(cn1)[0] d1 = dist while d1 - c1.dist > 0 and c1.up : d1, c1 = d1 - c1.dist, c1.up d2 = dist while d2 - c2.dist > 0 and c2.up : d2, c2 = d2 - c2.dist, c2.up p, l = c1.up, c1.dist n = ete3.TreeNode() if p is not None : p.remove_child(c1) n.add_child(c1) c1.up, c1.dist = n, d1 if p is not None : p.add_child(n) n.dist = l - c1.dist else : tr = n for _, x, _ in events : tr.get_leaves_by_name(x)[0] p2 = c2.up if p2 is not None : p2.remove_child(c2) n.add_child(c2) c2.up, c2.dist = n, d2 if p2 is not None : gp = p2.up if gp : gp.remove_child(p2) for c in p2.get_children() : p2.remove_child(c) gp.add_child(c) c.up, c.dist = gp, c.dist + p2.dist elif len(p2.get_children()) == 1 : ts = p2.get_children()[0] ts.up = None else : break r[:] = [w, tr] return result
def create_nj_tree(node_names, distance_matrix): """ Create a tree using the neighbor joining algorithm Algorithm from: "Biological sequence analysis: Probabilistic models of proteins and nucleic acids" by Durbin, Eddy, Krogh and Mitchinson (1998) Return the root of the tree """ # Create the initial list of leaf nodes # Our nodes will be instances of ete3.TreeNode # We are using ete3 so we can easily visualize the tree t_nodes = create_nodes(node_names) l_nodes = t_nodes.copy() dist_mat = distance_matrix.copy() # While there is still more than 2 leaf nodes remaining while len(l_nodes) > 2: # print("create_nj_tree:: length of leaf nodes = {}".format(len(l_nodes))) # "Pick a pair i, j in L for which D(i,j) is minimal" D_matrix = create_D_matrix(dist_mat) # We created the D matrix, now find the minimal value minimum = sys.float_info.max minimum_coord = (0, 0) for i in range(len(dist_mat)): for j in range(len(dist_mat)): if D_matrix[i][j] < minimum: minimum = D_matrix[i][j] minimum_coord = (i, j) # "Define a new node k and set d(k,m) = 1/2 * (d(i,m) + d(j,m) - d(i,j)), for all m in L except i, j" # We want a new distance_matrix with i, j removed and k (which is just ij) added # First remove i and j new_dist_mat = remove_row_col(dist_mat, minimum_coord[0], minimum_coord[1]) # print("new dist mat after remove rowCol= {}".format(new_dist_mat)) # Add in the new node new_dist_list = create_combined_dist_list(dist_mat, minimum_coord[0], minimum_coord[1]) # print("new dist list = {}".format(new_dist_list)) new_dist_mat = add_row_col(new_dist_mat, new_dist_list) # print("new dist mat after add RowCol = {}".format(new_dist_mat)) # So now we have a new_dist_mat with k added, and i and j removed # Now, create a TreeNode with i and j as children and set the correct distances k = ete3.TreeNode() dist_i_k = get_edge_length(dist_mat, minimum_coord[0], minimum_coord[1]) dist_j_k = dist_mat[minimum_coord[0]][minimum_coord[1]] - dist_i_k k.add_child(child=l_nodes[minimum_coord[0]], dist=dist_i_k) k.add_child(child=l_nodes[minimum_coord[1]], dist=dist_j_k) # Remove i and j, add k del l_nodes[minimum_coord[0]] if minimum_coord[0] > minimum_coord[1]: del l_nodes[minimum_coord[1]] else: del l_nodes[minimum_coord[1] - 1] l_nodes.append(k) # print("old dist mat = {}".format(dist_mat)) # print("new dist mat = {}".format(new_dist_mat)) # print("length old dist mat = {}".format(len(dist_mat[0]))) # print("length new dist mat = {}".format(len(new_dist_mat[0]))) # finally update the dist_mat dist_mat = new_dist_mat # Now there is only 2 leaves remaining # Define a new root root = ete3.TreeNode(dist=0) root.add_child(child=l_nodes[0], dist=dist_mat[0][1]/2) root.add_child(child=l_nodes[1], dist=dist_mat[0][1] / 2) # Choose either of the nodes to be the root # root = l_nodes[0] # root.add_child(child=l_nodes[1], dist=dist_mat[0][1]) return root
def visualize(self): return ete3.TreeNode(name=self.decision)
def make_singleton_node(nodename, taxon): snode = ete3.TreeNode(name=nodename) snode.add_feature("S", taxon) snode.add_feature("P", 1) # whether it is a paralogy branch? return snode