def __init__(self, newick = None, text_array = None, \ fdist=clustvalidation.default_dist): # Default dist is spearman_dist when scipy module is loaded # otherwise, it is set to euclidean_dist. # Initialize basic tree features and loads the newick (if any) TreeNode.__init__(self, newick) self._fdist = None self._silhouette = None self._intercluster_dist = None self._intracluster_dist = None self._profile = None self._std_profile = None # Cluster especific features self.features.add("intercluster_dist") self.features.add("intracluster_dist") self.features.add("silhouette") self.features.add("profile") self.features.add("deviation") # Initialize tree with array data if text_array: self.link_to_arraytable(text_array) if newick: self.set_distance_function(fdist)
def validate_monophylies(tree: ete3.TreeNode, clades: dict, force_check=False): exit = 0 for clade, leaves in clades.items(): ismono, cladetype, badleaves = tree.check_monophyly(leaves, 'name') if ismono and not force_check: print('Clade %s: OK (%d leaves).' % (clade, len(leaves))) else: exit = 1 print('Clade %s: NO (%s):' % (clade, cladetype)) for node in tree.traverse('preorder'): if node.name.startswith(clade): found_leaves = node.get_leaf_names() print(' * Found node %s' % node.name) print(' with extra leaves: ', ' '.join(set(found_leaves).difference(leaves))) print(' and missing leaves: ', ' '.join(set(leaves).difference(found_leaves))) break else: print(" * Not found: '^%s.*'" % clade) mrca = tree.get_common_ancestor(leaves) print(' * MRCA is', mrca.name) return exit
def recreate_tree(tree, num_layers=None, color=True): # build tree with same topology but without the coordinate and metadata labels # use color_dict to color nodes the appropriate colors new_tree = TreeNode(name=tree.name) #new_tree = TreeNodeHashable(name = tree.name) new_tree.img_style['size'] = 10 if color: new_tree.img_style['fgcolor'] = tree.color new_tree.img_style['shape'] = 'sphere' old_layer = [tree] new_layer = [new_tree] layer_num = 0 while old_layer: next_old_layer, next_new_layer = [], [] for ind, node in enumerate(old_layer): for child in node.children: next_old_layer.append(child) new_child = TreeNode(name=child.name) new_child.img_style['size'] = 10 if color: new_child.img_style['fgcolor'] = child.color new_child.img_style['shape'] = 'sphere' new_layer[ind].add_child(new_child) next_new_layer.append(new_child) old_layer = next_old_layer new_layer = next_new_layer layer_num += 1 if num_layers is not None and layer_num == num_layers: break return new_tree
def compare(self, tree2, method='identity'): '''compare this tree to the other tree''' if method == 'identity': # we compare lists of seq, parent, abundance # return true if these lists are identical, else false list1 = sorted((node.sequence, node.frequency, node.up.sequence if node.up is not None else None) for node in self.tree.traverse()) list2 = sorted((node.sequence, node.frequency, node.up.sequence if node.up is not None else None) for node in tree2.tree.traverse()) return list1 == list2 elif method == 'MRCA': # matrix of hamming distance of common ancestors of taxa # takes a true and inferred tree as CollapsedTree objects taxa = [ node.sequence for node in self.tree.traverse() if node.frequency ] n_taxa = len(taxa) d = scipy.zeros(shape=(n_taxa, n_taxa)) sum_sites = scipy.zeros(shape=(n_taxa, n_taxa)) for i in range(n_taxa): nodei_true = self.tree.iter_search_nodes( sequence=taxa[i]).next() nodei = tree2.tree.iter_search_nodes(sequence=taxa[i]).next() for j in range(i + 1, n_taxa): nodej_true = self.tree.iter_search_nodes( sequence=taxa[j]).next() nodej = tree2.tree.iter_search_nodes( sequence=taxa[j]).next() MRCA_true = self.tree.get_common_ancestor( (nodei_true, nodej_true)).sequence MRCA = tree2.tree.get_common_ancestor( (nodei, nodej)).sequence d[i, j] = hamming_distance(MRCA_true, MRCA) sum_sites[i, j] = len(MRCA_true) return d.sum() / sum_sites.sum() elif method == 'RF': tree1_copy = self.tree.copy(method='deepcopy') tree2_copy = tree2.tree.copy(method='deepcopy') for treex in (tree1_copy, tree2_copy): for node in list(treex.traverse()): if node.frequency > 0: child = TreeNode() child.add_feature('sequence', node.sequence) node.add_child(child) try: return tree1_copy.robinson_foulds(tree2_copy, attr_t1='sequence', attr_t2='sequence', unrooted_trees=True)[0] except: return tree1_copy.robinson_foulds(tree2_copy, attr_t1='sequence', attr_t2='sequence', unrooted_trees=True, allow_dup=True)[0] else: raise ValueError('invalid distance method: ' + method)
def _collapse_tree_by_sequence_and_isotype(tree: ete3.TreeNode): for node in tree.iter_descendants(): node.dist = node.up.sequence != node.sequence or node.up.isotype != node.isotype for node in tree.iter_descendants(): if node.dist == 0: node.up.abundance += node.abundance node.up.name = node.name node.delete(prevent_nondicotomic=False)
def read_tree(infile, format, quiet=False): if infile=='-': nwk_string = sys.stdin.readlines()[0] tree = TreeNode(newick=nwk_string, format=format, quoted_node_names=True) else: tree = TreeNode(newick=infile, format=format, quoted_node_names=True) if not quiet: num_leaves = len([ n for n in tree.traverse() if n.is_leaf() ]) sys.stderr.write('number of leaves in input tree: {:,}\n'.format(num_leaves)) return tree
def simplify_tree(self, tree): root_label = self._simplify_tree(tree) if tree.label in ['Arg1', 'Arg2', 'Conn', 'none']: tree.children = self.get_leave_node(tree) return for i, c in enumerate(tree.children): if self.deeperthan1(c): self.simplify_tree(c) else: n = TreeNode() n.children = [c] n.label = c.label tree.children[i] = n
def __init__(self, newick=None, format=0, dist=None, support=None, name=None): """ Default init for the TreeClass. This works better than wrapping the entire class""" TreeNode.__init__(self, newick=newick, format=format, dist=dist, support=support, name=name)
def DFS_get_tree(root, par_node): results = get_info(root) par_node.name = results[0] if len(results) == 1: # par node is a leaf, end return elif len(results) == 3: name, l, r = results l_node = TreeNode() r_node = TreeNode() par_node.add_child(l_node) par_node.add_child(r_node) return DFS_get_tree(l, l_node), DFS_get_tree(r, r_node)
def isotype_tree( tree: ete3.TreeNode, newidmap: Dict[str, Dict[str, str]], isotype_names: Sequence[str], weight_matrix: Optional[Sequence[Sequence[float]]] = None, ) -> ete3.TreeNode: """Method adds isotypes to ``tree``, minimizing isotype switching and obeying switching order. * Adds observed isotypes to each observed node in the collapsed trees output by gctree inference. If cells with the same sequence but different isotypes are observed, then collapsed tree nodes must be ‘exploded’ into new nodes with the appropriate isotypes and abundances. Each unique sequence ID generated by gctree is prepended to its observed isotype, and a new `isotyped.idmap` mapping these new sequence IDs to original sequence IDs is written in the output directory. * Resolves isotypes of unobserved ancestral genotypes in a way that minimizes isotype switching and obeys isotype switching order. If observed isotypes of an observed internal node and its children violate switching order, then the observed internal node is replaced with an unobserved node with the same sequence, and the observed internal node is placed as a child leaf. This procedure always allows switching order conflicts to be resolved, and should usually increase isotype transitions required in the resulting tree. Args: tree: ete3 Tree newidmap: mapping of sequence IDs to isotypes, such as that output by :meth:`utils.explode_idmap`. isotype_names: list or other sequence of isotype names observed, in correct switching order. Returns: A new ete3 Tree whose nodes have isotype annotations in the attribute ``isotype``. Node names in this tree also contain isotype names. """ tree = tree.copy() _add_observed_isotypes(tree, newidmap, isotype_names, weight_matrix=weight_matrix) _disambiguate_isotype(tree) _collapse_tree_by_sequence_and_isotype(tree) for node in tree.traverse(): node.name = str(node.name) + " " + str(node.isotype) for node in tree.iter_descendants(): node.dist = hamming_distance(node.up.sequence, node.sequence) return tree
def get_clade_count(tree:TreeNode, clades: List[str], alternate_names: Dict[str, str]) -> Dict[str, int]: """ Returns the total number of clades for each clade in [clades]. Sequence not matching any of [clades] will be added under "other" in return dictionary Args: - tree (TreeNode): the tree in which clades are to be counted for - clades (List[str]): List containing the clade names Clade names should occur at the start of the sequence name. - altername_names (Dict[str, str]): Maps alternate name found in sequence to desired name. Returns: - Dict[str, int]: Maps the clade to the total number present. """ def get_belonging(leaf: TreeNode, clades: Dict[str, str]): """ """ result = [clades[clade] for clade in clades.keys() if leaf.name.startswith(clade)] if len(result) == 0: return "other" return result[0] clade_dict = {clade: clade for clade in clades} clade_dict.update(alternate_names) leaf_clades = list(map(partial(get_belonging, clades = clade_dict), tree.get_leaves())) serialize = pd.Series(data = leaf_clades, dtype = str) return serialize.value_counts().to_dict()
def parameterised_test(mutDict, insertionDict, mutations, expected_output): f = MockFile() node = TreeNode(name="test_node") node.mutations = mutations original_mD = mutDict.copy() original_iD = insertionDict.copy() genome_tree.writeGenomeShortIndels(node=node, file=f, mutDict=mutDict, insertionDict=insertionDict) # the whole point of this function is that the genome tree updates and then de-updates # any mutations. So we need the mutDict and insertionDict to remain the same before and after printing. assert mutDict == original_mD assert insertionDict == original_iD assert f.written_data == expected_output #
def initialize_pathogen_tree(self): """ Initialize one pathogen lineage per host tip dist records height that pathogen lineage was started TODO: relax this assumption - needs some way to input """ # reset containers self.extant_p = [] # pathogen lineages that have not coalesced self.not_yet_sampled_p = [] # pathogen lineages higher in the tree for i, host_tip in enumerate(self.hosttree.get_leaves()): pnode = TreeNode(name=host_tip.name + '_P', dist=0) pnode.add_features(height=host_tip.height, host=host_tip) if host_tip.height == 0: self.extant_p.append(pnode) else: self.not_yet_sampled_p.append(pnode)
def _convert_biotree_to_etetree(bio_tree): fhand = io.StringIO() write_newick([bio_tree], fhand) newick = fhand.getvalue() newick = re.sub("Inner[0-9]+:", ":", newick) ete_tree = TreeNode(newick) return (ete_tree)
def add_tree_layer(tree, leaves, clusters, proportions, child_coords, prop_filter): ''' tree: tree that we want to add an additional layer to leaves: leaves of tree clusters: number of clusters in the child layer proportions: nested dictionary containing id of parent and id of child and the proportion of cells contained in the parent that are also contained in the child prop_filter: proportion of cells for edge between clusters to be created ''' child_nodes = {} for ind in range(len(clusters)): child_node_id = clusters[ind] child_nodes[child_node_id] = TreeNode(name=child_node_id) # add coordinate data to node child_nodes[child_node_id].add_features( coords=child_coords[child_node_id]) child_nodes[child_node_id].add_features(cluster_id=child_node_id) for child_node_id in proportions: # ensure that each child node is not added to more than one parent node proportions_child = proportions[child_node_id] max_node_id = max(proportions_child, key=proportions_child.get) if proportions_child[max_node_id] > prop_filter: parent_node = leaves[max_node_id] parent_node.add_child(child_nodes[child_node_id]) return tree, child_nodes
def layout_lift(node: TreeNode, levels: int = 3) -> None: """Layout implementation for a tree node Parameters ---------- node : TreeNode the root of the taxonomy tree / sub-tree levels : int a number of tree levels to draw Returns ------- None """ name = TextFace(node.name if (int(node.e) < levels or node.Hd == "1") else "", tight_text=True) name.rotation = 270 node.add_face(name, column=0, position="branch-right") nst = NodeStyle() if .2 >= float(node.u) > 0: nst["fgcolor"] = "#90ee90" elif .4 >= float(node.u) > .2: nst["fgcolor"] = "green" elif float(node.u) > .4: nst["fgcolor"] = "#004000" else: nst["fgcolor"] = "red" if node.Hd == "0": nst["size"] = 20 nst["shape"] = "square" else: if node.Ch == "1": nst["size"] = 40 nst["shape"] = "circle" else: nst["size"] = 40 nst["shape"] = "circle" if node.Sq == "1": nst["shape"] = "circle" node.set_style(nst)
def p_toArbre(self): n = TreeNode() n.name = "main()" n1 = TreeNode() n1.name = str(self.sons[0]) n2 = self.sons[1].c_toArbre() n3 = self.sons[2].e_toArbre() n.add_child(n1) n.add_child(n2) n.add_child(n3) return n
def discover_children(object=None): """ Discovers all children defined in the thrift_spec of an instance of a thrift auto-generated class. :param object: The treenode object to search to discover the children. :return: The discovered children, wrapped in treenodes. """ nodes = [] for spec in object.obj.thrift_spec.values(): node = TreeNode(name=spec[1]) node.add_features(t_parent=object.obj, t_name=spec[1], t_type=spec[2]) object.add_child(node) nodes.append(node) return nodes
def __init__(self, newick=None, alignment=None, alg_format="fasta", \ sp_naming_function=_parse_species, format=0, **kargs): # _update names? self._name = "NoName" self._species = "Unknown" self._speciesFunction = None # Caution! native __init__ has to be called after setting # _speciesFunction to None!! TreeNode.__init__(self, newick=newick, format=format, **kargs) # This will be only executed after reading the whole tree, # because the argument 'alignment' is not passed to the # PhyloNode constructor during parsing if alignment: self.link_to_alignment(alignment, alg_format) if newick: self.set_species_naming_function(sp_naming_function)
def color_taxon(node:TreeNode, color_marker: str, offset: int): """ Sets the NodeStyle for [n] Args: - node(TreeNode): the leaf to set node_style for - color_marker(Str): the delineator to split taxon sequence name to extract color information. Color info should be the last thing right after [color_marker]. Color should also be in Hex format either FFFFF or #FFFFFF - offset(int): how many characters off the back to chop off. """ split = node.name.split(color_marker) if len(split) > 1: color = split[-1][:offset] if color.startswith("#"): node.img_style = node_style(color=color) else: node.img_style = node_style(color= f"#{color}")
def isotype_parsimony(tree: ete3.TreeNode) -> float: """Computes the sum of :meth:`isotype_distance` along each edge in an isotyped tree. If no weight matrix was provided during isotyping of the tree, then the return value of this function is the number of isotype transitions along edges in the tree. """ return sum( isotype_distance(node.up.isotype, node.isotype) for node in tree.iter_descendants())
def root_on(tree: TreeNode, clade: str, clade_details: List, clade_total:int, clades: List[str], alternate_names: Dict[str, str]) -> TreeNode: """ Roots the given tree on clade Args: - tree (TreeNode): The tree in which will be rooted - clade (str): The clade to root tree on. Tree must contain clade. - clade_details (List): Contains [color, density, coverage] in that order - clade_total (int): The total number leaves in the tree belonging to [clade] - clades (List[str]): List containing the clade names Clade names should occur at the start of the sequence name. - altername_names (Dict[str, str]): Maps alternate name found in sequence to desired name. Returns: - TreeNode: A tree with clade as the outgroup """ node = get_max_ancestor(tree, clade, clade_details, clade_total, clades, alternate_names) tree.set_outgroup(node) return tree
def layout_raw(node: TreeNode, tight_mode: bool = True) -> None: """Layout implementation for a tree node Parameters ---------- node : TreeNode the root of the taxonomy tree / sub-tree tight_mode : bool, default=True a mode to print node names more tightly Returns ------- None """ if tight_mode: name_segments = node.name.split(' ') for i, name_segment in enumerate(name_segments): name_face = TextFace(name_segment, tight_text=True) name_face.rotation = 270 node.add_face(name_face, column=i, position="branch-right") else: name_face = TextFace(node.name, tight_text=True) name_face.rotation = 270 node.add_face(name_face, column=0, position="branch-right") nst = NodeStyle() nst["fgcolor"] = "black" nst["size"] = 20 nst["shape"] = "circle" node.set_style(nst)
def add_tree_to_distribution(self, tree): """ Add the bipartition of a tree to the CCP distribution Takes: - tree (ete3.Tree): phylogenetic tree """ if len(tree.children) == 3: ## special unrroted case where the tree begin by a trifurcation ... ## we artificially remove the trifurcation to avoid future problems a = TreeNode() b = tree.children[1] c = tree.children[2] b.detach() c.detach() tree.add_child(a) a.add_child(b) a.add_child(c) #print " special rerooting " for i in tree.traverse(): if len(i.children) > 2: print "multifurcation detected! Please provide bifurcating trees." print "exiting now" exit(1) if self.nb_observation == 0: ##no tree has been observed yet: add all the leaves for l in tree.get_leaf_names(): self.get_leaf_id(l) ##adds the leaves to the CCP for node in tree.traverse("postorder"): ##for each branch of the tree self.add_tree_branch_to_distribution(node) self.nb_observation += 1 return
def simulate(self): ''' simulate a collapsed tree given params replaces existing tree data member with simulation result, and returns self ''' if self.params is None: raise ValueError('params must be defined for simulation') # initiate by running a LeavesAndClades simulation to get the number of clones and mutants # in the root node of the collapsed tree LeavesAndClades.simulate(self) self.tree = TreeNode() self.tree.add_feature('frequency', self.c) if self.m == 0: return self for _ in range(self.m): # ooooh, recursion child = CollapsedTree(params=self.params, frame=self.frame).simulate().tree child.dist = 1 self.tree.add_child(child) return self
def coalesce_paths(self, child_paths, t0): """ Create a new TreeNode and assign a given list of child nodes and its host node. :param child_paths: A list of TreeNodes in the pathogen tree. :param t0: Time of pathogen coalescence as height :return: A tuple containing: 1. TreeNode object for the new pathogen lineage. 2. updated extant list """ assert len(child_paths ) == 2, 'Can only coalesce 2 pathogen lineages at a time' p1, p2 = child_paths assert p1 in self.extant_p and p2 in self.extant_p, 'Both pathogen lineages must be extant' assert p1.host == p2.host, 'Can only coalesce pathogen lineages in the same host' host = p1.host assert p1.height < t0 and p2.height < t0, \ 'Pathogen lineage heights %f %f cannot exceed coalescent event %f' % (p1.height, p2.height, t0) # create new pathogen lineage new_path = TreeNode(name='_'.join([x.name for x in child_paths]), dist=0) new_path.add_features(host=host, height=t0) # cast child_paths as a List because ete3.Tree.children requires it new_path.children = list(child_paths) self.extant_p.append(new_path) # coalesced pathogen lineages are no longer extant for node in child_paths: node.up = new_path node.dist = t0 - node.height # when node was created, we stored the height self.extant_p.remove(node) self.not_extant_p.append(node) return new_path
def get_qualifying_nodes (tree: TreeNode,clade: Union[str, List[str]], clade_total: int, clade_details: List, clades: List[str] = None, alternate_names: Dict[str, str] = None, node_cache = None, ): qualifying_list = {} for node in tree.traverse(): if node.is_leaf(): continue children_distribution = {} if node_cache is None: children_distribution = get_clade_count(node, clades= clades, alternate_names=alternate_names) else: children_distribution = node_cache[node] density, coverage = get_details(children_distribution, clade, clade_total) if density >= clade_details[MIN_DENSITY] and density <= clade_details[MAX_DENSITY] and coverage >= clade_details[MIN_COV] and coverage <= clade_details[MAX_COV]: qualifying_list[node] = (density, coverage, density + coverage) return qualifying_list
def copy_forest(forest, features=None): features = set(features if features else forest[0].features) copied_forest = [] for tree in forest: copied_tree = TreeNode() todo = [(tree, copied_tree)] copied_forest.append(copied_tree) while todo: n, copied_n = todo.pop() copied_n.dist = n.dist copied_n.support = n.support copied_n.name = n.name for f in features: if hasattr(n, f): copied_n.add_feature(f, getattr(n, f)) for c in n.children: todo.append((c, copied_n.add_child())) return copied_forest
def color_taxons(tree: TreeNode, color_marker: str, offset:int): """ Sets the node style for all nodes in [tree] Args: - tree(TreeNode): the input tree to set node_style for - color_marker(Str): the delineator to split taxon sequence name to extract color information. Color info should be the last thing right after [color_marker]. Color should also be in Hex format either FFFFF or #FFFFFF - offset(int): how many characters off the back to chop off. """ for node in tree.traverse(): node.img_style = node_style() if node.is_leaf(): color_taxon(node, color_marker=color_marker, offset= offset)
def get_node_details(tree: TreeNode, clades:List[str], alternate_names: Dict[str, str]) -> Dict[TreeNode, Dict[str, int]]: """ Computes the clade distribution at each node of the tree. Args - tree (TreeNode): The tree in which will be rooted - clades (List[str]): List containing the clade names Clade names should occur at the start of the sequence name. - altername_names (Dict[str, str]): Maps alternate name found in sequence to desired name. Returns: - Dict[TreeNode, Dict[str, int]]: DIctionary mapping a node to the clade_distribution under it. """ result = {} for node in tree.traverse(): if node.is_leaf(): continue result[node] = get_clade_count(node, clades= clades, alternate_names=alternate_names) return result
def creation_by_words(self, words): """ Creation of a tree based on separate words in the word list :type words: list """ # Creates an empty tree tree = Tree() tree.name = "" # Make sure there are no duplicates words = set(words) # Populate tree for word in words: # If no similar words exist, add it to the base of tree target = tree if self.is_reversed: words = list(reversed(split(r'[\s-]+|:[\\/]{2}', word))) else: words = split(r'[\s-]+|:[\\/]{2}', word) # Find relatives in the tree root = '' pos = 0 for pos in xrange(len(words), -1, -1): root = ' '.join(words[:pos]) if root in self.name2node: target = self.name2node[root] break # Add new nodes as necessary fullname = root for wd in words[pos:]: fullname = (fullname + ' ' + wd).strip() new_node = TreeNode(name=wd.strip(), dist=target.dist + 1) target.add_child(new_node) self.name2node[fullname] = new_node target = new_node return tree
def __init__(self, newick=None, format=0, dist=None, support=None, name=None): """ Default init for the TreeClass. This works better than wrapping the entire class""" TreeNode.__init__( self, newick=newick, format=format, dist=dist, support=support, name=name)