def _newick_to_tree_node(fh, convert_underscores=True): tree_stack = [] current_depth = 0 last_token = '' next_is_distance = False root = TreeNode() tree_stack.append((root, current_depth)) for token in _tokenize_newick(fh, convert_underscores=convert_underscores): # Check for a label if last_token not in '(,):': if not next_is_distance: tree_stack[-1][0].name = last_token if last_token else None else: next_is_distance = False # Check for a distance if token == ':': next_is_distance = True elif last_token == ':': try: tree_stack[-1][0].length = float(token) except ValueError: raise NewickFormatError("Could not read length as numeric type" ": %s." % token) elif token == '(': current_depth += 1 tree_stack.append((TreeNode(), current_depth)) elif token == ',': tree_stack.append((TreeNode(), current_depth)) elif token == ')': if len(tree_stack) < 2: raise NewickFormatError("Could not parse file as newick." " Parenthesis are unbalanced.") children = [] # Pop all nodes at this depth as they belong to the remaining # node on the top of the stack as children. while current_depth == tree_stack[-1][1]: node, _ = tree_stack.pop() children.insert(0, node) parent = tree_stack[-1][0] if parent.children: raise NewickFormatError("Could not parse file as newick." " Contains unnested children.") # This is much faster than TreeNode.extend for child in children: child.parent = parent parent.children = children current_depth -= 1 elif token == ';': if len(tree_stack) == 1: return root break last_token = token raise NewickFormatError("Could not parse file as newick." " `(Parenthesis)`, `'single-quotes'`," " `[comments]` may be unbalanced, or tree may be" " missing its root.")
def generate_skbio_tree(classification, existing_tree=None): from skbio.tree import MissingNodeError, TreeNode otus = classification.results()['table'] if existing_tree is None: tree = TreeNode(name='1', length=1) tree.tax_name = 'Root' tree.rank = 'no rank' else: tree = existing_tree # we use this to keep track of nodes that haven't had their parent added yet unlinked = defaultdict(list) for otu in otus: tax_id = otu['tax_id'] # skip nodes already in the tree try: tree.find(tax_id) continue except MissingNodeError: pass # try to find a parent (if it exists) parent_id = otu['parent_tax_id'] try: parent = tree.find(parent_id) # the children are merged out here (only if we have a parent) to # make sure we're not creating trees inside unlinked itself children = _merge_unlinked(tax_id, unlinked) except MissingNodeError: parent = None children = None # create the node node = TreeNode(name=tax_id, length=1, children=children) node.tax_name = otu.get('name', '') node.rank = otu.get('rank', 'no rank') # either add the node to its parent or keep track of it until its # parent is "in tree" too if parent is not None: parent.append(node) else: unlinked[parent_id].append(node) assert len( unlinked) == 0, 'some unlinked nodes were not included in the tree' return tree
def unifrac(classifications, weighted=True, field='readcount_w_children', rank='species', strict=False): """ A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence """ assert field in ACCEPTABLE_FIELDS counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) tree = None for c in classifications: if strict and c.job.id != classifications[0].job.id: raise OneCodexException('All Classifications must have the same Job for Unifrac') tree = generate_skbio_tree(c, existing_tree=tree) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here new_tree = TreeNode(name='fake root') new_tree.rank = 'no rank' new_tree.append(tree) # prune low-level nodes off the tree so the tips are what we're comparing prune_to_rank(new_tree, rank=rank) if weighted: return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids) else: return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids)
def _build_trees(clade_counts, edge_lengths, support_attr): """Construct the trees with support Parameters ---------- clade_counts : dict Keyed by the frozenset of the clade and valued by the support edge_lengths : dict Keyed by the frozenset of the clade and valued by the weighted length support_attr : str The name of the attribute to hold the support value Returns ------- list of TreeNode A list of the constructed trees """ nodes = {} queue = [(len(clade), clade) for clade in clade_counts] while queue: # The values within the queue are updated on each iteration, so it # doesn't look like an insertion sort will make sense unfortunately queue.sort() (clade_size, clade) = queue.pop(0) new_queue = [] # search for ancestors of clade for (_, ancestor) in queue: if clade.issubset(ancestor): # update ancestor such that, in the following example: # ancestor == {1, 2, 3, 4} # clade == {2, 3} # new_ancestor == {1, {2, 3}, 4} new_ancestor = (ancestor - clade) | frozenset([clade]) # update references for counts and lengths clade_counts[new_ancestor] = clade_counts.pop(ancestor) edge_lengths[new_ancestor] = edge_lengths.pop(ancestor) ancestor = new_ancestor new_queue.append((len(ancestor), ancestor)) # if the clade is a tip, then we have a name if clade_size == 1: name = list(clade)[0] else: name = None # the clade will not be in nodes if it is a tip children = [nodes.pop(c) for c in clade if c in nodes] length = edge_lengths[clade] node = TreeNode(children=children, length=length, name=name) setattr(node, support_attr, clade_counts[clade]) nodes[clade] = node queue = new_queue return list(nodes.values())
def unifrac(self, weighted=True, rank="auto"): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances import skbio.diversity if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def tree_build(self): """Build a tree from the taxonomy data present in this object. This is designed for use with `ClassificationsDataFrame` or `SampleCollection`. Returns ------- `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current analysis and their parents leading back to the root node. """ from skbio.tree import TreeNode # build all the nodes nodes = {} for tax_id in self.taxonomy.index: node = TreeNode(name=tax_id, length=1) node.tax_name = self.taxonomy["name"][tax_id] node.rank = self.taxonomy["rank"][tax_id] node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id] nodes[tax_id] = node # generate all the links for tax_id in self.taxonomy.index: try: parent = nodes[nodes[tax_id].parent_tax_id] except KeyError: if tax_id != "1": warnings.warn( "tax_id={} has parent_tax_id={} which is not in tree" "".format(tax_id, nodes[tax_id].parent_tax_id)) continue parent.append(nodes[tax_id]) return nodes["1"]
def deserialize(st: str, words: Optional[List] = None, convert_underscores: bool = True) -> Tuple[TreeNode, List]: """read str to TreeNode and get nested list of operation order Parameters ---------- st: str The string to recreate the tree from and extract an order of neighbors to be used as the guide tree. The guide tree is in the form of list of lists and tuples where an internal list represents an inner node and a tuple represents a pair to combine, e.g.: [['abc', ('aab','aac')],'xyz'] for the tree: --- 'xyz' ---| --- 'abc' --- | --- 'aab' ---| --- 'aac' words: list If the serialized tree was created using indices instead of labels, the original column can be passed in to return the exact values inside the order array convert_underscores: bool (default = True) flag to convert underscores as per the newick tokenizer """ tree_stack = [] current_depth = 0 last_token = '' root = TreeNode() tree_stack.append((root, current_depth)) next_is_distance = False combo = [] my_stack = [] my_stack.append((combo, current_depth)) for token in _tokenize_newick(st, convert_underscores=convert_underscores): # Check for a label if last_token not in '(,):': val = Sequence( words[int(last_token)]) if words else int(last_token) if not next_is_distance: tree_stack[-1][0].name = val if last_token else None else: next_is_distance = False if last_token: my_stack[-1][0].append(val) else: my_stack[-1][0].append(None) # Check for a distance if token == ':': next_is_distance = True elif last_token == ':': try: tree_stack[-1][0].length = float(token) except ValueError: raise NewickFormatError("Could not read length as numeric type" ": %s." % token) elif token == '(': current_depth += 1 tree_stack.append((TreeNode(), current_depth)) my_stack.append((list(), current_depth)) elif token == ',': tree_stack.append((TreeNode(), current_depth)) my_stack.append((list(), current_depth)) elif token == ')': if len(tree_stack) < 2: raise NewickFormatError("Could not parse file as newick." " Parenthesis are unbalanced.") children = [] my_children = [] # Pop all nodes at this depth as they belong to the remaining # node on the top of the stack as children. while current_depth == tree_stack[-1][1]: node, _ = tree_stack.pop() children.insert(0, node) nc, _ = my_stack.pop() [my_children.insert(0, c) for c in nc] parent = tree_stack[-1][0] my_parent = my_stack[-1][0] if parent.children: raise NewickFormatError("Could not parse file as newick." " Contains unnested children.") # This is much faster than TreeNode.extend for child in children: child.parent = parent parent.children = children my_parent.append(my_children) current_depth -= 1 elif token == ';': if len(tree_stack) == 1: return root, my_stack break last_token = token raise NewickFormatError("Could not parse file as newick." " `(Parenthesis)`, `'single-quotes'`," " `[comments]` may be unbalanced, or tree may be" " missing its root.")
def unifrac(self, weighted=True, rank=Rank.Auto): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ import skbio.diversity df = self.to_df(rank=rank, normalize=self._guess_normalized()) ocx_rank = df.ocx_rank # The scikit-bio implementations of phylogenetic metrics require integer counts if self._guess_normalized(): df = df * 10e9 tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=ocx_rank) # `scikit-bio` requires that the tree root has no more than 2 # children, otherwise it considers it "unrooted". # # https://github.com/biocore/scikit-bio/blob/f3ae1dcfe8ea88e52e19f6693d79e529d05bda04/skbio/diversity/_util.py#L89 # # Our taxonomy root regularly has more than 2 children, so we # add a fake parent of `root` to the tree here. from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( BetaDiversityMetric.WeightedUnifrac, df, df.index, tree=new_tree, otu_ids=tax_ids, normalized=True, ) else: return skbio.diversity.beta_diversity( BetaDiversityMetric.UnweightedUnifrac, df, df.index, tree=new_tree, otu_ids=tax_ids, )
def cluster(feature_matrix, prob_features, names=None, result_constructor=None): fm = copy.deepcopy(feature_matrix) fm = fm.values Ds = [] joins = [] if names is None: names = np.arange(fm.shape[0]) log_prob_features = np.log(prob_features) log_prob_features[-log_prob_features == np.inf] = -10000 #hacky # Compute the distance matrix D = squareform(pdist(fm, lambda u,v: (- (u + v - 2*u*v) * log_prob_features).sum())) tree_nodes = {} for name in names: tree_nodes[name] = TreeNode(name=str(name)) print('Starting with {0} nodes'.format(len(D))) new_name = len(D) new_lcas = {} while len(D) > 2: s = time.time() Ds.append(D) # Convert Q martix to lower triangular form without the diagonal to avoid merging the same site D[np.tril_indices(D.shape[0], 0)] = np.inf # Now find the argmin (i,j) of Q. These are the sites the be merged min_i, min_j = np.unravel_index(np.argmin(D, axis=None), D.shape) s = time.time() joins.append((names[min_i], names[min_j])) # Create a new TreeNode from the merged children new_name += 1 child_i = tree_nodes[names[min_i]] child_j = tree_nodes[names[min_j]] new_node = TreeNode(name=str(new_name), length=None, parent=None, children=[child_i, child_j]) child_i.parent = new_node child_j.parent = new_node tree_nodes[new_name] = new_node names = np.delete(names, [min_i,min_j], axis=0) names = np.hstack([names, new_name]) # Now we merge i,j. We need to replace i,j in the feature matrix with lca(i,j). # lca = lcas[min_i,min_j] lca = fm[min_i]*fm[min_j] fm = np.delete(fm, [min_i,min_j], axis=0) fm = np.vstack([fm, lca]) new_lcas[new_name] = lca # We also need to replace the distance of each site k to i or j with the distance to lca(i,j) D = np.delete(np.delete(D, [min_i,min_j], axis=0), [min_i,min_j], axis=1) new_D = np.zeros((fm.shape[0], fm.shape[0])) new_D[:-1, :-1] = D new_D_row = - ((fm + fm[-1] - 2* fm * fm[-1])*log_prob_features).sum(1) new_D[-1, :] = new_D_row new_D[:, -1] = new_D_row D = new_D new_name += 1 # Merge the last two remaining sites to complete the tree child1, child2 = tree_nodes[names[0]], tree_nodes[names[1]] root = TreeNode(name = str(new_name), children=[child1, child2]) child1.parent = root child2.parent = root return root, {'Ds':Ds, 'joins':joins, 'lcas':new_lcas}
def _cn2tn(cn, names): if cn.is_leaf(): return TreeNode(name=names[cn.id], length=cn.dist) left = _cn2tn(cn.left, names) right = _cn2tn(cn.right, names) return TreeNode(name=cn.id, length=cn.dist, children=[left, right])