def annotate_tree_depths(tree: CassiopeiaTree) -> None: """Annotates tree depth at every node. Adds two attributes to the tree: how far away each node is from the root of the tree and how many triplets are rooted at that node. Modifies the tree in place. Args: tree: An ete3 Tree Returns: A dictionary mapping depth to the list of nodes at that depth. """ depth_to_nodes = defaultdict(list) for n in tree.depth_first_traverse_nodes(source=tree.root, postorder=False): if tree.is_root(n): tree.set_attribute(n, "depth", 0) else: tree.set_attribute(n, "depth", tree.get_attribute(tree.parent(n), "depth") + 1) depth_to_nodes[tree.get_attribute(n, "depth")].append(n) number_of_leaves = 0 correction = 0 for child in tree.children(n): number_of_leaves += len(tree.leaves_in_subtree(child)) correction += nCr(len(tree.leaves_in_subtree(child)), 3) tree.set_attribute(n, "number_of_triplets", nCr(number_of_leaves, 3) - correction) return depth_to_nodes
def compute_expansion_pvalues( tree: CassiopeiaTree, min_clade_size: int = 10, min_depth: int = 1, copy: bool = False, ) -> Union[CassiopeiaTree, None]: """Call expansion pvalues on a tree. Uses the methodology described in Yang, Jones et al, BioRxiv (2021) to assess the expansion probability of a given subclade of a phylogeny. Mathematical treatment of the coalescent probability is described in Griffiths and Tavare, Stochastic Models (1998). The probability computed corresponds to the probability that, under a simple neutral coalescent model, a given subclade contains the observed number of cells; in other words, a one-sided p-value. Often, if the probability is less than some threshold (e.g., 0.05), this might indicate that there exists some subclade under this node that to which this expansion probability can be attributed (i.e. the null hypothesis that the subclade is undergoing neutral drift can be rejected). This function will add an attribute "expansion_pvalue" to the tree, and return None unless :param:`copy` is set to True. On a typical balanced tree, this function will perform in O(n log n) time, but can be up to O(n^3) on highly unbalanced trees. A future endeavor may be to impelement the function in O(n) time. Args: tree: CassiopeiaTree min_clade_size: Minimum number of leaves in a subtree to be considered. min_depth: Minimum depth of clade to be considered. Depth is measured in number of nodes from the root, not branch lengths. copy: Return copy. Returns: If copy is set to False, returns the tree with attributes added in place. Else, returns a new CassiopeiaTree. """ tree = tree.copy() if copy else tree # instantiate attributes _depths = {} for node in tree.depth_first_traverse_nodes(postorder=False): tree.set_attribute(node, "expansion_pvalue", 1.0) if tree.is_root(node): _depths[node] = 0 else: _depths[node] = _depths[tree.parent(node)] + 1 for node in tree.depth_first_traverse_nodes(postorder=False): n = len(tree.leaves_in_subtree(node)) k = len(tree.children(node)) for c in tree.children(node): if len(tree.leaves_in_subtree(c)) < min_clade_size: continue depth = _depths[c] if depth < min_depth: continue b = len(tree.leaves_in_subtree(c)) # this value below is a simplification of the quantity: # sum[simple_coalescent_probability(n, b2, k) for \ # b2 in range(b, n - k + 2)] p = nCk(n - b, k - 1) / nCk(n - 1, k - 1) tree.set_attribute(c, "expansion_pvalue", p) return tree if copy else None
def percolate( self, character_matrix: pd.DataFrame, samples: List[str], priors: Optional[Dict[int, Dict[int, float]]] = None, weights: Optional[Dict[int, Dict[int, float]]] = None, missing_state_indicator: int = -1, ) -> Tuple[List[str], List[str]]: """The function used by the percolation algorithm to partition the set of samples in two. First, a pairwise similarity graph is generated with samples as nodes such that edges between a pair of nodes is some provided function on the number of character/state mutations shared. Then, the algorithm removes the minimum edge (in the case of ties all are removed) until the graph is split into multiple connected components. If there are more than two connected components, the procedure joins them until two remain. This is done by inferring the mutations of the LCA of each sample set obeying Camin-Sokal Parsimony, and then clustering the groups of samples based on their LCAs. The provided solver is used to cluster the groups into two clusters. Args: character_matrix: Character matrix samples: A list of samples to partition priors: A dictionary storing the probability of each character mutating to a particular state. weights: Weighting of each (character, state) pair. Typically a transformation of the priors. missing_state_indicator: Character representing missing data. Returns: A tuple of lists, representing the left and right partition groups """ sample_indices = solver_utilities.convert_sample_names_to_indices( character_matrix.index, samples) unique_character_array = character_matrix.to_numpy() G = nx.Graph() G.add_nodes_from(sample_indices) # Add edge weights into the similarity graph edge_weight_buckets = defaultdict(list) for i, j in itertools.combinations(sample_indices, 2): similarity = self.similarity_function( unique_character_array[i, :], unique_character_array[j, :], missing_state_indicator, weights, ) if similarity > self.threshold: edge_weight_buckets[similarity].append((i, j)) G.add_edge(i, j) if len(G.edges) == 0: return samples, [] connected_components = list(nx.connected_components(G)) sorted_edge_weights = sorted(edge_weight_buckets, reverse=True) # Percolate the similarity graph by continuously removing the minimum # edge until at least two components exists while len(connected_components) <= 1: min_weight = sorted_edge_weights.pop() for edge in edge_weight_buckets[min_weight]: G.remove_edge(edge[0], edge[1]) connected_components = list(nx.connected_components(G)) # If the number of connected components > 2, merge components by # joining the most similar LCAs of each component until # only 2 remain partition_sides = [] if len(connected_components) > 2: for c in range(len(connected_components)): connected_components[c] = list(connected_components[c]) lcas = {} component_to_nodes = {} # Find the LCA of the nodes in each connected component for ind in range(len(connected_components)): component_identifier = "component" + str(ind) component_to_nodes[ component_identifier] = connected_components[ind] character_vectors = [ list(i) for i in list(unique_character_array[ connected_components[ind], :]) ] lcas[component_identifier] = data_utilities.get_lca_characters( character_vectors, missing_state_indicator) # Build a tree on the LCA characters to cluster the components lca_tree = CassiopeiaTree( pd.DataFrame.from_dict(lcas, orient="index"), missing_state_indicator=missing_state_indicator, priors=priors, ) self.joining_solver.solve(lca_tree, collapse_mutationless_edges=False) grouped_components = [] # Take the split at the root as the clusters of components # in the split, ignoring unifurcations current_node = lca_tree.root while len(grouped_components) == 0: successors = lca_tree.children(current_node) if len(successors) == 1: current_node = successors[0] else: for i in successors: grouped_components.append( lca_tree.leaves_in_subtree(i)) # For each component in each cluster, take the nodes in that # component to form the final split for cluster in grouped_components: sample_index_group = [] for component in cluster: sample_index_group.extend(component_to_nodes[component]) partition_sides.append(sample_index_group) else: for c in range(len(connected_components)): partition_sides.append(list(connected_components[c])) # Convert from component indices back to the sample names in the # original character matrix sample_names = list(character_matrix.index) partition_named = [] for sample_index_group in partition_sides: sample_name_group = [] for sample_index in sample_index_group: sample_name_group.append(sample_names[sample_index]) partition_named.append(sample_name_group) return partition_named
def sample_triplet_at_depth( tree: CassiopeiaTree, depth: int, depth_to_nodes: Optional[Dict[int, List[str]]] = None, ) -> Tuple[List[int], str]: """Samples a triplet at a given depth. Samples a triplet of leaves such that the depth of the LCA of the triplet is at the specified depth. Args: tree: CassiopeiaTree depth: Depth at which to sample the triplet depth_to_nodes: An optional dictionary that maps a depth to the nodes that appear at that depth. This speeds up the function considerably. Returns: A list of three leaves corresponding to the triplet name of the outgroup of the triplet. """ if depth_to_nodes is None: candidate_nodes = tree.filter_nodes( lambda x: tree.get_attribute(x, "depth") == depth) else: candidate_nodes = depth_to_nodes[depth] total_triplets = sum( [tree.get_attribute(v, "number_of_triplets") for v in candidate_nodes]) # sample a node from this depth with probability proportional to the number # of triplets underneath it probs = [ tree.get_attribute(v, "number_of_triplets") / total_triplets for v in candidate_nodes ] node = np.random.choice(candidate_nodes, size=1, replace=False, p=probs)[0] # Generate the probabilities to sample each combination of 3 daughter clades # to sample from, proportional to the number of triplets in each daughter # clade. Choices include all ways to choose 3 different daughter clades # or 2 from one daughter clade and one from another probs = [] combos = [] denom = 0 for (i, j, k) in itertools.combinations_with_replacement( list(tree.children(node)), 3): if i == j and j == k: continue combos.append((i, j, k)) size_of_i = len(tree.leaves_in_subtree(i)) size_of_j = len(tree.leaves_in_subtree(j)) size_of_k = len(tree.leaves_in_subtree(k)) val = 0 if i == j: val = nCr(size_of_i, 2) * size_of_k elif j == k: val = nCr(size_of_j, 2) * size_of_i elif i == k: val = nCr(size_of_k, 2) * size_of_j else: val = size_of_i * size_of_j * size_of_k probs.append(val) denom += val probs = [val / denom for val in probs] # choose daughter clades ind = np.random.choice(range(len(combos)), size=1, replace=False, p=probs)[0] (i, j, k) = combos[ind] if i == j: in_group = np.random.choice(tree.leaves_in_subtree(i), 2, replace=False) out_group = np.random.choice(tree.leaves_in_subtree(k)) elif j == k: in_group = np.random.choice(tree.leaves_in_subtree(j), 2, replace=False) out_group = np.random.choice(tree.leaves_in_subtree(i)) elif i == k: in_group = np.random.choice(tree.leaves_in_subtree(k), 2, replace=True) out_group = np.random.choice(tree.leaves_in_subtree(j)) else: return ( ( str(np.random.choice(tree.leaves_in_subtree(i))), str(np.random.choice(tree.leaves_in_subtree(j))), str(np.random.choice(tree.leaves_in_subtree(k))), ), "None", ) return (str(in_group[0]), str(in_group[1]), str(out_group)), out_group