def annotate_tree_depths(tree: CassiopeiaTree) -> None:
    """Annotates tree depth at every node.

    Adds two attributes to the tree: how far away each node is from the root of
    the tree and how many triplets are rooted at that node. Modifies the tree
    in place.

    Args:
        tree: An ete3 Tree

    Returns:
        A dictionary mapping depth to the list of nodes at that depth.
    """

    depth_to_nodes = defaultdict(list)
    for n in tree.depth_first_traverse_nodes(source=tree.root,
                                             postorder=False):
        if tree.is_root(n):
            tree.set_attribute(n, "depth", 0)
        else:
            tree.set_attribute(n, "depth",
                               tree.get_attribute(tree.parent(n), "depth") + 1)

        depth_to_nodes[tree.get_attribute(n, "depth")].append(n)

        number_of_leaves = 0
        correction = 0
        for child in tree.children(n):
            number_of_leaves += len(tree.leaves_in_subtree(child))
            correction += nCr(len(tree.leaves_in_subtree(child)), 3)

        tree.set_attribute(n, "number_of_triplets",
                           nCr(number_of_leaves, 3) - correction)

    return depth_to_nodes
Esempio n. 2
0
def compute_expansion_pvalues(
    tree: CassiopeiaTree,
    min_clade_size: int = 10,
    min_depth: int = 1,
    copy: bool = False,
) -> Union[CassiopeiaTree, None]:
    """Call expansion pvalues on a tree.

    Uses the methodology described in Yang, Jones et al, BioRxiv (2021) to
    assess the expansion probability of a given subclade of a phylogeny.
    Mathematical treatment of the coalescent probability is described in
    Griffiths and Tavare, Stochastic Models (1998).

    The probability computed corresponds to the probability that, under a simple
    neutral coalescent model, a given subclade contains the observed number of
    cells; in other words, a one-sided p-value. Often, if the probability is
    less than some threshold (e.g., 0.05), this might indicate that there exists
    some subclade under this node that to which this expansion probability can
    be attributed (i.e. the null hypothesis that the subclade is undergoing 
    neutral drift can be rejected).

    This function will add an attribute "expansion_pvalue" to the tree, and
    return None unless :param:`copy` is set to True.

    On a typical balanced tree, this function will perform in O(n log n) time, 
    but can be up to O(n^3) on highly unbalanced trees. A future endeavor may 
    be to impelement the function in O(n) time.

    Args:
        tree: CassiopeiaTree
        min_clade_size: Minimum number of leaves in a subtree to be considered.
        min_depth: Minimum depth of clade to be considered. Depth is measured
            in number of nodes from the root, not branch lengths.
        copy: Return copy.

    Returns:
        If copy is set to False, returns the tree with attributes added
            in place. Else, returns a new CassiopeiaTree.
    """

    tree = tree.copy() if copy else tree

    # instantiate attributes
    _depths = {}
    for node in tree.depth_first_traverse_nodes(postorder=False):
        tree.set_attribute(node, "expansion_pvalue", 1.0)

        if tree.is_root(node):
            _depths[node] = 0
        else:
            _depths[node] = _depths[tree.parent(node)] + 1

    for node in tree.depth_first_traverse_nodes(postorder=False):

        n = len(tree.leaves_in_subtree(node))

        k = len(tree.children(node))
        for c in tree.children(node):

            if len(tree.leaves_in_subtree(c)) < min_clade_size:
                continue

            depth = _depths[c]
            if depth < min_depth:
                continue

            b = len(tree.leaves_in_subtree(c))

            # this value below is a simplification of the quantity:
            # sum[simple_coalescent_probability(n, b2, k) for \
            #   b2 in range(b, n - k + 2)]
            p = nCk(n - b, k - 1) / nCk(n - 1, k - 1)

            tree.set_attribute(c, "expansion_pvalue", p)

    return tree if copy else None
    def percolate(
        self,
        character_matrix: pd.DataFrame,
        samples: List[str],
        priors: Optional[Dict[int, Dict[int, float]]] = None,
        weights: Optional[Dict[int, Dict[int, float]]] = None,
        missing_state_indicator: int = -1,
    ) -> Tuple[List[str], List[str]]:
        """The function used by the percolation algorithm to partition the
        set of samples in two.
        First, a pairwise similarity graph is generated with samples as nodes
        such that edges between a pair of nodes is some provided function on
        the number of character/state mutations shared. Then, the algorithm
        removes the minimum edge (in the case of ties all are removed) until
        the graph is split into multiple connected components. If there are more
        than two connected components, the procedure joins them until two remain.
        This is done by inferring the mutations of the LCA of each sample set
        obeying Camin-Sokal Parsimony, and then clustering the groups of samples
        based on their LCAs. The provided solver is used to cluster the groups
        into two clusters.
        Args:
            character_matrix: Character matrix
            samples: A list of samples to partition
            priors: A dictionary storing the probability of each character
                mutating to a particular state.
            weights: Weighting of each (character, state) pair. Typically a
                transformation of the priors.
            missing_state_indicator: Character representing missing data.
        Returns:
            A tuple of lists, representing the left and right partition groups
        """
        sample_indices = solver_utilities.convert_sample_names_to_indices(
            character_matrix.index, samples)
        unique_character_array = character_matrix.to_numpy()

        G = nx.Graph()
        G.add_nodes_from(sample_indices)

        # Add edge weights into the similarity graph
        edge_weight_buckets = defaultdict(list)
        for i, j in itertools.combinations(sample_indices, 2):
            similarity = self.similarity_function(
                unique_character_array[i, :],
                unique_character_array[j, :],
                missing_state_indicator,
                weights,
            )
            if similarity > self.threshold:
                edge_weight_buckets[similarity].append((i, j))
                G.add_edge(i, j)

        if len(G.edges) == 0:
            return samples, []

        connected_components = list(nx.connected_components(G))
        sorted_edge_weights = sorted(edge_weight_buckets, reverse=True)

        # Percolate the similarity graph by continuously removing the minimum
        # edge until at least two components exists
        while len(connected_components) <= 1:
            min_weight = sorted_edge_weights.pop()
            for edge in edge_weight_buckets[min_weight]:
                G.remove_edge(edge[0], edge[1])
            connected_components = list(nx.connected_components(G))

        # If the number of connected components > 2, merge components by
        # joining the most similar LCAs of each component until
        # only 2 remain
        partition_sides = []

        if len(connected_components) > 2:
            for c in range(len(connected_components)):
                connected_components[c] = list(connected_components[c])
            lcas = {}
            component_to_nodes = {}
            # Find the LCA of the nodes in each connected component
            for ind in range(len(connected_components)):
                component_identifier = "component" + str(ind)
                component_to_nodes[
                    component_identifier] = connected_components[ind]
                character_vectors = [
                    list(i) for i in list(unique_character_array[
                        connected_components[ind], :])
                ]
                lcas[component_identifier] = data_utilities.get_lca_characters(
                    character_vectors, missing_state_indicator)
            # Build a tree on the LCA characters to cluster the components
            lca_tree = CassiopeiaTree(
                pd.DataFrame.from_dict(lcas, orient="index"),
                missing_state_indicator=missing_state_indicator,
                priors=priors,
            )

            self.joining_solver.solve(lca_tree,
                                      collapse_mutationless_edges=False)
            grouped_components = []

            # Take the split at the root as the clusters of components
            # in the split, ignoring unifurcations
            current_node = lca_tree.root
            while len(grouped_components) == 0:
                successors = lca_tree.children(current_node)
                if len(successors) == 1:
                    current_node = successors[0]
                else:
                    for i in successors:
                        grouped_components.append(
                            lca_tree.leaves_in_subtree(i))

            # For each component in each cluster, take the nodes in that
            # component to form the final split
            for cluster in grouped_components:
                sample_index_group = []
                for component in cluster:
                    sample_index_group.extend(component_to_nodes[component])
                partition_sides.append(sample_index_group)
        else:
            for c in range(len(connected_components)):
                partition_sides.append(list(connected_components[c]))

        # Convert from component indices back to the sample names in the
        # original character matrix
        sample_names = list(character_matrix.index)
        partition_named = []
        for sample_index_group in partition_sides:
            sample_name_group = []
            for sample_index in sample_index_group:
                sample_name_group.append(sample_names[sample_index])
            partition_named.append(sample_name_group)

        return partition_named
def sample_triplet_at_depth(
    tree: CassiopeiaTree,
    depth: int,
    depth_to_nodes: Optional[Dict[int, List[str]]] = None,
) -> Tuple[List[int], str]:
    """Samples a triplet at a given depth.

    Samples a triplet of leaves such that the depth of the LCA of the triplet
    is at the specified depth.

    Args:
        tree: CassiopeiaTree
        depth: Depth at which to sample the triplet
        depth_to_nodes: An optional dictionary that maps a depth to the nodes
            that appear at that depth. This speeds up the function considerably.

    Returns:
        A list of three leaves corresponding to the triplet name of the outgroup
            of the triplet.
    """

    if depth_to_nodes is None:
        candidate_nodes = tree.filter_nodes(
            lambda x: tree.get_attribute(x, "depth") == depth)
    else:
        candidate_nodes = depth_to_nodes[depth]

    total_triplets = sum(
        [tree.get_attribute(v, "number_of_triplets") for v in candidate_nodes])

    # sample a  node from this depth with probability proportional to the number
    # of triplets underneath it
    probs = [
        tree.get_attribute(v, "number_of_triplets") / total_triplets
        for v in candidate_nodes
    ]
    node = np.random.choice(candidate_nodes, size=1, replace=False, p=probs)[0]

    # Generate the probabilities to sample each combination of 3 daughter clades
    # to sample from, proportional to the number of triplets in each daughter
    # clade. Choices include all ways to choose 3 different daughter clades
    # or 2 from one daughter clade and one from another
    probs = []
    combos = []
    denom = 0
    for (i, j, k) in itertools.combinations_with_replacement(
            list(tree.children(node)), 3):

        if i == j and j == k:
            continue

        combos.append((i, j, k))

        size_of_i = len(tree.leaves_in_subtree(i))
        size_of_j = len(tree.leaves_in_subtree(j))
        size_of_k = len(tree.leaves_in_subtree(k))

        val = 0
        if i == j:
            val = nCr(size_of_i, 2) * size_of_k
        elif j == k:
            val = nCr(size_of_j, 2) * size_of_i
        elif i == k:
            val = nCr(size_of_k, 2) * size_of_j
        else:
            val = size_of_i * size_of_j * size_of_k
        probs.append(val)
        denom += val

    probs = [val / denom for val in probs]

    # choose daughter clades
    ind = np.random.choice(range(len(combos)), size=1, replace=False,
                           p=probs)[0]
    (i, j, k) = combos[ind]

    if i == j:
        in_group = np.random.choice(tree.leaves_in_subtree(i),
                                    2,
                                    replace=False)
        out_group = np.random.choice(tree.leaves_in_subtree(k))
    elif j == k:
        in_group = np.random.choice(tree.leaves_in_subtree(j),
                                    2,
                                    replace=False)
        out_group = np.random.choice(tree.leaves_in_subtree(i))
    elif i == k:
        in_group = np.random.choice(tree.leaves_in_subtree(k), 2, replace=True)
        out_group = np.random.choice(tree.leaves_in_subtree(j))
    else:

        return (
            (
                str(np.random.choice(tree.leaves_in_subtree(i))),
                str(np.random.choice(tree.leaves_in_subtree(j))),
                str(np.random.choice(tree.leaves_in_subtree(k))),
            ),
            "None",
        )

    return (str(in_group[0]), str(in_group[1]), str(out_group)), out_group