def fitch_hartigan_top_down(
    cassiopeia_tree: CassiopeiaTree,
    root: Optional[str] = None,
    state_key: str = "S1",
    label_key: str = "label",
    copy: bool = False,
) -> Optional[CassiopeiaTree]:
    """Run Fitch-Hartigan top-down refinement

    Runs the Fitch-Hartigan top-down algorithm which selects an optimal solution
    from the tree rooted at the specified root.

    Args:
        cassiopeia_tree: CassiopeiaTree that has been processed with the
            Fitch-Hartigan bottom-up algorithm.
        root: Root from which to begin this refinement. Only the subtree below
            this node will be considered.
        state_key: Attribute key that stores the Fitch-Hartigan ancestral
            states.
        label_key: Key to add that stores the maximum-parsimony assignment
            inferred from the Fitch-Hartigan top-down refinement.
        copy: Modify the tree in place or not.

    Returns:
        A new CassiopeiaTree if the copy is set to True, else None.

    Raises:
        A CassiopeiaTreeError if Fitch-Hartigan bottom-up has not been called
        or if the state_key does not exist for a node.
    """

    # assign root
    root = cassiopeia_tree.root if (root is None) else root

    cassiopeia_tree = cassiopeia_tree.copy() if copy else cassiopeia_tree

    for node in cassiopeia_tree.depth_first_traverse_nodes(source=root,
                                                           postorder=False):

        if node == root:
            root_states = cassiopeia_tree.get_attribute(root, state_key)
            cassiopeia_tree.set_attribute(root, label_key,
                                          np.random.choice(root_states))
            continue

        parent = cassiopeia_tree.parent(node)
        parent_label = cassiopeia_tree.get_attribute(parent, label_key)
        optimal_node_states = cassiopeia_tree.get_attribute(node, state_key)

        if parent_label in optimal_node_states:
            cassiopeia_tree.set_attribute(node, label_key, parent_label)

        else:
            cassiopeia_tree.set_attribute(
                node, label_key, np.random.choice(optimal_node_states))

    return cassiopeia_tree if copy else None
def score_small_parsimony(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    root: Optional[str] = None,
    infer_ancestral_states: bool = True,
    label_key: Optional[str] = "label",
) -> int:
    """Computes the small-parsimony of the tree.

    Using the meta data stored in the specified cell meta column, compute the
    parsimony score of the tree.

    Args:
        cassiopeia_tree: CassiopeiaTree object with cell meta data.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        root: Node to treat as the root. Only the subtree below
            this node will be considered.
        infer_ancestral_states: Whether or not ancestral states must be inferred
            (this will be False if `fitch_hartigan` has already been called on
            the tree.)
        label_key: If ancestral states have already been inferred, this key
            indicates the name of the attribute they're stored in.

    Returns:
        The parsimony score.

    Raises:
        CassiopeiaError if label_key has not been populated.
    """

    cassiopeia_tree = cassiopeia_tree.copy()

    if infer_ancestral_states:
        fitch_hartigan(cassiopeia_tree, meta_item, root, label_key=label_key)

    parsimony = 0
    for (parent,
         child) in cassiopeia_tree.depth_first_traverse_edges(source=root):

        try:
            if cassiopeia_tree.get_attribute(
                    parent, label_key) != cassiopeia_tree.get_attribute(
                        child, label_key):
                parsimony += 1
        except CassiopeiaTreeError:
            raise CassiopeiaError(f"{label_key} does not exist for a node, "
                                  "try running Fitch-Hartigan or passing "
                                  "infer_ancestral_states=True.")
    return parsimony
def fitch_hartigan(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    root: Optional[str] = None,
    state_key: str = "S1",
    label_key: str = "label",
    copy: bool = False,
) -> Optional[CassiopeiaTree]:
    """Run the Fitch-Hartigan algorithm.
    
    Performs the full Fitch-Hartigan small parsimony algorithm which, given
    a set of states for the leaves, infers the most-parsimonious set of states
    and returns a random solution that satisfies the maximum-parsimony
    criterion. The solution will be stored in the label key specified by the
    user (by default 'label'). This function will modify the tree in place
    if `copy=False`.

    Args:
        cassiopeia_tree: CassiopeiaTree that has been processed with the
            Fitch-Hartigan bottom-up algorithm.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        root: Root from which to begin this refinement. Only the subtree below
            this node will be considered.
        state_key: Attribute key that stores the Fitch-Hartigan ancestral
            states.
        label_key: Key to add that stores the maximum-parsimony assignment
            inferred from the Fitch-Hartigan top-down refinement.
        copy: Modify the tree in place or not.
    
    Returns:
        A new CassiopeiaTree if the copy is set to True, else None.
    """

    cassiopeia_tree = cassiopeia_tree.copy() if copy else cassiopeia_tree

    fitch_hartigan_bottom_up(cassiopeia_tree, meta_item, state_key)

    fitch_hartigan_top_down(cassiopeia_tree, root, state_key, label_key)

    return cassiopeia_tree if copy else None
def fitch_hartigan_bottom_up(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    add_key: str = "S1",
    copy: bool = False,
) -> Optional[CassiopeiaTree]:
    """Performs Fitch-Hartigan bottom-up ancestral reconstruction.

    Performs the bottom-up phase of the Fitch-Hartigan small parsimony
    algorithm. A new attribute called "S1" will be added to each node
    storing the optimal set of ancestral states inferred from this bottom-up 
    algorithm. If copy is False, the tree will be modified in place.
     

    Args:
        cassiopeia_tree: CassiopeiaTree object with cell meta data.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        add_key: Key to add for bottom-up reconstruction
        copy: Modify the tree in place or not.

    Returns:
        A new CassiopeiaTree if the copy is set to True, else None.

    Raises:
        CassiopeiaError if the tree does not have the specified meta data
            or the meta data is not categorical.
    """

    if meta_item not in cassiopeia_tree.cell_meta.columns:
        raise CassiopeiaError(
            "Meta item does not exist in the cassiopeia tree")

    meta = cassiopeia_tree.cell_meta[meta_item]

    if is_numeric_dtype(meta):
        raise CassiopeiaError("Meta item is not a categorical variable.")

    if not is_categorical_dtype(meta):
        meta = meta.astype("category")

    cassiopeia_tree = cassiopeia_tree.copy() if copy else cassiopeia_tree

    for node in cassiopeia_tree.depth_first_traverse_nodes():

        if cassiopeia_tree.is_leaf(node):
            cassiopeia_tree.set_attribute(node, add_key, [meta.loc[node]])

        else:
            children = cassiopeia_tree.children(node)
            if len(children) == 1:
                child_assignment = cassiopeia_tree.get_attribute(
                    children[0], add_key)
                cassiopeia_tree.set_attribute(node, add_key,
                                              [child_assignment])

            all_labels = np.concatenate([
                cassiopeia_tree.get_attribute(child, add_key)
                for child in children
            ])
            states, frequencies = np.unique(all_labels, return_counts=True)

            S1 = states[np.where(frequencies == np.max(frequencies))]
            cassiopeia_tree.set_attribute(node, add_key, S1)

    return cassiopeia_tree if copy else None
def fitch_count(
    cassiopeia_tree: CassiopeiaTree,
    meta_item: str,
    root: Optional[str] = None,
    infer_ancestral_states: bool = True,
    state_key: str = "S1",
    unique_states: Optional[List[str]] = None,
):
    """Runs the FitchCount algorithm.

    Performs the FitchCount algorithm for inferring the number of times that
    two states transition to one another across all equally-parsimonious
    solutions returned by the Fitch-Hartigan algorithm. The original algorithm
    was described in Quinn, Jones, et al, Science (2021). The output is an 
    MxM count matrix, where the values indicate the number of times that
    m1 transitioned to m2 along an edge in a Fitch-Hartigan solution.
    To obtain probabilities P(m1 -> m2), divide each row by its row-sum.

    This procedure will only work on categorical data and will otherwise raise
    an error.

    Args:
        cassiopeia_tree: CassiopeiaTree object with a tree and cell meta data.
        meta_item: A column in the CassiopeiaTree cell meta corresponding to a
            categorical variable.
        root: Node to treat as the root. Only the subtree below this node will
            be considered for the procedure.
        infer_ancestral_states: Whether or not to initialize the ancestral state
            sets with Fitch-Hartigan.
        state_key: If ancestral state sets have already been created, then this
            argument specifies what the attribute name is in the CassiopeiaTree
        unique_states: State space that can be optionally provided by the user.
            If this is not provided, we take the unique values in
            `cell_meta[meta_item]` to be the state space.

    Returns:
        An MxM count matrix indicating the number of edges that contained a
            transition between two states across all equally parsimonious
            solutions returned by Fitch-Hartigan.
    """
    cassiopeia_tree = cassiopeia_tree.copy()

    if unique_states is None:
        unique_states = cassiopeia_tree.cell_meta[meta_item].unique()
    else:
        if (len(
                np.setdiff1d(cassiopeia_tree.cell_meta[meta_item].unique(),
                             unique_states)) > 0):
            raise FitchCountError("Specified state space does not span the set"
                                  " of states that appear in the meta data.")

    if root != cassiopeia_tree.root:
        cassiopeia_tree.subset_clade(root)

    if infer_ancestral_states:
        fitch_hartigan_bottom_up(cassiopeia_tree, meta_item, add_key=state_key)

    # create mapping from nodes to integers
    bfs_postorder = [cassiopeia_tree.root]
    for (_, e1) in cassiopeia_tree.breadth_first_traverse_edges():
        bfs_postorder.append(e1)

    node_to_i = dict(zip(bfs_postorder, range(len(bfs_postorder))))
    label_to_j = dict(zip(unique_states, range(len(unique_states))))

    N = _N_fitch_count(cassiopeia_tree, unique_states, node_to_i, label_to_j,
                       state_key)

    C = _C_fitch_count(cassiopeia_tree, N, unique_states, node_to_i,
                       label_to_j, state_key)

    M = pd.DataFrame(np.zeros((N.shape[1], N.shape[1])))
    M.columns = unique_states
    M.index = unique_states

    # create count matrix
    for s1 in unique_states:
        for s2 in unique_states:
            M.loc[s1, s2] = np.sum(C[node_to_i[cassiopeia_tree.root], :,
                                     label_to_j[s1], label_to_j[s2], ])

    return M
Exemple #6
0
def compute_expansion_pvalues(
    tree: CassiopeiaTree,
    min_clade_size: int = 10,
    min_depth: int = 1,
    copy: bool = False,
) -> Union[CassiopeiaTree, None]:
    """Call expansion pvalues on a tree.

    Uses the methodology described in Yang, Jones et al, BioRxiv (2021) to
    assess the expansion probability of a given subclade of a phylogeny.
    Mathematical treatment of the coalescent probability is described in
    Griffiths and Tavare, Stochastic Models (1998).

    The probability computed corresponds to the probability that, under a simple
    neutral coalescent model, a given subclade contains the observed number of
    cells; in other words, a one-sided p-value. Often, if the probability is
    less than some threshold (e.g., 0.05), this might indicate that there exists
    some subclade under this node that to which this expansion probability can
    be attributed (i.e. the null hypothesis that the subclade is undergoing 
    neutral drift can be rejected).

    This function will add an attribute "expansion_pvalue" to the tree, and
    return None unless :param:`copy` is set to True.

    On a typical balanced tree, this function will perform in O(n log n) time, 
    but can be up to O(n^3) on highly unbalanced trees. A future endeavor may 
    be to impelement the function in O(n) time.

    Args:
        tree: CassiopeiaTree
        min_clade_size: Minimum number of leaves in a subtree to be considered.
        min_depth: Minimum depth of clade to be considered. Depth is measured
            in number of nodes from the root, not branch lengths.
        copy: Return copy.

    Returns:
        If copy is set to False, returns the tree with attributes added
            in place. Else, returns a new CassiopeiaTree.
    """

    tree = tree.copy() if copy else tree

    # instantiate attributes
    _depths = {}
    for node in tree.depth_first_traverse_nodes(postorder=False):
        tree.set_attribute(node, "expansion_pvalue", 1.0)

        if tree.is_root(node):
            _depths[node] = 0
        else:
            _depths[node] = _depths[tree.parent(node)] + 1

    for node in tree.depth_first_traverse_nodes(postorder=False):

        n = len(tree.leaves_in_subtree(node))

        k = len(tree.children(node))
        for c in tree.children(node):

            if len(tree.leaves_in_subtree(c)) < min_clade_size:
                continue

            depth = _depths[c]
            if depth < min_depth:
                continue

            b = len(tree.leaves_in_subtree(c))

            # this value below is a simplification of the quantity:
            # sum[simple_coalescent_probability(n, b2, k) for \
            #   b2 in range(b, n - k + 2)]
            p = nCk(n - b, k - 1) / nCk(n - 1, k - 1)

            tree.set_attribute(c, "expansion_pvalue", p)

    return tree if copy else None