Ejemplo n.º 1
0
    def get_dissimilarity_map(self,
                              cassiopeia_tree: CassiopeiaTree,
                              layer: Optional[str] = None) -> pd.DataFrame:
        """Obtains or generates a matrix that is updated throughout the solver.

        The highest-level method to obtain a dissimilarity map, which
        will be the matrix primarily used throughout the solve method. This
        matrix contains the pairwise dissimilarity between samples which is used
        for identifying sample pairs to merge, and will be updated at every
        iteration within the solve method. This method is not limited to
        outputting dissimilarity maps, but is instead deliberately
        designed to be overwritten to allow for use of similarity maps or other
        algorithm-specific sample to sample comparison maps in derived classes.

        Args:
            cassiopeia_tree: Tree object from which the 
                dissimilarity map is generated from
            layer: Layer storing the character matrix 
                for solving. If None, the default character matrix is used in 
                the CassiopeiaTree.

        Returns:
            pd.DataFrame: The matrix that will be used throughout the solve 
                method.
        """

        self.setup_dissimilarity_map(cassiopeia_tree, layer)
        dissimilarity_map = cassiopeia_tree.get_dissimilarity_map()

        return dissimilarity_map
Ejemplo n.º 2
0
    def setup_root_finder(self, cassiopeia_tree: CassiopeiaTree) -> None:
        """Gives the implicit rooting strategy for the SNJ Solver.

        By default, the SpectralNeighborJoining algorithm returns an
        unrooted tree.  To root this tree, an implicit root of all zeros is
        added to the character matrix. Then, the dissimilarity map is
        recalculated using the updated character matrix. If the tree already
        has a computed dissimilarity map, only the new similarities are
        calculated. See 'setup_root_finder' in NeighborJoiningSolver.

        Args:
            cassiopeia_tree: Input CassiopeiaTree to `solve`
        """
        character_matrix = cassiopeia_tree.character_matrix.copy()
        rooted_character_matrix = character_matrix.copy()

        root = [0] * rooted_character_matrix.shape[1]
        rooted_character_matrix.loc["root"] = root
        cassiopeia_tree.root_sample_name = "root"
        cassiopeia_tree.character_matrix = rooted_character_matrix

        if self.dissimilarity_function is None:
            raise DistanceSolver.DistanceSolverError(
                "Please specify a dissimilarity function to add an implicit "
                "root, or specify an explicit root"
            )

        dissimilarity_map = cassiopeia_tree.get_dissimilarity_map()
        if dissimilarity_map is None:
            cassiopeia_tree.compute_dissimilarity_map(
                self.dissimilarity_function, self.prior_transformation
            )
        else:
            dissimilarity = {"root": 0}
            for leaf in character_matrix.index:
                weights = None
                if cassiopeia_tree.priors:
                    weights = solver_utilities.transform_priors(
                        cassiopeia_tree.priors, self.prior_transformation
                    )
                dissimilarity[leaf] = self.dissimilarity_function(
                    rooted_character_matrix.loc["root"].values,
                    rooted_character_matrix.loc[leaf].values,
                    cassiopeia_tree.missing_state_indicator,
                    weights,
                )
            cassiopeia_tree.set_dissimilarity("root", dissimilarity)

        cassiopeia_tree.character_matrix = character_matrix
Ejemplo n.º 3
0
    def setup_dissimilarity_map(self,
                                cassiopeia_tree: CassiopeiaTree,
                                layer: Optional[str] = None) -> None:
        """Sets up the solver.

        Sets up the solver with respect to the input CassiopeiaTree by
        creating the dissimilarity map if needed and setting up the
        "root" sample if the tree will be rooted. Operates directly on the
        CassiopeiaTree.

        Args:
            cassiopeia_tree: Input CassiopeiaTree to `solve`.
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.

        Raises:
            A `DistanceSolverError` if rooting parameters are not passed in
                correctly (i.e. no root is specified and the user has not
                asked to find a root) or when a dissimilarity map cannot
                be found or computed.
        """

        # if root sample is not specified, we'll add the implicit root
        # and recompute the dissimilarity map

        if cassiopeia_tree.root_sample_name is None:
            if self.add_root:
                self.setup_root_finder(cassiopeia_tree)

            else:
                raise DistanceSolverError(
                    "Please specify an explicit root sample in the Cassiopeia Tree"
                    " or specify the solver to add an implicit root")

        if cassiopeia_tree.get_dissimilarity_map() is None:
            if self.dissimilarity_function is None:
                raise DistanceSolverError(
                    "Please specify a dissimilarity function or populate the "
                    "CassiopeiaTree object with a dissimilarity map")

            cassiopeia_tree.compute_dissimilarity_map(
                self.dissimilarity_function, self.prior_transformation, layer)
Ejemplo n.º 4
0
def compute_cophenetic_correlation(
    tree: CassiopeiaTree,
    weights: Optional[pd.DataFrame] = None,
    dissimilarity_map: Optional[pd.DataFrame] = None,
    dissimilarity_function: Optional[
        Callable[[np.array, np.array, int, Dict[int, Dict[int, float]]], float]
    ] = dissimilarity_functions.weighted_hamming_distance,
) -> Tuple[float, float]:
    """Computes the cophenetic correlation of a lineage.

    Computes the cophenetic correlation of a lineage, which is defined as the
    Pearson correlation between the phylogenetic distance and dissimilarity
    between characters.

    If neither weight matrix nor the dissimilarity map are precomputed, then 
    this function will run in O(mn^2 + n^2logn + n^2) time, as the dissimilarity
    map will take O(mn^2) time, the phylogenetic distance will take O(n^2 logn)
    time, and the Pearson correlation will take O(n^2) time since it must
    compare n^2 entries (n = number of leaves; m = number of characters).

    Args:
        tree: CassiopeiaTree
        weights: Phylogenetic weights matrix. If this is not specified, invokes
            `cas.data.compute_phylogenetic_weight_matrix`
        dissimilarity_map: Dissimilarity matrix between samples. If this is not
            specified, then `tree.compute_dissimilarity_map` will be called.
        dissimilarity_function: Dissimilarity function to use. If dissimilarity
            map is not passed in, and one does not already exist in the
            CassiopeiaTree, then this function will be used to compute the
            dissimilarities between samples.
    
    Returns:
        The cophenetic correlation value and significance for the tree.
    """

    # set phylogenetic weight matrix
    W = (
        compute_phylogenetic_weight_matrix(tree)
        if (weights is None)
        else weights
    )

    # set dissimilarity map
    D = (
        tree.get_dissimilarity_map()
        if (dissimilarity_map is None)
        else dissimilarity_map
    )
    if D is None:
        tree.compute_dissimilarity_map(
            dissimilarity_function=dissimilarity_function
        )
        D = tree.get_dissimilarity_map()

    # align matrices
    cells = tree.leaves
    W = W.loc[cells, cells]
    D = D.loc[cells, cells]

    # convert to condensed distance matrices
    Wp = spatial.distance.squareform(W)
    Dp = spatial.distance.squareform(D)

    return stats.pearsonr(Wp, Dp)