Example #1
0
    def test_evolutionary_coupling_custom_dissimilarity_map(self):

        weight_matrix = pd.DataFrame.from_dict(
            {
                "D": [0.0, 0.5, 1.2, 0.4, 0.5, 0.6],
                "F": [0.5, 0.0, 3.0, 1.1, 3.0, 0.1],
                "G": [1.2, 3.0, 0.0, 0.8, 0.2, 0.8],
                "H": [0.4, 1.1, 0.8, 0.0, 2.0, 2.1],
                "I": [0.5, 3.0, 0.2, 2.0, 0.0, 0.1],
                "J": [0.6, 0.1, 1.8, 2.1, 0.1, 0.0],
            },
            orient="index",
            columns=["D", "F", "G", "H", "I", "J"],
        )

        random_state = np.random.RandomState(1231234)

        evolutionary_coupling = cas.tl.compute_evolutionary_coupling(
            self.tree,
            meta_variable="CellType",
            random_state=random_state,
            minimum_proportion=0.0,
            number_of_shuffles=10,
            dissimilarity_map=weight_matrix,
        )

        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
            self.tree, meta_item="CellType", dissimilarity_map=weight_matrix)

        # background computed with random seed set above and 10 shuffles
        # (state1, state2): (mean, sd)
        expected_summary_stats = {
            ("TypeB", "TypeB"): (0.695, 0.5456418239101545),
            ("TypeB", "TypeA"): (1.0000000000000002, 0.281291663580704),
            ("TypeB", "TypeC"): (1.0925, 0.44763964301656745),
            ("TypeA", "TypeB"): (1.0000000000000002, 0.3148412298286232),
            ("TypeA", "TypeA"): (0.63, 0.4550824101193101),
            ("TypeA", "TypeC"): (1.2349999999999999, 0.391503512117069),
            ("TypeC", "TypeB"): (1.0675000000000001, 0.4493119740225047),
            ("TypeC", "TypeA"): (1.26, 0.41791147387933725),
            ("TypeC", "TypeC"): (0.4699999999999999, 0.41424630354415953),
        }

        expected_coupling = inter_cluster_distances.copy()
        for s1 in expected_coupling.index:
            for s2 in expected_coupling.columns:
                mean = expected_summary_stats[(s1, s2)][0]
                sd = expected_summary_stats[(s1, s2)][1]

                expected_coupling.loc[
                    s1, s2] = (inter_cluster_distances.loc[s1, s2] - mean) / sd

        pd.testing.assert_frame_equal(expected_coupling,
                                      evolutionary_coupling,
                                      atol=0.001)
Example #2
0
    def test_inter_cluster_distance_custom_input(self):

        tree = nx.DiGraph()
        tree.add_nodes_from(["A", "B", "C", "D", "E", "F"])
        tree.add_edge("F", "A", length=0.1)
        tree.add_edge("F", "B", length=0.2)
        tree.add_edge("F", "E", length=0.5)
        tree.add_edge("E", "C", length=0.3)
        tree.add_edge("E", "D", length=0.4)

        meta_data = pd.DataFrame.from_dict(
            {
                "A": ["TypeA", 10],
                "B": ["TypeA", 5],
                "C": ["TypeB", 3],
                "D": ["TypeB", 22],
            },
            orient="index",
            columns=["CellType", "nUMI"],
        )

        weight_matrix = pd.DataFrame.from_dict(
            {
                "A": [0.0, 0.5, 1.2, 0.4],
                "B": [0.5, 0.0, 3.0, 1.1],
                "C": [1.2, 3.0, 0.0, 0.8],
                "D": [0.4, 1.1, 0.8, 0.0],
            },
            orient="index",
            columns=["A", "B", "C", "D"],
        )

        tree = CassiopeiaTree(tree=tree)

        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
            tree,
            meta_data=meta_data["CellType"],
            dissimilarity_map=weight_matrix,
        )

        expected_distances = pd.DataFrame.from_dict(
            {
                "TypeA": [0.25, 1.425],
                "TypeB": [1.425, 0.4]
            },
            orient="index",
            columns=["TypeA", "TypeB"],
        )

        pd.testing.assert_frame_equal(
            expected_distances,
            inter_cluster_distances,
            check_exact=False,
            atol=0.001,
        )
Example #3
0
    def test_evolutionary_coupling_basic(self):

        random_state = np.random.RandomState(1231234)

        evolutionary_coupling = cas.tl.compute_evolutionary_coupling(
            self.tree,
            meta_variable="CellType",
            random_state=random_state,
            minimum_proportion=0.0,
            number_of_shuffles=10,
        )

        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
            self.tree, meta_item="CellType")

        # background computed with random seed set above and 10 shuffles
        # (state1, state2): (mean, sd)
        expected_summary_stats = {
            ("TypeA", "TypeA"): (1.7, 0.6000000000000001),
            ("TypeA", "TypeB"): (3.55, 0.4716990566028302),
            ("TypeA", "TypeC"): (3.55, 0.4716990566028302),
            ("TypeB", "TypeA"): (3.55, 0.4716990566028302),
            ("TypeB", "TypeB"): (2.0, 0.5),
            ("TypeB", "TypeC"): (3.65, 0.45),
            ("TypeC", "TypeA"): (3.55, 0.4716990566028302),
            ("TypeC", "TypeB"): (3.65, 0.45),
            ("TypeC", "TypeC"): (1.8, 0.5567764362830022),
        }

        expected_coupling = inter_cluster_distances.copy()
        for s1 in expected_coupling.index:
            for s2 in expected_coupling.columns:
                mean = expected_summary_stats[(s1, s2)][0]
                sd = expected_summary_stats[(s1, s2)][1]

                expected_coupling.loc[
                    s1, s2] = (inter_cluster_distances.loc[s1, s2] - mean) / sd

        pd.testing.assert_frame_equal(expected_coupling,
                                      evolutionary_coupling,
                                      atol=0.001)

        # make sure errors are raised for numerical data
        self.assertRaises(
            CassiopeiaError,
            cas.tl.compute_evolutionary_coupling,
            self.tree,
            "nUMI",
        )
Example #4
0
    def test_evolutionary_coupling_minimum_proportion(self):

        self.tree.cell_meta.loc["J", "CellType"] = "TypeD"

        random_state = np.random.RandomState(1231234)

        evolutionary_coupling = cas.tl.compute_evolutionary_coupling(
            self.tree,
            meta_variable="CellType",
            random_state=random_state,
            minimum_proportion=1 / 6,  # This will drop types C and D
            number_of_shuffles=10,
        )

        # make sure TypeC and TypeD are not in the evolutionary coupling matrix
        expected_types = ["TypeA", "TypeB"]
        self.assertCountEqual(expected_types, evolutionary_coupling.index)
        self.assertCountEqual(expected_types, evolutionary_coupling.columns)

        # make sure couplings are correct
        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
            self.tree, meta_item="CellType")

        inter_cluster_distances = inter_cluster_distances.loc[expected_types,
                                                              expected_types]

        expected_summary_stats = {
            ("TypeB", "TypeB"): (1.4, 0.19999999999999998),
            ("TypeB", "TypeA"): (2.6, 0.19999999999999998),
            ("TypeA", "TypeB"): (2.6, 0.19999999999999998),
            ("TypeA", "TypeA"): (1.4, 0.19999999999999998),
        }

        expected_coupling = inter_cluster_distances.copy()
        for s1 in expected_coupling.index:
            for s2 in expected_coupling.columns:
                mean = expected_summary_stats[(s1, s2)][0]
                sd = expected_summary_stats[(s1, s2)][1]

                expected_coupling.loc[
                    s1, s2] = (inter_cluster_distances.loc[s1, s2] - mean) / sd

        evolutionary_coupling = evolutionary_coupling.loc[expected_types,
                                                          expected_types]
        pd.testing.assert_frame_equal(expected_coupling,
                                      evolutionary_coupling,
                                      atol=0.001)
Example #5
0
    def test_inter_cluster_distance_basic(self):

        tree = nx.DiGraph()
        tree.add_nodes_from(["A", "B", "C", "D", "E", "F"])
        tree.add_edge("F", "A", length=0.1)
        tree.add_edge("F", "B", length=0.2)
        tree.add_edge("F", "E", length=0.5)
        tree.add_edge("E", "C", length=0.3)
        tree.add_edge("E", "D", length=0.4)

        meta_data = pd.DataFrame.from_dict(
            {
                "A": ["TypeA", 10],
                "B": ["TypeA", 5],
                "C": ["TypeB", 3],
                "D": ["TypeB", 22],
            },
            orient="index",
            columns=["CellType", "nUMI"],
        )

        tree = CassiopeiaTree(tree=tree, cell_meta=meta_data)

        inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
            tree, meta_item="CellType")

        expected_distances = pd.DataFrame.from_dict(
            {
                "TypeA": [0.15, 1.0],
                "TypeB": [1.0, 0.35]
            },
            orient="index",
            columns=["TypeA", "TypeB"],
        )

        pd.testing.assert_frame_equal(expected_distances,
                                      inter_cluster_distances)

        self.assertRaises(
            CassiopeiaError,
            data_utilities.compute_inter_cluster_distances,
            tree,
            "nUMI",
        )
Example #6
0
def compute_evolutionary_coupling(
    tree: CassiopeiaTree,
    meta_variable: str,
    minimum_proportion: float = 0.05,
    number_of_shuffles: int = 500,
    random_state: Optional[np.random.RandomState] = None,
    dissimilarity_map: Optional[pd.DataFrame] = None,
    cluster_comparison_function: Callable = data_utilities.
    net_relatedness_index,
    **comparison_kwargs,
) -> pd.DataFrame:
    """Computes Evolutionary Coupling of categorical variables.

    Using the methodology described in Yang, Jones et al, BioRxiv (2021), this
    function will compute the "evolutionary coupling" statistic between values
    that a categorical variable can take on with the tree. For example, this
    categorical variable can be a "cell type", and this function will compute
    the evolutionary couplings between all types of cell types. This indicates
    how closely related these cell types are to one another.

    Briefly, this statistic is the Z-normalized mean distance between categories
    in the specified categorical variable. Note that empirical nulls that have a
    standard deviation of 0 lead to NaNs in the resulting evolutionary coupling
    matrix. 

    The computational complexity of this function is
    O(n^2 log n + (B+1)(K^2 * O(distance_function)) for a tree with n leaves, a
    variable with K categories, and B random shuffles.

    Args:
        tree: CassiopeiaTree
        meta_variable: Column in `tree.cell_meta` that stores a categorical
            variable with K categories.
        minimum_proportion: Minimum proportion of cells that a category needs
            to appear in to be considered.
        number_of_shuffles: Number of times to shuffle the data to compute the
            empirical Z score.
        random_state: Numpy random state to parameterize the shuffling.
        dissimilarity_map: A precomputed dissimilarity map between all leaves.
        cluster_comparison_function: A function for comparing the mean distance
            between groups. By default, this is the Net Relatedness Index.
        **comparison_kwargs: Extra arguments to pass to the cluster comparison
            function.

    Returns:
        A K x K evolutionary coupling dataframe. 
    """

    W = (data_utilities.compute_phylogenetic_weight_matrix(tree) if
         (dissimilarity_map is None) else dissimilarity_map)

    meta_data = tree.cell_meta[meta_variable]

    # subset meta data by minimum proportion
    if minimum_proportion > 0:
        filter_threshold = int(len(tree.leaves) * minimum_proportion)
        category_frequencies = meta_data.value_counts()
        passing_categories = category_frequencies[
            category_frequencies > filter_threshold].index.values
        meta_data = meta_data[meta_data.isin(passing_categories)]
        W = W.loc[meta_data.index.values, meta_data.index.values]

    # compute inter-cluster distances
    inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
        tree,
        meta_data=meta_data,
        dissimilarity_map=W,
        distance_function=cluster_comparison_function,
        **comparison_kwargs,
    )

    # compute background for Z-scoring
    background = defaultdict(list)
    for _ in tqdm(range(number_of_shuffles),
                  desc="Creating empirical background"):
        permuted_assignments = meta_data.copy()
        if random_state:
            permuted_assignments.index = random_state.permutation(
                meta_data.index.values)
        else:
            permuted_assignments.index = np.random.permutation(
                meta_data.index.values)
        background_distances = data_utilities.compute_inter_cluster_distances(
            tree,
            meta_data=permuted_assignments,
            dissimilarity_map=W,
            distance_function=cluster_comparison_function,
            **comparison_kwargs,
        )
        for s1 in background_distances.index:
            for s2 in background_distances.columns:
                background[(s1, s2)].append(background_distances.loc[s1, s2])

    Z_scores = inter_cluster_distances.copy()
    for s1 in Z_scores.index:
        for s2 in Z_scores.columns:
            mean = np.mean(background[(s1, s2)])
            sd = np.std(background[(s1, s2)])

            Z_scores.loc[s1, s2] = (inter_cluster_distances.loc[s1, s2] -
                                    mean) / sd

    return Z_scores