Example #1
0
 def test_hand_solvable_problem_2(self, name, solver):
     """
     Tree topology is just a branch 0->1.
     There are two mutated characters and one unmutated character, i.e.:
         root [state = '000']
         |
         v
         child [state = '011']
     The solution can be verified by hand. The optimization problem is:
         min_{r * t0} log(exp(-r * t0)) + 2 * log(1 - exp(-r * t0))
     The solution is r * t0 = ln(3) ~ 1.098
     (Note that because the depth of the tree is fixed to 1, r * t0 = r * 1
     is the mutation rate.)
     """
     tree = nx.DiGraph()
     tree.add_nodes_from(["0", "1"])
     tree.add_edge("0", "1")
     tree = CassiopeiaTree(tree=tree)
     tree.set_all_character_states({"0": [0, 0, 0], "1": [0, 1, 1]})
     model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
     model.estimate_branch_lengths(tree)
     self.assertAlmostEqual(tree.get_branch_length("0", "1"), 1.0, places=3)
     self.assertAlmostEqual(tree.get_time("1"), 1.0, places=3)
     self.assertAlmostEqual(tree.get_time("0"), 0.0, places=3)
     self.assertAlmostEqual(model.mutation_rate, np.log(3), places=3)
     self.assertAlmostEqual(model.log_likelihood, -1.910, places=3)
Example #2
0
 def test_on_simulated_data(self, name, solver):
     """
     We run the estimator on data simulated under the correct model.
     The estimator should be close to the ground truth.
     """
     tree = nx.DiGraph()
     tree.add_nodes_from(["0", "1", "2", "3", "4", "5", "6"]),
     tree.add_edges_from(
         [
             ("0", "1"),
             ("0", "2"),
             ("1", "3"),
             ("1", "4"),
             ("2", "5"),
             ("2", "6"),
         ]
     )
     tree = CassiopeiaTree(tree=tree)
     tree.set_times(
         {"0": 0, "1": 0.1, "2": 0.9, "3": 1.0, "4": 1.0, "5": 1.0, "6": 1.0}
     )
     np.random.seed(1)
     Cas9LineageTracingDataSimulator(
         number_of_cassettes=200,
         size_of_cassette=1,
         mutation_rate=1.5,
     ).overlay_data(tree)
     model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver)
     model.estimate_branch_lengths(tree)
     self.assertTrue(0.05 < tree.get_time("1") < 0.15)
     self.assertTrue(0.8 < tree.get_time("2") < 1.0)
     self.assertTrue(0.9 < tree.get_time("3") < 1.1)
     self.assertTrue(0.9 < tree.get_time("4") < 1.1)
     self.assertTrue(0.9 < tree.get_time("5") < 1.1)
     self.assertTrue(0.9 < tree.get_time("6") < 1.1)
     self.assertTrue(1.4 < model.mutation_rate < 1.6)
     self.assertAlmostEqual(tree.get_time("0"), 0.0, places=3)
    def subsample_leaves(
            self,
            tree: CassiopeiaTree,
            keep_singular_root_edge: bool = True) -> CassiopeiaTree:
        """Uniformly subsample leaf samples of a given tree.

        Generates a uniform random sample on the leaves of the given
        CassiopeiaTree and returns a tree pruned to contain lineages relevant
        to only leaves in the sample (the "induced subtree" on the sample).
        All fields on the original character matrix persist, but maintains
        character states, meta data, and the dissimilarity map for the sampled
        cells only.

        Has the option to keep the single edge leading from the root in the
        induced subtree, if it exists. This edge is often used to represent the
        time that the root lives before any divisions occur in the phyologeny,
        and is useful in instances where the branch lengths are critical, like
        simulating ground truth phylogenies or estimating branch lengths.

        Args:
            tree: The CassiopeiaTree for which to subsample leaves
            keep_singular_root_edge: Whether or not to collapse the single edge
                leading from the root in the subsample, if it exists

        Returns:
            A new CassiopeiaTree that is the induced subtree on a sample of the
                leaves in the given tree

        Raises:
            LeafSubsamplerError if the sample size is <= 0, or larger than the
                number of leaves in the tree
        """
        ratio = self.__ratio
        number_of_leaves = self.__number_of_leaves
        n_subsample = (number_of_leaves if number_of_leaves is not None else
                       int(tree.n_cell * ratio))
        if n_subsample <= 0:
            raise LeafSubsamplerError(
                "Specified number of leaves sampled is <= 0.")
        if n_subsample > tree.n_cell:
            raise LeafSubsamplerError(
                "Specified number of leaves sampled is greater than the number"
                " of leaves in the given tree.")

        n_remove = len(tree.leaves) - n_subsample
        subsampled_tree = copy.deepcopy(tree)
        leaf_remove = np.random.choice(subsampled_tree.leaves,
                                       n_remove,
                                       replace=False)

        subsampled_tree.remove_leaves_and_prune_lineages(leaf_remove)

        # Keep the singular root edge if it exists and is indicated to be kept
        if (len(subsampled_tree.children(subsampled_tree.root)) == 1
                and keep_singular_root_edge):
            collapse_source = subsampled_tree.children(subsampled_tree.root)[0]
        else:
            collapse_source = None
        subsampled_tree.collapse_unifurcations(source=collapse_source)

        # Copy and annotate branch lengths and times
        subsampled_tree.set_times(
            dict([(node, tree.get_time(node))
                  for node in subsampled_tree.nodes]))
        return subsampled_tree
    def test_against_closed_form_solution_small(
        self,
        name,
        sampling_probability,
        birth_rate,
        many_characters,
        discretization_level,
    ):
        """
        For a small tree with only one internal node, the likelihood of the
        data, and the posterior age of the internal node, can be computed
        easily in closed form. We check the theoretical values against those
        obtained from our model. We try different settings of the model
        hyperparameters, particularly the birth rate and sampling probability.
        """
        # First, we create the ground truth tree and character states
        tree = nx.DiGraph()
        tree.add_nodes_from(["0", "1", "2", "3"])
        tree.add_edges_from([("0", "1"), ("1", "2"), ("1", "3")])
        tree = CassiopeiaTree(tree=tree)
        if many_characters:
            tree.set_all_character_states(
                {
                    "0": [0, 0, 0, 0, 0, 0, 0, 0, 0],
                    "1": [0, 1, 0, 0, 0, 0, 1, -1, 0],
                    "2": [0, -1, 0, 1, 1, 0, 1, -1, -1],
                    "3": [0, -1, -1, 1, 0, 0, -1, -1, -1],
                }, )
        else:
            tree.set_all_character_states(
                {
                    "0": [0],
                    "1": [1],
                    "2": [-1],
                    "3": [1]
                }, )

        # Estimate branch lengths
        mutation_rate = 0.3  # This is kind of arbitrary; not super relevant.
        model = IIDExponentialBayesian(
            mutation_rate=mutation_rate,
            birth_rate=birth_rate,
            sampling_probability=sampling_probability,
            discretization_level=discretization_level,
        )
        model.estimate_branch_lengths(tree)

        # Test the model log likelihood vs its computation from the joint of the
        # age of vertex 1.
        re = relative_error(-model.log_likelihood,
                            -logsumexp(model.log_joints("1")))
        self.assertLessEqual(re, 0.01)

        # Test the model log likelihood against its numerical computation
        numerical_log_likelihood = calc_numerical_log_likelihood(
            tree=tree,
            mutation_rate=mutation_rate,
            birth_rate=birth_rate,
            sampling_probability=sampling_probability,
        )
        re = relative_error(-model.log_likelihood, -numerical_log_likelihood)
        self.assertLessEqual(re, 0.01)

        # Test the _whole_ array of log joints P(t_v = t, X, T) against its
        # numerical computation
        numerical_log_joints = calc_numerical_log_joints(
            tree=tree,
            node="1",
            mutation_rate=mutation_rate,
            birth_rate=birth_rate,
            sampling_probability=sampling_probability,
            discretization_level=discretization_level,
        )
        np.testing.assert_array_almost_equal(
            model.log_joints("1")[50:-50],
            numerical_log_joints[50:-50],
            decimal=1,
        )

        # Test the model posterior times against its numerical posterior
        numerical_posterior = numerical_posterior_time(
            tree=tree,
            node="1",
            mutation_rate=mutation_rate,
            birth_rate=birth_rate,
            sampling_probability=sampling_probability,
            discretization_level=discretization_level,
        )
        # # For debugging; these two plots should look very similar.
        # import matplotlib.pyplot as plt
        # plt.plot(model.posterior_time("1"))
        # plt.show()
        # plt.plot(numerical_posterior)
        # plt.show()
        total_variation = np.sum(
            np.abs(model.posterior_time("1") - numerical_posterior))
        self.assertLessEqual(total_variation, 0.03)

        # Test the posterior mean against the numerical posterior mean.
        numerical_posterior_mean = np.sum(
            numerical_posterior * np.array(range(discretization_level + 1)) /
            discretization_level)
        posterior_mean = tree.get_time("1")
        re = relative_error(posterior_mean, numerical_posterior_mean)
        self.assertLessEqual(re, 0.01)
def calc_exact_log_full_joint(
    tree: CassiopeiaTree,
    mutation_rate: float,
    birth_rate: float,
    sampling_probability: float,
) -> float:
    """
    Exact log full joint probability computation.

    This method is used for testing the implementation of the model.

    The log full joint probability density of the observed tree topology,
    state vectors, and branch lengths. In other words:
    log P(branch lengths, character states, tree topology)
    Intergrating this function allows computing the marginals and hence
    the posteriors of the times of any internal node in the tree.

    Note that this method is only fast enough for small trees. It's
    run time scales exponentially with the number of internal nodes of the
    tree.

    Args:
        tree: The CassiopeiaTree containing the tree topology and all
            character states.
        node: An internal node of the tree, for which to compute the
            posterior log joint.
        mutation_rate: The mutation rate of the model.
        birth_rate: The birth rate of the model.
        sampling_probability: The sampling probability of the model.

    Returns:
        log P(branch lengths, character states, tree topology)
    """
    tree = deepcopy(tree)
    ll = 0.0
    lam = birth_rate
    r = mutation_rate
    p = sampling_probability
    q_inv = (1.0 - p) / p
    lg = np.log
    e = np.exp
    b = binom
    T = tree.get_max_depth_of_tree()
    for (p, c) in tree.edges:
        t = tree.get_branch_length(p, c)
        # Birth process with subsampling likelihood
        h = T - tree.get_time(p) + tree.get_time(tree.root)
        h_tilde = T - tree.get_time(c) + tree.get_time(tree.root)
        if c in tree.leaves:
            # "Easy" case
            assert h_tilde == 0
            ll += (2.0 * lg(q_inv + 1.0) + lam * h -
                   2.0 * lg(q_inv + e(lam * h)) + lg(sampling_probability))
        else:
            ll += (lg(lam) + lam * h - 2.0 * lg(q_inv + e(lam * h)) +
                   2.0 * lg(q_inv + e(lam * h_tilde)) - lam * h_tilde)
        # Mutation process likelihood
        cuts = len(
            tree.get_mutations_along_edge(p,
                                          c,
                                          treat_missing_as_mutations=False))
        uncuts = tree.get_character_states(c).count(0)
        # Care must be taken here, we might get a nan
        if np.isnan(lg(1 - e(-t * r)) * cuts):
            return -np.inf
        ll += ((-t * r) * uncuts + lg(1 - e(-t * r)) * cuts +
               lg(b(cuts + uncuts, cuts)))
    return ll
Example #6
0
    def overlay_data(self, tree: CassiopeiaTree):
        """Overlays Cas9-based lineage tracing data onto the CassiopeiaTree.

        Args:
            tree: Input CassiopeiaTree
        """

        if self.random_seed is not None:
            np.random.seed(self.random_seed)

        # create state priors if they don't exist.
        # This will set the instance's variable for mutation priors and will
        # use this for all future simulations.
        if self.mutation_priors is None:
            self.mutation_priors = {}
            probabilites = [
                self.state_generating_distribution()
                for _ in range(self.number_of_states)
            ]
            Z = np.sum(probabilites)
            for i in range(self.number_of_states):
                self.mutation_priors[i + 1] = probabilites[i] / Z

        number_of_characters = self.number_of_cassettes * self.size_of_cassette

        # initialize character states
        character_matrix = {}
        for node in tree.nodes:
            character_matrix[node] = [-1] * number_of_characters

        for node in tree.depth_first_traverse_nodes(tree.root, postorder=False):

            if tree.is_root(node):
                character_matrix[node] = [0] * number_of_characters
                continue

            parent = tree.parent(node)
            life_time = tree.get_time(node) - tree.get_time(parent)

            character_array = character_matrix[parent]
            open_sites = [
                c
                for c in range(len(character_array))
                if character_array[c] == 0
            ]

            new_cuts = []
            for site in open_sites:
                mutation_rate = self.mutation_rate_per_character[site]
                mutation_probability = 1 - (np.exp(-life_time * mutation_rate))

                if np.random.uniform() < mutation_probability:
                    new_cuts.append(site)

            # collapse cuts that are on the same cassette
            cuts_remaining = new_cuts
            if self.collapse_sites_on_cassette and self.size_of_cassette > 1:
                character_array, cuts_remaining = self.collapse_sites(
                    character_array, new_cuts
                )

            # introduce new states at cut sites
            character_array = self.introduce_states(
                character_array, cuts_remaining
            )

            # silence cassettes
            silencing_probability = 1 - (
                np.exp(-life_time * self.heritable_silencing_rate)
            )
            character_array = self.silence_cassettes(
                character_array,
                silencing_probability,
                self.heritable_missing_data_state,
            )

            character_matrix[node] = character_array

        # apply stochastic silencing
        for leaf in tree.leaves:
            character_matrix[leaf] = self.silence_cassettes(
                character_matrix[leaf],
                self.stochastic_silencing_rate,
                self.stochastic_missing_data_state,
            )

        tree.set_all_character_states(character_matrix)