def test_hand_solvable_problem_2(self, name, solver): """ Tree topology is just a branch 0->1. There are two mutated characters and one unmutated character, i.e.: root [state = '000'] | v child [state = '011'] The solution can be verified by hand. The optimization problem is: min_{r * t0} log(exp(-r * t0)) + 2 * log(1 - exp(-r * t0)) The solution is r * t0 = ln(3) ~ 1.098 (Note that because the depth of the tree is fixed to 1, r * t0 = r * 1 is the mutation rate.) """ tree = nx.DiGraph() tree.add_nodes_from(["0", "1"]) tree.add_edge("0", "1") tree = CassiopeiaTree(tree=tree) tree.set_all_character_states({"0": [0, 0, 0], "1": [0, 1, 1]}) model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver) model.estimate_branch_lengths(tree) self.assertAlmostEqual(tree.get_branch_length("0", "1"), 1.0, places=3) self.assertAlmostEqual(tree.get_time("1"), 1.0, places=3) self.assertAlmostEqual(tree.get_time("0"), 0.0, places=3) self.assertAlmostEqual(model.mutation_rate, np.log(3), places=3) self.assertAlmostEqual(model.log_likelihood, -1.910, places=3)
def test_on_simulated_data(self, name, solver): """ We run the estimator on data simulated under the correct model. The estimator should be close to the ground truth. """ tree = nx.DiGraph() tree.add_nodes_from(["0", "1", "2", "3", "4", "5", "6"]), tree.add_edges_from( [ ("0", "1"), ("0", "2"), ("1", "3"), ("1", "4"), ("2", "5"), ("2", "6"), ] ) tree = CassiopeiaTree(tree=tree) tree.set_times( {"0": 0, "1": 0.1, "2": 0.9, "3": 1.0, "4": 1.0, "5": 1.0, "6": 1.0} ) np.random.seed(1) Cas9LineageTracingDataSimulator( number_of_cassettes=200, size_of_cassette=1, mutation_rate=1.5, ).overlay_data(tree) model = IIDExponentialMLE(minimum_branch_length=1e-4, solver=solver) model.estimate_branch_lengths(tree) self.assertTrue(0.05 < tree.get_time("1") < 0.15) self.assertTrue(0.8 < tree.get_time("2") < 1.0) self.assertTrue(0.9 < tree.get_time("3") < 1.1) self.assertTrue(0.9 < tree.get_time("4") < 1.1) self.assertTrue(0.9 < tree.get_time("5") < 1.1) self.assertTrue(0.9 < tree.get_time("6") < 1.1) self.assertTrue(1.4 < model.mutation_rate < 1.6) self.assertAlmostEqual(tree.get_time("0"), 0.0, places=3)
def subsample_leaves( self, tree: CassiopeiaTree, keep_singular_root_edge: bool = True) -> CassiopeiaTree: """Uniformly subsample leaf samples of a given tree. Generates a uniform random sample on the leaves of the given CassiopeiaTree and returns a tree pruned to contain lineages relevant to only leaves in the sample (the "induced subtree" on the sample). All fields on the original character matrix persist, but maintains character states, meta data, and the dissimilarity map for the sampled cells only. Has the option to keep the single edge leading from the root in the induced subtree, if it exists. This edge is often used to represent the time that the root lives before any divisions occur in the phyologeny, and is useful in instances where the branch lengths are critical, like simulating ground truth phylogenies or estimating branch lengths. Args: tree: The CassiopeiaTree for which to subsample leaves keep_singular_root_edge: Whether or not to collapse the single edge leading from the root in the subsample, if it exists Returns: A new CassiopeiaTree that is the induced subtree on a sample of the leaves in the given tree Raises: LeafSubsamplerError if the sample size is <= 0, or larger than the number of leaves in the tree """ ratio = self.__ratio number_of_leaves = self.__number_of_leaves n_subsample = (number_of_leaves if number_of_leaves is not None else int(tree.n_cell * ratio)) if n_subsample <= 0: raise LeafSubsamplerError( "Specified number of leaves sampled is <= 0.") if n_subsample > tree.n_cell: raise LeafSubsamplerError( "Specified number of leaves sampled is greater than the number" " of leaves in the given tree.") n_remove = len(tree.leaves) - n_subsample subsampled_tree = copy.deepcopy(tree) leaf_remove = np.random.choice(subsampled_tree.leaves, n_remove, replace=False) subsampled_tree.remove_leaves_and_prune_lineages(leaf_remove) # Keep the singular root edge if it exists and is indicated to be kept if (len(subsampled_tree.children(subsampled_tree.root)) == 1 and keep_singular_root_edge): collapse_source = subsampled_tree.children(subsampled_tree.root)[0] else: collapse_source = None subsampled_tree.collapse_unifurcations(source=collapse_source) # Copy and annotate branch lengths and times subsampled_tree.set_times( dict([(node, tree.get_time(node)) for node in subsampled_tree.nodes])) return subsampled_tree
def test_against_closed_form_solution_small( self, name, sampling_probability, birth_rate, many_characters, discretization_level, ): """ For a small tree with only one internal node, the likelihood of the data, and the posterior age of the internal node, can be computed easily in closed form. We check the theoretical values against those obtained from our model. We try different settings of the model hyperparameters, particularly the birth rate and sampling probability. """ # First, we create the ground truth tree and character states tree = nx.DiGraph() tree.add_nodes_from(["0", "1", "2", "3"]) tree.add_edges_from([("0", "1"), ("1", "2"), ("1", "3")]) tree = CassiopeiaTree(tree=tree) if many_characters: tree.set_all_character_states( { "0": [0, 0, 0, 0, 0, 0, 0, 0, 0], "1": [0, 1, 0, 0, 0, 0, 1, -1, 0], "2": [0, -1, 0, 1, 1, 0, 1, -1, -1], "3": [0, -1, -1, 1, 0, 0, -1, -1, -1], }, ) else: tree.set_all_character_states( { "0": [0], "1": [1], "2": [-1], "3": [1] }, ) # Estimate branch lengths mutation_rate = 0.3 # This is kind of arbitrary; not super relevant. model = IIDExponentialBayesian( mutation_rate=mutation_rate, birth_rate=birth_rate, sampling_probability=sampling_probability, discretization_level=discretization_level, ) model.estimate_branch_lengths(tree) # Test the model log likelihood vs its computation from the joint of the # age of vertex 1. re = relative_error(-model.log_likelihood, -logsumexp(model.log_joints("1"))) self.assertLessEqual(re, 0.01) # Test the model log likelihood against its numerical computation numerical_log_likelihood = calc_numerical_log_likelihood( tree=tree, mutation_rate=mutation_rate, birth_rate=birth_rate, sampling_probability=sampling_probability, ) re = relative_error(-model.log_likelihood, -numerical_log_likelihood) self.assertLessEqual(re, 0.01) # Test the _whole_ array of log joints P(t_v = t, X, T) against its # numerical computation numerical_log_joints = calc_numerical_log_joints( tree=tree, node="1", mutation_rate=mutation_rate, birth_rate=birth_rate, sampling_probability=sampling_probability, discretization_level=discretization_level, ) np.testing.assert_array_almost_equal( model.log_joints("1")[50:-50], numerical_log_joints[50:-50], decimal=1, ) # Test the model posterior times against its numerical posterior numerical_posterior = numerical_posterior_time( tree=tree, node="1", mutation_rate=mutation_rate, birth_rate=birth_rate, sampling_probability=sampling_probability, discretization_level=discretization_level, ) # # For debugging; these two plots should look very similar. # import matplotlib.pyplot as plt # plt.plot(model.posterior_time("1")) # plt.show() # plt.plot(numerical_posterior) # plt.show() total_variation = np.sum( np.abs(model.posterior_time("1") - numerical_posterior)) self.assertLessEqual(total_variation, 0.03) # Test the posterior mean against the numerical posterior mean. numerical_posterior_mean = np.sum( numerical_posterior * np.array(range(discretization_level + 1)) / discretization_level) posterior_mean = tree.get_time("1") re = relative_error(posterior_mean, numerical_posterior_mean) self.assertLessEqual(re, 0.01)
def calc_exact_log_full_joint( tree: CassiopeiaTree, mutation_rate: float, birth_rate: float, sampling_probability: float, ) -> float: """ Exact log full joint probability computation. This method is used for testing the implementation of the model. The log full joint probability density of the observed tree topology, state vectors, and branch lengths. In other words: log P(branch lengths, character states, tree topology) Intergrating this function allows computing the marginals and hence the posteriors of the times of any internal node in the tree. Note that this method is only fast enough for small trees. It's run time scales exponentially with the number of internal nodes of the tree. Args: tree: The CassiopeiaTree containing the tree topology and all character states. node: An internal node of the tree, for which to compute the posterior log joint. mutation_rate: The mutation rate of the model. birth_rate: The birth rate of the model. sampling_probability: The sampling probability of the model. Returns: log P(branch lengths, character states, tree topology) """ tree = deepcopy(tree) ll = 0.0 lam = birth_rate r = mutation_rate p = sampling_probability q_inv = (1.0 - p) / p lg = np.log e = np.exp b = binom T = tree.get_max_depth_of_tree() for (p, c) in tree.edges: t = tree.get_branch_length(p, c) # Birth process with subsampling likelihood h = T - tree.get_time(p) + tree.get_time(tree.root) h_tilde = T - tree.get_time(c) + tree.get_time(tree.root) if c in tree.leaves: # "Easy" case assert h_tilde == 0 ll += (2.0 * lg(q_inv + 1.0) + lam * h - 2.0 * lg(q_inv + e(lam * h)) + lg(sampling_probability)) else: ll += (lg(lam) + lam * h - 2.0 * lg(q_inv + e(lam * h)) + 2.0 * lg(q_inv + e(lam * h_tilde)) - lam * h_tilde) # Mutation process likelihood cuts = len( tree.get_mutations_along_edge(p, c, treat_missing_as_mutations=False)) uncuts = tree.get_character_states(c).count(0) # Care must be taken here, we might get a nan if np.isnan(lg(1 - e(-t * r)) * cuts): return -np.inf ll += ((-t * r) * uncuts + lg(1 - e(-t * r)) * cuts + lg(b(cuts + uncuts, cuts))) return ll
def overlay_data(self, tree: CassiopeiaTree): """Overlays Cas9-based lineage tracing data onto the CassiopeiaTree. Args: tree: Input CassiopeiaTree """ if self.random_seed is not None: np.random.seed(self.random_seed) # create state priors if they don't exist. # This will set the instance's variable for mutation priors and will # use this for all future simulations. if self.mutation_priors is None: self.mutation_priors = {} probabilites = [ self.state_generating_distribution() for _ in range(self.number_of_states) ] Z = np.sum(probabilites) for i in range(self.number_of_states): self.mutation_priors[i + 1] = probabilites[i] / Z number_of_characters = self.number_of_cassettes * self.size_of_cassette # initialize character states character_matrix = {} for node in tree.nodes: character_matrix[node] = [-1] * number_of_characters for node in tree.depth_first_traverse_nodes(tree.root, postorder=False): if tree.is_root(node): character_matrix[node] = [0] * number_of_characters continue parent = tree.parent(node) life_time = tree.get_time(node) - tree.get_time(parent) character_array = character_matrix[parent] open_sites = [ c for c in range(len(character_array)) if character_array[c] == 0 ] new_cuts = [] for site in open_sites: mutation_rate = self.mutation_rate_per_character[site] mutation_probability = 1 - (np.exp(-life_time * mutation_rate)) if np.random.uniform() < mutation_probability: new_cuts.append(site) # collapse cuts that are on the same cassette cuts_remaining = new_cuts if self.collapse_sites_on_cassette and self.size_of_cassette > 1: character_array, cuts_remaining = self.collapse_sites( character_array, new_cuts ) # introduce new states at cut sites character_array = self.introduce_states( character_array, cuts_remaining ) # silence cassettes silencing_probability = 1 - ( np.exp(-life_time * self.heritable_silencing_rate) ) character_array = self.silence_cassettes( character_array, silencing_probability, self.heritable_missing_data_state, ) character_matrix[node] = character_array # apply stochastic silencing for leaf in tree.leaves: character_matrix[leaf] = self.silence_cassettes( character_matrix[leaf], self.stochastic_silencing_rate, self.stochastic_missing_data_state, ) tree.set_all_character_states(character_matrix)