def overlay_data( self, tree: CassiopeiaTree, attribute_key: str = "spatial", ): """Overlays spatial data onto the CassiopeiaTree via Brownian motion. Args: tree: The CassiopeiaTree to overlay spatial data on to. attribute_key: The name of the attribute to save the coordinates as. This also serves as the prefix of the coordinates saved into the `cell_meta` attribute as `{attribute_key}_i` where i is an integer from 0...`dim-1`. """ # Using numpy arrays instead of tuples for easy vector operations locations = {tree.root: np.zeros(self.dim)} for parent, child in tree.depth_first_traverse_edges(source=tree.root): parent_location = locations[parent] branch_length = tree.get_branch_length(parent, child) locations[child] = parent_location + np.random.normal( scale=np.sqrt(2 * self.diffusion_coefficient * branch_length), size=self.dim, ) # Scale if desired # Note that Python dictionaries preserve order since 3.6 if self.scale_unit_area: all_coordinates = np.array(list(locations.values())) # Shift each dimension so that the smallest value is at 0. all_coordinates -= all_coordinates.min(axis=0) # Scale all dimensions (by the same value) so that all values are # between [0, 1]. We don't scale each dimension separately because # we want to retain the shape of the distribution. all_coordinates /= all_coordinates.max() locations = { node: coordinates for node, coordinates in zip(locations.keys(), all_coordinates) } # Set node attributes for node, loc in locations.items(): tree.set_attribute(node, attribute_key, tuple(loc)) # Set cell meta cell_meta = (tree.cell_meta.copy() if tree.cell_meta is not None else pd.DataFrame(index=tree.leaves)) columns = [f"{attribute_key}_{i}" for i in range(self.dim)] cell_meta[columns] = np.nan for leaf in tree.leaves: cell_meta.loc[leaf, columns] = locations[leaf] tree.cell_meta = cell_meta
def score_small_parsimony( cassiopeia_tree: CassiopeiaTree, meta_item: str, root: Optional[str] = None, infer_ancestral_states: bool = True, label_key: Optional[str] = "label", ) -> int: """Computes the small-parsimony of the tree. Using the meta data stored in the specified cell meta column, compute the parsimony score of the tree. Args: cassiopeia_tree: CassiopeiaTree object with cell meta data. meta_item: A column in the CassiopeiaTree cell meta corresponding to a categorical variable. root: Node to treat as the root. Only the subtree below this node will be considered. infer_ancestral_states: Whether or not ancestral states must be inferred (this will be False if `fitch_hartigan` has already been called on the tree.) label_key: If ancestral states have already been inferred, this key indicates the name of the attribute they're stored in. Returns: The parsimony score. Raises: CassiopeiaError if label_key has not been populated. """ cassiopeia_tree = cassiopeia_tree.copy() if infer_ancestral_states: fitch_hartigan(cassiopeia_tree, meta_item, root, label_key=label_key) parsimony = 0 for (parent, child) in cassiopeia_tree.depth_first_traverse_edges(source=root): try: if cassiopeia_tree.get_attribute( parent, label_key) != cassiopeia_tree.get_attribute( child, label_key): parsimony += 1 except CassiopeiaTreeError: raise CassiopeiaError(f"{label_key} does not exist for a node, " "try running Fitch-Hartigan or passing " "infer_ancestral_states=True.") return parsimony
def create_clade_colors( tree: CassiopeiaTree, clade_colors: Dict[str, Tuple[float, float, float]] ) -> Tuple[Dict[str, Tuple[float, float, float]], Dict[Tuple[str, str], Tuple[ float, float, float]], ]: """Assign colors to nodes and branches by clade. Args: tree: The CassiopeiaTree. clade_colors: Dictionary containing internal node-color mappings. These colors will be used to color all the paths from this node to the leaves the provided color. Returns: Two dictionaries. The first contains the node colors, and the second contains the branch colors. """ # Deal with clade colors. descendants = {} for node in clade_colors.keys(): descendants[node] = set(tree.depth_first_traverse_nodes(node)) if len(set.union(*list(descendants.values()))) != sum( len(d) for d in descendants.values()): warnings.warn( "Some clades specified with `clade_colors` are overlapping. " "Colors may be overridden.", PlottingWarning, ) # Color by largest clade first node_colors = {} branch_colors = {} for node in sorted(descendants, key=lambda x: len(descendants[x]), reverse=True): color = clade_colors[node] for n1, n2 in tree.depth_first_traverse_edges(node): node_colors[n1] = node_colors[n2] = color branch_colors[(n1, n2)] = color return node_colors, branch_colors
def estimate_branch_lengths(self, tree: CassiopeiaTree) -> None: r""" MLE under a model of IID memoryless CRISPR/Cas9 mutations. The only caveat is that this method raises an IIDExponentialMLEError if the underlying convex optimization solver fails, or a ValueError if the character matrix is degenerate (fully mutated, or fully unmutated). Raises: IIDExponentialMLEError ValueError """ # Extract parameters minimum_branch_length = self._minimum_branch_length solver = self._solver verbose = self._verbose # # # # # Check that the character has at least one mutation # # # # # if (tree.character_matrix == 0).all().all(): raise ValueError( "The character matrix has no mutations. Please check your data." ) # # # # # Check that the character is not saturated # # # # # if (tree.character_matrix != 0).all().all(): raise ValueError( "The character matrix is fully mutated. The MLE does not " "exist. Please check your data.") # # # # # Create variables of the optimization problem # # # # # r_X_t_variables = dict([(node_id, cp.Variable(name=f"r_X_t_{node_id}")) for node_id in tree.nodes]) # # # # # Create constraints of the optimization problem # # # # # a_leaf = tree.leaves[0] root = tree.root root_has_time_0_constraint = [r_X_t_variables[root] == 0] minimum_branch_length_constraints = [ r_X_t_variables[child] >= r_X_t_variables[parent] + minimum_branch_length * r_X_t_variables[a_leaf] for (parent, child) in tree.edges ] ultrametric_constraints = [ r_X_t_variables[leaf] == r_X_t_variables[a_leaf] for leaf in tree.leaves if leaf != a_leaf ] all_constraints = (root_has_time_0_constraint + minimum_branch_length_constraints + ultrametric_constraints) # # # # # Compute the log-likelihood # # # # # log_likelihood = 0 for (parent, child) in tree.edges: edge_length = r_X_t_variables[child] - r_X_t_variables[parent] num_unmutated = len( tree.get_unmutated_characters_along_edge(parent, child)) num_mutated = len( tree.get_mutations_along_edge( parent, child, treat_missing_as_mutations=False)) log_likelihood += num_unmutated * (-edge_length) log_likelihood += num_mutated * cp.log( 1 - cp.exp(-edge_length - 1e-5) # We add eps for stability. ) # # # # # Solve the problem # # # # # obj = cp.Maximize(log_likelihood) prob = cp.Problem(obj, all_constraints) try: prob.solve(solver=solver, verbose=verbose) except cp.SolverError: # pragma: no cover raise IIDExponentialMLEError("Third-party solver failed") # # # # # Extract the mutation rate # # # # # self._mutation_rate = float(r_X_t_variables[a_leaf].value) if self._mutation_rate < 1e-8 or self._mutation_rate > 15.0: raise IIDExponentialMLEError( "The solver failed when it shouldn't have.") # # # # # Extract the log-likelihood # # # # # log_likelihood = float(log_likelihood.value) if np.isnan(log_likelihood): log_likelihood = -np.inf self._log_likelihood = log_likelihood # # # # # Populate the tree with the estimated branch lengths # # # # # times = { node: float(r_X_t_variables[node].value) / self._mutation_rate for node in tree.nodes } # Make sure that the root has time 0 (avoid epsilons) times[tree.root] = 0.0 # We smooth out epsilons that might make a parent's time greater # than its child (which can happen if minimum_branch_length=0) for (parent, child) in tree.depth_first_traverse_edges(): times[child] = max(times[parent], times[child]) tree.set_times(times)
def calculate_parsimony( tree: CassiopeiaTree, infer_ancestral_characters: bool = False, treat_missing_as_mutation: bool = False, ) -> int: """ Calculates the number of mutations that have occurred on a tree. Calculates the parsimony, defined as the number of character/state mutations that occur on edges of the tree, from the character state annotations at the nodes. A mutation is said to have occurred on an edge if a state is present at a character at the child node and this state is not in the parent node. If `infer_ancestral_characters` is set to True, then the internal nodes' character states are inferred by Camin-Sokal Parsimony from the current character states at the leaves. Use `tree.set_character_states_at_leaves` to use a different layer to infer ancestral states. Otherwise, the current annotations at the internal states are used. If `treat_missing_as_mutations` is set to True, then transitions from a non-missing state to a missing state are counted in the parsimony calculation. Otherwise, they are not included. Args: tree: The tree to calculate parsimony over infer_ancestral_characters: Whether to infer the ancestral characters states of the tree treat_missing_as_mutations: Whether to treat missing states as mutations Returns: The number of mutations that have occurred on the tree Raises: TreeMetricError if the tree has not been initialized or if a node does not have character states initialized """ if infer_ancestral_characters: tree.reconstruct_ancestral_characters() parsimony = 0 if tree.get_character_states(tree.root) == []: raise TreeMetricError( f"Character states empty at internal node. Annotate" " character states or infer ancestral characters by" " setting infer_ancestral_characters=True.") for u, v in tree.depth_first_traverse_edges(): if tree.get_character_states(v) == []: if tree.is_leaf(v): raise TreeMetricError( "Character states have not been initialized at leaves." " Use set_character_states_at_leaves or populate_tree" " with the character matrix that specifies the leaf" " character states.") else: raise TreeMetricError( f"Character states empty at internal node. Annotate" " character states or infer ancestral characters by" " setting infer_ancestral_characters=True.") parsimony += len( tree.get_mutations_along_edge(u, v, treat_missing_as_mutation)) return parsimony