def apply_mutation_to_state(x): column = [] for v in x.values: if is_ambiguous_state(v): column.append(tuple(mutation_to_state[x.name][_v] for _v in v)) else: column.append(mutation_to_state[x.name][v]) return column
def get_lca_characters( vecs: List[Union[List[int], List[Tuple[int, ...]]]], missing_state_indicator: int, ) -> List[int]: """Builds the character vector of the LCA of a list of character vectors, obeying Camin-Sokal Parsimony. For each index in the reconstructed vector, imputes the non-missing character if only one of the constituent vectors has a missing value at that index, and imputes missing value if all have a missing value at that index. Args: vecs: A list of character vectors to generate an LCA for missing_state_indicator: The character representing missing values Returns: A list representing the character vector of the LCA """ k = len(vecs[0]) for i in vecs: assert len(i) == k lca_vec = [0] * len(vecs[0]) for i in range(k): chars = set() for vec in vecs: if is_ambiguous_state(vec[i]): chars = chars.union(vec[i]) else: chars.add(vec[i]) if len(chars) == 1: lca_vec[i] = list(chars)[0] else: if missing_state_indicator in chars: chars.remove(missing_state_indicator) if len(chars) == 1: lca_vec[i] = list(chars)[0] return lca_vec
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """Infers a tree with Cassiopeia-ILP. Solves a tree using the Cassiopeia-ILP algorithm and populates a tree in the provided CassiopeiaTree. Args: cassiopeia_tree: Input CassiopeiaTree layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to log progress. """ if self.weighted and not cassiopeia_tree.priors: raise ILPSolverError( "Specify prior probabilities in the CassiopeiaTree for weighted" " analysis.") # setup logfile config handler = logging.FileHandler(logfile) handler.setLevel(logging.INFO) logger.addHandler(handler) logger.info("Solving tree with the following parameters.") logger.info(f"Convergence time limit: {self.convergence_time_limit}") logger.info( f"Convergence iteration limit: {self.convergence_iteration_limit}") logger.info( f"Max potential graph layer size: {self.maximum_potential_graph_layer_size}" ) logger.info( f"Max potential graph lca distance: {self.maximum_potential_graph_lca_distance}" ) logger.info(f"MIP gap: {self.mip_gap}") if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() if any( is_ambiguous_state(state) for state in character_matrix.values.flatten()): raise ILPSolverError("Solver does not support ambiguous states.") unique_character_matrix = character_matrix.drop_duplicates() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) # find the root of the tree & generate process ID root = tuple( data_utilities.get_lca_characters( unique_character_matrix.values.tolist(), cassiopeia_tree.missing_state_indicator, )) logger.info(f"Phylogenetic root: {root}") pid = hashlib.md5("|".join([str(r) for r in root ]).encode("utf-8")).hexdigest() targets = [tuple(t) for t in unique_character_matrix.values.tolist()] if unique_character_matrix.shape[0] == 1: optimal_solution = nx.DiGraph() optimal_solution.add_node(root) optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) return # determine diameter of the dataset by evaluating maximum distance to # the root from each sample if (self.maximum_potential_graph_lca_distance is not None) and ( self.maximum_potential_graph_lca_distance > 0): max_lca_distance = self.maximum_potential_graph_lca_distance else: max_lca_distance = 0 lca_distances = [ dissimilarity_functions.hamming_distance( root, np.array(u), ignore_missing_state=True, missing_state_indicator=cassiopeia_tree. missing_state_indicator, ) for u in targets ] for (i, j) in itertools.combinations(range(len(lca_distances)), 2): max_lca_distance = max(max_lca_distance, lca_distances[i] + lca_distances[j] + 1) # infer the potential graph potential_graph = self.infer_potential_graph( unique_character_matrix, pid, max_lca_distance, weights, cassiopeia_tree.missing_state_indicator, ) # generate Steiner Tree ILP model nodes = list(potential_graph.nodes()) encoder = dict(zip(nodes, list(range(len(nodes))))) decoder = dict((v, k) for k, v in encoder.items()) _potential_graph = nx.relabel_nodes(potential_graph, encoder) _targets = list(map(lambda x: encoder[x], targets)) _root = encoder[root] model, edge_variables = self.generate_steiner_model( _potential_graph, _root, _targets) # solve the ILP problem and return a set of proposed solutions proposed_solutions = self.solve_steiner_instance( model, edge_variables, _potential_graph, pid, logfile) # select best model and post process the solution optimal_solution = proposed_solutions[0] optimal_solution = nx.relabel_nodes(optimal_solution, decoder) optimal_solution = self.post_process_steiner_solution( optimal_solution, root) # append sample names to the solution and populate the tree optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) # rename internal nodes such that they are not tuples node_name_generator = solver_utilities.node_name_generator() internal_node_rename = {} for i in cassiopeia_tree.internal_nodes: internal_node_rename[i] = next(node_name_generator) cassiopeia_tree.relabel_nodes(internal_node_rename) cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True) logger.removeHandler(handler)
def convert_lineage_profile_to_character_matrix( lineage_profile: pd.DataFrame, indel_priors: Optional[pd.DataFrame] = None, missing_allele_indicator: Optional[str] = None, missing_state_indicator: int = -1, ) -> Tuple[pd.DataFrame, Dict[int, Dict[int, float]], Dict[int, Dict[int, str]]]: """Converts a lineage profile to a character matrix. Takes in a lineage profile summarizing the explicit indel identities observed at each cut site in a cell and converts this into a character matrix where the indels are abstracted into integers. Note: The lineage profile is converted directly into a character matrix, without performing any collapsing of duplicate states. Instead, this should have been done in the previous step, when calling :func:`convert_alleletable_to_lineage_profile`. Args: lineage_profile: Lineage profile indel_priors: Dataframe mapping indels to prior probabilities missing_allele_indicator: An allele that is being used to represent missing data. missing_state_indicator: State to indicate missing data Returns: A character matrix, prior probability dictionary, and mapping from character/state pairs to indel identities. """ prior_probs = defaultdict(dict) indel_to_charstate = defaultdict(dict) lineage_profile = lineage_profile.copy() lineage_profile = lineage_profile.fillna("Missing").copy() if missing_allele_indicator: lineage_profile.replace({missing_allele_indicator: "Missing"}, inplace=True) samples = [] lineage_profile.columns = [ f"r{i}" for i in range(lineage_profile.shape[1]) ] column_to_unique_values = dict( zip( lineage_profile.columns, [ lineage_profile[x].factorize()[1].values for x in lineage_profile.columns ], )) column_to_number = dict( zip(lineage_profile.columns, range(lineage_profile.shape[1]))) mutation_counter = dict( zip(lineage_profile.columns, [0] * lineage_profile.shape[1])) mutation_to_state = defaultdict(dict) for col in column_to_unique_values.keys(): c = column_to_number[col] indel_to_charstate[c] = {} for indels in column_to_unique_values[col]: if not is_ambiguous_state(indels): indels = (indels, ) for indel in indels: if indel == "Missing" or indel == "NC": mutation_to_state[col][indel] = -1 elif "none" in indel.lower(): mutation_to_state[col][indel] = 0 elif indel not in mutation_to_state[col]: mutation_to_state[col][indel] = mutation_counter[col] + 1 mutation_counter[col] += 1 indel_to_charstate[c][mutation_to_state[col] [indel]] = indel if indel_priors is not None: prob = np.mean(indel_priors.loc[indel]["freq"]) prior_probs[c][mutation_to_state[col][indel]] = float( prob) # Helper function to apply to lineage profile def apply_mutation_to_state(x): column = [] for v in x.values: if is_ambiguous_state(v): column.append(tuple(mutation_to_state[x.name][_v] for _v in v)) else: column.append(mutation_to_state[x.name][v]) return column character_matrix = lineage_profile.apply(apply_mutation_to_state, axis=0) character_matrix.index = lineage_profile.index character_matrix.columns = [ f"r{i}" for i in range(lineage_profile.shape[1]) ] return character_matrix, prior_probs, indel_to_charstate
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """Implements a top-down greedy solving procedure. The procedure recursively splits a set of samples to build a tree. At each partition of the samples, an ancestral node is created and each side of the partition is placed as a daughter clade of that node. This continues until each side of the partition is comprised only of single samples. If an algorithm cannot produce a split on a set of samples, then those samples are placed as sister nodes and the procedure terminates, generating a polytomy in the tree. This function will populate a tree inside the input CassiopeiaTree. Args: cassiopeia_tree: CassiopeiaTree storing a character matrix and priors. layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: File location to log output. Not currently used. """ # A helper function that builds the subtree given a set of samples def _solve( samples: List[Union[str, int]], tree: nx.DiGraph, unique_character_matrix: pd.DataFrame, weights: Dict[int, Dict[int, float]], missing_state_indicator: int, ): if len(samples) == 1: return samples[0] # Finds the best partition of the set given the split criteria clades = list( self.perform_split( unique_character_matrix, samples, weights, missing_state_indicator, )) # Generates a root for this subtree with a unique int identifier root = next(node_name_generator) tree.add_node(root) for clade in clades: if len(clade) == 0: clades.remove(clade) # If unable to return a split, generate a polytomy and return if len(clades) == 1: for clade in clades[0]: tree.add_edge(root, clade) return root # Recursively generate the subtrees for each daughter clade for clade in clades: child = _solve( clade, tree, unique_character_matrix, weights, missing_state_indicator, ) tree.add_edge(root, child) return root node_name_generator = solver_utilities.node_name_generator() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) # extract character matrix if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() # Raise exception if the character matrix has ambiguous states. if any( is_ambiguous_state(state) for state in character_matrix.values.flatten()): raise GreedySolverError( "Solver does not support ambiguous states.") unique_character_matrix = character_matrix.drop_duplicates() tree = nx.DiGraph() tree.add_nodes_from(list(unique_character_matrix.index)) _solve( list(unique_character_matrix.index), tree, unique_character_matrix, weights, cassiopeia_tree.missing_state_indicator, ) # Append duplicate samples duplicates_tree = self.__add_duplicates_to_tree( tree, character_matrix, node_name_generator) cassiopeia_tree.populate_tree(duplicates_tree, layer=layer) # Collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True)