def test_lca_characters_ambiguous(self): vecs = [ [(1, 1), (0, 2), (3, ), (4, ), (5, )], [1, -1, -1, 3, -1], [1, 2, 3, 2, -1], ] ret_vec = data_utilities.get_lca_characters(vecs, missing_state_indicator=-1) self.assertEqual(ret_vec, [1, 0, 3, 0, 5])
def assess_cutoff( self, samples: List[str], character_matrix: pd.DataFrame, missing_state_indicator: int = -1, ) -> bool: """Assesses samples with respect to hybrid cutoff. Args: samples: A list of samples in a clade. character_matrix: Character matrix missing_state_indicator: Indicator for missing data. Returns: True if the cutoff is reached, False if not. """ if self.cell_cutoff is None: root_states = data_utilities.get_lca_characters( character_matrix.loc[samples].values.tolist(), missing_state_indicator, ) lca_distances = [ dissimilarity_functions.hamming_distance( np.array(root_states), character_matrix.loc[u].values) for u in samples ] if np.max(lca_distances) <= self.lca_cutoff: return True else: if len(samples) <= self.cell_cutoff: return True return False
def apply_bottom_solver( self, cassiopeia_tree: CassiopeiaTree, root: int, samples=List[str], logfile: str = "stdout.log", layer: Optional[str] = None, ) -> Tuple[nx.DiGraph, int]: """Apply the bottom solver to subproblems. A private method for solving subproblems identified by the top-down solver with the more precise bottom solver for this instantation of the HybridSolver. This function will create a unique log file, based on the root, set up a new instance of the bottom solver and solve the subproblem. The function will return a tree for the subproblem and the identifier of the root of the tree. Args: cassiopeia_tree: CassiopeiaTree for the entire dataset. This will be subsetted with respect to the samples specified. root: Identifier of the root in the master tree samples: A list of samples for which to infer a tree. logfile: Base location for logging output. A specific logfile will be created from this base logfile name. layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. Returns: A tree in the form of a Networkx graph and the original root identifier """ if len(samples) == 1: subproblem_tree = nx.DiGraph() subproblem_tree.add_edge(root, samples[0]) return subproblem_tree, root if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() subproblem_character_matrix = character_matrix.loc[samples] subtree_root = data_utilities.get_lca_characters( subproblem_character_matrix.loc[samples].values.tolist(), cassiopeia_tree.missing_state_indicator, ) subtree = CassiopeiaTree( subproblem_character_matrix, missing_state_indicator=cassiopeia_tree.missing_state_indicator, priors=cassiopeia_tree.priors, ) self.bottom_solver.solve(subtree, logfile=logfile) subproblem_tree = subtree.get_tree_topology() subproblem_root = [ n for n in subproblem_tree if subproblem_tree.in_degree(n) == 0 ][0] subproblem_tree.add_edge(root, subproblem_root) return subproblem_tree, root
def update_similarity_map_and_character_matrix( self, character_matrix: pd.DataFrame, similarity_function: Callable[ [np.array, np.array, int, Dict[int, Dict[int, float]]], float ], similarity_map: pd.DataFrame, cherry: Tuple[str, str], new_node: str, missing_state_indicator: int = -1, weights=None, ) -> pd.DataFrame: """Update similarity map after finding a cherry. Adds the new LCA node into the character matrix with the mutations shared by the joined nodes as its character vector. Then, updates the similarity matrix by calculating the pairwise similarity between the new LCA node and all existing nodes. Args: character_matrix: Contains the character information for all nodes, updated as nodes are joined and new internal LCA nodes are added similarity_function: A similarity function similarity_map: A similarity map to update cherry: A tuple of indices in the similarity map that are joining new_node: New node name, to be added to the updated similarity map missing_state_indicator: Character representing missing data weights: Weighting of each (character, state) pair. Typically a transformation of the priors. Returns: A new similarity map, updated with the new node """ character_i, character_j = ( np.where(character_matrix.index == cherry[0])[0][0], np.where(character_matrix.index == cherry[1])[0][0], ) character_array = character_matrix.to_numpy(copy=True) similarity_array = similarity_map.to_numpy() i_characters = character_array[character_i, :] j_characters = character_array[character_j, :] lca = data_utilities.get_lca_characters( [i_characters, j_characters], missing_state_indicator ) character_matrix.loc[new_node] = lca similarity_array_updated = self.__update_similarity_map( character_array, similarity_array, np.array(lca), similarity_function, missing_state_indicator, weights, ) sample_names = list(similarity_map.index) + [new_node] similarity_map = pd.DataFrame( similarity_array_updated, index=sample_names, columns=sample_names ) # drop out cherry from similarity map and character matrix similarity_map.drop( columns=[cherry[0], cherry[1]], index=[cherry[0], cherry[1]], inplace=True, ) character_matrix.drop(index=[cherry[0], cherry[1]], inplace=True) return similarity_map
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """Infers a tree with Cassiopeia-ILP. Solves a tree using the Cassiopeia-ILP algorithm and populates a tree in the provided CassiopeiaTree. Args: cassiopeia_tree: Input CassiopeiaTree layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to log progress. """ if self.weighted and not cassiopeia_tree.priors: raise ILPSolverError( "Specify prior probabilities in the CassiopeiaTree for weighted" " analysis.") # setup logfile config handler = logging.FileHandler(logfile) handler.setLevel(logging.INFO) logger.addHandler(handler) logger.info("Solving tree with the following parameters.") logger.info(f"Convergence time limit: {self.convergence_time_limit}") logger.info( f"Convergence iteration limit: {self.convergence_iteration_limit}") logger.info( f"Max potential graph layer size: {self.maximum_potential_graph_layer_size}" ) logger.info( f"Max potential graph lca distance: {self.maximum_potential_graph_lca_distance}" ) logger.info(f"MIP gap: {self.mip_gap}") if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() if any( is_ambiguous_state(state) for state in character_matrix.values.flatten()): raise ILPSolverError("Solver does not support ambiguous states.") unique_character_matrix = character_matrix.drop_duplicates() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) # find the root of the tree & generate process ID root = tuple( data_utilities.get_lca_characters( unique_character_matrix.values.tolist(), cassiopeia_tree.missing_state_indicator, )) logger.info(f"Phylogenetic root: {root}") pid = hashlib.md5("|".join([str(r) for r in root ]).encode("utf-8")).hexdigest() targets = [tuple(t) for t in unique_character_matrix.values.tolist()] if unique_character_matrix.shape[0] == 1: optimal_solution = nx.DiGraph() optimal_solution.add_node(root) optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) return # determine diameter of the dataset by evaluating maximum distance to # the root from each sample if (self.maximum_potential_graph_lca_distance is not None) and ( self.maximum_potential_graph_lca_distance > 0): max_lca_distance = self.maximum_potential_graph_lca_distance else: max_lca_distance = 0 lca_distances = [ dissimilarity_functions.hamming_distance( root, np.array(u), ignore_missing_state=True, missing_state_indicator=cassiopeia_tree. missing_state_indicator, ) for u in targets ] for (i, j) in itertools.combinations(range(len(lca_distances)), 2): max_lca_distance = max(max_lca_distance, lca_distances[i] + lca_distances[j] + 1) # infer the potential graph potential_graph = self.infer_potential_graph( unique_character_matrix, pid, max_lca_distance, weights, cassiopeia_tree.missing_state_indicator, ) # generate Steiner Tree ILP model nodes = list(potential_graph.nodes()) encoder = dict(zip(nodes, list(range(len(nodes))))) decoder = dict((v, k) for k, v in encoder.items()) _potential_graph = nx.relabel_nodes(potential_graph, encoder) _targets = list(map(lambda x: encoder[x], targets)) _root = encoder[root] model, edge_variables = self.generate_steiner_model( _potential_graph, _root, _targets) # solve the ILP problem and return a set of proposed solutions proposed_solutions = self.solve_steiner_instance( model, edge_variables, _potential_graph, pid, logfile) # select best model and post process the solution optimal_solution = proposed_solutions[0] optimal_solution = nx.relabel_nodes(optimal_solution, decoder) optimal_solution = self.post_process_steiner_solution( optimal_solution, root) # append sample names to the solution and populate the tree optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) # rename internal nodes such that they are not tuples node_name_generator = solver_utilities.node_name_generator() internal_node_rename = {} for i in cassiopeia_tree.internal_nodes: internal_node_rename[i] = next(node_name_generator) cassiopeia_tree.relabel_nodes(internal_node_rename) cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True) logger.removeHandler(handler)
def test_lca_characters(self): vecs = [[1, 0, 3, 4, 5], [1, -1, -1, 3, -1], [1, 2, 3, 2, -1]] ret_vec = data_utilities.get_lca_characters(vecs, missing_state_indicator=-1) self.assertEqual(ret_vec, [1, 0, 3, 0, 5])
def percolate( self, character_matrix: pd.DataFrame, samples: List[str], priors: Optional[Dict[int, Dict[int, float]]] = None, weights: Optional[Dict[int, Dict[int, float]]] = None, missing_state_indicator: int = -1, ) -> Tuple[List[str], List[str]]: """The function used by the percolation algorithm to partition the set of samples in two. First, a pairwise similarity graph is generated with samples as nodes such that edges between a pair of nodes is some provided function on the number of character/state mutations shared. Then, the algorithm removes the minimum edge (in the case of ties all are removed) until the graph is split into multiple connected components. If there are more than two connected components, the procedure joins them until two remain. This is done by inferring the mutations of the LCA of each sample set obeying Camin-Sokal Parsimony, and then clustering the groups of samples based on their LCAs. The provided solver is used to cluster the groups into two clusters. Args: character_matrix: Character matrix samples: A list of samples to partition priors: A dictionary storing the probability of each character mutating to a particular state. weights: Weighting of each (character, state) pair. Typically a transformation of the priors. missing_state_indicator: Character representing missing data. Returns: A tuple of lists, representing the left and right partition groups """ sample_indices = solver_utilities.convert_sample_names_to_indices( character_matrix.index, samples) unique_character_array = character_matrix.to_numpy() G = nx.Graph() G.add_nodes_from(sample_indices) # Add edge weights into the similarity graph edge_weight_buckets = defaultdict(list) for i, j in itertools.combinations(sample_indices, 2): similarity = self.similarity_function( unique_character_array[i, :], unique_character_array[j, :], missing_state_indicator, weights, ) if similarity > self.threshold: edge_weight_buckets[similarity].append((i, j)) G.add_edge(i, j) if len(G.edges) == 0: return samples, [] connected_components = list(nx.connected_components(G)) sorted_edge_weights = sorted(edge_weight_buckets, reverse=True) # Percolate the similarity graph by continuously removing the minimum # edge until at least two components exists while len(connected_components) <= 1: min_weight = sorted_edge_weights.pop() for edge in edge_weight_buckets[min_weight]: G.remove_edge(edge[0], edge[1]) connected_components = list(nx.connected_components(G)) # If the number of connected components > 2, merge components by # joining the most similar LCAs of each component until # only 2 remain partition_sides = [] if len(connected_components) > 2: for c in range(len(connected_components)): connected_components[c] = list(connected_components[c]) lcas = {} component_to_nodes = {} # Find the LCA of the nodes in each connected component for ind in range(len(connected_components)): component_identifier = "component" + str(ind) component_to_nodes[ component_identifier] = connected_components[ind] character_vectors = [ list(i) for i in list(unique_character_array[ connected_components[ind], :]) ] lcas[component_identifier] = data_utilities.get_lca_characters( character_vectors, missing_state_indicator) # Build a tree on the LCA characters to cluster the components lca_tree = CassiopeiaTree( pd.DataFrame.from_dict(lcas, orient="index"), missing_state_indicator=missing_state_indicator, priors=priors, ) self.joining_solver.solve(lca_tree, collapse_mutationless_edges=False) grouped_components = [] # Take the split at the root as the clusters of components # in the split, ignoring unifurcations current_node = lca_tree.root while len(grouped_components) == 0: successors = lca_tree.children(current_node) if len(successors) == 1: current_node = successors[0] else: for i in successors: grouped_components.append( lca_tree.leaves_in_subtree(i)) # For each component in each cluster, take the nodes in that # component to form the final split for cluster in grouped_components: sample_index_group = [] for component in cluster: sample_index_group.extend(component_to_nodes[component]) partition_sides.append(sample_index_group) else: for c in range(len(connected_components)): partition_sides.append(list(connected_components[c])) # Convert from component indices back to the sample names in the # original character matrix sample_names = list(character_matrix.index) partition_named = [] for sample_index_group in partition_sides: sample_name_group = [] for sample_index in sample_index_group: sample_name_group.append(sample_names[sample_index]) partition_named.append(sample_name_group) return partition_named