def test_create_similarity_map(self): character_matrix = self.pp_tree_priors.character_matrix.copy() weights = solver_utilities.transform_priors(self.pp_tree_priors.priors, "negative_log") similarity_map = data_utilities.compute_dissimilarity_map( character_matrix.to_numpy(), character_matrix.shape[0], dissimilarity_functions.hamming_similarity_without_missing, weights, self.pp_tree_priors.missing_state_indicator, ) similarity_map = scipy.spatial.distance.squareform(similarity_map) similarity_map = pd.DataFrame( similarity_map, index=character_matrix.index, columns=character_matrix.index, ) expected_similarity = -np.log(0.5) - np.log(0.8) self.assertEqual(similarity_map.loc["a", "b"], expected_similarity) expected_similarity = -np.log(0.1) self.assertEqual(similarity_map.loc["a", "e"], expected_similarity)
def setUp(self): self.s1 = np.array([0, 1, 0, -1, 1, 2]) self.s2 = np.array([1, 1, 0, 0, 1, 3]) self.all_missing = np.array([-1, -1, -1, -1, -1, -1]) self.ambiguous = [(0, ), (-1, 0), (0, ), (-1, 0), (1, ), (1, )] self.priors = { 0: { 1: 0.5, 2: 0.5 }, 1: { 1: 0.5, 2: 0.5 }, 2: { 1: 0.25, 2: 0.75 }, 3: { 1: 0.3, 2: 0.7 }, 4: { 1: 0.4, 2: 0.6 }, 5: { 1: 0.1, 2: 0.05, 3: 0.85 }, } self.badpriors = {0: {1: 0}, 1: {1: -1, 2: -1.5}} self.nlweights = solver_utilities.transform_priors( self.priors, "negative_log") self.iweights = solver_utilities.transform_priors( self.priors, "inverse") self.sqiweights = solver_utilities.transform_priors( self.priors, "square_root_inverse")
def setup_root_finder(self, cassiopeia_tree: CassiopeiaTree) -> None: """Gives the implicit rooting strategy for the SNJ Solver. By default, the SpectralNeighborJoining algorithm returns an unrooted tree. To root this tree, an implicit root of all zeros is added to the character matrix. Then, the dissimilarity map is recalculated using the updated character matrix. If the tree already has a computed dissimilarity map, only the new similarities are calculated. See 'setup_root_finder' in NeighborJoiningSolver. Args: cassiopeia_tree: Input CassiopeiaTree to `solve` """ character_matrix = cassiopeia_tree.character_matrix.copy() rooted_character_matrix = character_matrix.copy() root = [0] * rooted_character_matrix.shape[1] rooted_character_matrix.loc["root"] = root cassiopeia_tree.root_sample_name = "root" cassiopeia_tree.character_matrix = rooted_character_matrix if self.dissimilarity_function is None: raise DistanceSolver.DistanceSolverError( "Please specify a dissimilarity function to add an implicit " "root, or specify an explicit root" ) dissimilarity_map = cassiopeia_tree.get_dissimilarity_map() if dissimilarity_map is None: cassiopeia_tree.compute_dissimilarity_map( self.dissimilarity_function, self.prior_transformation ) else: dissimilarity = {"root": 0} for leaf in character_matrix.index: weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation ) dissimilarity[leaf] = self.dissimilarity_function( rooted_character_matrix.loc["root"].values, rooted_character_matrix.loc[leaf].values, cassiopeia_tree.missing_state_indicator, weights, ) cassiopeia_tree.set_dissimilarity("root", dissimilarity) cassiopeia_tree.character_matrix = character_matrix
def test_average_missing_data_priors(self): cm = pd.DataFrame.from_dict( { "c1": [-1, 4, 0, 2, 2], "c2": [4, 4, 0, 2, 0], "c3": [4, 0, 1, -1, -1], "c4": [5, 0, 1, 2, -1], "c5": [5, 0, 1, 2, -1], }, orient="index", columns=["a", "b", "c", "d", "e"], ) priors = { 0: { 4: 0.5, 5: 0.5 }, 1: { 4: 1 }, 2: { 1: 1 }, 3: { 2: 1 }, 4: { 2: 1 }, } weights = solver_utilities.transform_priors(priors) left_set, right_set = missing_data_methods.assign_missing_average( cm, -1, ["c1", "c2"], ["c4", "c5"], ["c3"], weights) self.assertEqual(left_set, ["c1", "c2", "c3"]) self.assertEqual(right_set, ["c4", "c5"])
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """The general hybrid solver routine. The hybrid solver proceeds by clustering together cells using the algorithm stored in the top_solver until a criteria is reached. Once this criteria is reached, the bottom_solver is applied to each subproblem left over from the "greedy" clustering. Args: cassiopeia_tree: CassiopeiaTree that stores the character matrix and priors for reconstruction. layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to log progress. """ node_name_generator = solver_utilities.node_name_generator() if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() unique_character_matrix = character_matrix.drop_duplicates() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) tree = nx.DiGraph() # call top-down solver until a desired cutoff is reached. _, subproblems, tree = self.apply_top_solver( unique_character_matrix, list(unique_character_matrix.index), tree, node_name_generator, weights=weights, missing_state_indicator=cassiopeia_tree.missing_state_indicator, ) logfile_names = iter([i for i in range(1, len(subproblems) + 1)]) # multi-threaded bottom solver approach with multiprocessing.Pool(processes=self.threads) as pool: results = list( tqdm( pool.starmap( self.apply_bottom_solver, [( cassiopeia_tree, subproblem[0], subproblem[1], f"{logfile.split('.log')[0]}-" f"{next(logfile_names)}.log", layer, ) for subproblem in subproblems], ), total=len(subproblems), )) for result in results: subproblem_tree, subproblem_root = result[0], result[1] # check that the only overlapping name is the root, else # add a new name so that we don't get edges across the tree existing_nodes = [n for n in tree] mapping = {} for n in subproblem_tree: if n in existing_nodes and n != subproblem_root: mapping[n] = next(node_name_generator) subproblem_tree = nx.relabel_nodes(subproblem_tree, mapping) tree = nx.compose(tree, subproblem_tree) # append sample names to the solution and populate the tree samples_tree = self.__add_duplicates_to_tree_and_remove_spurious_leaves( tree, character_matrix, node_name_generator) cassiopeia_tree.populate_tree(samples_tree, layer=layer) cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True)
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ) -> None: """Solves a tree for the SharedMutationJoiningSolver. The solver routine calculates an n x n similarity matrix of all pairwise sample similarities based on a provided similarity function on the character vectors. The general solver routine proceeds by iteratively finding pairs of samples to join together into a "cherry" until all samples are joined. At each iterative step, the two samples with the most shared character/state mutations are joined. Then, an LCA node with a character vector containing only the mutations shared by the joined samples is added to the sample pool, and the similarity matrix is updated with respect to the new LCA node. The function will update the `tree` attribute of the input CassiopeiaTree. Args: cassiopeia_tree: CassiopeiaTree object to be populated layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to write standard out. Not currently used. """ node_name_generator = solver_utilities.node_name_generator() if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation ) similarity_map = data_utilities.compute_dissimilarity_map( character_matrix.to_numpy(), character_matrix.shape[0], self.similarity_function, weights, cassiopeia_tree.missing_state_indicator, ) similarity_map = scipy.spatial.distance.squareform(similarity_map) similarity_map = pd.DataFrame( similarity_map, index=character_matrix.index, columns=character_matrix.index, ) N = similarity_map.shape[0] # Numba-ize the similarity function and weights nb_weights = numba.typed.Dict.empty( numba.types.int64, numba.types.DictType(numba.types.int64, numba.types.float64), ) if weights: for k, v in weights.items(): nb_char_weights = numba.typed.Dict.empty( numba.types.int64, numba.types.float64 ) for state, prior in v.items(): nb_char_weights[state] = prior nb_weights[k] = nb_char_weights # instantiate a tree where all samples appear as leaves. tree = nx.DiGraph() tree.add_nodes_from(similarity_map.index) while N > 1: i, j = self.find_cherry(similarity_map.values) # get indices in the similarity matrix to join node_i, node_j = (similarity_map.index[i], similarity_map.index[j]) new_node_name = next(node_name_generator) tree.add_node(new_node_name) tree.add_edges_from( [(new_node_name, node_i), (new_node_name, node_j)] ) similarity_map = self.update_similarity_map_and_character_matrix( character_matrix, self.nb_similarity_function, similarity_map, (node_i, node_j), new_node_name, cassiopeia_tree.missing_state_indicator, nb_weights, ) N = similarity_map.shape[0] cassiopeia_tree.populate_tree(tree, layer=layer) # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True )
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """Infers a tree with Cassiopeia-ILP. Solves a tree using the Cassiopeia-ILP algorithm and populates a tree in the provided CassiopeiaTree. Args: cassiopeia_tree: Input CassiopeiaTree layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to log progress. """ if self.weighted and not cassiopeia_tree.priors: raise ILPSolverError( "Specify prior probabilities in the CassiopeiaTree for weighted" " analysis.") # setup logfile config handler = logging.FileHandler(logfile) handler.setLevel(logging.INFO) logger.addHandler(handler) logger.info("Solving tree with the following parameters.") logger.info(f"Convergence time limit: {self.convergence_time_limit}") logger.info( f"Convergence iteration limit: {self.convergence_iteration_limit}") logger.info( f"Max potential graph layer size: {self.maximum_potential_graph_layer_size}" ) logger.info( f"Max potential graph lca distance: {self.maximum_potential_graph_lca_distance}" ) logger.info(f"MIP gap: {self.mip_gap}") if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() if any( is_ambiguous_state(state) for state in character_matrix.values.flatten()): raise ILPSolverError("Solver does not support ambiguous states.") unique_character_matrix = character_matrix.drop_duplicates() weights = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) # find the root of the tree & generate process ID root = tuple( data_utilities.get_lca_characters( unique_character_matrix.values.tolist(), cassiopeia_tree.missing_state_indicator, )) logger.info(f"Phylogenetic root: {root}") pid = hashlib.md5("|".join([str(r) for r in root ]).encode("utf-8")).hexdigest() targets = [tuple(t) for t in unique_character_matrix.values.tolist()] if unique_character_matrix.shape[0] == 1: optimal_solution = nx.DiGraph() optimal_solution.add_node(root) optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) return # determine diameter of the dataset by evaluating maximum distance to # the root from each sample if (self.maximum_potential_graph_lca_distance is not None) and ( self.maximum_potential_graph_lca_distance > 0): max_lca_distance = self.maximum_potential_graph_lca_distance else: max_lca_distance = 0 lca_distances = [ dissimilarity_functions.hamming_distance( root, np.array(u), ignore_missing_state=True, missing_state_indicator=cassiopeia_tree. missing_state_indicator, ) for u in targets ] for (i, j) in itertools.combinations(range(len(lca_distances)), 2): max_lca_distance = max(max_lca_distance, lca_distances[i] + lca_distances[j] + 1) # infer the potential graph potential_graph = self.infer_potential_graph( unique_character_matrix, pid, max_lca_distance, weights, cassiopeia_tree.missing_state_indicator, ) # generate Steiner Tree ILP model nodes = list(potential_graph.nodes()) encoder = dict(zip(nodes, list(range(len(nodes))))) decoder = dict((v, k) for k, v in encoder.items()) _potential_graph = nx.relabel_nodes(potential_graph, encoder) _targets = list(map(lambda x: encoder[x], targets)) _root = encoder[root] model, edge_variables = self.generate_steiner_model( _potential_graph, _root, _targets) # solve the ILP problem and return a set of proposed solutions proposed_solutions = self.solve_steiner_instance( model, edge_variables, _potential_graph, pid, logfile) # select best model and post process the solution optimal_solution = proposed_solutions[0] optimal_solution = nx.relabel_nodes(optimal_solution, decoder) optimal_solution = self.post_process_steiner_solution( optimal_solution, root) # append sample names to the solution and populate the tree optimal_solution = ( self.__append_sample_names_and_remove_spurious_leaves( optimal_solution, character_matrix)) cassiopeia_tree.populate_tree(optimal_solution, layer=layer) # rename internal nodes such that they are not tuples node_name_generator = solver_utilities.node_name_generator() internal_node_rename = {} for i in cassiopeia_tree.internal_nodes: internal_node_rename[i] = next(node_name_generator) cassiopeia_tree.relabel_nodes(internal_node_rename) cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True) logger.removeHandler(handler)
def test_bad_prior_transformations(self): with self.assertRaises(solver_utilities.PriorTransformationError): solver_utilities.transform_priors(self.badpriors, "negative_log")
def solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ): """Implements a solving procedure for the Percolation Algorithm. The procedure recursively splits a set of samples to build a tree. At each partition of the samples produced by the percolation procedure, an ancestral node is created and each side of the partition is placed as a daughter clade of that node. This continues until each side of the partition is comprised only of single samples. If an algorithm cannot produce a split on a set of samples, then those samples are placed as sister nodes and the procedure terminates, generating a polytomy in the tree. This function will populate a tree inside the input CassiopeiaTree. Args: cassiopeia_tree: CassiopeiaTree storing a character matrix and priors. layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. collapse_mutationless_edges: Indicates if the final reconstructed tree should collapse mutationless edges based on internal states inferred by Camin-Sokal parsimony. In scoring accuracy, this removes artifacts caused by arbitrarily resolving polytomies. logfile: Location to write standard out. Not currently used. """ node_name_generator = solver_utilities.node_name_generator() # A helper function that builds the subtree given a set of samples def _solve( samples: List[Union[str, int]], tree: nx.DiGraph, unique_character_matrix: pd.DataFrame, priors: Dict[int, Dict[int, float]], weights: Dict[int, Dict[int, float]], missing_state_indicator: int, ): if len(samples) == 1: return samples[0] # Partitions the set of samples by percolating a similarity graph clades = list( self.percolate( unique_character_matrix, samples, priors, weights, missing_state_indicator, )) # Generates a root for this subtree with a unique int identifier root = next(node_name_generator) tree.add_node(root) for clade in clades: if len(clade) == 0: clades.remove(clade) # If unable to return a split, generate a polytomy and return if len(clades) == 1: for clade in clades[0]: tree.add_edge(root, clade) return root # Recursively generate the subtrees for each daughter clade for clade in clades: child = _solve( clade, tree, unique_character_matrix, priors, weights, missing_state_indicator, ) tree.add_edge(root, child) return root weights = None priors = None if cassiopeia_tree.priors: weights = solver_utilities.transform_priors( cassiopeia_tree.priors, self.prior_transformation) priors = cassiopeia_tree.priors # extract character matrix if layer: character_matrix = cassiopeia_tree.layers[layer].copy() else: character_matrix = cassiopeia_tree.character_matrix.copy() unique_character_matrix = character_matrix.drop_duplicates() tree = nx.DiGraph() tree.add_nodes_from(list(unique_character_matrix.index)) _solve( list(unique_character_matrix.index), tree, unique_character_matrix, priors, weights, cassiopeia_tree.missing_state_indicator, ) # Append duplicate samples duplicates_tree = self.__add_duplicates_to_tree( tree, character_matrix, node_name_generator) cassiopeia_tree.populate_tree(duplicates_tree, layer=layer) # Collapse mutationless edges if collapse_mutationless_edges: cassiopeia_tree.collapse_mutationless_edges( infer_ancestral_characters=True)