예제 #1
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ) -> None:
        """Solves a tree for a general bottom-up distance-based solver routine.

        The general solver routine proceeds by iteratively finding pairs of
        samples to join together into a "cherry" and then reform the
        dissimilarity matrix with respect to this new cherry. The implementation
        of how to find cherries and update the dissimilarity map is left to
        subclasses of DistanceSolver. The function will update the `tree`
        attribute of the input CassiopeiaTree.

        Args:
            cassiopeia_tree: CassiopeiaTree object to be populated
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: File location to log output. Not currently used.
        """
        node_name_generator = solver_utilities.node_name_generator()

        dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer)

        N = dissimilarity_map.shape[0]

        # instantiate a dissimilarity map that can be updated as we join
        # together nodes.
        _dissimilarity_map = dissimilarity_map.copy()

        # instantiate a tree where all samples appear as leaves.
        tree = nx.Graph()
        tree.add_nodes_from(_dissimilarity_map.index)

        while N > 2:

            i, j = self.find_cherry(_dissimilarity_map.to_numpy())

            # get indices in the dissimilarity matrix to join
            node_i, node_j = (
                _dissimilarity_map.index[i],
                _dissimilarity_map.index[j],
            )

            new_node_name = next(node_name_generator)
            tree.add_node(new_node_name)
            tree.add_edges_from([(new_node_name, node_i),
                                 (new_node_name, node_j)])

            _dissimilarity_map = self.update_dissimilarity_map(
                _dissimilarity_map, (node_i, node_j), new_node_name)

            N = _dissimilarity_map.shape[0]

        tree = self.root_tree(
            tree,
            cassiopeia_tree.root_sample_name,
            _dissimilarity_map.index.values,
        )

        # remove root from character matrix before populating tree
        if (cassiopeia_tree.root_sample_name
                in cassiopeia_tree.character_matrix.index):
            cassiopeia_tree.character_matrix = (
                cassiopeia_tree.character_matrix.drop(
                    index=cassiopeia_tree.root_sample_name))

        cassiopeia_tree.populate_tree(tree, layer=layer)
        cassiopeia_tree.collapse_unifurcations()

        # collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True)
예제 #2
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ):
        """The general hybrid solver routine.

        The hybrid solver proceeds by clustering together cells using the
        algorithm stored in the top_solver until a criteria is reached. Once
        this criteria is reached, the bottom_solver is applied to each
        subproblem left over from the "greedy" clustering.

        Args:
            cassiopeia_tree: CassiopeiaTree that stores the character matrix
                and priors for reconstruction.
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: Location to log progress.
        """
        node_name_generator = solver_utilities.node_name_generator()

        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()

        unique_character_matrix = character_matrix.drop_duplicates()

        weights = None
        if cassiopeia_tree.priors:
            weights = solver_utilities.transform_priors(
                cassiopeia_tree.priors, self.prior_transformation)

        tree = nx.DiGraph()
        # call top-down solver until a desired cutoff is reached.
        _, subproblems, tree = self.apply_top_solver(
            unique_character_matrix,
            list(unique_character_matrix.index),
            tree,
            node_name_generator,
            weights=weights,
            missing_state_indicator=cassiopeia_tree.missing_state_indicator,
        )

        logfile_names = iter([i for i in range(1, len(subproblems) + 1)])

        # multi-threaded bottom solver approach
        with multiprocessing.Pool(processes=self.threads) as pool:

            results = list(
                tqdm(
                    pool.starmap(
                        self.apply_bottom_solver,
                        [(
                            cassiopeia_tree,
                            subproblem[0],
                            subproblem[1],
                            f"{logfile.split('.log')[0]}-"
                            f"{next(logfile_names)}.log",
                            layer,
                        ) for subproblem in subproblems],
                    ),
                    total=len(subproblems),
                ))

        for result in results:

            subproblem_tree, subproblem_root = result[0], result[1]

            # check that the only overlapping name is the root, else
            # add a new name so that we don't get edges across the tree
            existing_nodes = [n for n in tree]

            mapping = {}
            for n in subproblem_tree:
                if n in existing_nodes and n != subproblem_root:
                    mapping[n] = next(node_name_generator)

            subproblem_tree = nx.relabel_nodes(subproblem_tree, mapping)

            tree = nx.compose(tree, subproblem_tree)

        # append sample names to the solution and populate the tree
        samples_tree = self.__add_duplicates_to_tree_and_remove_spurious_leaves(
            tree, character_matrix, node_name_generator)

        cassiopeia_tree.populate_tree(samples_tree, layer=layer)
        cassiopeia_tree.collapse_unifurcations()

        # collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True)
예제 #3
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ):
        """Infers a tree with Cassiopeia-ILP.

        Solves a tree using the Cassiopeia-ILP algorithm and populates a tree
        in the provided CassiopeiaTree.

        Args:
            cassiopeia_tree: Input CassiopeiaTree
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: Location to log progress.
        """

        if self.weighted and not cassiopeia_tree.priors:
            raise ILPSolverError(
                "Specify prior probabilities in the CassiopeiaTree for weighted"
                " analysis.")

        # setup logfile config
        handler = logging.FileHandler(logfile)
        handler.setLevel(logging.INFO)
        logger.addHandler(handler)
        logger.info("Solving tree with the following parameters.")
        logger.info(f"Convergence time limit: {self.convergence_time_limit}")
        logger.info(
            f"Convergence iteration limit: {self.convergence_iteration_limit}")
        logger.info(
            f"Max potential graph layer size: {self.maximum_potential_graph_layer_size}"
        )
        logger.info(
            f"Max potential graph lca distance: {self.maximum_potential_graph_lca_distance}"
        )
        logger.info(f"MIP gap: {self.mip_gap}")

        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()
        if any(
                is_ambiguous_state(state)
                for state in character_matrix.values.flatten()):
            raise ILPSolverError("Solver does not support ambiguous states.")

        unique_character_matrix = character_matrix.drop_duplicates()

        weights = None
        if cassiopeia_tree.priors:
            weights = solver_utilities.transform_priors(
                cassiopeia_tree.priors, self.prior_transformation)

        # find the root of the tree & generate process ID
        root = tuple(
            data_utilities.get_lca_characters(
                unique_character_matrix.values.tolist(),
                cassiopeia_tree.missing_state_indicator,
            ))

        logger.info(f"Phylogenetic root: {root}")

        pid = hashlib.md5("|".join([str(r) for r in root
                                    ]).encode("utf-8")).hexdigest()

        targets = [tuple(t) for t in unique_character_matrix.values.tolist()]

        if unique_character_matrix.shape[0] == 1:
            optimal_solution = nx.DiGraph()
            optimal_solution.add_node(root)
            optimal_solution = (
                self.__append_sample_names_and_remove_spurious_leaves(
                    optimal_solution, character_matrix))
            cassiopeia_tree.populate_tree(optimal_solution, layer=layer)
            return

        # determine diameter of the dataset by evaluating maximum distance to
        # the root from each sample
        if (self.maximum_potential_graph_lca_distance is not None) and (
                self.maximum_potential_graph_lca_distance > 0):
            max_lca_distance = self.maximum_potential_graph_lca_distance

        else:
            max_lca_distance = 0
            lca_distances = [
                dissimilarity_functions.hamming_distance(
                    root,
                    np.array(u),
                    ignore_missing_state=True,
                    missing_state_indicator=cassiopeia_tree.
                    missing_state_indicator,
                ) for u in targets
            ]

            for (i, j) in itertools.combinations(range(len(lca_distances)), 2):
                max_lca_distance = max(max_lca_distance,
                                       lca_distances[i] + lca_distances[j] + 1)

        # infer the potential graph
        potential_graph = self.infer_potential_graph(
            unique_character_matrix,
            pid,
            max_lca_distance,
            weights,
            cassiopeia_tree.missing_state_indicator,
        )

        # generate Steiner Tree ILP model
        nodes = list(potential_graph.nodes())
        encoder = dict(zip(nodes, list(range(len(nodes)))))
        decoder = dict((v, k) for k, v in encoder.items())

        _potential_graph = nx.relabel_nodes(potential_graph, encoder)
        _targets = list(map(lambda x: encoder[x], targets))
        _root = encoder[root]

        model, edge_variables = self.generate_steiner_model(
            _potential_graph, _root, _targets)

        # solve the ILP problem and return a set of proposed solutions
        proposed_solutions = self.solve_steiner_instance(
            model, edge_variables, _potential_graph, pid, logfile)

        # select best model and post process the solution
        optimal_solution = proposed_solutions[0]
        optimal_solution = nx.relabel_nodes(optimal_solution, decoder)

        optimal_solution = self.post_process_steiner_solution(
            optimal_solution, root)

        # append sample names to the solution and populate the tree
        optimal_solution = (
            self.__append_sample_names_and_remove_spurious_leaves(
                optimal_solution, character_matrix))

        cassiopeia_tree.populate_tree(optimal_solution, layer=layer)

        # rename internal nodes such that they are not tuples
        node_name_generator = solver_utilities.node_name_generator()
        internal_node_rename = {}
        for i in cassiopeia_tree.internal_nodes:
            internal_node_rename[i] = next(node_name_generator)
        cassiopeia_tree.relabel_nodes(internal_node_rename)

        cassiopeia_tree.collapse_unifurcations()

        # collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True)
        logger.removeHandler(handler)
예제 #4
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ) -> None:
        """Solves a tree for the SharedMutationJoiningSolver.

        The solver routine calculates an n x n similarity matrix of all
        pairwise sample similarities based on a provided similarity function on
        the character vectors. The general solver routine proceeds by
        iteratively finding pairs of samples to join together into a "cherry"
        until all samples are joined. At each iterative step, the two samples
        with the most shared character/state mutations are joined. Then, an LCA
        node with a character vector containing only the mutations shared by the
        joined samples is added to the sample pool, and the similarity matrix is
        updated with respect to the new LCA node. The function will update the
        `tree` attribute of the input CassiopeiaTree.

        Args:
            cassiopeia_tree: CassiopeiaTree object to be populated
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: Location to write standard out. Not currently used.
        """

        node_name_generator = solver_utilities.node_name_generator()

        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()

        weights = None
        if cassiopeia_tree.priors:
            weights = solver_utilities.transform_priors(
                cassiopeia_tree.priors, self.prior_transformation
            )

        similarity_map = data_utilities.compute_dissimilarity_map(
            character_matrix.to_numpy(),
            character_matrix.shape[0],
            self.similarity_function,
            weights,
            cassiopeia_tree.missing_state_indicator,
        )

        similarity_map = scipy.spatial.distance.squareform(similarity_map)

        similarity_map = pd.DataFrame(
            similarity_map,
            index=character_matrix.index,
            columns=character_matrix.index,
        )

        N = similarity_map.shape[0]

        # Numba-ize the similarity function and weights
        nb_weights = numba.typed.Dict.empty(
            numba.types.int64,
            numba.types.DictType(numba.types.int64, numba.types.float64),
        )
        if weights:
            for k, v in weights.items():
                nb_char_weights = numba.typed.Dict.empty(
                    numba.types.int64, numba.types.float64
                )
                for state, prior in v.items():
                    nb_char_weights[state] = prior
                nb_weights[k] = nb_char_weights

        # instantiate a tree where all samples appear as leaves.
        tree = nx.DiGraph()
        tree.add_nodes_from(similarity_map.index)

        while N > 1:

            i, j = self.find_cherry(similarity_map.values)

            # get indices in the similarity matrix to join
            node_i, node_j = (similarity_map.index[i], similarity_map.index[j])

            new_node_name = next(node_name_generator)
            tree.add_node(new_node_name)
            tree.add_edges_from(
                [(new_node_name, node_i), (new_node_name, node_j)]
            )

            similarity_map = self.update_similarity_map_and_character_matrix(
                character_matrix,
                self.nb_similarity_function,
                similarity_map,
                (node_i, node_j),
                new_node_name,
                cassiopeia_tree.missing_state_indicator,
                nb_weights,
            )

            N = similarity_map.shape[0]

        cassiopeia_tree.populate_tree(tree, layer=layer)
        # collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True
            )
예제 #5
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ):
        """Implements a solving procedure for the Percolation Algorithm.
        The procedure recursively splits a set of samples to build a tree. At
        each partition of the samples produced by the percolation procedure,
        an ancestral node is created and each side of the partition is placed
        as a daughter clade of that node. This continues until each side of
        the partition is comprised only of single samples. If an algorithm
        cannot produce a split on a set of samples, then those samples are
        placed as sister nodes and the procedure terminates, generating a
        polytomy in the tree. This function will populate a tree inside the
        input CassiopeiaTree.

        Args:
            cassiopeia_tree: CassiopeiaTree storing a character matrix and
                priors.
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: Location to write standard out. Not currently used.
        """

        node_name_generator = solver_utilities.node_name_generator()

        # A helper function that builds the subtree given a set of samples
        def _solve(
            samples: List[Union[str, int]],
            tree: nx.DiGraph,
            unique_character_matrix: pd.DataFrame,
            priors: Dict[int, Dict[int, float]],
            weights: Dict[int, Dict[int, float]],
            missing_state_indicator: int,
        ):

            if len(samples) == 1:
                return samples[0]
            # Partitions the set of samples by percolating a similarity graph
            clades = list(
                self.percolate(
                    unique_character_matrix,
                    samples,
                    priors,
                    weights,
                    missing_state_indicator,
                ))
            # Generates a root for this subtree with a unique int identifier
            root = next(node_name_generator)
            tree.add_node(root)

            for clade in clades:
                if len(clade) == 0:
                    clades.remove(clade)

            # If unable to return a split, generate a polytomy and return
            if len(clades) == 1:
                for clade in clades[0]:
                    tree.add_edge(root, clade)
                return root
            # Recursively generate the subtrees for each daughter clade
            for clade in clades:
                child = _solve(
                    clade,
                    tree,
                    unique_character_matrix,
                    priors,
                    weights,
                    missing_state_indicator,
                )
                tree.add_edge(root, child)
            return root

        weights = None
        priors = None
        if cassiopeia_tree.priors:
            weights = solver_utilities.transform_priors(
                cassiopeia_tree.priors, self.prior_transformation)
            priors = cassiopeia_tree.priors

        # extract character matrix
        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()
        unique_character_matrix = character_matrix.drop_duplicates()

        tree = nx.DiGraph()
        tree.add_nodes_from(list(unique_character_matrix.index))

        _solve(
            list(unique_character_matrix.index),
            tree,
            unique_character_matrix,
            priors,
            weights,
            cassiopeia_tree.missing_state_indicator,
        )

        # Append duplicate samples
        duplicates_tree = self.__add_duplicates_to_tree(
            tree, character_matrix, node_name_generator)
        cassiopeia_tree.populate_tree(duplicates_tree, layer=layer)

        # Collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True)