예제 #1
0
 def apply_mutation_to_state(x):
     column = []
     for v in x.values:
         if is_ambiguous_state(v):
             column.append(tuple(mutation_to_state[x.name][_v] for _v in v))
         else:
             column.append(mutation_to_state[x.name][v])
     return column
예제 #2
0
def get_lca_characters(
    vecs: List[Union[List[int], List[Tuple[int, ...]]]],
    missing_state_indicator: int,
) -> List[int]:
    """Builds the character vector of the LCA of a list of character vectors,
    obeying Camin-Sokal Parsimony.

    For each index in the reconstructed vector, imputes the non-missing
    character if only one of the constituent vectors has a missing value at that
    index, and imputes missing value if all have a missing value at that index.

    Args:
        vecs: A list of character vectors to generate an LCA for
        missing_state_indicator: The character representing missing values

    Returns:
        A list representing the character vector of the LCA

    """
    k = len(vecs[0])
    for i in vecs:
        assert len(i) == k
    lca_vec = [0] * len(vecs[0])
    for i in range(k):
        chars = set()
        for vec in vecs:
            if is_ambiguous_state(vec[i]):
                chars = chars.union(vec[i])
            else:
                chars.add(vec[i])
        if len(chars) == 1:
            lca_vec[i] = list(chars)[0]
        else:
            if missing_state_indicator in chars:
                chars.remove(missing_state_indicator)
                if len(chars) == 1:
                    lca_vec[i] = list(chars)[0]
    return lca_vec
예제 #3
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ):
        """Infers a tree with Cassiopeia-ILP.

        Solves a tree using the Cassiopeia-ILP algorithm and populates a tree
        in the provided CassiopeiaTree.

        Args:
            cassiopeia_tree: Input CassiopeiaTree
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: Location to log progress.
        """

        if self.weighted and not cassiopeia_tree.priors:
            raise ILPSolverError(
                "Specify prior probabilities in the CassiopeiaTree for weighted"
                " analysis.")

        # setup logfile config
        handler = logging.FileHandler(logfile)
        handler.setLevel(logging.INFO)
        logger.addHandler(handler)
        logger.info("Solving tree with the following parameters.")
        logger.info(f"Convergence time limit: {self.convergence_time_limit}")
        logger.info(
            f"Convergence iteration limit: {self.convergence_iteration_limit}")
        logger.info(
            f"Max potential graph layer size: {self.maximum_potential_graph_layer_size}"
        )
        logger.info(
            f"Max potential graph lca distance: {self.maximum_potential_graph_lca_distance}"
        )
        logger.info(f"MIP gap: {self.mip_gap}")

        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()
        if any(
                is_ambiguous_state(state)
                for state in character_matrix.values.flatten()):
            raise ILPSolverError("Solver does not support ambiguous states.")

        unique_character_matrix = character_matrix.drop_duplicates()

        weights = None
        if cassiopeia_tree.priors:
            weights = solver_utilities.transform_priors(
                cassiopeia_tree.priors, self.prior_transformation)

        # find the root of the tree & generate process ID
        root = tuple(
            data_utilities.get_lca_characters(
                unique_character_matrix.values.tolist(),
                cassiopeia_tree.missing_state_indicator,
            ))

        logger.info(f"Phylogenetic root: {root}")

        pid = hashlib.md5("|".join([str(r) for r in root
                                    ]).encode("utf-8")).hexdigest()

        targets = [tuple(t) for t in unique_character_matrix.values.tolist()]

        if unique_character_matrix.shape[0] == 1:
            optimal_solution = nx.DiGraph()
            optimal_solution.add_node(root)
            optimal_solution = (
                self.__append_sample_names_and_remove_spurious_leaves(
                    optimal_solution, character_matrix))
            cassiopeia_tree.populate_tree(optimal_solution, layer=layer)
            return

        # determine diameter of the dataset by evaluating maximum distance to
        # the root from each sample
        if (self.maximum_potential_graph_lca_distance is not None) and (
                self.maximum_potential_graph_lca_distance > 0):
            max_lca_distance = self.maximum_potential_graph_lca_distance

        else:
            max_lca_distance = 0
            lca_distances = [
                dissimilarity_functions.hamming_distance(
                    root,
                    np.array(u),
                    ignore_missing_state=True,
                    missing_state_indicator=cassiopeia_tree.
                    missing_state_indicator,
                ) for u in targets
            ]

            for (i, j) in itertools.combinations(range(len(lca_distances)), 2):
                max_lca_distance = max(max_lca_distance,
                                       lca_distances[i] + lca_distances[j] + 1)

        # infer the potential graph
        potential_graph = self.infer_potential_graph(
            unique_character_matrix,
            pid,
            max_lca_distance,
            weights,
            cassiopeia_tree.missing_state_indicator,
        )

        # generate Steiner Tree ILP model
        nodes = list(potential_graph.nodes())
        encoder = dict(zip(nodes, list(range(len(nodes)))))
        decoder = dict((v, k) for k, v in encoder.items())

        _potential_graph = nx.relabel_nodes(potential_graph, encoder)
        _targets = list(map(lambda x: encoder[x], targets))
        _root = encoder[root]

        model, edge_variables = self.generate_steiner_model(
            _potential_graph, _root, _targets)

        # solve the ILP problem and return a set of proposed solutions
        proposed_solutions = self.solve_steiner_instance(
            model, edge_variables, _potential_graph, pid, logfile)

        # select best model and post process the solution
        optimal_solution = proposed_solutions[0]
        optimal_solution = nx.relabel_nodes(optimal_solution, decoder)

        optimal_solution = self.post_process_steiner_solution(
            optimal_solution, root)

        # append sample names to the solution and populate the tree
        optimal_solution = (
            self.__append_sample_names_and_remove_spurious_leaves(
                optimal_solution, character_matrix))

        cassiopeia_tree.populate_tree(optimal_solution, layer=layer)

        # rename internal nodes such that they are not tuples
        node_name_generator = solver_utilities.node_name_generator()
        internal_node_rename = {}
        for i in cassiopeia_tree.internal_nodes:
            internal_node_rename[i] = next(node_name_generator)
        cassiopeia_tree.relabel_nodes(internal_node_rename)

        cassiopeia_tree.collapse_unifurcations()

        # collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True)
        logger.removeHandler(handler)
예제 #4
0
def convert_lineage_profile_to_character_matrix(
    lineage_profile: pd.DataFrame,
    indel_priors: Optional[pd.DataFrame] = None,
    missing_allele_indicator: Optional[str] = None,
    missing_state_indicator: int = -1,
) -> Tuple[pd.DataFrame, Dict[int, Dict[int, float]], Dict[int, Dict[int,
                                                                     str]]]:
    """Converts a lineage profile to a character matrix.

    Takes in a lineage profile summarizing the explicit indel identities
    observed at each cut site in a cell and converts this into a character
    matrix where the indels are abstracted into integers.

    Note:
        The lineage profile is converted directly into a character matrix,
        without performing any collapsing of duplicate states. Instead, this
        should have been done in the previous step, when calling
        :func:`convert_alleletable_to_lineage_profile`.

    Args:
        lineage_profile: Lineage profile
        indel_priors: Dataframe mapping indels to prior probabilities
        missing_allele_indicator: An allele that is being used to represent
            missing data.
        missing_state_indicator: State to indicate missing data

    Returns:
        A character matrix, prior probability dictionary, and mapping from
            character/state pairs to indel identities.
    """

    prior_probs = defaultdict(dict)
    indel_to_charstate = defaultdict(dict)

    lineage_profile = lineage_profile.copy()

    lineage_profile = lineage_profile.fillna("Missing").copy()
    if missing_allele_indicator:
        lineage_profile.replace({missing_allele_indicator: "Missing"},
                                inplace=True)

    samples = []

    lineage_profile.columns = [
        f"r{i}" for i in range(lineage_profile.shape[1])
    ]
    column_to_unique_values = dict(
        zip(
            lineage_profile.columns,
            [
                lineage_profile[x].factorize()[1].values
                for x in lineage_profile.columns
            ],
        ))

    column_to_number = dict(
        zip(lineage_profile.columns, range(lineage_profile.shape[1])))

    mutation_counter = dict(
        zip(lineage_profile.columns, [0] * lineage_profile.shape[1]))
    mutation_to_state = defaultdict(dict)

    for col in column_to_unique_values.keys():

        c = column_to_number[col]
        indel_to_charstate[c] = {}

        for indels in column_to_unique_values[col]:
            if not is_ambiguous_state(indels):
                indels = (indels, )

            for indel in indels:
                if indel == "Missing" or indel == "NC":
                    mutation_to_state[col][indel] = -1

                elif "none" in indel.lower():
                    mutation_to_state[col][indel] = 0

                elif indel not in mutation_to_state[col]:
                    mutation_to_state[col][indel] = mutation_counter[col] + 1
                    mutation_counter[col] += 1

                    indel_to_charstate[c][mutation_to_state[col]
                                          [indel]] = indel

                    if indel_priors is not None:
                        prob = np.mean(indel_priors.loc[indel]["freq"])
                        prior_probs[c][mutation_to_state[col][indel]] = float(
                            prob)

    # Helper function to apply to lineage profile
    def apply_mutation_to_state(x):
        column = []
        for v in x.values:
            if is_ambiguous_state(v):
                column.append(tuple(mutation_to_state[x.name][_v] for _v in v))
            else:
                column.append(mutation_to_state[x.name][v])
        return column

    character_matrix = lineage_profile.apply(apply_mutation_to_state, axis=0)

    character_matrix.index = lineage_profile.index
    character_matrix.columns = [
        f"r{i}" for i in range(lineage_profile.shape[1])
    ]

    return character_matrix, prior_probs, indel_to_charstate
예제 #5
0
    def solve(
        self,
        cassiopeia_tree: CassiopeiaTree,
        layer: Optional[str] = None,
        collapse_mutationless_edges: bool = False,
        logfile: str = "stdout.log",
    ):
        """Implements a top-down greedy solving procedure.

        The procedure recursively splits a set of samples to build a tree. At
        each partition of the samples, an ancestral node is created and each
        side of the partition is placed as a daughter clade of that node. This
        continues until each side of the partition is comprised only of single
        samples. If an algorithm cannot produce a split on a set of samples,
        then those samples are placed as sister nodes and the procedure
        terminates, generating a polytomy in the tree. This function will
        populate a tree inside the input CassiopeiaTree.

        Args:
            cassiopeia_tree: CassiopeiaTree storing a character matrix and
                priors.
            layer: Layer storing the character matrix for solving. If None, the
                default character matrix is used in the CassiopeiaTree.
            collapse_mutationless_edges: Indicates if the final reconstructed
                tree should collapse mutationless edges based on internal states
                inferred by Camin-Sokal parsimony. In scoring accuracy, this
                removes artifacts caused by arbitrarily resolving polytomies.
            logfile: File location to log output. Not currently used.
        """

        # A helper function that builds the subtree given a set of samples
        def _solve(
            samples: List[Union[str, int]],
            tree: nx.DiGraph,
            unique_character_matrix: pd.DataFrame,
            weights: Dict[int, Dict[int, float]],
            missing_state_indicator: int,
        ):
            if len(samples) == 1:
                return samples[0]
            # Finds the best partition of the set given the split criteria
            clades = list(
                self.perform_split(
                    unique_character_matrix,
                    samples,
                    weights,
                    missing_state_indicator,
                ))
            # Generates a root for this subtree with a unique int identifier
            root = next(node_name_generator)
            tree.add_node(root)

            for clade in clades:
                if len(clade) == 0:
                    clades.remove(clade)

            # If unable to return a split, generate a polytomy and return
            if len(clades) == 1:
                for clade in clades[0]:
                    tree.add_edge(root, clade)
                return root
            # Recursively generate the subtrees for each daughter clade
            for clade in clades:
                child = _solve(
                    clade,
                    tree,
                    unique_character_matrix,
                    weights,
                    missing_state_indicator,
                )
                tree.add_edge(root, child)
            return root

        node_name_generator = solver_utilities.node_name_generator()

        weights = None
        if cassiopeia_tree.priors:
            weights = solver_utilities.transform_priors(
                cassiopeia_tree.priors, self.prior_transformation)

        # extract character matrix
        if layer:
            character_matrix = cassiopeia_tree.layers[layer].copy()
        else:
            character_matrix = cassiopeia_tree.character_matrix.copy()

        # Raise exception if the character matrix has ambiguous states.
        if any(
                is_ambiguous_state(state)
                for state in character_matrix.values.flatten()):
            raise GreedySolverError(
                "Solver does not support ambiguous states.")

        unique_character_matrix = character_matrix.drop_duplicates()

        tree = nx.DiGraph()
        tree.add_nodes_from(list(unique_character_matrix.index))

        _solve(
            list(unique_character_matrix.index),
            tree,
            unique_character_matrix,
            weights,
            cassiopeia_tree.missing_state_indicator,
        )

        # Append duplicate samples
        duplicates_tree = self.__add_duplicates_to_tree(
            tree, character_matrix, node_name_generator)
        cassiopeia_tree.populate_tree(duplicates_tree, layer=layer)

        # Collapse mutationless edges
        if collapse_mutationless_edges:
            cassiopeia_tree.collapse_mutationless_edges(
                infer_ancestral_characters=True)