Beispiel #1
0
def find_good_gurobi_subgraph(
    root,
    targets,
    node_name_dict,
    prior_probabilities,
    time_limit,
    num_threads,
    max_neighborhood_size,
    seed=None,
    num_iter=-1,
    weighted=False,
    n_neighbors=10,
):
    """
	Sub-Function used for multi-threading in hybrid method

	:param root:
		Sub-root of the subgraph that is attempted to be reconstructed
	:param targets:
		List of sub-targets for a given subroot where each node is in the form 'Ch1|Ch2|....|Chn'
	:param prior_probabilities:
		A nested dictionary containing prior probabilities for [character][state] mappings
		where characters are in the form of integers, and states are in the form of strings,
		and values are the probability of mutation from the '0' state.
	:param time_limit:
		Length of time allowed for ILP convergence.
	:param num_threads:
		Number of threads to be used during ILP solving.
	:param max_neighborhood_size:
		Maximum size of potential graph allowed.
	:return:
		Optimal ilp subgraph for a given subset of nodes in the time limit allowed.
	"""

    if weighted:
        assert prior_probabilities is not None

    pid = hashlib.md5(root.encode("utf-8")).hexdigest()

    print(
        "Started new thread for: " + str(root) + " (num targets = " +
        str(len(targets)) + ") , pid = " + str(pid),
        flush=True,
    )

    if len(set(targets)) == 1:
        graph = nx.DiGraph()
        graph.add_node(node_name_dict[root])
        return [graph], root, pid, {}

    proot, targets_pruned, pruned_to_orig = prune_unique_alleles(root, targets)

    lca = root_finder(targets_pruned)

    distances = [get_edge_length(lca, t) for t in targets_pruned]
    widths = [0]
    for i in range(len(distances)):
        for j in range(i, len(distances)):
            if i != j:
                widths.append(distances[i] + distances[j] + 1)

    max_lca = max(widths)

    (
        potential_network_priors,
        lca_dist,
        graph_sizes,
    ) = build_potential_graph_from_base_graph(
        targets_pruned,
        proot,
        priors=prior_probabilities,
        max_neighborhood_size=max_neighborhood_size,
        pid=pid,
        weighted=weighted,
        lca_dist=max_lca,
    )

    # network was too large to compute, so just run greedy on it
    if potential_network_priors is None:
        neighbors, distances = find_neighbors(targets, n_neighbors=n_neighbors)
        subgraph = greedy_build(targets,
                                neighbors,
                                distances,
                                priors=prior_probabilities,
                                cell_cutoff=-1)[0]
        subgraph = nx.relabel_nodes(subgraph, node_name_dict)
        print("Max Neighborhood Exceeded", flush=True)
        return [subgraph], root, pid, graph_sizes

    print("Potential Graph built with maximum LCA of " + str(lca_dist) +
          " (pid: " + str(pid) + "). Proceeding to solver.")

    for l in nx.selfloop_edges(potential_network_priors):
        potential_network_priors.remove_edge(l[0], l[1])

    nodes = list(potential_network_priors.nodes())
    encoder = dict(zip(nodes, list(range(len(nodes)))))
    decoder = dict((v, k) for k, v in encoder.items())

    assert len(encoder) == len(decoder)

    _potential_network = nx.relabel_nodes(potential_network_priors, encoder)
    _targets = map(lambda x: encoder[x], targets_pruned)

    model, edge_variables = generate_mSteiner_model(_potential_network,
                                                    encoder[proot], _targets)
    subgraphs = solve_steiner_instance(
        model,
        _potential_network,
        edge_variables,
        MIPGap=0.01,
        detailed_output=False,
        time_limit=time_limit,
        num_threads=num_threads,
        seed=seed,
        num_iter=num_iter,
    )

    all_subgraphs = []
    for subgraph in subgraphs:

        subgraph = nx.relabel_nodes(subgraph, decoder)

        subgraph = subgraph = post_process_ILP(subgraph, root, pruned_to_orig,
                                               proot, targets, node_name_dict,
                                               pid)

        all_subgraphs.append(subgraph)

    r_name = root
    if root in node_name_dict:
        r_name = node_name_dict[root]

    return all_subgraphs, r_name, pid, graph_sizes
Beispiel #2
0
def solve_lineage_instance(
    _target_nodes,
    prior_probabilities=None,
    method="hybrid",
    threads=8,
    hybrid_cell_cutoff=200,
    hybrid_lca_cutoff=None,
    time_limit=1800,
    max_neighborhood_size=10000,
    seed=None,
    num_iter=-1,
    weighted_ilp=False,
    fuzzy=False,
    probabilistic=False,
    plot_diagnostics=True,
    maximum_alt_solutions=100,
    greedy_minimum_allele_rep=1.0,
    n_neighbors=10,
    missing_data_mode="lookahead",
    lookahead_depth=3,
):
    """
	Aggregated lineage solving method, which given a set of target nodes, will find the maximum parsimony tree
	accounting the given target nodes

	:param target_nodes:
		A list of target nodes, where each node is in the form 'Ch1|Ch2|....|Chn'
	:param prior_probabilities:
		A nested dictionary containing prior probabilities for [character][state] mappings
		where characters are in the form of integers, and states are in the form of strings,
		and values are the probability of mutation from the '0' state.
	:param method:
		The method used for solving the problem ['ilp, 'hybrid', 'greedy']
			- ilp: Attempts to solve the problem based on steiner tree on the potential graph
				   (Recommended for instances with several hundred samples at most)
			- greedy: Runs a greedy algorithm to find the maximum parsimony tree based on choosing the most occurring split in a
				   top down fasion (Algorithm scales to any number of samples)
			- hybrid: Runs the greedy algorithm until there are less than hybrid_subset_cutoff samples left in each leaf of the
				   tree, and then returns a series of small instance ilp is then run on these smaller instances, and the
				   resulting graph is created by merging the smaller instances with the greedy top-down tree
	:param threads:
		The number of threads to use in parallel for the hybrid algorithm
	:param hybrid_subset_cutoff:
		The maximum number of nodes allowed before the greedy algorithm terminates for a given leaf node
	:return:
		A reconstructed subgraph representing the nodes
	"""

    if method == "hybrid":
        assert (hybrid_cell_cutoff is None or hybrid_lca_cutoff is None
                ), "You can only use one type of cutoff in Hybrid"

    target_nodes = [
        n.get_character_string() + "_" + n.name for n in _target_nodes
    ]

    node_name_dict = dict(
        zip(
            [n.split("_")[0] for n in target_nodes],
            [n + "_target" for n in target_nodes],
        ))

    if seed is not None:
        np.random.seed(seed)
        random.seed(seed)

    # clip identifier for now, but make sure to add later
    target_nodes = [n.split("_")[0] for n in target_nodes]

    # target_nodes = list(set(target_nodes))
    master_root = root_finder(target_nodes)
    if method == "ilp":

        subgraphs, r, pid, graph_sizes = find_good_gurobi_subgraph(
            master_root,
            target_nodes,
            node_name_dict,
            prior_probabilities,
            time_limit,
            1,
            max_neighborhood_size,
            seed=seed,
            num_iter=num_iter,
            weighted=weighted_ilp,
            n_neighbors=n_neighbors,
        )

        subgraph = subgraphs[0]

        rdict = {}
        target_seen = []

        for n in subgraph:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)

            if len(spl) == 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True

            if len(spl) > 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True
                nn.pid = spl[-1]

            if nn.is_target:
                target_seen.append(nn.char_string)

            rdict[n] = nn

        state_tree = nx.relabel_nodes(subgraph, rdict)

        return (
            Cassiopeia_Tree(method="ilp",
                            network=state_tree,
                            name="Cassiopeia_state_tree"),
            graph_sizes,
        )

    if method == "hybrid":

        neighbors, distances = None, None
        if missing_data_mode == "knn":
            print("Computing neighbors for imputing missing values...")
            neighbors, distances = find_neighbors(target_nodes,
                                                  n_neighbors=n_neighbors)

        network, target_sets = greedy_build(
            target_nodes,
            neighbors,
            distances,
            priors=prior_probabilities,
            cell_cutoff=hybrid_cell_cutoff,
            lca_cutoff=hybrid_lca_cutoff,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            minimum_allele_rep=greedy_minimum_allele_rep,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )

        print(
            "Using " + str(min(multiprocessing.cpu_count(), threads)) +
            " threads, " + str(multiprocessing.cpu_count()) + " available.",
            flush=True,
        )
        executor = concurrent.futures.ProcessPoolExecutor(
            min(multiprocessing.cpu_count(), threads))
        print("Sending off Target Sets: " + str(len(target_sets)), flush=True)

        # just in case you've hit a target node during the greedy reconstruction, append name at this stage
        # so the composition step doesn't get confused when trying to join to the root.
        network = nx.relabel_nodes(network, node_name_dict)

        futures = [
            executor.submit(
                find_good_gurobi_subgraph,
                root,
                targets,
                node_name_dict,
                prior_probabilities,
                time_limit,
                1,
                max_neighborhood_size,
                seed,
                num_iter,
                weighted_ilp,
                n_neighbors,
            ) for root, targets in target_sets
        ]

        concurrent.futures.wait(futures)

        base_network = network.copy()
        base_rdict = {}
        for n in base_network:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)
            if len(spl) > 1:
                nn.pid = spl[1]
            if spl[0] in node_name_dict:
                nn.is_target = True

            base_rdict[n] = nn

        base_network = nx.relabel_nodes(base_network, base_rdict)

        num_solutions = 1  # keep track of number of possible solutions
        potential_graph_sizes = []
        all_res = []
        alt_solutions = {}

        for future in futures:
            results, r, pid, graph_sizes = future.result()
            potential_graph_sizes.append(graph_sizes)

            subproblem_solutions = []
            for res in results:
                new_names = {}
                for n in res:
                    if res.in_degree(n) == 0 or n == r:
                        new_names[n] = n
                    else:
                        new_names[n] = n + "_" + str(pid)
                res = nx.relabel_nodes(res, new_names)
                subproblem_solutions.append(res)

            num_solutions *= len(subproblem_solutions)
            all_res.append(subproblem_solutions)

            rt = [
                n for n in subproblem_solutions[0]
                if subproblem_solutions[0].in_degree(n) == 0
            ][0]
            alt_solutions[base_rdict[rt]] = subproblem_solutions

            network = nx.compose(network, subproblem_solutions[0])

        rdict = {}
        target_seen = []

        for n in network:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)

            if len(spl) == 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True

            if len(spl) > 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True
                nn.pid = spl[-1]

            if nn.is_target:
                target_seen.append(nn.char_string)

            rdict[n] = nn

        state_tree = nx.relabel_nodes(network, rdict)

        # create alternative solutions
        pbar = tqdm(total=len(alt_solutions.keys()),
                    desc="Enumerating alternative solutions")
        for r in alt_solutions.keys():
            soln_list = []

            # get original target char strings
            # sub_targets = [n.char_string for n in state_tree.successors(r) if n.is_target]
            for res in alt_solutions[r]:

                rdict = {}
                for n in res:
                    spl = n.split("_")
                    nn = Node(n, spl[0].split("|"), is_target=False)

                    if len(spl) > 2:
                        nn.pid = spl[-1]

                    rdict[n] = nn

                res = nx.relabel_nodes(res, rdict)
                soln_list.append(res)

            alt_solutions[r] = soln_list

            pbar.update(1)  # update progress bar

        # iterate through all possible solutions
        # alt_solutions = []

        # if num_solutions > 1:

        # 	num_considered_solutions = 0
        # 	sol_identifiers = []  # keep track of solutions already sampled

        # 	# we'll sample maximum_alt_solutions from the set of possible solutions
        # 	pbar = tqdm(
        # 		total=maximum_alt_solutions, desc="Enumerating alternative solutions"
        # 	)
        # 	while num_considered_solutions < min(num_solutions, maximum_alt_solutions):

        # 		current_sol = []
        # 		for res_list in all_res:
        # 			current_sol.append(np.random.choice(len(res_list)))

        # 		if tuple(current_sol) not in sol_identifiers:

        # 			new_network = base_network.copy()
        # 			for i in range(len(current_sol)):
        # 				res_list = all_res[i]
        # 				net = res_list[current_sol[i]]
        # 				new_network = nx.compose(new_network, net)

        # 			rdict = {}
        # 			target_seen = []
        # 			for n in new_network:
        # 				spl = n.split("_")
        # 				nn = Node("state-node", spl[0].split("|"), is_target=False)

        # 				if len(spl) == 2:
        # 					if "target" in n and n not in target_seen:
        # 						nn.is_target = True

        # 				if len(spl) > 2:
        # 					if 'target' in n and n not in target_seen:
        # 						nn.is_target = True
        # 					nn.pid = spl[-1]

        # 				if nn.is_target:
        # 					target_seen.append(nn.char_string)

        # 				rdict[n] = nn

        # 			new_network = nx.relabel_nodes(new_network, rdict)

        # 			alt_solutions.append(new_network)

        # 			sol_identifiers.append(tuple(current_sol))
        # 			num_considered_solutions += 1

        # 			pbar.update(1)  # update progress bar

        return (
            Cassiopeia_Tree(
                method="hybrid",
                network=state_tree,
                name="Cassiopeia_state_tree",
                alternative_solutions=alt_solutions,
                base_network=base_network,
            ),
            potential_graph_sizes,
        )

    if method == "greedy":

        neighbors, distances = None, None
        if missing_data_mode == "knn":
            print("Computing neighbors for imputing missing values...")
            neighbors, distances = find_neighbors(target_nodes,
                                                  n_neighbors=n_neighbors)

        graph = greedy_build(
            target_nodes,
            neighbors,
            distances,
            priors=prior_probabilities,
            cell_cutoff=-1,
            lca_cutoff=None,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            minimum_allele_rep=greedy_minimum_allele_rep,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )[0]

        rdict = {}
        for n in graph:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)
            if len(spl) > 1:
                nn.pid = spl[1]
            if spl[0] in node_name_dict and len(spl) == 1:
                nn.is_target = True
            rdict[n] = nn

        state_tree = nx.relabel_nodes(graph, rdict)

        return (
            Cassiopeia_Tree(method="greedy",
                            network=state_tree,
                            name="Cassiopeia_state_tree"),
            None,
        )

    else:
        raise Exception(
            "Please specify one of the following methods: ilp, hybrid, greedy")