def add_sampled_network(self): # Create networkx DiGraph to represent true_tree tree = nx.DiGraph() cell_record = self.get_cell_record() keep_labels = self.get_node_labels() parent_ix_levels = self.get_parent_child_map() # Create nodes representing the leaves record = cell_record[0] prev_level = [ Node(label, format_char_vec(record[i])) for i, label in enumerate(keep_labels[0]) ] for level, mapping in enumerate(parent_ix_levels): # Construct edges from this level to the next level level_labels = keep_labels[level + 1] record = cell_record[level + 1] current_level = [] for child in mapping: # Create a child node child_node = Node(level_labels[child], format_char_vec(record[child])) parent = prev_level[mapping[child]] tree.add_edges_from([(parent, child_node)]) current_level.append(child_node) # Current level finished adding to tree, move on to lower level prev_level = current_level self.true_network = tree
def extend_dummy_branches(G, max_depth): """ Converts the tree to an ultrametric tree by adding in dummy nodes and branches & extending true leaves to the max depth. :param G: Input tree :param max_depth: Depth to extend leaves to. :returns: Ultrametric tree with dummy edges/nodes. """ leaves = [n for n in G.nodes if G.out_degree(n) == 0] for n in leaves: new_node_iter = 1 while G.nodes[n]["depth"] < max_depth: d = G.nodes[n]["depth"] new_node = Node('state-node', n.get_character_vec()) parents = list(G.predecessors(n)) for p in parents: G.remove_edge(p, n) G.add_edge(p, new_node) G.add_edge(new_node, n) G.nodes[new_node]["depth"] = d G.nodes[n]["depth"] = d + 1 new_node_iter += 1 return G
def cassiopeia_reconstruct(simulation): print('Reconstructing Cassiopeia Tree') priors = None character_matrix = simulation.get_final_cells() # Cassiopeia takes a string dataframe cm = character_matrix.replace(np.nan, -1) cm = cm.astype(int).astype(str).replace('-1','-') cm_uniq = cm.drop_duplicates(inplace=False) target_nodes = cm_uniq.values.tolist() target_nodes = list(map(lambda x, n: Node(n,x), target_nodes, cm_uniq.index)) t = time.time() reconstructed_network_greedy = solve_lineage_instance(target_nodes, method="greedy", prior_probabilities=priors) cass_time = time.time()-t cass_network = reconstructed_network_greedy[0] true_tree = simulation.get_cleaned_tree() cass_tree, duplicates = utilities.convert_nx_to_tree(cass_network.network) our_score = utilities.triplets_correct(true_tree, cass_tree) print('Cassiopeia:', our_score) return cass_tree, {'our_score':our_score}
def convert_tree_to_nx(tree): """ Convert a binary tree, G, represented as an skbio TreeNode to a networkx DiGraph. """ from cassiopeia.TreeSolver.Node import Node network = nx.DiGraph() level_nodes = [tree] level_nx = [Node(x.name, is_target=False) for x in level_nodes] level = 0 stop = False while not stop: successor_nodes = [] successor_nx = [] for i, tree_node in enumerate(level_nodes): node = level_nx[i] # Lookup Cassiopeia node from dictionary that was created when node was probed as a child for child_node in tree_node.children: # Create CassiopeiaNode for each child and add to the DiGraph # If the child is a leaf, then we need to add a character vector if child_node.is_tip(): child = Node(child_node.name, character_vec = child_node.get_character_matrix().replace(-1,'-').values.reshape(-1).tolist(), is_target=True) else: child = Node(child_node.name, is_target=False) network.add_edges_from([(node, child)]) successor_nodes.append(child_node) successor_nx.append(child) # Now the successor level is the current level for the next iteration level_nodes = successor_nodes level_nx = successor_nx if len(level_nodes) == 0: stop = True return network
def add_redundant_leaves(G, cm): """ To fairly take into account sample purity, we'll add back in 'redundant' leaves (i.e. leaves that were removed because of non-unique character strings). :param G: Input graph :param cm: Character matrix pandas Dataframe :return: Graph with redundant samples added back on. """ # create lookup value for duplicates if 'lookup' not in cm.columns: cm["lookup"] = cm.astype('str').apply('|'.join, axis=1) net_nodes = np.intersect1d(cm.index, [n.name for n in G]) uniq = cm.loc[net_nodes] if uniq.shape == cm.shape: return G # find all non-unique character states in cm #nonuniq = np.setdiff1d(cm.index, np.array(uniq)) nonuniq = np.setdiff1d(cm.index, uniq.index) for n in nonuniq: new_node = str(n) try: _leaf = uniq.index[uniq["lookup"] == cm.loc[n]["lookup"]][0] new_node = Node(str(n), cm.loc[n].values, is_target=True) parents = list(G.predecessors(_leaf)) for p in parents: G.add_edge(p, new_node) except: continue return G
def assign_samples_to_charstrings(G, cm): """ Preprocessing step if sample names are not in the tree. Assigns sample name to appropriate character states in the phylogeny. :param G: Input graph. :param cm: Character matrix pandas Dataframe. :return: Networkx Graph object as a tree with samples mapped onto the tree. """ new_nodes = [] new_edges = [] nodes_to_remove = [] root = [n for n in G if G.in_degree(n) == 0][0] if 'lookup' not in cm.columns: cm["lookup"] = cm.astype(str).apply(lambda x: "|".join(x), axis=1) for n in G: if n.get_character_string() in cm['lookup'].values and n.is_target: n.is_target = False sub_cm = cm.loc[cm["lookup"] == n.get_character_string()] _nodes = sub_cm.apply( lambda x: Node(x.name, x.values[:-1], is_target=True), axis=1 ) # make sure to do up to [:-1] b/c you don't want the lookup in your character vec if len(_nodes) == 0: continue for new_node in _nodes: new_nodes.append(new_node) new_edges.append((n, new_node)) G.add_nodes_from(new_nodes) G.add_edges_from(new_edges) return G
def main(): """ Takes in a character matrix, an algorithm, and an output file and returns a tree in newick format. """ parser = argparse.ArgumentParser() parser.add_argument("netfp", type=str, help="character_matrix") parser.add_argument("-nj", "--neighbor-joining", action="store_true", default=False) parser.add_argument("--neighbor_joining_weighted", action="store_true", default=False) parser.add_argument("--ilp", action="store_true", default=False) parser.add_argument("--hybrid", action="store_true", default=False) parser.add_argument("--cutoff", type=int, default=80, help="Cutoff for ILP during Hybrid algorithm") parser.add_argument( "--hybrid_lca_mode", action="store_true", help= "Use LCA distances to transition in hybrid mode, instead of number of cells", ) parser.add_argument("--time_limit", type=int, default=-1, help="Time limit for ILP convergence") parser.add_argument( "--iter_limit", type=int, default=-1, help="Max number of iterations for ILP solver", ) parser.add_argument("--greedy", "-g", action="store_true", default=False) parser.add_argument("--camin-sokal", "-cs", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False, help="output verbosity") parser.add_argument("--mutation_map", type=str, default="") parser.add_argument("--num_threads", type=int, default=1) parser.add_argument("--no_triplets", action="store_true", default=False) parser.add_argument("--max_neighborhood_size", type=str, default=3000) parser.add_argument("--out_fp", type=str, default=None, help="optional output file") parser.add_argument("--seed", type=int, default=None, help="Random seed for ILP solver") args = parser.parse_args() netfp = args.netfp outfp = args.out_fp verbose = args.verbose lca_mode = args.hybrid_lca_mode if lca_mode: lca_cutoff = args.cutoff cell_cutoff = None else: cell_cutoff = args.cutoff lca_cutoff = None time_limit = args.time_limit iter_limit = args.iter_limit num_threads = args.num_threads max_neighborhood_size = args.max_neighborhood_size seed = args.seed if seed is not None: random.seed(seed) np.random.seed(seed) score_triplets = not args.no_triplets prior_probs = None if args.mutation_map != "": prior_probs = pic.load(open(args.mutation_map, "rb")) name = netfp.split("/")[-1] stem = ".".join(name.split(".")[:-1]) true_network = nx.read_gpickle(netfp) if isinstance(true_network, Cassiopeia_Tree): true_network = true_network.get_network() target_nodes = get_leaves_of_tree(true_network) target_nodes_uniq = [] seen_charstrings = [] for t in target_nodes: if t.char_string not in seen_charstrings: seen_charstrings.append(t.char_string) target_nodes_uniq.append(t) if args.greedy: if verbose: print("Running Greedy Algorithm on " + str(len(target_nodes_uniq)) + " Cells") reconstructed_network_greedy = solve_lineage_instance( target_nodes_uniq, method="greedy", prior_probabilities=prior_probs) net = reconstructed_network_greedy[0] if outfp is None: outfp = name.replace("true", "greedy") pic.dump(net, open(outfp, "wb")) elif args.hybrid: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_hybrid = solve_lineage_instance( target_nodes_uniq, method="hybrid", hybrid_cell_cutoff=cell_cutoff, hybrid_lca_cutoff=lca_cutoff, prior_probabilities=prior_probs, time_limit=time_limit, threads=num_threads, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_hybrid[0] if outfp is None: outfp = name.replace("true", "hybrid") pic.dump(net, open(outfp, "wb")) elif args.ilp: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_ilp = solve_lineage_instance( target_nodes_uniq, method="ilp", hybrid_subset_cutoff=cutoff, prior_probabilities=prior_probs, time_limit=time_limit, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_ilp[0] # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample) if outfp is None: outfp = name.replace("true", "ilp") pic.dump(net, open(outfp, "wb")) elif args.neighbor_joining: if verbose: print("Running Neighbor-Joining on " + str(len(target_nodes_uniq)) + " Unique Cells") infile = "".join(name.split(".")[:-1]) + "infile.txt" fn = "".join(name.split(".")[:-1]) + "phylo.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str( script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) aln = AlignIO.read(infile, "phylip-relaxed") aln = unique_alignments(aln) t0 = time.time() calculator = DistanceCalculator("identity", skip_letters="?") constructor = DistanceTreeConstructor(calculator, "nj") tree = constructor.build_tree(aln) tree.root_at_midpoint() nj_net = Phylo.to_networkx(tree) # convert labels to characters for writing to file i = 0 rndict = {} for n in nj_net: if n.name is None: rndict[n] = Node("state-node", []) # n.name = "internal" + str(i) # i += 1 else: rndict[n] = Node(n.name, []) nj_net = nx.relabel_nodes(nj_net, rndict) # convert labels to strings, not Bio.Phylo.Clade objects # c2str = map(lambda x: x.name, list(nj_net.nodes())) # c2strdict = dict(zip(list(nj_net.nodes()), c2str)) # nj_net = nx.relabel_nodes(nj_net, c2strdict) cm = pd.read_csv(fn, sep="\t", index_col=0) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) nj_net = fill_in_tree(nj_net, cm) nj_net = tree_collapse(nj_net) for n in nj_net: if n.char_string in cm_lookup.keys(): n.is_target = True nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj") pic.dump(nj_net, open(outfp, "wb")) # Phylo.write(tree, out, 'newick') os.system("rm " + infile) os.system("rm " + fn) elif args.neighbor_joining_weighted: if verbose: print("Running Neighbor-Joining with Weighted Scoring on " + str(len(target_nodes_uniq)) + " Unique Cells") target_node_charstrings = np.array( [t.get_character_vec() for t in target_nodes_uniq]) dm = compute_distance_mat(target_node_charstrings, len(target_node_charstrings), priors=prior_probs) ids = [t.name for t in target_nodes_uniq] cm_uniq = pd.DataFrame(target_node_charstrings) cm_uniq.index = ids dm = sp.spatial.distance.squareform(dm) dm = DistanceMatrix(dm, ids) newick_str = nj(dm, result_constructor=str) tree = newick_to_network(newick_str, cm_uniq) nj_net = fill_in_tree(tree, cm_uniq) nj_net = tree_collapse(nj_net) cm_lookup = dict( zip( list( cm_uniq.apply( lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm_uniq.index.values, )) rdict = {} for n in nj_net: if n.char_string in cm_lookup: n.is_target = True else: n.is_target = False nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj_weighted") pic.dump(nj_net, open(outfp, "wb")) elif args.camin_sokal: if verbose: print("Running Camin-Sokal Max Parsimony Algorithm on " + str(len(target_nodes_uniq)) + " Unique Cells") samples_to_cells = {} indices = [] for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq): samples_to_cells["s" + str(i)] = n.name indices.append(n.name) n.name = str(i) infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt" fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt" weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str(script) + " " + fn + " " + infile pi = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(pi.pid, 0) weights = construct_weights(infile, weights_fn) os.system("touch outfile") os.system("touch outtree") outfile = stem + "outfile.txt" outtree = stem + "outtree.txt" # run phylip mix with camin-sokal responses = "." + stem + ".temp.txt" FH = open(responses, "w") current_dir = os.getcwd() FH.write(infile + "\n") FH.write("F\n" + outfile + "\n") FH.write("P\n") FH.write("W\n") FH.write("Y\n") FH.write(weights_fn + "\n") FH.write("F\n" + outtree + "\n") FH.close() t0 = time.time() cmd = "~/software/phylip-3.697/exe/mix" cmd += " < " + responses + " > screenout1" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) consense_outtree = stem + "consenseouttree.txt" consense_outfile = stem + "consenseoutfile.txt" FH = open(responses, "w") FH.write(outtree + "\n") FH.write("F\n" + consense_outfile + "\n") FH.write("Y\n") FH.write("F\n" + consense_outtree + "\n") FH.close() if verbose: print("Computing Consensus Tree, elasped time: " + str(time.time() - t0)) cmd = "~/software/phylip-3.697/exe/consense" cmd += " < " + responses + " > screenout" p2 = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p2.pid, 0) newick_str = "" with open(consense_outtree, "r") as f: for l in f: l = l.strip() newick_str += l cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str) cm.index = indices cs_net = newick_to_network(newick_str, cm) for n in cs_net: if n.name in samples_to_cells: n.name = samples_to_cells[n.name] cs_net = fill_in_tree(cs_net, cm) cs_net = tree_collapse2(cs_net) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) for n in cs_net: if n.char_string in cm_lookup.keys(): n.is_target = True cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net) if outfp is None: outfp = name.replace("true", "cs") pic.dump(cs_net, open(outfp, "wb")) os.system("rm " + outfile) os.system("rm " + responses) os.system("rm " + outtree) os.system("rm " + consense_outfile) os.system("rm " + infile) os.system("rm " + fn) else: raise Exception( "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal" )
def main(): """ Takes in a character matrix, an algorithm, and an output file and returns a tree in newick format. """ parser = argparse.ArgumentParser() parser.add_argument("char_fp", type=str, help="character_matrix") parser.add_argument("out_fp", type=str, help="output file name") parser.add_argument("-nj", "--neighbor-joining", action="store_true", default=False) parser.add_argument("--neighbor_joining_weighted", action="store_true", default=False) parser.add_argument("--ilp", action="store_true", default=False) parser.add_argument("--hybrid", action="store_true", default=False) parser.add_argument("--cutoff", type=int, default=80, help="Cutoff for ILP during Hybrid algorithm") parser.add_argument( "--hybrid_lca_mode", action="store_true", help= "Use LCA distances to transition in hybrid mode, instead of number of cells", ) parser.add_argument("--time_limit", type=int, default=1500, help="Time limit for ILP convergence") parser.add_argument("--greedy", "-g", action="store_true", default=False) parser.add_argument("--camin-sokal", "-cs", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False, help="output verbosity") parser.add_argument("--mutation_map", type=str, default="") parser.add_argument("--num_threads", type=int, default=1) parser.add_argument("--max_neighborhood_size", type=int, default=10000) parser.add_argument("--weighted_ilp", "-w", action="store_true", default=False) parser.add_argument("--greedy_min_allele_rep", type=float, default=1.0) parser.add_argument("--fuzzy_greedy", action="store_true", default=False) parser.add_argument("--multinomial_greedy", action="store_true", default=False) parser.add_argument("--num_neighbors", default=10) parser.add_argument("--num_alternative_solutions", default=100, type=int) parser.add_argument("--greedy_missing_data_mode", default="lookahead", type=str) parser.add_argument("--greedy_lookahead_depth", default=3, type=int) args = parser.parse_args() char_fp = args.char_fp out_fp = args.out_fp verbose = args.verbose lca_mode = args.hybrid_lca_mode if lca_mode: lca_cutoff = args.cutoff cell_cutoff = None else: cell_cutoff = args.cutoff lca_cutoff = None time_limit = args.time_limit num_threads = args.num_threads n_neighbors = args.num_neighbors num_alt_soln = args.num_alternative_solutions max_neighborhood_size = args.max_neighborhood_size missing_data_mode = args.greedy_missing_data_mode lookahead_depth = args.greedy_lookahead_depth if missing_data_mode not in ["knn", "lookahead", "avg", "modified_avg"]: raise Exception("Greedy missing data mode not recognized") stem = "".join(char_fp.split(".")[:-1]) cm = pd.read_csv(char_fp, sep="\t", index_col=0, dtype=str) cm_uniq = cm.drop_duplicates(inplace=False) cm_lookup = list(cm.apply(lambda x: "|".join(x.values), axis=1)) newick = "" prior_probs = None if args.mutation_map != "": prior_probs = read_mutation_map(args.mutation_map) weighted_ilp = args.weighted_ilp if prior_probs is None and weighted_ilp: raise Exception( "If you'd like to use weighted ILP reconstructions, you need to provide a mutation map (i.e. prior probabilities)" ) greedy_min_allele_rep = args.greedy_min_allele_rep fuzzy = args.fuzzy_greedy probabilistic = args.multinomial_greedy if args.greedy: target_nodes = list( cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1)) if verbose: print("Read in " + str(cm.shape[0]) + " Cells") print("Running Greedy Algorithm on " + str(len(target_nodes)) + " Unique States") reconstructed_network_greedy, potential_graph_sizes = solve_lineage_instance( target_nodes, method="greedy", prior_probabilities=prior_probs, greedy_minimum_allele_rep=greedy_min_allele_rep, fuzzy=fuzzy, probabilistic=probabilistic, n_neighbors=n_neighbors, missing_data_mode=missing_data_mode, lookahead_depth=lookahead_depth, ) net = reconstructed_network_greedy.get_network() out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_greedy, open(out_stem + ".pkl", "wb")) newick = reconstructed_network_greedy.get_newick() with open(out_fp, "w") as f: f.write(newick) root = [n for n in net if net.in_degree(n) == 0][0] # score parsimony score = 0 for e in nx.dfs_edges(net, source=root): score += e[0].get_mut_length(e[1]) print("Parsimony: " + str(score)) elif args.hybrid: target_nodes = list( cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1)) if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes)) + " Cells") if lca_mode: print( "Parameters: ILP on sets of cells with a maximum LCA distance of " + str(lca_cutoff) + " with " + str(time_limit) + "s to complete optimization") else: print("Parameters: ILP on sets of " + str(cell_cutoff) + " cells with " + str(time_limit) + "s to complete optimization") # string_to_sample = dict(zip(target_nodes, cm_uniq.index)) # target_nodes = list(map(lambda x, n: x + "_" + n, target_nodes, cm_uniq.index)) print("running algorithm...") reconstructed_network_hybrid, potential_graph_sizes = solve_lineage_instance( target_nodes, method="hybrid", hybrid_cell_cutoff=cell_cutoff, hybrid_lca_cutoff=lca_cutoff, prior_probabilities=prior_probs, time_limit=time_limit, threads=num_threads, max_neighborhood_size=max_neighborhood_size, weighted_ilp=weighted_ilp, greedy_minimum_allele_rep=greedy_min_allele_rep, fuzzy=fuzzy, probabilistic=probabilistic, n_neighbors=n_neighbors, maximum_alt_solutions=num_alt_soln, missing_data_mode=missing_data_mode, lookahead_depth=lookahead_depth, ) net = reconstructed_network_hybrid.get_network() if verbose: print("Writing the tree to output...") out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_hybrid, open(out_stem + ".pkl", "wb")) newick = reconstructed_network_hybrid.get_newick() with open(out_fp, "w") as f: f.write(newick) ## plot out diagnostic potential graph sizes h = plt.figure(figsize=(10, 10)) for i in range(len(potential_graph_sizes)): try: x, y = ( [k for k in potential_graph_sizes[i].keys()], [ potential_graph_sizes[i][k] for k in potential_graph_sizes[i].keys() ], ) plt.plot(x, y) except: continue # plt.xlim(0, int(cutoff)) plt.xlabel("LCA Distance") plt.ylabel("Size of Potential Graph") plt.savefig(out_stem + "_potentialgraphsizes.pdf") # score parsimony score = 0 for e in net.edges(): score += e[0].get_mut_length(e[1]) print("Parsimony: " + str(score)) elif args.ilp: target_nodes = list( cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1)) if verbose: print("Running ILP Algorithm on " + str(len(target_nodes)) + " Unique Cells") print("Paramters: ILP allowed " + str(time_limit) + "s to complete optimization") reconstructed_network_ilp, potential_graph_sizes = solve_lineage_instance( target_nodes, method="ilp", prior_probabilities=prior_probs, time_limit=time_limit, max_neighborhood_size=max_neighborhood_size, weighted_ilp=weighted_ilp, maximum_alt_solutions=num_alt_soln, ) net = reconstructed_network_ilp.get_network() root = [n for n in net if net.in_degree(n) == 0][0] # score parsimony score = 0 for e in nx.dfs_edges(net, source=root): score += e[0].get_mut_length(e[1]) print("Parsimony: " + str(score)) newick = reconstructed_network_ilp.get_newick() if verbose: print("Writing the tree to output...") out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_ilp, open(out_stem + ".pkl", "wb")) with open(out_fp, "w") as f: f.write(newick) h = plt.figure(figsize=(10, 10)) for i in range(len(potential_graph_sizes)): try: x, y = ( [k for k in potential_graph_sizes[i].keys()], [ potential_graph_sizes[i][k] for k in potential_graph_sizes[i].keys() ], ) plt.plot(x, y) except: continue # plt.xlim(0, int(cutoff)) plt.xlabel("LCA Distance") plt.ylabel("Size of Potential Graph") plt.savefig(out_stem + "_potentialgraphsizes.pdf") elif args.neighbor_joining: out_stem = "".join(out_fp.split(".")[:-1]) ret_tree = run_nj_naive(cm_uniq, stem, verbose) pic.dump(ret_tree, open(out_stem + ".pkl", "wb")) newick = ret_tree.get_newick() with open(out_fp, "w") as f: f.write(newick) elif args.neighbor_joining_weighted: out_stem = "".join(out_fp.split(".")[:-1]) ret_tree = run_nj_weighted(cm_uniq, prior_probs, verbose) pic.dump(ret_tree, open(out_stem + ".pkl", "wb")) newick = ret_tree.get_newick() with open(out_fp, "w") as f: f.write(newick) elif args.camin_sokal: out_stem = "".join(out_fp.split(".")[:-1]) ret_tree = run_camin_sokal(cm_uniq, stem, verbose) pic.dump(ret_tree, open(out_stem + ".pkl", "wb")) newick = convert_network_to_newick_format(ret_tree.get_network()) # newick = ret_tree.get_newick() with open(out_fp, "w") as f: f.write(newick) elif alg == "--max-likelihood" or alg == "-ml": # cells = cm.index # samples = [("s" + str(i)) for i in range(len(cells))] # samples_to_cells = dict(zip(samples, cells)) # cm.index = list(range(len(cells))) if verbose: print("Running Maximum Likelihood on " + str(cm.shape[0]) + " Unique Cells") infile = stem + "infile.txt" fn = stem + "phylo.txt" cm.to_csv(fn, sep="\t") script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str( script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) os.system("/home/mattjones/software/FastTreeMP < " + infile + " > " + out_fp) tree = Phylo.parse(out_fp, "newick").next() ml_net = Phylo.to_networkx(tree) i = 0 for n in ml_net: if n.name is None: n.name = "internal" + str(i) i += 1 c2str = map(lambda x: str(x), ml_net.nodes()) c2strdict = dict(zip(ml_net.nodes(), c2str)) ml_net = nx.relabel_nodes(ml_net, c2strdict) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(ml_net, open(out_stem + ".pkl", "wb")) os.system("rm " + infile) os.system("rm " + fn) else: raise Exception( "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, max-likelihood, or camin-sokal" )
def solve_lineage_instance( _target_nodes, prior_probabilities=None, method="hybrid", threads=8, hybrid_cell_cutoff=200, hybrid_lca_cutoff=None, time_limit=1800, max_neighborhood_size=10000, seed=None, num_iter=-1, weighted_ilp=False, fuzzy=False, probabilistic=False, plot_diagnostics=True, maximum_alt_solutions=100, greedy_minimum_allele_rep=1.0, n_neighbors=10, missing_data_mode="lookahead", lookahead_depth=3, ): """ Aggregated lineage solving method, which given a set of target nodes, will find the maximum parsimony tree accounting the given target nodes :param target_nodes: A list of target nodes, where each node is in the form 'Ch1|Ch2|....|Chn' :param prior_probabilities: A nested dictionary containing prior probabilities for [character][state] mappings where characters are in the form of integers, and states are in the form of strings, and values are the probability of mutation from the '0' state. :param method: The method used for solving the problem ['ilp, 'hybrid', 'greedy'] - ilp: Attempts to solve the problem based on steiner tree on the potential graph (Recommended for instances with several hundred samples at most) - greedy: Runs a greedy algorithm to find the maximum parsimony tree based on choosing the most occurring split in a top down fasion (Algorithm scales to any number of samples) - hybrid: Runs the greedy algorithm until there are less than hybrid_subset_cutoff samples left in each leaf of the tree, and then returns a series of small instance ilp is then run on these smaller instances, and the resulting graph is created by merging the smaller instances with the greedy top-down tree :param threads: The number of threads to use in parallel for the hybrid algorithm :param hybrid_subset_cutoff: The maximum number of nodes allowed before the greedy algorithm terminates for a given leaf node :return: A reconstructed subgraph representing the nodes """ if method == "hybrid": assert (hybrid_cell_cutoff is None or hybrid_lca_cutoff is None ), "You can only use one type of cutoff in Hybrid" target_nodes = [ n.get_character_string() + "_" + n.name for n in _target_nodes ] node_name_dict = dict( zip( [n.split("_")[0] for n in target_nodes], [n + "_target" for n in target_nodes], )) if seed is not None: np.random.seed(seed) random.seed(seed) # clip identifier for now, but make sure to add later target_nodes = [n.split("_")[0] for n in target_nodes] # target_nodes = list(set(target_nodes)) master_root = root_finder(target_nodes) if method == "ilp": subgraphs, r, pid, graph_sizes = find_good_gurobi_subgraph( master_root, target_nodes, node_name_dict, prior_probabilities, time_limit, 1, max_neighborhood_size, seed=seed, num_iter=num_iter, weighted=weighted_ilp, n_neighbors=n_neighbors, ) subgraph = subgraphs[0] rdict = {} target_seen = [] for n in subgraph: spl = n.split("_") nn = Node(n, spl[0].split("|"), is_target=False) if len(spl) == 2: if "target" in n and nn.char_string not in target_seen: nn.is_target = True if len(spl) > 2: if "target" in n and nn.char_string not in target_seen: nn.is_target = True nn.pid = spl[-1] if nn.is_target: target_seen.append(nn.char_string) rdict[n] = nn state_tree = nx.relabel_nodes(subgraph, rdict) return ( Cassiopeia_Tree(method="ilp", network=state_tree, name="Cassiopeia_state_tree"), graph_sizes, ) if method == "hybrid": neighbors, distances = None, None if missing_data_mode == "knn": print("Computing neighbors for imputing missing values...") neighbors, distances = find_neighbors(target_nodes, n_neighbors=n_neighbors) network, target_sets = greedy_build( target_nodes, neighbors, distances, priors=prior_probabilities, cell_cutoff=hybrid_cell_cutoff, lca_cutoff=hybrid_lca_cutoff, fuzzy=fuzzy, probabilistic=probabilistic, minimum_allele_rep=greedy_minimum_allele_rep, missing_data_mode=missing_data_mode, lookahead_depth=lookahead_depth, ) print( "Using " + str(min(multiprocessing.cpu_count(), threads)) + " threads, " + str(multiprocessing.cpu_count()) + " available.", flush=True, ) executor = concurrent.futures.ProcessPoolExecutor( min(multiprocessing.cpu_count(), threads)) print("Sending off Target Sets: " + str(len(target_sets)), flush=True) # just in case you've hit a target node during the greedy reconstruction, append name at this stage # so the composition step doesn't get confused when trying to join to the root. network = nx.relabel_nodes(network, node_name_dict) futures = [ executor.submit( find_good_gurobi_subgraph, root, targets, node_name_dict, prior_probabilities, time_limit, 1, max_neighborhood_size, seed, num_iter, weighted_ilp, n_neighbors, ) for root, targets in target_sets ] concurrent.futures.wait(futures) base_network = network.copy() base_rdict = {} for n in base_network: spl = n.split("_") nn = Node(n, spl[0].split("|"), is_target=False) if len(spl) > 1: nn.pid = spl[1] if spl[0] in node_name_dict: nn.is_target = True base_rdict[n] = nn base_network = nx.relabel_nodes(base_network, base_rdict) num_solutions = 1 # keep track of number of possible solutions potential_graph_sizes = [] all_res = [] alt_solutions = {} for future in futures: results, r, pid, graph_sizes = future.result() potential_graph_sizes.append(graph_sizes) subproblem_solutions = [] for res in results: new_names = {} for n in res: if res.in_degree(n) == 0 or n == r: new_names[n] = n else: new_names[n] = n + "_" + str(pid) res = nx.relabel_nodes(res, new_names) subproblem_solutions.append(res) num_solutions *= len(subproblem_solutions) all_res.append(subproblem_solutions) rt = [ n for n in subproblem_solutions[0] if subproblem_solutions[0].in_degree(n) == 0 ][0] alt_solutions[base_rdict[rt]] = subproblem_solutions network = nx.compose(network, subproblem_solutions[0]) rdict = {} target_seen = [] for n in network: spl = n.split("_") nn = Node(n, spl[0].split("|"), is_target=False) if len(spl) == 2: if "target" in n and nn.char_string not in target_seen: nn.is_target = True if len(spl) > 2: if "target" in n and nn.char_string not in target_seen: nn.is_target = True nn.pid = spl[-1] if nn.is_target: target_seen.append(nn.char_string) rdict[n] = nn state_tree = nx.relabel_nodes(network, rdict) # create alternative solutions pbar = tqdm(total=len(alt_solutions.keys()), desc="Enumerating alternative solutions") for r in alt_solutions.keys(): soln_list = [] # get original target char strings # sub_targets = [n.char_string for n in state_tree.successors(r) if n.is_target] for res in alt_solutions[r]: rdict = {} for n in res: spl = n.split("_") nn = Node(n, spl[0].split("|"), is_target=False) if len(spl) > 2: nn.pid = spl[-1] rdict[n] = nn res = nx.relabel_nodes(res, rdict) soln_list.append(res) alt_solutions[r] = soln_list pbar.update(1) # update progress bar # iterate through all possible solutions # alt_solutions = [] # if num_solutions > 1: # num_considered_solutions = 0 # sol_identifiers = [] # keep track of solutions already sampled # # we'll sample maximum_alt_solutions from the set of possible solutions # pbar = tqdm( # total=maximum_alt_solutions, desc="Enumerating alternative solutions" # ) # while num_considered_solutions < min(num_solutions, maximum_alt_solutions): # current_sol = [] # for res_list in all_res: # current_sol.append(np.random.choice(len(res_list))) # if tuple(current_sol) not in sol_identifiers: # new_network = base_network.copy() # for i in range(len(current_sol)): # res_list = all_res[i] # net = res_list[current_sol[i]] # new_network = nx.compose(new_network, net) # rdict = {} # target_seen = [] # for n in new_network: # spl = n.split("_") # nn = Node("state-node", spl[0].split("|"), is_target=False) # if len(spl) == 2: # if "target" in n and n not in target_seen: # nn.is_target = True # if len(spl) > 2: # if 'target' in n and n not in target_seen: # nn.is_target = True # nn.pid = spl[-1] # if nn.is_target: # target_seen.append(nn.char_string) # rdict[n] = nn # new_network = nx.relabel_nodes(new_network, rdict) # alt_solutions.append(new_network) # sol_identifiers.append(tuple(current_sol)) # num_considered_solutions += 1 # pbar.update(1) # update progress bar return ( Cassiopeia_Tree( method="hybrid", network=state_tree, name="Cassiopeia_state_tree", alternative_solutions=alt_solutions, base_network=base_network, ), potential_graph_sizes, ) if method == "greedy": neighbors, distances = None, None if missing_data_mode == "knn": print("Computing neighbors for imputing missing values...") neighbors, distances = find_neighbors(target_nodes, n_neighbors=n_neighbors) graph = greedy_build( target_nodes, neighbors, distances, priors=prior_probabilities, cell_cutoff=-1, lca_cutoff=None, fuzzy=fuzzy, probabilistic=probabilistic, minimum_allele_rep=greedy_minimum_allele_rep, missing_data_mode=missing_data_mode, lookahead_depth=lookahead_depth, )[0] rdict = {} for n in graph: spl = n.split("_") nn = Node(n, spl[0].split("|"), is_target=False) if len(spl) > 1: nn.pid = spl[1] if spl[0] in node_name_dict and len(spl) == 1: nn.is_target = True rdict[n] = nn state_tree = nx.relabel_nodes(graph, rdict) return ( Cassiopeia_Tree(method="greedy", network=state_tree, name="Cassiopeia_state_tree"), None, ) else: raise Exception( "Please specify one of the following methods: ilp, hybrid, greedy")
def run_nj_naive(cm_uniq, stem, verbose=True): if verbose: print("Running Neighbor-Joining on " + str(cm_uniq.shape[0]) + " Unique Cells") cm_lookup = list(cm_uniq.apply(lambda x: "|".join(x.values), axis=1)) fn = stem + "phylo.txt" infile = stem + "infile.txt" cm_uniq.to_csv(fn, sep='\t') script = (SCLT_PATH / 'TreeSolver' / 'binarize_multistate_charmat.py') cmd = "python3.6 " + str(script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) aln = AlignIO.read(infile, "phylip-relaxed") calculator = DistanceCalculator('identity') constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln) tree.root_at_midpoint() nj_net = Phylo.to_networkx(tree) # convert labels to characters for writing to file rndict = {} for n in nj_net: if n.name is None: rndict[n] = Node('state-node', []) elif n.name in cm_uniq: rndict[n] = Node(n.name, cm_uniq.loc[n.name].values) # convert labels to strings, not Bio.Phylo.Clade objects #c2str = map(lambda x: x.name, list(nj_net.nodes())) #c2strdict = dict(zip(list(nj_net.nodes()), c2str)) nj_net = nx.relabel_nodes(nj_net, rndict) # nj_net = fill_in_tree(nj_net, cm_uniq) # nj_net = tree_collapse2(nj_net) rdict = {} for n in nj_net: if nj_net.out_degree(n) == 0 and n.char_string in cm_lookup: n.is_target = True else: n.is_target = False state_tree = nj_net ret_tree = Cassiopeia_Tree(method='neighbor-joining', network=state_tree, name='Cassiopeia_state_tree') os.system("rm " + infile) os.system("rm " + fn) return ret_tree
def prune_and_clean_leaves(G): """ Prune off leaves that don't correspond to samples and clean up the names on leaves (i.e. only keep the sample labels and remove character states or post-processing hashes.) :param G: Networkx Graph as a tree :return: Pruned and cleaned tree as a Networkx object. """ new_nodes = [] new_edges = [] def prune_leaves(G): nodes_to_remove = [] root = [n for n in G if G.in_degree(n) == 0][0] # first remove paths to leaves that don't correspond to samples _leaves = [n for n in G if G.out_degree(n) == 0] for n in _leaves: # we detect leaves that are not targets by the `is_target` attribute. if not n.is_target: nodes_to_remove.append(n) return nodes_to_remove nodes_to_remove = prune_leaves(G) while len(nodes_to_remove) > 0: for n in set(nodes_to_remove): G.remove_node(n) nodes_to_remove = prune_leaves(G) # remove character strings from node name # node_dict = {} # for n in tqdm(G.nodes, desc="removing character strings from sample names"): # spl = n.split("_") # if "|" in spl[0] and "target" in n: # nn = "_".join(spl[1:]) # node_dict[n] = nn # G = nx.relabel_nodes(G, node_dict) for n in G.nodes: # spl = n.split("_") if n.is_target: #if spl[-1] == "target": # name = "_".join(spl[:-1]) #else: # name = "_".join(spl[:-2]) # if this target is a leaf, just rename it # else we must add an extra 'redundant' leaf here if G.out_degree(n) != 0: # node_dict2[n] = name if n.name == 'state-node': n.is_target = False else: n.is_target = False new_node = Node(n.name, n.get_character_vec(), is_target=True) # else: new_nodes.append(new_node) new_edges.append((n, new_node)) G.add_nodes_from(new_nodes) G.add_edges_from(new_edges) # G = nx.relabel_nodes(G, node_dict2) return G