def test_ilp_simple(): n1 = Node('a', [1, 0, 0, 0, 0]) n2 = Node('b', [1, 0, 0, 1, 0]) n3 = Node('c', [1, 0, 0, 2, 0]) n4 = Node('d', [1, 2, 0, 1, 0]) n5 = Node('e', [1, 1, 0, 1, 0]) n6 = Node('f', [1, 0, 3, 2, 0]) n7 = Node('g', [0, 0, 0, 0, 1]) n8 = Node('h', [0, 1, 0, 0, 1]) n9 = Node('i', [0, 1, 2, 0, 1]) n10 = Node('j', [0, 1, 1, 0, 1]) nodes = [n1, n2, n3, n4, n5, n6, n7, n8, n9, n10] with open(stdout_backup, "w") as f: sys.stdout = f tree = ls.solve_lineage_instance(nodes, method="ilp") os.remove(stdout_backup) net = tree.get_network() roots = [n for n in net if net.in_degree(n) == 0] assert len(roots) == 1 root = roots[0] targets = [n for n in net if n.is_target] assert len(targets) == len(nodes) for t in targets: assert nx.has_path(net, root, t)
def test_on_sim_greedy(): stree = pic.load(open("test/data/sim_net.pkl", "rb")) leaves = stree.get_leaves() target_nodes = [] for l in leaves: new_node = Node(l.name, l.get_character_vec()) target_nodes.append(new_node) rtree = ls.solve_lineage_instance(target_nodes, method="greedy") rnet = rtree.get_network() roots = [n for n in rnet if rnet.in_degree(n) == 0] assert len(roots) == 1 root = roots[0] targets = [n for n in rnet if n.is_target] assert len(targets) == len(target_nodes) for t in targets: assert nx.has_path(rnet, root, t) multi_parents = [n for n in rnet if rnet.in_degree(n) > 1] assert len(multi_parents) == 0
def test_ilp_parallel_evo(): n = Node('a', [1, 1, 2, 0]) n2 = Node('b', [1, 1, 3, 0]) n3 = Node('c', [2, 1, 1, 0]) n4 = Node('d', [2, 1, 3, 0]) n5 = Node('e', [1, 3, 1, '-']) n6 = Node('f', [1, '-', '-', '1']) n7 = Node('g', [1, 1, 0, 2]) nodes = [n, n2, n3, n4, n5, n6, n7] with open(stdout_backup, "w") as f: sys.stdout = f tree = ls.solve_lineage_instance(nodes, method='ilp') os.remove(stdout_backup) net = tree.get_network() roots = [n for n in net if net.in_degree(n) == 0] assert len(roots) == 1 root = roots[0] targets = [n for n in net if n.is_target] assert len(targets) == len(nodes) for t in targets: assert nx.has_path(net, root, t) multi_parents = [n for n in net if net.in_degree(n) > 1] assert len(multi_parents) == 0
def test_on_sim_hybrid(): stree = pic.load(open("test/data/sim_net.pkl", "rb")) leaves = stree.get_leaves() target_nodes = [] for l in leaves: new_node = Node(l.name, l.get_character_vec()) target_nodes.append(new_node) with open(stdout_backup, "w") as f: sys.stdout = f rtree = ls.solve_lineage_instance(target_nodes, method="hybrid", hybrid_subset_cutoff=200, time_limit=100, max_neighborhood_size=500, threads=4) os.remove(stdout_backup) rnet = rtree.get_network() roots = [n for n in rnet if rnet.in_degree(n) == 0] assert len(roots) == 1 root = roots[0] targets = [n for n in rnet if n.is_target] assert len(targets) == len(target_nodes) for t in targets: assert nx.has_path(rnet, root, t) multi_parents = [n for n in rnet if rnet.in_degree(n) > 1] assert len(multi_parents) == 0
def main(): """ Takes in a character matrix, an algorithm, and an output file and returns a tree in newick format. """ parser = argparse.ArgumentParser() parser.add_argument("netfp", type=str, help="character_matrix") parser.add_argument("-nj", "--neighbor-joining", action="store_true", default=False) parser.add_argument("--neighbor_joining_weighted", action="store_true", default=False) parser.add_argument("--ilp", action="store_true", default=False) parser.add_argument("--hybrid", action="store_true", default=False) parser.add_argument("--cutoff", type=int, default=80, help="Cutoff for ILP during Hybrid algorithm") parser.add_argument( "--hybrid_lca_mode", action="store_true", help= "Use LCA distances to transition in hybrid mode, instead of number of cells", ) parser.add_argument("--time_limit", type=int, default=-1, help="Time limit for ILP convergence") parser.add_argument( "--iter_limit", type=int, default=-1, help="Max number of iterations for ILP solver", ) parser.add_argument("--greedy", "-g", action="store_true", default=False) parser.add_argument("--camin-sokal", "-cs", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False, help="output verbosity") parser.add_argument("--mutation_map", type=str, default="") parser.add_argument("--num_threads", type=int, default=1) parser.add_argument("--no_triplets", action="store_true", default=False) parser.add_argument("--max_neighborhood_size", type=str, default=3000) parser.add_argument("--out_fp", type=str, default=None, help="optional output file") parser.add_argument("--seed", type=int, default=None, help="Random seed for ILP solver") args = parser.parse_args() netfp = args.netfp outfp = args.out_fp verbose = args.verbose lca_mode = args.hybrid_lca_mode if lca_mode: lca_cutoff = args.cutoff cell_cutoff = None else: cell_cutoff = args.cutoff lca_cutoff = None time_limit = args.time_limit iter_limit = args.iter_limit num_threads = args.num_threads max_neighborhood_size = args.max_neighborhood_size seed = args.seed if seed is not None: random.seed(seed) np.random.seed(seed) score_triplets = not args.no_triplets prior_probs = None if args.mutation_map != "": prior_probs = pic.load(open(args.mutation_map, "rb")) name = netfp.split("/")[-1] stem = ".".join(name.split(".")[:-1]) true_network = nx.read_gpickle(netfp) if isinstance(true_network, Cassiopeia_Tree): true_network = true_network.get_network() target_nodes = get_leaves_of_tree(true_network) target_nodes_uniq = [] seen_charstrings = [] for t in target_nodes: if t.char_string not in seen_charstrings: seen_charstrings.append(t.char_string) target_nodes_uniq.append(t) if args.greedy: if verbose: print("Running Greedy Algorithm on " + str(len(target_nodes_uniq)) + " Cells") reconstructed_network_greedy = solve_lineage_instance( target_nodes_uniq, method="greedy", prior_probabilities=prior_probs) net = reconstructed_network_greedy[0] if outfp is None: outfp = name.replace("true", "greedy") pic.dump(net, open(outfp, "wb")) elif args.hybrid: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_hybrid = solve_lineage_instance( target_nodes_uniq, method="hybrid", hybrid_cell_cutoff=cell_cutoff, hybrid_lca_cutoff=lca_cutoff, prior_probabilities=prior_probs, time_limit=time_limit, threads=num_threads, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_hybrid[0] if outfp is None: outfp = name.replace("true", "hybrid") pic.dump(net, open(outfp, "wb")) elif args.ilp: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_ilp = solve_lineage_instance( target_nodes_uniq, method="ilp", hybrid_subset_cutoff=cutoff, prior_probabilities=prior_probs, time_limit=time_limit, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_ilp[0] # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample) if outfp is None: outfp = name.replace("true", "ilp") pic.dump(net, open(outfp, "wb")) elif args.neighbor_joining: if verbose: print("Running Neighbor-Joining on " + str(len(target_nodes_uniq)) + " Unique Cells") infile = "".join(name.split(".")[:-1]) + "infile.txt" fn = "".join(name.split(".")[:-1]) + "phylo.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str( script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) aln = AlignIO.read(infile, "phylip-relaxed") aln = unique_alignments(aln) t0 = time.time() calculator = DistanceCalculator("identity", skip_letters="?") constructor = DistanceTreeConstructor(calculator, "nj") tree = constructor.build_tree(aln) tree.root_at_midpoint() nj_net = Phylo.to_networkx(tree) # convert labels to characters for writing to file i = 0 rndict = {} for n in nj_net: if n.name is None: rndict[n] = Node("state-node", []) # n.name = "internal" + str(i) # i += 1 else: rndict[n] = Node(n.name, []) nj_net = nx.relabel_nodes(nj_net, rndict) # convert labels to strings, not Bio.Phylo.Clade objects # c2str = map(lambda x: x.name, list(nj_net.nodes())) # c2strdict = dict(zip(list(nj_net.nodes()), c2str)) # nj_net = nx.relabel_nodes(nj_net, c2strdict) cm = pd.read_csv(fn, sep="\t", index_col=0) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) nj_net = fill_in_tree(nj_net, cm) nj_net = tree_collapse(nj_net) for n in nj_net: if n.char_string in cm_lookup.keys(): n.is_target = True nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj") pic.dump(nj_net, open(outfp, "wb")) # Phylo.write(tree, out, 'newick') os.system("rm " + infile) os.system("rm " + fn) elif args.neighbor_joining_weighted: if verbose: print("Running Neighbor-Joining with Weighted Scoring on " + str(len(target_nodes_uniq)) + " Unique Cells") target_node_charstrings = np.array( [t.get_character_vec() for t in target_nodes_uniq]) dm = compute_distance_mat(target_node_charstrings, len(target_node_charstrings), priors=prior_probs) ids = [t.name for t in target_nodes_uniq] cm_uniq = pd.DataFrame(target_node_charstrings) cm_uniq.index = ids dm = sp.spatial.distance.squareform(dm) dm = DistanceMatrix(dm, ids) newick_str = nj(dm, result_constructor=str) tree = newick_to_network(newick_str, cm_uniq) nj_net = fill_in_tree(tree, cm_uniq) nj_net = tree_collapse(nj_net) cm_lookup = dict( zip( list( cm_uniq.apply( lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm_uniq.index.values, )) rdict = {} for n in nj_net: if n.char_string in cm_lookup: n.is_target = True else: n.is_target = False nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj_weighted") pic.dump(nj_net, open(outfp, "wb")) elif args.camin_sokal: if verbose: print("Running Camin-Sokal Max Parsimony Algorithm on " + str(len(target_nodes_uniq)) + " Unique Cells") samples_to_cells = {} indices = [] for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq): samples_to_cells["s" + str(i)] = n.name indices.append(n.name) n.name = str(i) infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt" fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt" weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str(script) + " " + fn + " " + infile pi = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(pi.pid, 0) weights = construct_weights(infile, weights_fn) os.system("touch outfile") os.system("touch outtree") outfile = stem + "outfile.txt" outtree = stem + "outtree.txt" # run phylip mix with camin-sokal responses = "." + stem + ".temp.txt" FH = open(responses, "w") current_dir = os.getcwd() FH.write(infile + "\n") FH.write("F\n" + outfile + "\n") FH.write("P\n") FH.write("W\n") FH.write("Y\n") FH.write(weights_fn + "\n") FH.write("F\n" + outtree + "\n") FH.close() t0 = time.time() cmd = "~/software/phylip-3.697/exe/mix" cmd += " < " + responses + " > screenout1" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) consense_outtree = stem + "consenseouttree.txt" consense_outfile = stem + "consenseoutfile.txt" FH = open(responses, "w") FH.write(outtree + "\n") FH.write("F\n" + consense_outfile + "\n") FH.write("Y\n") FH.write("F\n" + consense_outtree + "\n") FH.close() if verbose: print("Computing Consensus Tree, elasped time: " + str(time.time() - t0)) cmd = "~/software/phylip-3.697/exe/consense" cmd += " < " + responses + " > screenout" p2 = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p2.pid, 0) newick_str = "" with open(consense_outtree, "r") as f: for l in f: l = l.strip() newick_str += l cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str) cm.index = indices cs_net = newick_to_network(newick_str, cm) for n in cs_net: if n.name in samples_to_cells: n.name = samples_to_cells[n.name] cs_net = fill_in_tree(cs_net, cm) cs_net = tree_collapse2(cs_net) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) for n in cs_net: if n.char_string in cm_lookup.keys(): n.is_target = True cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net) if outfp is None: outfp = name.replace("true", "cs") pic.dump(cs_net, open(outfp, "wb")) os.system("rm " + outfile) os.system("rm " + responses) os.system("rm " + outtree) os.system("rm " + consense_outfile) os.system("rm " + infile) os.system("rm " + fn) else: raise Exception( "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal" )