def make_tree_figure(wanted_seqs, trop_dict, tree_file): mat_data = get_pairwise_distances(wanted_seqs, tree_file = tree_file) tree = Phylo.read(open(tree_file), 'newick') net = Phylo.to_networkx(tree) node_mapping = {} clade = 1 for node in net.nodes(): if node.name is None: node_mapping[node] = 'Clade-%i' % clade clade += 1 else: node_mapping[node] = node.name new_net = networkx.relabel_nodes(net, node_mapping) colors = [] for node in new_net.nodes(): if node.startswith('Clade'): colors.append('w') elif trop_dict[node]: colors.append('g') elif not trop_dict[node]: colors.append('r') else: print node #print colors, len(colors), len(new_net.nodes()) pos = networkx.graphviz_layout(new_net, 'twopi') networkx.draw_networkx(new_net, pos, with_labels = False, node_color = colors)
def nkew(): with open("rosalind_nkew.txt") as f: lines = map(lambda l: l.strip(), f.readlines()) lines = [line for line in lines if line] for i in xrange(len(lines)/2): handle = StringIO.StringIO(lines[2*i]) tree = Phylo.read(handle, "newick") names = lines[2*i+1].split() t = Phylo.to_networkx(tree) # create weighted tree wt = networkx.Graph() for node in t.nodes(): wt.add_node(node) for key,vals in t.edge.items(): for val in vals: wt.add_edge(key, val, weight=vals[val]['weight']) #pos = networkx.spring_layout(wt) #networkx.draw(wt, pos) #networkx.draw_networkx_edge_labels(wt, pos) #plt.show() na = [node for node in wt.nodes() if node.name == names[0]][0] nb = [node for node in wt.nodes() if node.name == names[1]][0] print int(networkx.shortest_path_length(wt, na, nb, 'weight')), print ""
def get_tree(tree_file, name_tree): tree = Phylo.read( open(tree_file, 'r'), "newick") tree_name = Phylo.read( open(name_tree, 'r'), "newick") #set node number for nonterminal nodes and specify root node numInternalNode = 0 for clade in tree.get_nonterminals(): clade.name = 'N' + str(numInternalNode) clade.branch_length = clade.confidence numInternalNode += 1 for clade_iter in range(len(tree.get_terminals())): clade = tree.get_terminals()[clade_iter] clade.branch_length = clade.confidence clade.name = tree_name.get_terminals()[clade_iter].name tree_phy = tree.as_phyloxml(rooted = 'True') tree_nx = Phylo.to_networkx(tree_phy) triples = ((u.name, v.name, d['weight']) for (u, v, d) in tree_nx.edges(data = True)) # data = True to have the blen as 'weight' T = nx.DiGraph() edge_to_blen = {} for va, vb, blen in triples: edge = (va, vb) T.add_edge(*edge) edge_to_blen[edge] = blen edge_list = edge_to_blen.keys() edge_list.sort(key = lambda node: int(node[0][1:])) return edge_to_blen, edge_list
def get_tree(newicktree): tree = Phylo.read( newicktree, "newick") #set node number for nonterminal nodes and specify root node numInternalNode = 0 for clade in tree.get_nonterminals(): clade.name = 'N' + str(numInternalNode) numInternalNode += 1 tree_phy = tree.as_phyloxml(rooted = 'True') tree_nx = Phylo.to_networkx(tree_phy) triples = ((u.name, v.name, d['weight']) for (u, v, d) in tree_nx.edges(data = True)) # data = True to have the blen as 'weight' T = nx.DiGraph() edge_to_blen = {} for va, vb, blen in triples: edge = (va, vb) T.add_edge(*edge) edge_to_blen[edge] = blen # Now assign node_to_num leaves = set(v for v, degree in T.degree().items() if degree == 1) internal_nodes = set(list(T)).difference(leaves) node_names = list(internal_nodes) + list(leaves) # Prepare for generating self.tree so that it has same order as the self.x_process nEdge = len(edge_to_blen) # number of edges l = nEdge / 2 + 1 # number of leaves k = l - 1 # number of internal nodes. The notation here is inconsistent with Alex's for trying to match my notes. leaf_branch = [edge for edge in edge_to_blen.keys() if edge[0][0] == 'N' and str.isdigit(edge[0][1:]) and not str.isdigit(edge[1][1:])] out_group_branch = [edge for edge in leaf_branch if edge[0] == 'N0' and not str.isdigit(edge[1][1:])] [0] internal_branch = [x for x in edge_to_blen.keys() if not x in leaf_branch] assert(len(internal_branch) == k-1) # check if number of internal branch is one less than number of internal nodes return list(leaves), out_group_branch
def iterate(n_iters): for i in tqdm(xrange(n_iters)): sampler.sample() likelihoods.append(sampler.tree.marg_log_likelihood()) plt.figure() plt.xlabel("Iterations", fontsize=fontsize) plt.ylabel("Data Log Likelihood", fontsize=fontsize) plt.plot(likelihoods) plt.legend(loc='best', fontsize=12) plt.savefig('unconstrained-likelihoods.png', bbox_inches='tight') final_tree = sampler.tree.copy() plt.figure() plot_tree_2d(final_tree, X, pca) for node in final_tree.dfs(): if node.is_leaf(): node.point = y[node.point] plt.figure() newick = final_tree.to_newick() tree = Phylo.read(StringIO(newick), 'newick') Phylo.draw_graphviz(tree, prog='neato') plt.savefig('unconstrained-tree.png', bbox_inches='tight') graph = Phylo.to_networkx(tree) with open('unconstrained-tree.nwk', 'w') as fp: print >>fp, newick, nx.write_dot(graph, 'unconstrained-tree.dot') plt.show()
def get_character_table(t): chars = dict() char_matrix = [] t = Phylo.read(StringIO(t), 'newick') for c in list(t.get_terminals()): chars[c.name] = [] net = Phylo.to_networkx(t) adj_matrix = networkx.adjacency_matrix(net) tchars = [] for node in net.nodes(data=True): tchars.append(str(node[0])) for m in range(len(adj_matrix)): if adj_matrix[m,:].sum() == 3: for i in range(m): if (i != m) and (adj_matrix[i,:].sum() == 3) \ and (adj_matrix[i,m] == adj_matrix[m,i]) and (adj_matrix[i,m] == 1): adj_matrix[i,m] = 0 adj_matrix[m,i] = 0 net = networkx.from_numpy_matrix(adj_matrix) test1 = networkx.connected_components(net) for item in test1[0]: try: chars[tchars[int(item)]].append(1) except: continue for item in test1[1]: try: chars[tchars[int(item)]].append(0) except: continue adj_matrix[i,m] = 1 adj_matrix[m,i] = 1 for i in xrange(len(chars.items()[0][1])): char_matrix.append([]) for j in xrange(len(chars)): char_matrix[i].append(0) nn = 0 for _, v in sorted(chars.items()) : for j in range(len(v)): char_matrix[j][nn] = v[j] nn += 1 for i in xrange(len(char_matrix)): str1 = "" for j in xrange(len(char_matrix[i])): str1 += str(int(char_matrix[i][j])) print str1
def nwck(): with open("rosalind_nwck.txt") as f: lines = map(lambda l: l.strip(), f.readlines()) lines = [line for line in lines if line] for i in xrange(len(lines)/2): handle = StringIO(lines[2*i]) tree = Phylo.read(handle, "newick") names = lines[2*i+1].split() t = Phylo.to_networkx(tree) na = [node for node in t.nodes() if node.name == names[0]][0] nb = [node for node in t.nodes() if node.name == names[1]][0] print len(networkx.shortest_path(t, na, nb))-1, print ""
def tree_from_random(list_of_scores): """Generates a random guide tree for MGA. Parameters ---------- list_of_scores : scores from the pairwise alignments of the graphs to get graph names. Example for three graphs a, b, c: [["a", "b", 2], ["a", "c", 4], ["b", "c", 3]] Output ------ Guide_tree object """ names = Guide_tree_Generator.make_graph_list(list_of_scores) matrix = Guide_tree_Generator.random_score_matrix(names) constructor = DistanceTreeConstructor() upgmatree = constructor.upgma(matrix) tree = Phylo.to_networkx(upgmatree) guide_tree = Guide_tree(tree) return guide_tree
def get_pairwise_distances(npalign, tree_file=None, seq_file=None): if seq_file is None: fasta_handle = NTF(mode="w") else: fasta_handle = open("/tmp/tmp.fasta", "w") if tree_file is None: tree_handle = NTF() else: tree_handle = open(tree_file, "w") seq_names = fasta_write(fasta_handle, npalign) fasta_handle.flush() os.fsync(fasta_handle.fileno()) cmd = "muscle -in %(ifile)s -tree2 %(treefile)s -gapopen -2.9" cmdlist = shlex.split(cmd % {"ifile": fasta_handle.name, "treefile": tree_handle.name}) try: t = check_output(cmdlist) tree = Phylo.read(open(tree_handle.name), "newick") except CalledProcessError: # print('Could not make tree') return None except ValueError: # print('no tree present') return None except RuntimeError: return None seq_names = sorted(tree.get_terminals(), key=lambda x: x.name) net = Phylo.to_networkx(tree) dmat = networkx.all_pairs_shortest_path(net) terminals = tree.get_terminals() dists = np.zeros((npalign.shape[0], npalign.shape[0])) for t1, t2 in product(terminals, terminals): path = dmat[t1][t2] dist = sum(c.branch_length for c in path) i1 = int(t1.name.split("-")[1]) i2 = int(t2.name.split("-")[1]) dists[i1, i2] = dist return dists
def tree_from_newick(path): """Generates Guide_tree object from a newick tree entered by the user. Parameters ---------- path : path to newick string representing the desired aligning sequence for MGA Output ------ Guide_tree object """ tree = next(Phylo.parse(path, 'newick', rooted=True)) networkx_tree = Phylo.to_networkx(tree) guide_tree = Guide_tree(networkx_tree) if Guide_tree_Generator.is_binary_tree(guide_tree) == True: return guide_tree else: print( "The input is not a binary tree. Please enter a binary tree to get a guide tree" )
def character_table2(tree): '''given a Bio.Phylo tree object, return character table showing all nontrivial splits in a set of binary strings''' terminals = sorted(tree.get_terminals(), key=lambda x: x.name) n = len(terminals) G = Phylo.to_networkx(tree) nontrivials = [(u, v) for u, v in G.edges() if (u.name is None) and (v.name is None)] m = len(nontrivials) table = np.zeros((m, n), dtype=int) for i in range(m): G.remove_edge(*nontrivials[i]) s = nx.node_connected_component(G, terminals[0]) for j in range(n): table[i, j] = int(terminals[j] in s) G.add_edge(*nontrivials[i]) return set([''.join([str(i) for i in row]) for row in table])
def generar_arbol(especie, indice): tree_path = './static/img/bio/' + especie + indice + '.png' graph_path = './static/img/bio/' + especie + indice + 'g.png' if not path.exists(tree_path) or not path.exists(graph_path): seq_path = './static/seq/Homologos/' fasta_file = seq_path + especie + str(indice) + '.fasta' aln_file = seq_path + especie + str(indice) + '.aln' # Ejecuta MUSCLE para el alineamiento de secuencias homologas cli = MuscleCommandline(input=fasta_file, out=aln_file, clw=True) #cli = ClustalwCommandline(infile=fasta_file,outfile=aln_file) cli() with open(aln_file, "r") as aln: alineamiento = AlignIO.read(aln, "clustal") # Blosum62 para proteinas calculator = DistanceCalculator('blosum62') dm = calculator.get_distance(alineamiento) constructor = DistanceTreeConstructor(calculator) # Neighbor Joining nj = constructor.nj(dm) net = Phylo.to_networkx(nj) pos1 = nx.nx_pydot.pydot_layout(net, prog='dot') # Dibuja Dendrograma Phylo.draw(nj) pylab.savefig(tree_path, format='png') pylab.clf() # Dibuja grafo nx.draw(net, pos=pos1, with_labels=True) pylab.savefig(graph_path, format='png') pylab.clf()
def main(): parser = argparse.ArgumentParser() parser.add_argument("true_net", type=str) parser.add_argument("r_net", type=str) parser.add_argument("alg", type=str) parser.add_argument("typ", type=str) parser.add_argument("--modified", action="store_true", default=False) args = parser.parse_args() true_netfp = args.true_net reconstructed_fp = args.r_net alg = args.alg t = args.typ modified = args.modified name = true_netfp.split("/")[-1] spl = name.split("_") param = spl[-3] run = spl[-1].split(".")[0] #param = "na" name2 = reconstructed_fp.split("/")[-1] spl2 = name2.split("_") ending = spl2[-1].split(".")[-1] #true_network = pic.load(open(true_netfp, "rb")) true_network = nx.read_gpickle(true_netfp) target_nodes = get_leaves_of_tree(true_network, clip_identifier=True) target_nodes_original_network = get_leaves_of_tree(true_network, clip_identifier=False) if ending == "pkl" or ending == "pickle": #reconstructed_network = nx.read_gpickle(reconstructed_fp) reconstructed_network = pic.load(open(reconstructed_fp, "rb"), encoding="latin1") nodes = [n for n in reconstructed_network.nodes()] encoder = dict(zip(nodes, map(lambda x: x.split("_")[0], nodes))) reconstructed_network = nx.relabel_nodes(reconstructed_network, encoder) else: k = map(lambda x: "s" + x.split("_")[-1], target_nodes_original_network) s_to_char = dict(zip(k, target_nodes)) char_to_s = dict(zip(target_nodes, k)) reconstructed_tree = next(Phylo.parse(reconstructed_fp, "newick")) reconstructed_tree.rooted = True reconstructed_network = Phylo.to_networkx(reconstructed_tree) i = 1 for n in reconstructed_network: if n.name is None: n.name = "i" + str(i) i += 1 #newick_str = "" #with open(reconstructed_fp, "r") as f: # for l in f: # l = l.strip() # newick_str += l #reconstructed_tree = newick_to_network(reconstructed_fp) #reconstructed_tree = newick_to_network(newick_str) #reconstructed_network = tree_collapse(reconstructed_tree) # convert labels to strings, not Bio.Phylo.Clade objects c2str = map(lambda x: x.name, reconstructed_network.nodes()) c2strdict = dict(zip(reconstructed_network.nodes(), c2str)) reconstructed_network = nx.relabel_nodes(reconstructed_network, c2strdict) # convert labels to characters for triplets correct analysis reconstructed_network = nx.relabel_nodes(reconstructed_network, s_to_char) #reconstructed_network = tree_collapse(reconstructed_network) tot_tp = score_triplets(true_network, reconstructed_network, number_of_trials=50000, modified=modified) print( str(param) + "\t" + str(run) + "\t" + str(tot_tp) + "\t" + alg + "\t" + t + "\t" + str(0))
plt.figure() plt.xlim([0, n_iters + constraint_add]) plt.xlabel("Iterations", fontsize=fontsize) plt.ylabel("Data Log Likelihood", fontsize=fontsize) plt.plot(likelihoods) plt.legend(loc="best", fontsize=12) plt.savefig("online-likelihoods.png", bbox_inches="tight") final_tree = sampler.tree.copy() plt.figure() plot_tree_2d(final_tree, X, pca) for node in final_tree.dfs(): if node.is_leaf(): node.point = y[node.point] newick = final_tree.to_newick() tree = Phylo.read(StringIO(newick), "newick") plt.figure() Phylo.draw_graphviz(tree, prog="neato") plt.savefig("tree.png", bbox_inches="tight") graph = Phylo.to_networkx(tree) with open("tree.nwk", "w") as fp: print >> fp, newick, nx.write_dot(graph, "tree.dot") plt.show()
def plot_phylo(C_raw, F, list_funcs, len_kegg, comp_p, pattern, threshold=0.05): """ Build the phylogeny of components, analyze the pathway and plot them. Parameters ---------- C_raw: matrix gene module values of all components, each row a gene module, each column a component. F: matrix portions of each component in each sample, each row a component, each column a sample. list_funcs: list of str functions of each gene module. len_kegg: int number of KEGG cancer related pathways. comp_p: int index of most abundant component in the primary samples. pattern: list of int components need to considered for constructing the phylogeney. threshold: float threshold to define whether a component is primary or metastatic. """ assert comp_p in pattern is_p, is_m = get_ary_pm_comp(F, threshold=threshold) labels = get_labels_comp(F, is_p, is_m) labels = [labels[idx] for idx in pattern] C = C_raw[:, pattern] # build up phylogeny of components W = pairwise_distances(C.T) # for numerical stability W = (W+W.T)/2.0 dm = DistanceMatrix(W, labels) newick_str = nj(dm, result_constructor=str) tree = Phylo.read(StringIO(newick_str), "newick") tree.ladderize() # Flip branches so deeper clades are displayed at top #Phylo.draw(tree) #Phylo.draw(tree, branch_labels=lambda c: c.branch_length) # initialize the graph: # pathway of leaves, name of steiner nodes, branch length of root G = Phylo.to_networkx(tree) idx = 1 for node in G.nodes(): if node.name != None: node.pathway = C_raw[:,int(node.name[1])-1] else: node.name = "S"+str(idx) idx += 1 node.pathway = [0] if node.branch_length == None: node.branch_length = 0 dim_path = C.shape[0] edges = G.edges(data=True) # number of steiner nodes n_s = C.shape[1] - 2 mat_Q = np.zeros((n_s, n_s),dtype=float) ary_c = [np.zeros(n_s, dtype=float) for _ in range(dim_path)] for edge in edges: wt = edge[2]["weight"] # It's length, not weight! wt = 1.0/wt if (edge[0].name[0] == "S") and (edge[1].name[0] == "S"): nodes, nodet = edge[0], edge[1] ids, idt = int(nodes.name[1])-1, int(nodet.name[1])-1 mat_Q[ids, ids] += wt mat_Q[ids, idt] -= wt mat_Q[idt, idt] += wt mat_Q[idt, ids] -= wt else: nodes, nodec = None, None if (edge[0].name[0] == "S") and (edge[1].name[0] == "C"): nodes, nodec = edge[0], edge[1] elif (edge[0].name[0] == "C") and (edge[1].name[0] == "S"): nodes, nodec = edge[1], edge[0] else: print("error") ids, idc = int(nodes.name[1])-1, int(nodec.name[1])-1 mat_Q[ids, ids] += wt for idx_path in range(dim_path): ary_c[idx_path][ids] += wt * (nodec.pathway[idx_path]) #for node in G.nodes(): # print(node.name, node.pathway[0]) s_pathway = [] for idx_path in range(dim_path): tmp = np.linalg.solve(mat_Q, ary_c[idx_path]) s_pathway.append(tmp) # num_pathways x num_steiner_nodes S = np.asarray(s_pathway, dtype=float) for node in G.nodes(): if node.name[0] == "S": node.pathway = S[:, int(node.name[1])-1] min_weight, max_weight = get_min_max_weight_edges(G) root_name = [l for l in labels if int(l[1])-1 == comp_p][0] for root in G.nodes(): if root.name == root_name: break set_traverse = [] plot_nodes = [] cur_pos = [0, 0, root.name] xgrain = 1.0 strings = [] set_traverse, strings, plot_nodes = iter_func( root_name, root, set_traverse, list_funcs, G, strings, plot_nodes, cur_pos, xgrain, min_weight, max_weight) node2pos = {v[2]:[v[0],v[1]] for v in plot_nodes} sns.set_style("white") fig = plt.figure(figsize=(7,6)) ax0 = plt.subplot(1,1,1) xmin = 0 xmax = 0 ymin = 0 ymax = 0 min_linewidth=8 max_linewidth=20 for edge in G.edges(data=True): ax0.plot([node2pos[edge[0].name][0],node2pos[edge[1].name][0]], [node2pos[edge[0].name][1],node2pos[edge[1].name][1]], "-", color="gray", linewidth=min_linewidth+(max_linewidth-min_linewidth)*(1.0/edge[2]["weight"]-min_weight)/max_weight, alpha=0.3) for node in node2pos.keys(): pos = node2pos[node] if node[0] == "S": color = "gray" elif node[-1] == "P": color = "green" elif node[-1] == "M": color = "red" else: color = "royalblue" ax0.plot(pos[0], pos[1], "o", markersize=50, color=color, alpha=0.5)#markeredgecolor="k", ax0.annotate( s=node, xy=(pos[0], pos[1]), ha="center", va="center", size=22, fontweight="bold", ) xmin = min(xmin, pos[0]) xmax = max(xmax, pos[0]) ymin = min(ymin, pos[1]) ymax = max(ymax, pos[1]) delta = 0.7 xratio = (xmax-xmin)/(ymax-ymin)*delta plt.xlim(xmin-delta*xratio, xmax+delta*xratio) plt.ylim(ymin-delta, ymax+delta) # ax0.annotate("Progression", xy=(xmin-delta*xratio, ymax), xytext=(xmin-delta*xratio,ymin), # ha="center", # arrowprops=dict(facecolor="black",alpha=0.7), # size=22, # fontweight="bold", # rotation=0, # ) plt.gca().invert_yaxis() ax0.spines["right"].set_visible(False) ax0.spines["top"].set_visible(False) ax0.spines["left"].set_visible(False) ax0.spines["bottom"].set_visible(False) ax0.get_xaxis().set_ticks([]) ax0.get_yaxis().set_ticks([]) plt.show() ##fig.savefig("figures/fig8phylo3.pdf", bbox_inches="tight") for tmp in strings: node_src, node_tgt = tmp[0], tmp[1] print("\colrule") print("$%s \\rightarrow %s$"%(node_src.name, node_tgt.name)) delta_pathway = node_tgt.pathway - node_src.pathway delta_pathway = delta_pathway[0:len_kegg] idx_sel = sorted(range(len(delta_pathway)), key=delta_pathway.__getitem__) threshold = 1.0 list_pos, list_neg = [], [] max_ct = 5 for ct, idx in enumerate(idx_sel[::-1]): if delta_pathway[idx] > threshold and ct < max_ct: list_pos.append([delta_pathway[idx], list_funcs[idx] ]) for ct, idx in enumerate(idx_sel): if delta_pathway[idx] < -threshold and ct < max_ct: list_neg.append([delta_pathway[idx], list_funcs[idx] ]) for idx in range(max(len(list_pos), len(list_neg))): if idx+1 <= len(list_pos): fun = list_pos[idx][1] if fun in ["RET", "PI3K-Akt signaling pathway", "ErbB signaling pathway"]: fun = "\\textbf{"+fun+"}" print("& $+%.2f$ & "%(list_pos[idx][0])+fun) else: print("& & ") if idx+1 <= len(list_neg): fun = list_neg[idx][1] if fun in ["RET", "PI3K-Akt signaling pathway", "ErbB signaling pathway"]: fun = "\\textbf{"+fun+"}" print("& $%.2f$ & "%(list_neg[idx][0])+fun) else: print("& & ") print("\\\\") if max(len(list_pos), len(list_neg)) == 0: print("& $<1.0$ & $\emptyset$ & $<1.0$ & $\emptyset$ \\\\")
def out(): records = SeqIO.parse("%s" % e1.get(), "fasta") lens = [] lens2 = [] file = open("phylo.phy", 'w') for record in records: ids = record.id sequence = record.seq[0:100] lens.append(record.id) lens2.append(record.seq) line = "%s %s" % (ids, sequence) print(line) lengthmax = len(max(lens, key=len)) lengthmin = len(min(lens, key=len)) file.write(" %s 100\n" % len(lens)) for i, item in enumerate(lens): start = i - 1 end = i - 1 seq = lens2[end] if len(item) == int(lengthmax): if i < 10: ids = "%s%s%s" % (i, "-", item + "-") ids = ids ids = ids.replace(".", "") ids = ids.replace("_", "") print("1") else: ids = "%s%s%s" % (i, "-", item) ids = ids ids = ids.replace(".", "") ids = ids.replace("_", "") print("1") elif len(item) < int(lengthmax): ids = "%s%s%s" % (i, "-", item) add = int(lengthmax) - int(len(item)) ids = ids + (add * "-") + "-" ids = ids ids = ids.replace(".", "") ids = ids.replace("_", "") print("2") line = "%s %s\n" % (ids.replace(".", ""), seq[0:100]) print(line) file.write(line) file.close() # Read the sequences and align aln = AlignIO.read('phylo.phy', 'phylip') # Print the alignment print(aln) # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Print the distance Matrix print('\nDistance Matrix\n===================') print(dm) # Construct the phylogenetic tree using UPGMA algorithm constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) Phylo.write(tree, 'apaf.xml', 'phyloxml') tree = Phylo.read('apaf.xml', 'phyloxml') net = Phylo.to_networkx(tree) networkx.draw_networkx(net) pylab.show(net)
def out(): records = SeqIO.parse(e1.get(), "fasta") lens = [] for record in records: print(record.seq) ids = record.id sequence = record.seq op = lens.append(record.id) # print(lens) try: lengthmax = len(max(lens, key=len)) lengthmin = len(min(lens, key=len)) except: lengthmax = "0" line = " %s %s\n" % (len(lens), "125") file = open("phylo.phy", "w") file.write(line) for i, id in enumerate(lens): if lengthmin < int(lengthmax): add = int(lengthmax) - int(len(id)) # print(i) id = id + (add * "-") id = id.replace(".", "") id = id.replace("_", "") to_be_write = "%s%s%s %s" % (i, "-", id, sequence[0:100]) # file.write(" %s %s\n"%(num_rec,seqlen)) file.writelines(str("%s\n" % to_be_write)) print(id) else: add = int(lengthmax) - int(len(id)) id = id + (add * "-") id = id.replace(".", "") id = id.replace("_", "") to_be_write = "%s %s" % (id, sequence[0:100]) # file.write(" %s %s\n"%(num_rec,seqlen)) file.writelines(str("%s\n" % (to_be_write))) print(id) # Read the sequences and align aln = AlignIO.read( '/home/peter/Desktop/Moduls/phylogenetic tree/phylo.phy', 'phylip') # Print the alignment print(aln) # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Print the distance Matrix print('\nDistance Matrix\n===================') print(dm) # Construct the phylogenetic tree using UPGMA algorithm constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) tree = Phylo.read('apaf.xml', 'phyloxml') net = Phylo.to_networkx(tree) networkx.draw_networkx(net) pylab.show(net) win.destroy()
def run_nj_naive(cm_uniq, stem, verbose=True): if verbose: print("Running Neighbor-Joining on " + str(cm_uniq.shape[0]) + " Unique Cells") cm_lookup = list(cm_uniq.apply(lambda x: "|".join(x.values), axis=1)) fn = stem + "phylo.txt" infile = stem + "infile.txt" cm_uniq.to_csv(fn, sep='\t') script = (SCLT_PATH / 'TreeSolver' / 'binarize_multistate_charmat.py') cmd = "python3.6 " + str(script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) aln = AlignIO.read(infile, "phylip-relaxed") calculator = DistanceCalculator('identity') constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln) tree.root_at_midpoint() nj_net = Phylo.to_networkx(tree) # convert labels to characters for writing to file rndict = {} for n in nj_net: if n.name is None: rndict[n] = Node('state-node', []) elif n.name in cm_uniq: rndict[n] = Node(n.name, cm_uniq.loc[n.name].values) # convert labels to strings, not Bio.Phylo.Clade objects #c2str = map(lambda x: x.name, list(nj_net.nodes())) #c2strdict = dict(zip(list(nj_net.nodes()), c2str)) nj_net = nx.relabel_nodes(nj_net, rndict) # nj_net = fill_in_tree(nj_net, cm_uniq) # nj_net = tree_collapse2(nj_net) rdict = {} for n in nj_net: if nj_net.out_degree(n) == 0 and n.char_string in cm_lookup: n.is_target = True else: n.is_target = False state_tree = nj_net ret_tree = Cassiopeia_Tree(method='neighbor-joining', network=state_tree, name='Cassiopeia_state_tree') os.system("rm " + infile) os.system("rm " + fn) return ret_tree
mutation = int(line) edgeLabels[child].append(mutation) line = f.readline().rstrip("\n") leafStates = {} with open(sys.argv[2]) as f: for line in f: s = line.rstrip("\n").split("\t") leafStates[s[0]] = map(int, s[1:]) #print edgeLabels #print leafStates tree.rooted = True network = Phylo.to_networkx(tree) # map vertices to integers vertexIndex = {} index2Vertex = [] for edge in network.edges(): if str(edge[0]) not in vertexIndex: vertexIndex[str(edge[0])] = len(vertexIndex) index2Vertex.append(str(edge[0])) if str(edge[1]) not in vertexIndex: vertexIndex[str(edge[1])] = len(vertexIndex) index2Vertex.append(str(edge[1])) pi = [-1 for i in range(len(vertexIndex))] for edge in network.edges(): pi[vertexIndex[str(edge[1])]] = vertexIndex[str(edge[0])]
def compute_tree(options, mat, names): """ make upgma hierarchical clustering and write it as png and graphviz dot """ # oops, convert to biopython matrix matrix = [] for i in xrange(len(names)): row = [] for j in xrange(i + 1): # tree constructor writes 0-distances as 1s for some reason # so we hack around here val = float(mat[names[i]][names[j]]) if val == 0.: val = 1e-10 elif val == 1.: val = 1.1 row.append(val) matrix.append(row) dm = _DistanceMatrix(names, matrix) # upgma tree constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) robust_makedirs(os.path.dirname(tree_path(options))) Phylo.write(tree, tree_path(options), "newick") # png tree -- note : doesn't work in toil def f(x): if "Inner" in str(x): return "" else: return x Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10) pylab.savefig(tree_path(options).replace("newick", "png")) # graphviz # get networkx graph nxgraph = Phylo.to_networkx(tree) # make undirected nxgraph = nx.Graph(nxgraph) # push names to name labels nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label") for node_id in nxgraph.nodes(): node = nxgraph.node[node_id] if "Inner" in str(node["label"]): node["label"] = "\"\"" node["width"] = 0.001 node["height"] = 0.001 else: node["fontsize"] = 18 for edge_id in nxgraph.edges(): edge = nxgraph.edge[edge_id[0]][edge_id[1]] # in graphviz, weight means something else, so make it a label weight = float(edge["weight"]) # undo hack from above if weight > 1: weight = 1. if weight <= 1e-10 or weight == 1.: weight = 0. edge["weight"] = None edge["label"] = "{0:.3g}".format(float(weight) * 100.) edge["fontsize"] = 14 edge["len"] = draw_len(weight) nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
# Generating IDs file = open("work/NLP/Trees/Cosine.csv",'rt') data = file.readlines() data = list(data) ids = [] for i in range(len(data)-1): row = data[i+1] row = row.split(',') ids.append(row[0]) tree = Phylo.read("work/NLP/Plotly/small.newick",'newick') # tree.rooted = True # Phylo.draw_ascii(tree) tree_net = Phylo.to_networkx(tree) # networkx.write_graphml(tree_net,'graph.gml') # tree_igr = igraph.read('graph.graphml',format='graphml') # Phylo.draw_graphviz(tree,prog='dot') pos = networkx.spring_layout(tree_net) pos = list(pos.values()) Xn=[pos[k][0] for k in range(len(pos))] Yn=[pos[k][1] for k in range(len(pos))] # g = networkx.read_gml("graph.gml") # g.node
from cStringIO import StringIO import sys import os sys.path.insert(1,"../../biopython") sys.path.insert(1,"../../networkx") sys.path.insert(1,"../../matplotlib") import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import networkx as nx from Bio import Phylo #treedata = "(A, (B,C), (D,E))" f = open("test.txt") handle = StringIO(f.read().rstrip()) tree = Phylo.read(handle, "newick") print tree net = Phylo.to_networkx(tree) nx.draw(net) plt.savefig("test.png") Phylo.draw(tree) plt.savefig("test-phylo.png")
def main(): """ Takes in a character matrix, an algorithm, and an output file and returns a tree in newick format. """ parser = argparse.ArgumentParser() parser.add_argument("char_fp", type=str, help="character_matrix") parser.add_argument("out_fp", type=str, help="output file name") parser.add_argument("-nj", "--neighbor-joining", action="store_true", default=False) parser.add_argument("--ilp", action="store_true", default=False) parser.add_argument("--hybrid", action="store_true", default=False) parser.add_argument("--cutoff", type=int, default=80, help="Cutoff for ILP during Hybrid algorithm") parser.add_argument("--time_limit", type=int, default=1500, help="Time limit for ILP convergence") parser.add_argument("--greedy", "-g", action="store_true", default=False) parser.add_argument("--camin-sokal", "-cs", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False, help="output verbosity") parser.add_argument("--mutation_map", type=str, default="") parser.add_argument("--num_threads", type=int, default=1) parser.add_argument("--max_neighborhood_size", type=int, default=10000) args = parser.parse_args() char_fp = args.char_fp out_fp = args.out_fp verbose = args.verbose cutoff = args.cutoff time_limit = args.time_limit num_threads = args.num_threads max_neighborhood_size = args.max_neighborhood_size stem = ''.join(char_fp.split(".")[:-1]) cm = pd.read_csv(char_fp, sep='\t', index_col=0) cm_uniq = cm.drop_duplicates(inplace=False) newick = "" prior_probs = None if args.mutation_map != "": prior_probs = read_mutation_map(args.mutation_map) if args.greedy: target_nodes = cm_uniq.astype(str).apply(lambda x: '|'.join(x), axis=1) if verbose: print('Running Greedy Algorithm on ' + str(len(target_nodes)) + " Cells") string_to_sample = dict(zip(target_nodes, cm.index)) target_nodes = map(lambda x, n: x + "_" + n, target_nodes, cm_uniq.index) reconstructed_network_greedy = solve_lineage_instance( target_nodes, method="greedy", prior_probabilities=prior_probs) # score parsimony score = 0 for e in reconstructed_network_greedy.edges(): score += get_edge_length(e[0], e[1]) print("Parsimony: " + str(score)) #reconstructed_network_greedy = nx.relabel_nodes(reconstructed_network_greedy, string_to_sample) newick = convert_network_to_newick_format(reconstructed_network_greedy) with open(out_fp, "w") as f: f.write(newick) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_greedy, open(out_stem + ".pkl", "wb")) elif args.hybrid: target_nodes = cm_uniq.astype(str).apply(lambda x: '|'.join(x), axis=1) if verbose: print('Running Hybrid Algorithm on ' + str(len(target_nodes)) + " Cells") print('Parameters: ILP on sets of ' + str(cutoff) + ' cells ' + str(time_limit) + 's to complete optimization') string_to_sample = dict(zip(target_nodes, cm.index)) target_nodes = map(lambda x, n: x + "_" + n, target_nodes, cm_uniq.index) print("running algorithm...") reconstructed_network_hybrid = solve_lineage_instance( target_nodes, method="hybrid", hybrid_subset_cutoff=cutoff, prior_probabilities=prior_probs, time_limit=time_limit, threads=num_threads, max_neighborhood_size=max_neighborhood_size) if verbose: print("Scoring Parsimony...") # score parsimony score = 0 for e in reconstructed_network_hybrid.edges(): score += get_edge_length(e[0], e[1]) if verbose: print("Parsimony: " + str(score)) if verbose: print("Writing the tree to output...") #reconstructed_network_hybrid = nx.relabel_nodes(reconstructed_network_hybrid, string_to_sample) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_hybrid, open(out_stem + ".pkl", "wb")) newick = convert_network_to_newick_format(reconstructed_network_hybrid) with open(out_fp, "w") as f: f.write(newick) elif args.ilp: target_nodes = cm_uniq.astype(str).apply(lambda x: '|'.join(x), axis=1) if verbose: print("Running ILP Algorithm on " + str(len(target_nodes)) + " Unique Cells") print("Paramters: ILP allowed " + str(time_limit) + "s to complete optimization") string_to_sample = dict(zip(target_nodes, cm.index)) target_nodes = map(lambda x, n: x + "_" + n, target_nodes, cm_uniq.index) reconstructed_network_ilp = solve_lineage_instance( target_nodes, method="ilp", prior_probabilities=prior_probs, time_limit=time_limit, max_neighborhood_size=max_neighborhood_size) # score parsimony score = 0 for e in reconstructed_network_ilp.edges(): score += get_edge_length(e[0], e[1]) print("Parsimony: " + str(score)) #reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample) newick = convert_network_to_newick_format(reconstructed_network_ilp) with open(out_fp, "w") as f: f.write(newick) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_ilp, open(out_stem + ".pkl", "wb")) elif args.neighbor_joining: cm.drop_duplicates(inplace=True) if verbose: print("Running Neighbor-Joining on " + str(cm.shape[0]) + " Unique Cells") fn = stem + "phylo.txt" infile = stem + "infile.txt" cm.to_csv(fn, sep='\t') os.system( "python2 ~/projects/scLineages/SingleCellLineageTracing/scripts/binarize_multistate_charmat.py " + fn + " " + infile + " --relaxed") aln = AlignIO.read(infile, "phylip-relaxed") calculator = DistanceCalculator('identity') constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln) tree.root_at_midpoint() nj_net = Phylo.to_networkx(tree) # convert labels to characters for writing to file i = 0 for n in nj_net: if n.name is None: n.name = "internal" + str(i) i += 1 # convert labels to strings, not Bio.Phylo.Clade objects c2str = map(lambda x: x.name, nj_net.nodes()) c2strdict = dict(zip(nj_net.nodes(), c2str)) nj_net = nx.relabel_nodes(nj_net, c2strdict) nj_net = tree_collapse(nj_net) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(nj_net, open(out_stem + ".pkl", "wb")) newick = convert_network_to_newick_format(nj_net) with open(out_fp, "w") as f: f.write(newick) os.system("rm " + infile) os.system("rm " + fn) elif args.camin_sokal: cells = cm.index samples = [("s" + str(i)) for i in range(len(cells))] samples_to_cells = dict(zip(samples, cells)) cm.index = list(range(len(cells))) if verbose: print("Running Camin-Sokal on " + str(cm.shape[0]) + " Unique Cells") infile = stem + 'infile.txt' fn = stem + "phylo.txt" weights_fn = stem + "weights.txt" cm.to_csv(fn, sep='\t') os.system( "python2 /home/mattjones/projects/scLineages/SingleCellLineageTracing/scripts/binarize_multistate_charmat.py " + fn + " " + infile) weights = construct_weights(infile, weights_fn) outfile = stem + 'outfile.txt' outtree = stem + 'outtree.txt' # run phylip mix with camin-sokal responses = "." + stem + ".temp.txt" FH = open(responses, 'w') current_dir = os.getcwd() FH.write(infile + "\n") FH.write("F\n" + outfile + "\n") FH.write("P\n") FH.write("Y\n") FH.write("F\n" + outtree + "\n") FH.close() t0 = time.time() cmd = "~/software/phylip-3.697/exe/mix" cmd += " < " + responses + " > screenout" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) consense_outtree = stem + "consenseouttree.txt" consense_outfile = stem + "conenseoutfile.txt" FH = open(responses, "w") FH.write(outtree + "\n") FH.write("F\n" + consense_outfile + "\n") FH.write("Y\n") FH.write("F\n" + consense_outtree + "\n") FH.close() if verbose: print("Computing Consensus Tree, elasped time: " + str(time.time() - t0)) cmd = "~/software/phylip-3.697/exe/consense" cmd += " < " + responses + " > screenout2" p2 = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p2.pid, 0) newick_str = "" with open(consense_outtree, "r") as f: for l in f: l = l.strip() newick_str += l #tree = Phylo.parse(consense_outtree, "newick").next() tree = newick_to_network(newick_str) #tree.rooted = True cs_net = tree_collapse(tree) #cs_net = Phylo.to_networkx(tree) cs_net = nx.relabel_nodes(cs_net, samples_to_cells) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(cs_net, open(out_stem + ".pkl", "wb")) newick = convert_network_to_newick_format(cs_net) with open(out_fp, "w") as f: f.write(newick) os.system("rm " + outfile) os.system("rm " + responses) os.system("rm " + outtree) os.system("rm " + consense_outfile) os.system("rm " + infile) os.system("rm " + fn) elif alg == "--max-likelihood" or alg == '-ml': #cells = cm.index #samples = [("s" + str(i)) for i in range(len(cells))] #samples_to_cells = dict(zip(samples, cells)) #cm.index = list(range(len(cells))) if verbose: print("Running Camin-Sokal on " + str(cm.shape[0]) + " Unique Cells") infile = stem + 'infile.txt' fn = stem + "phylo.txt" cm.to_csv(fn, sep='\t') os.system( "python2 /home/mattjones/projects/scLineages/SingleCellLineageTracing/scripts/binarize_multistate_charmat.py " + fn + " " + infile + " --relaxed") os.system("/home/mattjones/software/FastTreeMP < " + infile + " > " + out_fp) tree = Phylo.parse(out_fp, "newick").next() ml_net = Phylo.to_networkx(tree) i = 0 for n in ml_net: if n.name is None: n.name = "internal" + str(i) i += 1 c2str = map(lambda x: str(x), ml_net.nodes()) c2strdict = dict(zip(ml_net.nodes(), c2str)) ml_net = nx.relabel_nodes(ml_net, c2strdict) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(ml_net, open(out_stem + ".pkl", "wb")) os.system("rm " + infile) os.system("rm " + fn) else: raise Exception( "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, max-likelihood, or camin-sokal" )
print(tree.ascii_art()) tree_file = open(tree_file, 'w+') tree_file.write(tree.ascii_art()) tree_file.close() nws = nj(dm, result_constructor=str) print(nws) nws_file_l = open(nws_file, 'w+') nws_file_l.write(nws) nws_file_l.close() bio_tree = Phylo.read("work/NLP/Trees/output_data.txt", 'newick') tree_net = Phylo.to_networkx(bio_tree) # networkx.graphviz_layout = networkx.drawing.nx_agraph.pydot_layout # networkx.draw(tree_net,pos=networkx.spring_layout(tree_net)) # networkx.draw(tree_net) # matplotlib.plot() # pyplot.draw() # pyplot.show() # graphviz_layout = nx_agraph.pydot_layout H = networkx.nx_agraph.to_agraph(tree_net) H.layout() H.draw('a.ph') # Phylo.draw_graphviz(H,prog='dot') # pylab.show()
plt.figure() plt.xlim([0, n_iters + constraint_add]) plt.xlabel("Iterations", fontsize=fontsize) plt.ylabel("Data Log Likelihood", fontsize=fontsize) plt.plot(likelihoods) plt.legend(loc='best', fontsize=12) plt.savefig('online-likelihoods.png', bbox_inches='tight') final_tree = sampler.tree.copy() plt.figure() plot_tree_2d(final_tree, X, pca) for node in final_tree.dfs(): if node.is_leaf(): node.point = y[node.point] newick = final_tree.to_newick() tree = Phylo.read(StringIO(newick), 'newick') plt.figure() Phylo.draw_graphviz(tree, prog='neato') plt.savefig('tree.png', bbox_inches='tight') graph = Phylo.to_networkx(tree) with open('tree.nwk', 'w') as fp: print >> fp, newick, nx.write_dot(graph, 'tree.dot') plt.show()
def main(): """ Takes in a character matrix, an algorithm, and an output file and returns a tree in newick format. """ parser = argparse.ArgumentParser() parser.add_argument("char_fp", type=str, help="character_matrix") parser.add_argument("out_fp", type=str, help="output file name") parser.add_argument("-nj", "--neighbor-joining", action="store_true", default=False) parser.add_argument("--neighbor_joining_weighted", action="store_true", default=False) parser.add_argument("--ilp", action="store_true", default=False) parser.add_argument("--hybrid", action="store_true", default=False) parser.add_argument("--cutoff", type=int, default=80, help="Cutoff for ILP during Hybrid algorithm") parser.add_argument( "--hybrid_lca_mode", action="store_true", help= "Use LCA distances to transition in hybrid mode, instead of number of cells", ) parser.add_argument("--time_limit", type=int, default=1500, help="Time limit for ILP convergence") parser.add_argument("--greedy", "-g", action="store_true", default=False) parser.add_argument("--camin-sokal", "-cs", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False, help="output verbosity") parser.add_argument("--mutation_map", type=str, default="") parser.add_argument("--num_threads", type=int, default=1) parser.add_argument("--max_neighborhood_size", type=int, default=10000) parser.add_argument("--weighted_ilp", "-w", action="store_true", default=False) parser.add_argument("--greedy_min_allele_rep", type=float, default=1.0) parser.add_argument("--fuzzy_greedy", action="store_true", default=False) parser.add_argument("--multinomial_greedy", action="store_true", default=False) parser.add_argument("--num_neighbors", default=10) parser.add_argument("--num_alternative_solutions", default=100, type=int) parser.add_argument("--greedy_missing_data_mode", default="lookahead", type=str) parser.add_argument("--greedy_lookahead_depth", default=3, type=int) args = parser.parse_args() char_fp = args.char_fp out_fp = args.out_fp verbose = args.verbose lca_mode = args.hybrid_lca_mode if lca_mode: lca_cutoff = args.cutoff cell_cutoff = None else: cell_cutoff = args.cutoff lca_cutoff = None time_limit = args.time_limit num_threads = args.num_threads n_neighbors = args.num_neighbors num_alt_soln = args.num_alternative_solutions max_neighborhood_size = args.max_neighborhood_size missing_data_mode = args.greedy_missing_data_mode lookahead_depth = args.greedy_lookahead_depth if missing_data_mode not in ["knn", "lookahead", "avg", "modified_avg"]: raise Exception("Greedy missing data mode not recognized") stem = "".join(char_fp.split(".")[:-1]) cm = pd.read_csv(char_fp, sep="\t", index_col=0, dtype=str) cm_uniq = cm.drop_duplicates(inplace=False) cm_lookup = list(cm.apply(lambda x: "|".join(x.values), axis=1)) newick = "" prior_probs = None if args.mutation_map != "": prior_probs = read_mutation_map(args.mutation_map) weighted_ilp = args.weighted_ilp if prior_probs is None and weighted_ilp: raise Exception( "If you'd like to use weighted ILP reconstructions, you need to provide a mutation map (i.e. prior probabilities)" ) greedy_min_allele_rep = args.greedy_min_allele_rep fuzzy = args.fuzzy_greedy probabilistic = args.multinomial_greedy if args.greedy: target_nodes = list( cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1)) if verbose: print("Read in " + str(cm.shape[0]) + " Cells") print("Running Greedy Algorithm on " + str(len(target_nodes)) + " Unique States") reconstructed_network_greedy, potential_graph_sizes = solve_lineage_instance( target_nodes, method="greedy", prior_probabilities=prior_probs, greedy_minimum_allele_rep=greedy_min_allele_rep, fuzzy=fuzzy, probabilistic=probabilistic, n_neighbors=n_neighbors, missing_data_mode=missing_data_mode, lookahead_depth=lookahead_depth, ) net = reconstructed_network_greedy.get_network() out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_greedy, open(out_stem + ".pkl", "wb")) newick = reconstructed_network_greedy.get_newick() with open(out_fp, "w") as f: f.write(newick) root = [n for n in net if net.in_degree(n) == 0][0] # score parsimony score = 0 for e in nx.dfs_edges(net, source=root): score += e[0].get_mut_length(e[1]) print("Parsimony: " + str(score)) elif args.hybrid: target_nodes = list( cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1)) if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes)) + " Cells") if lca_mode: print( "Parameters: ILP on sets of cells with a maximum LCA distance of " + str(lca_cutoff) + " with " + str(time_limit) + "s to complete optimization") else: print("Parameters: ILP on sets of " + str(cell_cutoff) + " cells with " + str(time_limit) + "s to complete optimization") # string_to_sample = dict(zip(target_nodes, cm_uniq.index)) # target_nodes = list(map(lambda x, n: x + "_" + n, target_nodes, cm_uniq.index)) print("running algorithm...") reconstructed_network_hybrid, potential_graph_sizes = solve_lineage_instance( target_nodes, method="hybrid", hybrid_cell_cutoff=cell_cutoff, hybrid_lca_cutoff=lca_cutoff, prior_probabilities=prior_probs, time_limit=time_limit, threads=num_threads, max_neighborhood_size=max_neighborhood_size, weighted_ilp=weighted_ilp, greedy_minimum_allele_rep=greedy_min_allele_rep, fuzzy=fuzzy, probabilistic=probabilistic, n_neighbors=n_neighbors, maximum_alt_solutions=num_alt_soln, missing_data_mode=missing_data_mode, lookahead_depth=lookahead_depth, ) net = reconstructed_network_hybrid.get_network() if verbose: print("Writing the tree to output...") out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_hybrid, open(out_stem + ".pkl", "wb")) newick = reconstructed_network_hybrid.get_newick() with open(out_fp, "w") as f: f.write(newick) ## plot out diagnostic potential graph sizes h = plt.figure(figsize=(10, 10)) for i in range(len(potential_graph_sizes)): try: x, y = ( [k for k in potential_graph_sizes[i].keys()], [ potential_graph_sizes[i][k] for k in potential_graph_sizes[i].keys() ], ) plt.plot(x, y) except: continue # plt.xlim(0, int(cutoff)) plt.xlabel("LCA Distance") plt.ylabel("Size of Potential Graph") plt.savefig(out_stem + "_potentialgraphsizes.pdf") # score parsimony score = 0 for e in net.edges(): score += e[0].get_mut_length(e[1]) print("Parsimony: " + str(score)) elif args.ilp: target_nodes = list( cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1)) if verbose: print("Running ILP Algorithm on " + str(len(target_nodes)) + " Unique Cells") print("Paramters: ILP allowed " + str(time_limit) + "s to complete optimization") reconstructed_network_ilp, potential_graph_sizes = solve_lineage_instance( target_nodes, method="ilp", prior_probabilities=prior_probs, time_limit=time_limit, max_neighborhood_size=max_neighborhood_size, weighted_ilp=weighted_ilp, maximum_alt_solutions=num_alt_soln, ) net = reconstructed_network_ilp.get_network() root = [n for n in net if net.in_degree(n) == 0][0] # score parsimony score = 0 for e in nx.dfs_edges(net, source=root): score += e[0].get_mut_length(e[1]) print("Parsimony: " + str(score)) newick = reconstructed_network_ilp.get_newick() if verbose: print("Writing the tree to output...") out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(reconstructed_network_ilp, open(out_stem + ".pkl", "wb")) with open(out_fp, "w") as f: f.write(newick) h = plt.figure(figsize=(10, 10)) for i in range(len(potential_graph_sizes)): try: x, y = ( [k for k in potential_graph_sizes[i].keys()], [ potential_graph_sizes[i][k] for k in potential_graph_sizes[i].keys() ], ) plt.plot(x, y) except: continue # plt.xlim(0, int(cutoff)) plt.xlabel("LCA Distance") plt.ylabel("Size of Potential Graph") plt.savefig(out_stem + "_potentialgraphsizes.pdf") elif args.neighbor_joining: out_stem = "".join(out_fp.split(".")[:-1]) ret_tree = run_nj_naive(cm_uniq, stem, verbose) pic.dump(ret_tree, open(out_stem + ".pkl", "wb")) newick = ret_tree.get_newick() with open(out_fp, "w") as f: f.write(newick) elif args.neighbor_joining_weighted: out_stem = "".join(out_fp.split(".")[:-1]) ret_tree = run_nj_weighted(cm_uniq, prior_probs, verbose) pic.dump(ret_tree, open(out_stem + ".pkl", "wb")) newick = ret_tree.get_newick() with open(out_fp, "w") as f: f.write(newick) elif args.camin_sokal: out_stem = "".join(out_fp.split(".")[:-1]) ret_tree = run_camin_sokal(cm_uniq, stem, verbose) pic.dump(ret_tree, open(out_stem + ".pkl", "wb")) newick = convert_network_to_newick_format(ret_tree.get_network()) # newick = ret_tree.get_newick() with open(out_fp, "w") as f: f.write(newick) elif alg == "--max-likelihood" or alg == "-ml": # cells = cm.index # samples = [("s" + str(i)) for i in range(len(cells))] # samples_to_cells = dict(zip(samples, cells)) # cm.index = list(range(len(cells))) if verbose: print("Running Maximum Likelihood on " + str(cm.shape[0]) + " Unique Cells") infile = stem + "infile.txt" fn = stem + "phylo.txt" cm.to_csv(fn, sep="\t") script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str( script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) os.system("/home/mattjones/software/FastTreeMP < " + infile + " > " + out_fp) tree = Phylo.parse(out_fp, "newick").next() ml_net = Phylo.to_networkx(tree) i = 0 for n in ml_net: if n.name is None: n.name = "internal" + str(i) i += 1 c2str = map(lambda x: str(x), ml_net.nodes()) c2strdict = dict(zip(ml_net.nodes(), c2str)) ml_net = nx.relabel_nodes(ml_net, c2strdict) out_stem = "".join(out_fp.split(".")[:-1]) pic.dump(ml_net, open(out_stem + ".pkl", "wb")) os.system("rm " + infile) os.system("rm " + fn) else: raise Exception( "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, max-likelihood, or camin-sokal" )
def main(): """ Takes in a character matrix, an algorithm, and an output file and returns a tree in newick format. """ parser = argparse.ArgumentParser() parser.add_argument("netfp", type=str, help="character_matrix") parser.add_argument("-nj", "--neighbor-joining", action="store_true", default=False) parser.add_argument("--neighbor_joining_weighted", action="store_true", default=False) parser.add_argument("--ilp", action="store_true", default=False) parser.add_argument("--hybrid", action="store_true", default=False) parser.add_argument("--cutoff", type=int, default=80, help="Cutoff for ILP during Hybrid algorithm") parser.add_argument( "--hybrid_lca_mode", action="store_true", help= "Use LCA distances to transition in hybrid mode, instead of number of cells", ) parser.add_argument("--time_limit", type=int, default=-1, help="Time limit for ILP convergence") parser.add_argument( "--iter_limit", type=int, default=-1, help="Max number of iterations for ILP solver", ) parser.add_argument("--greedy", "-g", action="store_true", default=False) parser.add_argument("--camin-sokal", "-cs", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False, help="output verbosity") parser.add_argument("--mutation_map", type=str, default="") parser.add_argument("--num_threads", type=int, default=1) parser.add_argument("--no_triplets", action="store_true", default=False) parser.add_argument("--max_neighborhood_size", type=str, default=3000) parser.add_argument("--out_fp", type=str, default=None, help="optional output file") parser.add_argument("--seed", type=int, default=None, help="Random seed for ILP solver") args = parser.parse_args() netfp = args.netfp outfp = args.out_fp verbose = args.verbose lca_mode = args.hybrid_lca_mode if lca_mode: lca_cutoff = args.cutoff cell_cutoff = None else: cell_cutoff = args.cutoff lca_cutoff = None time_limit = args.time_limit iter_limit = args.iter_limit num_threads = args.num_threads max_neighborhood_size = args.max_neighborhood_size seed = args.seed if seed is not None: random.seed(seed) np.random.seed(seed) score_triplets = not args.no_triplets prior_probs = None if args.mutation_map != "": prior_probs = pic.load(open(args.mutation_map, "rb")) name = netfp.split("/")[-1] stem = ".".join(name.split(".")[:-1]) true_network = nx.read_gpickle(netfp) if isinstance(true_network, Cassiopeia_Tree): true_network = true_network.get_network() target_nodes = get_leaves_of_tree(true_network) target_nodes_uniq = [] seen_charstrings = [] for t in target_nodes: if t.char_string not in seen_charstrings: seen_charstrings.append(t.char_string) target_nodes_uniq.append(t) if args.greedy: if verbose: print("Running Greedy Algorithm on " + str(len(target_nodes_uniq)) + " Cells") reconstructed_network_greedy = solve_lineage_instance( target_nodes_uniq, method="greedy", prior_probabilities=prior_probs) net = reconstructed_network_greedy[0] if outfp is None: outfp = name.replace("true", "greedy") pic.dump(net, open(outfp, "wb")) elif args.hybrid: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_hybrid = solve_lineage_instance( target_nodes_uniq, method="hybrid", hybrid_cell_cutoff=cell_cutoff, hybrid_lca_cutoff=lca_cutoff, prior_probabilities=prior_probs, time_limit=time_limit, threads=num_threads, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_hybrid[0] if outfp is None: outfp = name.replace("true", "hybrid") pic.dump(net, open(outfp, "wb")) elif args.ilp: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_ilp = solve_lineage_instance( target_nodes_uniq, method="ilp", hybrid_subset_cutoff=cutoff, prior_probabilities=prior_probs, time_limit=time_limit, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_ilp[0] # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample) if outfp is None: outfp = name.replace("true", "ilp") pic.dump(net, open(outfp, "wb")) elif args.neighbor_joining: if verbose: print("Running Neighbor-Joining on " + str(len(target_nodes_uniq)) + " Unique Cells") infile = "".join(name.split(".")[:-1]) + "infile.txt" fn = "".join(name.split(".")[:-1]) + "phylo.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str( script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) aln = AlignIO.read(infile, "phylip-relaxed") aln = unique_alignments(aln) t0 = time.time() calculator = DistanceCalculator("identity", skip_letters="?") constructor = DistanceTreeConstructor(calculator, "nj") tree = constructor.build_tree(aln) tree.root_at_midpoint() nj_net = Phylo.to_networkx(tree) # convert labels to characters for writing to file i = 0 rndict = {} for n in nj_net: if n.name is None: rndict[n] = Node("state-node", []) # n.name = "internal" + str(i) # i += 1 else: rndict[n] = Node(n.name, []) nj_net = nx.relabel_nodes(nj_net, rndict) # convert labels to strings, not Bio.Phylo.Clade objects # c2str = map(lambda x: x.name, list(nj_net.nodes())) # c2strdict = dict(zip(list(nj_net.nodes()), c2str)) # nj_net = nx.relabel_nodes(nj_net, c2strdict) cm = pd.read_csv(fn, sep="\t", index_col=0) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) nj_net = fill_in_tree(nj_net, cm) nj_net = tree_collapse(nj_net) for n in nj_net: if n.char_string in cm_lookup.keys(): n.is_target = True nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj") pic.dump(nj_net, open(outfp, "wb")) # Phylo.write(tree, out, 'newick') os.system("rm " + infile) os.system("rm " + fn) elif args.neighbor_joining_weighted: if verbose: print("Running Neighbor-Joining with Weighted Scoring on " + str(len(target_nodes_uniq)) + " Unique Cells") target_node_charstrings = np.array( [t.get_character_vec() for t in target_nodes_uniq]) dm = compute_distance_mat(target_node_charstrings, len(target_node_charstrings), priors=prior_probs) ids = [t.name for t in target_nodes_uniq] cm_uniq = pd.DataFrame(target_node_charstrings) cm_uniq.index = ids dm = sp.spatial.distance.squareform(dm) dm = DistanceMatrix(dm, ids) newick_str = nj(dm, result_constructor=str) tree = newick_to_network(newick_str, cm_uniq) nj_net = fill_in_tree(tree, cm_uniq) nj_net = tree_collapse(nj_net) cm_lookup = dict( zip( list( cm_uniq.apply( lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm_uniq.index.values, )) rdict = {} for n in nj_net: if n.char_string in cm_lookup: n.is_target = True else: n.is_target = False nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj_weighted") pic.dump(nj_net, open(outfp, "wb")) elif args.camin_sokal: if verbose: print("Running Camin-Sokal Max Parsimony Algorithm on " + str(len(target_nodes_uniq)) + " Unique Cells") samples_to_cells = {} indices = [] for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq): samples_to_cells["s" + str(i)] = n.name indices.append(n.name) n.name = str(i) infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt" fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt" weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str(script) + " " + fn + " " + infile pi = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(pi.pid, 0) weights = construct_weights(infile, weights_fn) os.system("touch outfile") os.system("touch outtree") outfile = stem + "outfile.txt" outtree = stem + "outtree.txt" # run phylip mix with camin-sokal responses = "." + stem + ".temp.txt" FH = open(responses, "w") current_dir = os.getcwd() FH.write(infile + "\n") FH.write("F\n" + outfile + "\n") FH.write("P\n") FH.write("W\n") FH.write("Y\n") FH.write(weights_fn + "\n") FH.write("F\n" + outtree + "\n") FH.close() t0 = time.time() cmd = "~/software/phylip-3.697/exe/mix" cmd += " < " + responses + " > screenout1" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) consense_outtree = stem + "consenseouttree.txt" consense_outfile = stem + "consenseoutfile.txt" FH = open(responses, "w") FH.write(outtree + "\n") FH.write("F\n" + consense_outfile + "\n") FH.write("Y\n") FH.write("F\n" + consense_outtree + "\n") FH.close() if verbose: print("Computing Consensus Tree, elasped time: " + str(time.time() - t0)) cmd = "~/software/phylip-3.697/exe/consense" cmd += " < " + responses + " > screenout" p2 = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p2.pid, 0) newick_str = "" with open(consense_outtree, "r") as f: for l in f: l = l.strip() newick_str += l cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str) cm.index = indices cs_net = newick_to_network(newick_str, cm) for n in cs_net: if n.name in samples_to_cells: n.name = samples_to_cells[n.name] cs_net = fill_in_tree(cs_net, cm) cs_net = tree_collapse2(cs_net) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) for n in cs_net: if n.char_string in cm_lookup.keys(): n.is_target = True cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net) if outfp is None: outfp = name.replace("true", "cs") pic.dump(cs_net, open(outfp, "wb")) os.system("rm " + outfile) os.system("rm " + responses) os.system("rm " + outtree) os.system("rm " + consense_outfile) os.system("rm " + infile) os.system("rm " + fn) else: raise Exception( "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal" )
"[Required] Location of the file containing nodes of interest for network analysis" ) # Parse options into variables (options, args) = parser.parse_args() treepath = options.treepath namepath = options.namepath if treepath is None or namepath is None: print "Invalid options" sys.exit(1) # load Tree trees = Phylo.parse(treepath, 'newick') Tree = trees.next() tree = Phylo.to_networkx(Tree) # load names f = open(namepath) lines = f.readlines() names = [] for line in lines: if line: names.append(line.strip()) # Analysis nodes = tree.nodes() leaves = [] for node in nodes: if node.name is not None:
def test_to_networkx(self): """Tree to Graph conversion, if networkx is available.""" tree = Phylo.read(EX_DOLLO, 'phyloxml') G = Phylo.to_networkx(tree) self.assertEqual(len(G.nodes()), 659)
def createHashedPTreeGraph(tree, maxLeafHashValue=2000000000000000000000, refTree=None, treeName="anonymous", hashAlgorithm="pow"): print("[INFO] Constructing hashed PTree graph via networkx for {} tree". format(treeName)) if refTree is None: print("[INFO] Using purely new node ids for nodes") else: print("[INFO] Using reference tree ids for nodes") # To each node we assign ids that are integers starting from 0 def assignPhyloIds(tree, nextFreeId=0): tree.id = nextFreeId nextFreeId = nextFreeId + 1 for child in tree: nextFreeId = assignPhyloIds(child, nextFreeId) return nextFreeId print("[INFO] Assigning random leafs hashes in range 0 to {}".format( maxLeafHashValue)) assignPhyloIds(tree.clade) net = Phylo.to_networkx(tree) # Create directed DFS traversal graph for tree startingNode = tree.clade dfsTree = nx.dfs_tree(net, source=startingNode) if not (refTree is None): standarizeGraphLabeling(dfsTree, refTree["network"]) nodesCount = len(dfsTree.nodes()) # Assign hashes to all leafs leafId = 1 nodeToHashMapping = {} hashToNodeMapping = {} for node in dfsTree.nodes(): if len(node) <= 0: if hashAlgorithm == "pow": leafHash = 2**leafId elif hashAlgorithm == "rand": nodeHash = random.randint(1, maxLeafHashValue + 1) leafId = leafId + 1 if not (refTree is None): if not (node.name is None): foundNodes = list( it.ifilter(lambda n: n.name == node.name, refTree["network"].nodes())) if len(foundNodes) > 0: leafHash = refTree["nodeToHashMapping"][ foundNodes[0].id]["hash"] hashObj = {"clade": node, "hash": leafHash} nodeToHashMapping[node.id] = hashObj hashToNodeMapping[leafHash] = hashObj print("[INFO] Generating hashes for the rest of nodes") # Function that recursively generates hashes for nodes from # hashes of the children nodes def recAssignHashes(node, parent): for child in nx.neighbors(dfsTree, node): recAssignHashes(child, node) if not node.id in nodeToHashMapping: nodeHash = None okChild = 0 for child in nx.neighbors(dfsTree, node): if child.id in nodeToHashMapping: okChild = okChild + 1 if nodeHash is None: nodeHash = nodeToHashMapping[child.id]["hash"] else: if hashAlgorithm == "pow": nodeHash = nodeHash | nodeToHashMapping[ child.id]["hash"] elif hashAlgorithm == "rand": nodeHash = nodeHash ^ nodeToHashMapping[ child.id]["hash"] if nodeHash is None: nodeHash = random.randint(1, maxLeafHashValue + 1) hashObj = {"clade": node, "hash": nodeHash} nodeToHashMapping[node.id] = hashObj hashToNodeMapping[nodeHash] = hashObj recAssignHashes(startingNode, None) return { "network": dfsTree, "nodeToHashMapping": nodeToHashMapping, "hashToNodeMapping": hashToNodeMapping }
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True): print 'computing alignments...' print ' ...using muscle' malis, mrefs, mpairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'muscle', reset = reset, on_fail = 'compute', register = 'tuali_musc_{0}'.format(run_id))) print ' ...using cmalign.' salis, srefs, spairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali__struct_{0}'.format(run_id))) print ' ...making trees.' for idx, alis in enumerate(zip(malis, salis)): m, s = alis mtree = phyml.tree(m,run_id, bionj = True) stree = phyml.tree(s,run_id, bionj = True) maps = dict([(elt.id,i) for i, elt in enumerate(m)]) mdists = zeros((len(maps),len(maps))) sdists = zeros((len(maps),len(maps))) for n1 in mtree.get_terminals(): for n2 in mtree.get_terminals(): mdists[maps[n1.name],maps[n2.name]] = \ mtree.distance(n1,n2) for n1 in stree.get_terminals(): for n2 in stree.get_terminals(): sdists[maps[n1.name],maps[n2.name]] = \ stree.distance(n1,n2) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1)) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6) f = myplots.fignum(4, (8,10)) ct = mycolors.getct(len(mtree.get_terminals())) import networkx for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']): a = f.add_subplot(sp) layout = 'neato' G = phylo.to_networkx(t) Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False) posi = networkx.pygraphviz_layout(Gi, layout, args = '') posn = dict((n, posi[Gi.node_labels[n]]) for n in G) networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]), node_size = [100 if n.name in maps.keys() else 0 for n in G.nodes()], width = 1, edge_color = 'black', ax = a, node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] ) a.annotate('Embedded tree for {0} alignment.'.format(ttype), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,0],textcoords = 'offset pixels') a.annotate('Total branch length is {0}'.format(t.total_branch_length()), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') #phylo.draw_graphviz( mtree, label_func = lambda x: '', # node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\ # [ct[0] for n in mtree.get_nonterminals()], axes = ax) datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx)) f.savefig(datafile, dpi = 200, format = 'ps')
reconstructed_network = pic.load(open(reconstructed_fp, "rb"), encoding="latin1") nodes = [n for n in reconstructed_network.nodes()] encoder = dict(zip(nodes, map(lambda x: x.split("_")[0], nodes))) reconstructed_network = nx.relabel_nodes(reconstructed_network, encoder) else: k = map(lambda x: "s" + x.split("_")[-1], target_nodes_original_network) s_to_char = dict(zip(k, target_nodes)) char_to_s = dict(zip(target_nodes, k)) reconstructed_tree = next(Phylo.parse(reconstructed_fp, "newick")) reconstructed_tree.rooted = True reconstructed_network = Phylo.to_networkx(reconstructed_tree) i = 1 for n in reconstructed_network: if n.name is None: n.name = "i" + str(i) i += 1 # convert labels to strings, not Bio.Phylo.Clade objects c2str = map(lambda x: x.name, reconstructed_network.nodes()) c2strdict = dict(zip(reconstructed_network.nodes(), c2str)) reconstructed_network = nx.relabel_nodes(reconstructed_network, c2strdict) # convert labels to characters for triplets correct analysis reconstructed_network = nx.relabel_nodes(reconstructed_network, s_to_char)
from Bio import Phylo import subprocess, networkx, re from functools import reduce from calculations.python.paths import * #Phylo incorrectly parses the gisaid_china.MCC.trees file, so we modify it, assume that EPI_ISL_\d+ is an identifier subprocess.check_output( f'sed -E "s/h\S+EPI_ISL_/EPI_ISL_/" {data_path(TREE_UNFILTERED)} | sed -E "s/\|\S+[^,]//" > {data_path(TREE_FILTERED)}', shell=True) trees = Phylo.parse(data_path(TREE_FILTERED), 'nexus') tree = trees.__next__() net = Phylo.to_networkx(tree).to_undirected() #descendants_at_level[i] keeps a dict {node: nodes that can be reached with exactly i steps} descendants_at_level = [{node: {node} for node in net.nodes}] LEVEL = 2 for i in range(LEVEL): prev_level = descendants_at_level[i] descendants_at_level.append({ node: reduce(set.union, (set(net.neighbors(n)) for n in prev_level[node])) for node in net.nodes }) #dict actually used to generate list of genomes to compare d = { node: reduce(set.union, (descendants_at_level[i][node] for i in range(LEVEL + 1))) for node in net.nodes
def make_header(): tree_file = 'example.tree' tip_data = SeqIO.to_dict(SeqIO.parse("example.nexus", "nexus")) NUM_SITES = len(next(iter(tip_data.values()))) assert all(len(v) == NUM_SITES for v in tip_data.values()) ## Tree parsing G = Phylo.to_networkx(Phylo.read(tree_file, 'newick', rooted=True)) # FIXME this just arbitrarily assigns the leaves to the first n nodes in some # way. Need to make sure it matches up with tip_data. print(G.nodes()) n = len(tip_data) leaves = iter(range(n)) interior = iter(range(n, 2 * n - 1)) node_remap = {} tip_remap = {} for c in nx.dfs_postorder_nodes(G): if c.name is not None: node_id = next(leaves) node_remap[c] = node_id tip_remap[node_id] = tip_data[c.name] else: node_remap[c] = next(interior) G = nx.relabel_nodes(G, mapping=node_remap) postorder = list(nx.dfs_postorder_nodes(G)) preorder_map = np.empty((len(G), 2), dtype=int) preorder_map[:, 0] = list(nx.dfs_preorder_nodes(G)) for i in range(len(G)): preorder_map[i, 1] = list(G.pred[preorder_map[i, 0]])[0] if list( G.pred[preorder_map[i, 0]]) else 0 preorder_map += 1 child_parent = [None] * len( G) # the i-th entry of child_parent is the parent of node i for k in G.pred: child_parent[k] = list(G.pred[k])[0] if G.pred[k] else -1 ## Rate matrix mu = .25 Q = np.full((4, 4), mu) np.fill_diagonal(Q, -3 * mu) pi = np.ones(4) / 4 ## Initial partials encoding = dict(zip('actg', np.eye(4))) encoding['-'] = np.ones(4) print(tip_data) tip_partials = [] sparse_tip_partials = [] for i in range(n): v = tip_remap[i] tip_partials.append(np.transpose([encoding[vv.lower()] for vv in v ])) ## FIXME: ordering is arbitrary sp = scipy.sparse.coo_matrix(tip_partials[-1]) sparse_tip_partials.append(zip(sp.row, sp.col, sp.data)) print(tip_partials) ## Write C++ header file hpp = jinja2.Template(open('eigen.j2', 'rt').read()) with open("eigen.hpp", "wt") as out: s = hpp.render(child_parent=child_parent, postorder=postorder, Q=Q, pi=pi, tip_partials=tip_partials, sparse_tip_partials=sparse_tip_partials, num_sites=NUM_SITES) out.write(s) data = { 'S': n, 'L': NUM_SITES, 'map': preorder_map, 'rate': 1.0, 'lower_root': 0.0 } return data