def make_tree_figure(wanted_seqs, trop_dict, tree_file):
    mat_data = get_pairwise_distances(wanted_seqs, tree_file = tree_file)
    tree = Phylo.read(open(tree_file), 'newick')
    net = Phylo.to_networkx(tree)
    
    node_mapping = {}
    clade = 1
    for node in net.nodes():
        if node.name is None:
            node_mapping[node] = 'Clade-%i' % clade
            clade += 1
        else:
            node_mapping[node] = node.name
    new_net = networkx.relabel_nodes(net, node_mapping)
    
    colors = []
    for node in new_net.nodes():
        if node.startswith('Clade'):
            colors.append('w')
        elif trop_dict[node]:
            colors.append('g')
        elif not trop_dict[node]:
            colors.append('r')
        else:
            print node
    #print colors, len(colors), len(new_net.nodes())
    pos = networkx.graphviz_layout(new_net, 'twopi')
    
    networkx.draw_networkx(new_net, pos, with_labels = False, node_color = colors)
Esempio n. 2
0
def nkew():
    with open("rosalind_nkew.txt") as f:
        lines = map(lambda l: l.strip(), f.readlines())
        lines = [line for line in lines if line]

    for i in xrange(len(lines)/2):
        handle = StringIO.StringIO(lines[2*i])
        tree = Phylo.read(handle, "newick")
        names = lines[2*i+1].split()

        t =  Phylo.to_networkx(tree)
        # create weighted tree
        wt = networkx.Graph()
        for node in t.nodes():
            wt.add_node(node)
        for key,vals in t.edge.items():
            for val in vals:
                wt.add_edge(key, val, weight=vals[val]['weight'])

        #pos = networkx.spring_layout(wt)
        #networkx.draw(wt, pos)
        #networkx.draw_networkx_edge_labels(wt, pos)
        #plt.show()
        
        na = [node for node in wt.nodes() if node.name == names[0]][0]
        nb = [node for node in wt.nodes() if node.name == names[1]][0]

        print int(networkx.shortest_path_length(wt, na, nb, 'weight')),
    print ""
Esempio n. 3
0
def get_tree(tree_file, name_tree):
    tree = Phylo.read( open(tree_file, 'r'), "newick")
    tree_name = Phylo.read( open(name_tree, 'r'), "newick")
    #set node number for nonterminal nodes and specify root node
    numInternalNode = 0
    for clade in tree.get_nonterminals():
        clade.name = 'N' + str(numInternalNode)
        clade.branch_length = clade.confidence
        numInternalNode += 1

    
    for clade_iter in range(len(tree.get_terminals())):
        clade = tree.get_terminals()[clade_iter]
        clade.branch_length = clade.confidence
        clade.name = tree_name.get_terminals()[clade_iter].name
    tree_phy = tree.as_phyloxml(rooted = 'True')
    tree_nx = Phylo.to_networkx(tree_phy)


    triples = ((u.name, v.name, d['weight']) for (u, v, d) in tree_nx.edges(data = True)) # data = True to have the blen as 'weight'
    T = nx.DiGraph()
    edge_to_blen = {}
    for va, vb, blen in triples:
        edge = (va, vb)
        T.add_edge(*edge)
        edge_to_blen[edge] = blen

    edge_list = edge_to_blen.keys()
    edge_list.sort(key = lambda node: int(node[0][1:]))

    return edge_to_blen, edge_list
def get_tree(newicktree):
    tree = Phylo.read( newicktree, "newick")
    #set node number for nonterminal nodes and specify root node
    numInternalNode = 0
    for clade in tree.get_nonterminals():
        clade.name = 'N' + str(numInternalNode)
        numInternalNode += 1
    tree_phy = tree.as_phyloxml(rooted = 'True')
    tree_nx = Phylo.to_networkx(tree_phy)

    triples = ((u.name, v.name, d['weight']) for (u, v, d) in tree_nx.edges(data = True)) # data = True to have the blen as 'weight'
    T = nx.DiGraph()
    edge_to_blen = {}
    for va, vb, blen in triples:
        edge = (va, vb)
        T.add_edge(*edge)
        edge_to_blen[edge] = blen

    # Now assign node_to_num
    leaves = set(v for v, degree in T.degree().items() if degree == 1)
    internal_nodes = set(list(T)).difference(leaves)
    node_names = list(internal_nodes) + list(leaves)

    # Prepare for generating self.tree so that it has same order as the self.x_process
    nEdge = len(edge_to_blen)  # number of edges
    l = nEdge / 2 + 1               # number of leaves
    k = l - 1   # number of internal nodes. The notation here is inconsistent with Alex's for trying to match my notes.

    leaf_branch = [edge for edge in edge_to_blen.keys() if edge[0][0] == 'N' and str.isdigit(edge[0][1:]) and not str.isdigit(edge[1][1:])]
    out_group_branch = [edge for edge in leaf_branch if edge[0] == 'N0' and not str.isdigit(edge[1][1:])] [0]
    internal_branch = [x for x in edge_to_blen.keys() if not x in leaf_branch]
    assert(len(internal_branch) == k-1)  # check if number of internal branch is one less than number of internal nodes
    
    return list(leaves), out_group_branch
Esempio n. 5
0
def iterate(n_iters):
    for i in tqdm(xrange(n_iters)):
        sampler.sample()
        likelihoods.append(sampler.tree.marg_log_likelihood())

    plt.figure()
    plt.xlabel("Iterations", fontsize=fontsize)
    plt.ylabel("Data Log Likelihood", fontsize=fontsize)
    plt.plot(likelihoods)
    plt.legend(loc='best', fontsize=12)

    plt.savefig('unconstrained-likelihoods.png', bbox_inches='tight')


    final_tree = sampler.tree.copy()

    plt.figure()
    plot_tree_2d(final_tree, X, pca)

    for node in final_tree.dfs():
        if node.is_leaf():
            node.point = y[node.point]

    plt.figure()
    newick = final_tree.to_newick()
    tree = Phylo.read(StringIO(newick), 'newick')

    Phylo.draw_graphviz(tree, prog='neato')
    plt.savefig('unconstrained-tree.png', bbox_inches='tight')
    graph = Phylo.to_networkx(tree)
    with open('unconstrained-tree.nwk', 'w') as fp:
        print >>fp, newick,
    nx.write_dot(graph, 'unconstrained-tree.dot')
    plt.show()
Esempio n. 6
0
def get_character_table(t):
    chars = dict() 
    char_matrix = []
    
    t = Phylo.read(StringIO(t), 'newick')
    
    for c in list(t.get_terminals()):
        chars[c.name] = []
    
    net = Phylo.to_networkx(t)
    adj_matrix = networkx.adjacency_matrix(net)
    tchars = []
    
    for node in net.nodes(data=True):
        tchars.append(str(node[0]))
        
    for m in range(len(adj_matrix)): 
        if adj_matrix[m,:].sum() == 3:
            for i in range(m):
                if (i != m) and (adj_matrix[i,:].sum() == 3) \
                and (adj_matrix[i,m] == adj_matrix[m,i]) and (adj_matrix[i,m] == 1):
                    adj_matrix[i,m] = 0
                    adj_matrix[m,i] = 0
                    
                    net = networkx.from_numpy_matrix(adj_matrix)
                    test1 = networkx.connected_components(net)
                    
                    for item in test1[0]:
                        try:
                            chars[tchars[int(item)]].append(1)
                        except:
                            continue
                    for item in test1[1]:
                        try:
                            chars[tchars[int(item)]].append(0)
                        except:
                            continue
                            
                    adj_matrix[i,m] = 1
                    adj_matrix[m,i] = 1
    
    for i in xrange(len(chars.items()[0][1])):
        char_matrix.append([])
        for j in xrange(len(chars)):
            char_matrix[i].append(0)
    
    nn = 0
    
    for _, v in sorted(chars.items()) :
        for j in range(len(v)):
            char_matrix[j][nn] = v[j]
        nn += 1
    
    for i in xrange(len(char_matrix)):
        str1 = ""
        
        for j in xrange(len(char_matrix[i])):
            str1 += str(int(char_matrix[i][j]))
        print str1 
Esempio n. 7
0
def nwck():
    with open("rosalind_nwck.txt") as f:
        lines = map(lambda l: l.strip(), f.readlines())
        lines = [line for line in lines if line]

    for i in xrange(len(lines)/2):
        handle = StringIO(lines[2*i])
        tree = Phylo.read(handle, "newick")
        names = lines[2*i+1].split()

        t =  Phylo.to_networkx(tree)

        na = [node for node in t.nodes() if node.name == names[0]][0]
        nb = [node for node in t.nodes() if node.name == names[1]][0]

        print len(networkx.shortest_path(t, na, nb))-1,

    print ""
Esempio n. 8
0
    def tree_from_random(list_of_scores):
        """Generates a random guide tree for MGA.
	
			Parameters
			----------
			list_of_scores : scores from the pairwise alignments of the graphs to get graph names. Example for three graphs a, b, c: [["a", "b", 2], ["a", "c", 4], ["b", "c", 3]]
		
			Output
			------
			Guide_tree object
		"""
        names = Guide_tree_Generator.make_graph_list(list_of_scores)
        matrix = Guide_tree_Generator.random_score_matrix(names)
        constructor = DistanceTreeConstructor()
        upgmatree = constructor.upgma(matrix)
        tree = Phylo.to_networkx(upgmatree)
        guide_tree = Guide_tree(tree)

        return guide_tree
Esempio n. 9
0
def get_pairwise_distances(npalign, tree_file=None, seq_file=None):

    if seq_file is None:
        fasta_handle = NTF(mode="w")
    else:
        fasta_handle = open("/tmp/tmp.fasta", "w")
    if tree_file is None:
        tree_handle = NTF()
    else:
        tree_handle = open(tree_file, "w")
    seq_names = fasta_write(fasta_handle, npalign)

    fasta_handle.flush()
    os.fsync(fasta_handle.fileno())
    cmd = "muscle -in %(ifile)s -tree2 %(treefile)s -gapopen -2.9"
    cmdlist = shlex.split(cmd % {"ifile": fasta_handle.name, "treefile": tree_handle.name})

    try:
        t = check_output(cmdlist)
        tree = Phylo.read(open(tree_handle.name), "newick")
    except CalledProcessError:
        # print('Could not make tree')
        return None
    except ValueError:
        # print('no tree present')
        return None
    except RuntimeError:
        return None

    seq_names = sorted(tree.get_terminals(), key=lambda x: x.name)
    net = Phylo.to_networkx(tree)
    dmat = networkx.all_pairs_shortest_path(net)
    terminals = tree.get_terminals()
    dists = np.zeros((npalign.shape[0], npalign.shape[0]))
    for t1, t2 in product(terminals, terminals):
        path = dmat[t1][t2]
        dist = sum(c.branch_length for c in path)
        i1 = int(t1.name.split("-")[1])
        i2 = int(t2.name.split("-")[1])
        dists[i1, i2] = dist

    return dists
Esempio n. 10
0
    def tree_from_newick(path):
        """Generates Guide_tree object from a newick tree entered by the user.
	
			Parameters
			----------
			path : path to newick string representing the desired aligning sequence for MGA
		
			Output
			------
			Guide_tree object
		"""
        tree = next(Phylo.parse(path, 'newick', rooted=True))
        networkx_tree = Phylo.to_networkx(tree)
        guide_tree = Guide_tree(networkx_tree)
        if Guide_tree_Generator.is_binary_tree(guide_tree) == True:
            return guide_tree
        else:
            print(
                "The input is not a binary tree. Please enter a binary tree to get a guide tree"
            )
Esempio n. 11
0
def character_table2(tree):
    '''given a Bio.Phylo tree object, return character table showing 
    all nontrivial splits in a set of binary strings'''
    terminals = sorted(tree.get_terminals(), key=lambda x: x.name)
    n = len(terminals)

    G = Phylo.to_networkx(tree)
    nontrivials = [(u, v) for u, v in G.edges() if (u.name is None) and (v.name is None)]
    m = len(nontrivials)
 
    table = np.zeros((m, n), dtype=int)
    for i in range(m):
        G.remove_edge(*nontrivials[i])
        s = nx.node_connected_component(G, terminals[0])
    
        for j in range(n):
            table[i, j] = int(terminals[j] in s)

        G.add_edge(*nontrivials[i])
    
    return set([''.join([str(i) for i in row]) for row in table]) 
Esempio n. 12
0
def character_table2(tree):
    '''given a Bio.Phylo tree object, return character table showing 
    all nontrivial splits in a set of binary strings'''
    terminals = sorted(tree.get_terminals(), key=lambda x: x.name)
    n = len(terminals)

    G = Phylo.to_networkx(tree)
    nontrivials = [(u, v) for u, v in G.edges()
                   if (u.name is None) and (v.name is None)]
    m = len(nontrivials)

    table = np.zeros((m, n), dtype=int)
    for i in range(m):
        G.remove_edge(*nontrivials[i])
        s = nx.node_connected_component(G, terminals[0])

        for j in range(n):
            table[i, j] = int(terminals[j] in s)

        G.add_edge(*nontrivials[i])

    return set([''.join([str(i) for i in row]) for row in table])
Esempio n. 13
0
def generar_arbol(especie, indice):
    tree_path = './static/img/bio/' + especie + indice + '.png'
    graph_path = './static/img/bio/' + especie + indice + 'g.png'

    if not path.exists(tree_path) or not path.exists(graph_path):
        seq_path = './static/seq/Homologos/'
        fasta_file = seq_path + especie + str(indice) + '.fasta'
        aln_file = seq_path + especie + str(indice) + '.aln'

        # Ejecuta MUSCLE para el alineamiento de secuencias homologas
        cli = MuscleCommandline(input=fasta_file, out=aln_file, clw=True)
        #cli = ClustalwCommandline(infile=fasta_file,outfile=aln_file)
        cli()

        with open(aln_file, "r") as aln:
            alineamiento = AlignIO.read(aln, "clustal")

        # Blosum62 para proteinas
        calculator = DistanceCalculator('blosum62')
        dm = calculator.get_distance(alineamiento)

        constructor = DistanceTreeConstructor(calculator)
        # Neighbor Joining
        nj = constructor.nj(dm)

        net = Phylo.to_networkx(nj)
        pos1 = nx.nx_pydot.pydot_layout(net, prog='dot')

        # Dibuja Dendrograma
        Phylo.draw(nj)
        pylab.savefig(tree_path, format='png')
        pylab.clf()

        # Dibuja grafo
        nx.draw(net, pos=pos1, with_labels=True)
        pylab.savefig(graph_path, format='png')
        pylab.clf()
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("true_net", type=str)
    parser.add_argument("r_net", type=str)
    parser.add_argument("alg", type=str)
    parser.add_argument("typ", type=str)
    parser.add_argument("--modified", action="store_true", default=False)

    args = parser.parse_args()

    true_netfp = args.true_net
    reconstructed_fp = args.r_net
    alg = args.alg
    t = args.typ
    modified = args.modified

    name = true_netfp.split("/")[-1]
    spl = name.split("_")
    param = spl[-3]
    run = spl[-1].split(".")[0]
    #param = "na"

    name2 = reconstructed_fp.split("/")[-1]
    spl2 = name2.split("_")

    ending = spl2[-1].split(".")[-1]

    #true_network = pic.load(open(true_netfp, "rb"))
    true_network = nx.read_gpickle(true_netfp)
    target_nodes = get_leaves_of_tree(true_network, clip_identifier=True)
    target_nodes_original_network = get_leaves_of_tree(true_network,
                                                       clip_identifier=False)

    if ending == "pkl" or ending == "pickle":

        #reconstructed_network = nx.read_gpickle(reconstructed_fp)
        reconstructed_network = pic.load(open(reconstructed_fp, "rb"),
                                         encoding="latin1")

        nodes = [n for n in reconstructed_network.nodes()]
        encoder = dict(zip(nodes, map(lambda x: x.split("_")[0], nodes)))

        reconstructed_network = nx.relabel_nodes(reconstructed_network,
                                                 encoder)

    else:
        k = map(lambda x: "s" + x.split("_")[-1],
                target_nodes_original_network)
        s_to_char = dict(zip(k, target_nodes))
        char_to_s = dict(zip(target_nodes, k))

        reconstructed_tree = next(Phylo.parse(reconstructed_fp, "newick"))
        reconstructed_tree.rooted = True
        reconstructed_network = Phylo.to_networkx(reconstructed_tree)

        i = 1
        for n in reconstructed_network:
            if n.name is None:
                n.name = "i" + str(i)
                i += 1

        #newick_str = ""
        #with open(reconstructed_fp, "r") as f:
        #    for l in f:
        #        l = l.strip()
        #        newick_str += l

        #reconstructed_tree = newick_to_network(reconstructed_fp)
        #reconstructed_tree = newick_to_network(newick_str)
        #reconstructed_network = tree_collapse(reconstructed_tree)

        # convert labels to strings, not Bio.Phylo.Clade objects
        c2str = map(lambda x: x.name, reconstructed_network.nodes())
        c2strdict = dict(zip(reconstructed_network.nodes(), c2str))
        reconstructed_network = nx.relabel_nodes(reconstructed_network,
                                                 c2strdict)

        # convert labels to characters for triplets correct analysis
        reconstructed_network = nx.relabel_nodes(reconstructed_network,
                                                 s_to_char)
        #reconstructed_network = tree_collapse(reconstructed_network)

        tot_tp = score_triplets(true_network,
                                reconstructed_network,
                                number_of_trials=50000,
                                modified=modified)

        print(
            str(param) + "\t" + str(run) + "\t" + str(tot_tp) + "\t" + alg +
            "\t" + t + "\t" + str(0))
Esempio n. 15
0
plt.figure()
plt.xlim([0, n_iters + constraint_add])
plt.xlabel("Iterations", fontsize=fontsize)
plt.ylabel("Data Log Likelihood", fontsize=fontsize)
plt.plot(likelihoods)
plt.legend(loc="best", fontsize=12)

plt.savefig("online-likelihoods.png", bbox_inches="tight")


final_tree = sampler.tree.copy()

plt.figure()
plot_tree_2d(final_tree, X, pca)

for node in final_tree.dfs():
    if node.is_leaf():
        node.point = y[node.point]

newick = final_tree.to_newick()
tree = Phylo.read(StringIO(newick), "newick")

plt.figure()
Phylo.draw_graphviz(tree, prog="neato")
plt.savefig("tree.png", bbox_inches="tight")
graph = Phylo.to_networkx(tree)
with open("tree.nwk", "w") as fp:
    print >> fp, newick,
nx.write_dot(graph, "tree.dot")
plt.show()
Esempio n. 16
0
def plot_phylo(C_raw, F, list_funcs, len_kegg, comp_p, pattern, threshold=0.05):
  """ Build the phylogeny of components, analyze the pathway and plot them.

  Parameters
  ----------
  C_raw: matrix
    gene module values of all components, each row a gene module, each column a component.
  F: matrix
    portions of each component in each sample, each row a component, each column a sample.
  list_funcs: list of str
    functions of each gene module.
  len_kegg: int
    number of KEGG cancer related pathways.
  comp_p: int
    index of most abundant component in the primary samples.
  pattern: list of int
    components need to considered for constructing the phylogeney.
  threshold: float
    threshold to define whether a component is primary or metastatic.
  """
  assert comp_p in pattern
  is_p, is_m = get_ary_pm_comp(F, threshold=threshold)
  labels = get_labels_comp(F, is_p, is_m)

  labels = [labels[idx] for idx in pattern]
  C = C_raw[:, pattern]
  # build up phylogeny of components
  W = pairwise_distances(C.T)
  # for numerical stability
  W = (W+W.T)/2.0
  dm = DistanceMatrix(W, labels)
  newick_str = nj(dm, result_constructor=str)
  tree = Phylo.read(StringIO(newick_str), "newick")
  tree.ladderize() # Flip branches so deeper clades are displayed at top
  #Phylo.draw(tree)
  #Phylo.draw(tree, branch_labels=lambda c: c.branch_length)

  # initialize the graph:
  # pathway of leaves, name of steiner nodes, branch length of root
  G = Phylo.to_networkx(tree)
  idx = 1
  for node in G.nodes():
    if node.name != None:
      node.pathway = C_raw[:,int(node.name[1])-1]
    else:
      node.name = "S"+str(idx)
      idx += 1
      node.pathway = [0]
    if node.branch_length == None:
      node.branch_length = 0

  dim_path = C.shape[0]

  edges = G.edges(data=True)

  # number of steiner nodes
  n_s = C.shape[1] - 2
  mat_Q = np.zeros((n_s, n_s),dtype=float)
  ary_c = [np.zeros(n_s, dtype=float) for _ in range(dim_path)]

  for edge in edges:
    wt = edge[2]["weight"] # It's length, not weight!
    wt = 1.0/wt

    if (edge[0].name[0] == "S") and (edge[1].name[0] == "S"):
      nodes, nodet = edge[0], edge[1]
      ids, idt = int(nodes.name[1])-1, int(nodet.name[1])-1
      mat_Q[ids, ids] += wt
      mat_Q[ids, idt] -= wt
      mat_Q[idt, idt] += wt
      mat_Q[idt, ids] -= wt
    else:
      nodes, nodec = None, None
      if (edge[0].name[0] == "S") and (edge[1].name[0] == "C"):
        nodes, nodec = edge[0], edge[1]
      elif (edge[0].name[0] == "C") and (edge[1].name[0] == "S"):
        nodes, nodec = edge[1], edge[0]
      else:
        print("error")
      ids, idc = int(nodes.name[1])-1, int(nodec.name[1])-1
      mat_Q[ids, ids] += wt

      for idx_path in range(dim_path):
        ary_c[idx_path][ids] += wt * (nodec.pathway[idx_path])
  #for node in G.nodes():
  #  print(node.name, node.pathway[0])

  s_pathway = []
  for idx_path in range(dim_path):
    tmp = np.linalg.solve(mat_Q, ary_c[idx_path])
    s_pathway.append(tmp)
  # num_pathways x num_steiner_nodes
  S = np.asarray(s_pathway, dtype=float)

  for node in G.nodes():
    if node.name[0] == "S":
      node.pathway = S[:, int(node.name[1])-1]

  min_weight, max_weight = get_min_max_weight_edges(G)

  root_name = [l for l in labels if int(l[1])-1 == comp_p][0]
  for root in G.nodes():
    if root.name == root_name:
      break

  set_traverse = []
  plot_nodes = []
  cur_pos = [0, 0, root.name]
  xgrain = 1.0

  strings = []
  set_traverse, strings, plot_nodes = iter_func(
      root_name, root, set_traverse, list_funcs, G, strings, plot_nodes,
      cur_pos, xgrain, min_weight, max_weight)

  node2pos = {v[2]:[v[0],v[1]] for v in plot_nodes}

  sns.set_style("white")
  fig = plt.figure(figsize=(7,6))
  ax0 = plt.subplot(1,1,1)

  xmin = 0
  xmax = 0
  ymin = 0
  ymax = 0

  min_linewidth=8
  max_linewidth=20

  for edge in G.edges(data=True):
    ax0.plot([node2pos[edge[0].name][0],node2pos[edge[1].name][0]],
             [node2pos[edge[0].name][1],node2pos[edge[1].name][1]],
             "-", color="gray",
             linewidth=min_linewidth+(max_linewidth-min_linewidth)*(1.0/edge[2]["weight"]-min_weight)/max_weight,
             alpha=0.3)

  for node in node2pos.keys():
    pos = node2pos[node]
    if node[0] == "S":
      color = "gray"
    elif node[-1] == "P":
      color = "green"
    elif node[-1] == "M":
      color = "red"
    else:
      color = "royalblue"

    ax0.plot(pos[0], pos[1], "o", markersize=50, color=color, alpha=0.5)#markeredgecolor="k",
    ax0.annotate(
        s=node,
        xy=(pos[0], pos[1]),
        ha="center",
        va="center",
        size=22,
        fontweight="bold",
        )
    xmin = min(xmin, pos[0])
    xmax = max(xmax, pos[0])
    ymin = min(ymin, pos[1])
    ymax = max(ymax, pos[1])

  delta = 0.7
  xratio = (xmax-xmin)/(ymax-ymin)*delta
  plt.xlim(xmin-delta*xratio, xmax+delta*xratio)
  plt.ylim(ymin-delta, ymax+delta)

#  ax0.annotate("Progression", xy=(xmin-delta*xratio, ymax), xytext=(xmin-delta*xratio,ymin),
#               ha="center",
#               arrowprops=dict(facecolor="black",alpha=0.7),
#               size=22,
#               fontweight="bold",
#               rotation=0,
#               )

  plt.gca().invert_yaxis()
  ax0.spines["right"].set_visible(False)
  ax0.spines["top"].set_visible(False)
  ax0.spines["left"].set_visible(False)
  ax0.spines["bottom"].set_visible(False)
  ax0.get_xaxis().set_ticks([])
  ax0.get_yaxis().set_ticks([])
  plt.show()
  ##fig.savefig("figures/fig8phylo3.pdf", bbox_inches="tight")

  for tmp in strings:
    node_src, node_tgt = tmp[0], tmp[1]
    print("\colrule")
    print("$%s \\rightarrow %s$"%(node_src.name, node_tgt.name))
    delta_pathway = node_tgt.pathway - node_src.pathway
    delta_pathway = delta_pathway[0:len_kegg]

    idx_sel = sorted(range(len(delta_pathway)), key=delta_pathway.__getitem__)

    threshold = 1.0
    list_pos, list_neg = [], []
    max_ct = 5
    for ct, idx in enumerate(idx_sel[::-1]):
      if delta_pathway[idx] > threshold and ct < max_ct:
        list_pos.append([delta_pathway[idx], list_funcs[idx] ])
    for ct, idx in enumerate(idx_sel):
      if delta_pathway[idx] < -threshold and ct < max_ct:
        list_neg.append([delta_pathway[idx], list_funcs[idx] ])

    for idx in range(max(len(list_pos), len(list_neg))):
      if idx+1 <= len(list_pos):
        fun = list_pos[idx][1]
        if fun in ["RET", "PI3K-Akt signaling pathway", "ErbB signaling pathway"]:
          fun = "\\textbf{"+fun+"}"
        print("& $+%.2f$ & "%(list_pos[idx][0])+fun)
      else:
        print("& & ")
      if idx+1 <= len(list_neg):
        fun = list_neg[idx][1]
        if fun in ["RET", "PI3K-Akt signaling pathway", "ErbB signaling pathway"]:
          fun = "\\textbf{"+fun+"}"
        print("& $%.2f$ & "%(list_neg[idx][0])+fun)
      else:
        print("& & ")
      print("\\\\")
    if max(len(list_pos), len(list_neg)) == 0:
      print("& $<1.0$ & $\emptyset$ & $<1.0$ & $\emptyset$ \\\\")
Esempio n. 17
0
        def out():

            records = SeqIO.parse("%s" % e1.get(), "fasta")

            lens = []
            lens2 = []
            file = open("phylo.phy", 'w')
            for record in records:
                ids = record.id
                sequence = record.seq[0:100]
                lens.append(record.id)
                lens2.append(record.seq)
                line = "%s   %s" % (ids, sequence)
                print(line)

            lengthmax = len(max(lens, key=len))

            lengthmin = len(min(lens, key=len))

            file.write("   %s     100\n" % len(lens))

            for i, item in enumerate(lens):
                start = i - 1
                end = i - 1
                seq = lens2[end]

                if len(item) == int(lengthmax):
                    if i < 10:
                        ids = "%s%s%s" % (i, "-", item + "-")
                        ids = ids
                        ids = ids.replace(".", "")
                        ids = ids.replace("_", "")
                        print("1")
                    else:
                        ids = "%s%s%s" % (i, "-", item)
                        ids = ids
                        ids = ids.replace(".", "")
                        ids = ids.replace("_", "")
                        print("1")

                elif len(item) < int(lengthmax):
                    ids = "%s%s%s" % (i, "-", item)
                    add = int(lengthmax) - int(len(item))
                    ids = ids + (add * "-") + "-"
                    ids = ids
                    ids = ids.replace(".", "")
                    ids = ids.replace("_", "")
                    print("2")

                line = "%s          %s\n" % (ids.replace(".", ""), seq[0:100])
                print(line)
                file.write(line)
            file.close()

            # Read the sequences and align
            aln = AlignIO.read('phylo.phy', 'phylip')

            # Print the alignment
            print(aln)

            # Calculate the distance matrix
            calculator = DistanceCalculator('identity')
            dm = calculator.get_distance(aln)

            # Print the distance Matrix
            print('\nDistance Matrix\n===================')
            print(dm)

            # Construct the phylogenetic tree using UPGMA algorithm
            constructor = DistanceTreeConstructor()
            tree = constructor.upgma(dm)

            Phylo.write(tree, 'apaf.xml', 'phyloxml')
            tree = Phylo.read('apaf.xml', 'phyloxml')
            net = Phylo.to_networkx(tree)
            networkx.draw_networkx(net)
            pylab.show(net)
    def out():
        records = SeqIO.parse(e1.get(), "fasta")

        lens = []

        for record in records:
            print(record.seq)
            ids = record.id
            sequence = record.seq
            op = lens.append(record.id)
            # print(lens)

        try:
            lengthmax = len(max(lens, key=len))
            lengthmin = len(min(lens, key=len))
        except:
            lengthmax = "0"
        line = "  %s             %s\n" % (len(lens), "125")
        file = open("phylo.phy", "w")
        file.write(line)

        for i, id in enumerate(lens):
            if lengthmin < int(lengthmax):
                add = int(lengthmax) - int(len(id))
                # print(i)
                id = id + (add * "-")
                id = id.replace(".", "")
                id = id.replace("_", "")
                to_be_write = "%s%s%s    %s" % (i, "-", id, sequence[0:100])
                # file.write("  %s                    %s\n"%(num_rec,seqlen))
                file.writelines(str("%s\n" % to_be_write))
                print(id)

            else:
                add = int(lengthmax) - int(len(id))
                id = id + (add * "-")
                id = id.replace(".", "")
                id = id.replace("_", "")
                to_be_write = "%s    %s" % (id, sequence[0:100])
                # file.write("  %s                    %s\n"%(num_rec,seqlen))

                file.writelines(str("%s\n" % (to_be_write)))
                print(id)

        # Read the sequences and align
        aln = AlignIO.read(
            '/home/peter/Desktop/Moduls/phylogenetic tree/phylo.phy', 'phylip')

        # Print the alignment
        print(aln)

        # Calculate the distance matrix
        calculator = DistanceCalculator('identity')
        dm = calculator.get_distance(aln)

        # Print the distance Matrix
        print('\nDistance Matrix\n===================')
        print(dm)

        # Construct the phylogenetic tree using UPGMA algorithm
        constructor = DistanceTreeConstructor()
        tree = constructor.upgma(dm)

        tree = Phylo.read('apaf.xml', 'phyloxml')
        net = Phylo.to_networkx(tree)
        networkx.draw_networkx(net)
        pylab.show(net)

        win.destroy()
def run_nj_naive(cm_uniq, stem, verbose=True):

    if verbose:
        print("Running Neighbor-Joining on " + str(cm_uniq.shape[0]) +
              " Unique Cells")

    cm_lookup = list(cm_uniq.apply(lambda x: "|".join(x.values), axis=1))

    fn = stem + "phylo.txt"
    infile = stem + "infile.txt"

    cm_uniq.to_csv(fn, sep='\t')

    script = (SCLT_PATH / 'TreeSolver' / 'binarize_multistate_charmat.py')
    cmd = "python3.6 " + str(script) + " " + fn + " " + infile + " --relaxed"
    p = subprocess.Popen(cmd, shell=True)
    pid, ecode = os.waitpid(p.pid, 0)

    aln = AlignIO.read(infile, "phylip-relaxed")

    calculator = DistanceCalculator('identity')
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)

    tree.root_at_midpoint()

    nj_net = Phylo.to_networkx(tree)

    # convert labels to characters for writing to file
    rndict = {}
    for n in nj_net:

        if n.name is None:
            rndict[n] = Node('state-node', [])
        elif n.name in cm_uniq:
            rndict[n] = Node(n.name, cm_uniq.loc[n.name].values)

    # convert labels to strings, not Bio.Phylo.Clade objects
    #c2str = map(lambda x: x.name, list(nj_net.nodes()))
    #c2strdict = dict(zip(list(nj_net.nodes()), c2str))
    nj_net = nx.relabel_nodes(nj_net, rndict)

    # nj_net = fill_in_tree(nj_net, cm_uniq)

    # nj_net = tree_collapse2(nj_net)

    rdict = {}
    for n in nj_net:
        if nj_net.out_degree(n) == 0 and n.char_string in cm_lookup:
            n.is_target = True
        else:
            n.is_target = False

    state_tree = nj_net
    ret_tree = Cassiopeia_Tree(method='neighbor-joining',
                               network=state_tree,
                               name='Cassiopeia_state_tree')

    os.system("rm " + infile)
    os.system("rm " + fn)

    return ret_tree
Esempio n. 20
0
                    mutation = int(line)
                    edgeLabels[child].append(mutation)
                    line = f.readline().rstrip("\n")

    leafStates = {}
    with open(sys.argv[2]) as f:
        for line in f:
            s = line.rstrip("\n").split("\t")

            leafStates[s[0]] = map(int, s[1:])

    #print edgeLabels
    #print leafStates

    tree.rooted = True
    network = Phylo.to_networkx(tree)

    # map vertices to integers
    vertexIndex = {}
    index2Vertex = []
    for edge in network.edges():
        if str(edge[0]) not in vertexIndex:
            vertexIndex[str(edge[0])] = len(vertexIndex)
            index2Vertex.append(str(edge[0]))
        if str(edge[1]) not in vertexIndex:
            vertexIndex[str(edge[1])] = len(vertexIndex)
            index2Vertex.append(str(edge[1]))

    pi = [-1 for i in range(len(vertexIndex))]
    for edge in network.edges():
        pi[vertexIndex[str(edge[1])]] = vertexIndex[str(edge[0])]
Esempio n. 21
0
def compute_tree(options, mat, names):
    """ make upgma hierarchical clustering and write it as png and
    graphviz dot
    """
    # oops, convert to biopython matrix
    matrix = []
    for i in xrange(len(names)):
        row = []
        for j in xrange(i + 1):
            # tree constructor writes 0-distances as 1s for some reason
            # so we hack around here
            val = float(mat[names[i]][names[j]])
            if val == 0.:
                val = 1e-10
            elif val == 1.:
                val = 1.1
            row.append(val)
        matrix.append(row)
    dm = _DistanceMatrix(names, matrix)

    # upgma tree
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)
    robust_makedirs(os.path.dirname(tree_path(options)))
    Phylo.write(tree, tree_path(options), "newick")

    # png tree -- note : doesn't work in toil
    def f(x):
        if "Inner" in str(x):
            return ""
        else:
            return x
    Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10)
    pylab.savefig(tree_path(options).replace("newick", "png"))

    # graphviz
    # get networkx graph
    nxgraph = Phylo.to_networkx(tree)
    # make undirected
    nxgraph = nx.Graph(nxgraph)
    # push names to name labels
    nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label")
    for node_id in nxgraph.nodes():
        node = nxgraph.node[node_id]
        if "Inner" in str(node["label"]):
            node["label"] = "\"\""
            node["width"] = 0.001
            node["height"] = 0.001
        else:
            node["fontsize"] = 18
    for edge_id in nxgraph.edges():
        edge = nxgraph.edge[edge_id[0]][edge_id[1]]
        # in graphviz, weight means something else, so make it a label
        weight = float(edge["weight"])
        # undo hack from above
        if weight > 1:
            weight = 1.
        if weight <= 1e-10 or weight == 1.:
            weight = 0.
        edge["weight"] = None
        edge["label"] = "{0:.3g}".format(float(weight) * 100.)
        edge["fontsize"] = 14
        edge["len"] = draw_len(weight)
    nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
Esempio n. 22
0
# Generating IDs
file = open("work/NLP/Trees/Cosine.csv",'rt')
data = file.readlines()
data = list(data)
ids = []
for i in range(len(data)-1):
    row = data[i+1]
    row = row.split(',')
    ids.append(row[0])


tree = Phylo.read("work/NLP/Plotly/small.newick",'newick')
# tree.rooted = True
# Phylo.draw_ascii(tree)

tree_net = Phylo.to_networkx(tree)

# networkx.write_graphml(tree_net,'graph.gml')

# tree_igr = igraph.read('graph.graphml',format='graphml')

# Phylo.draw_graphviz(tree,prog='dot')

pos = networkx.spring_layout(tree_net)
pos = list(pos.values())

Xn=[pos[k][0] for k in range(len(pos))]
Yn=[pos[k][1] for k in range(len(pos))]

# g = networkx.read_gml("graph.gml")
# g.node
Esempio n. 23
0
from cStringIO import StringIO
import sys
import os
sys.path.insert(1,"../../biopython")
sys.path.insert(1,"../../networkx")
sys.path.insert(1,"../../matplotlib")
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import networkx as nx
from Bio import Phylo

#treedata = "(A, (B,C), (D,E))"
f = open("test.txt")
handle = StringIO(f.read().rstrip())
tree = Phylo.read(handle, "newick")
print tree
net = Phylo.to_networkx(tree)
nx.draw(net)
plt.savefig("test.png")
Phylo.draw(tree)
plt.savefig("test-phylo.png")
Esempio n. 24
0
def main():
    """
    Takes in a character matrix, an algorithm, and an output file and 
    returns a tree in newick format.

    """

    parser = argparse.ArgumentParser()
    parser.add_argument("char_fp", type=str, help="character_matrix")
    parser.add_argument("out_fp", type=str, help="output file name")
    parser.add_argument("-nj",
                        "--neighbor-joining",
                        action="store_true",
                        default=False)
    parser.add_argument("--ilp", action="store_true", default=False)
    parser.add_argument("--hybrid", action="store_true", default=False)
    parser.add_argument("--cutoff",
                        type=int,
                        default=80,
                        help="Cutoff for ILP during Hybrid algorithm")
    parser.add_argument("--time_limit",
                        type=int,
                        default=1500,
                        help="Time limit for ILP convergence")
    parser.add_argument("--greedy", "-g", action="store_true", default=False)
    parser.add_argument("--camin-sokal",
                        "-cs",
                        action="store_true",
                        default=False)
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="output verbosity")
    parser.add_argument("--mutation_map", type=str, default="")
    parser.add_argument("--num_threads", type=int, default=1)
    parser.add_argument("--max_neighborhood_size", type=int, default=10000)

    args = parser.parse_args()

    char_fp = args.char_fp
    out_fp = args.out_fp
    verbose = args.verbose

    cutoff = args.cutoff
    time_limit = args.time_limit
    num_threads = args.num_threads

    max_neighborhood_size = args.max_neighborhood_size

    stem = ''.join(char_fp.split(".")[:-1])

    cm = pd.read_csv(char_fp, sep='\t', index_col=0)
    cm_uniq = cm.drop_duplicates(inplace=False)

    newick = ""

    prior_probs = None
    if args.mutation_map != "":

        prior_probs = read_mutation_map(args.mutation_map)

    if args.greedy:

        target_nodes = cm_uniq.astype(str).apply(lambda x: '|'.join(x), axis=1)

        if verbose:
            print('Running Greedy Algorithm on ' + str(len(target_nodes)) +
                  " Cells")

        string_to_sample = dict(zip(target_nodes, cm.index))

        target_nodes = map(lambda x, n: x + "_" + n, target_nodes,
                           cm_uniq.index)

        reconstructed_network_greedy = solve_lineage_instance(
            target_nodes, method="greedy", prior_probabilities=prior_probs)

        # score parsimony
        score = 0
        for e in reconstructed_network_greedy.edges():
            score += get_edge_length(e[0], e[1])

        print("Parsimony: " + str(score))

        #reconstructed_network_greedy = nx.relabel_nodes(reconstructed_network_greedy, string_to_sample)
        newick = convert_network_to_newick_format(reconstructed_network_greedy)

        with open(out_fp, "w") as f:
            f.write(newick)

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_greedy, open(out_stem + ".pkl", "wb"))

    elif args.hybrid:

        target_nodes = cm_uniq.astype(str).apply(lambda x: '|'.join(x), axis=1)

        if verbose:
            print('Running Hybrid Algorithm on ' + str(len(target_nodes)) +
                  " Cells")
            print('Parameters: ILP on sets of ' + str(cutoff) + ' cells ' +
                  str(time_limit) + 's to complete optimization')

        string_to_sample = dict(zip(target_nodes, cm.index))

        target_nodes = map(lambda x, n: x + "_" + n, target_nodes,
                           cm_uniq.index)

        print("running algorithm...")
        reconstructed_network_hybrid = solve_lineage_instance(
            target_nodes,
            method="hybrid",
            hybrid_subset_cutoff=cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            threads=num_threads,
            max_neighborhood_size=max_neighborhood_size)

        if verbose:
            print("Scoring Parsimony...")

        # score parsimony
        score = 0
        for e in reconstructed_network_hybrid.edges():
            score += get_edge_length(e[0], e[1])

        if verbose:
            print("Parsimony: " + str(score))

        if verbose:
            print("Writing the tree to output...")

        #reconstructed_network_hybrid = nx.relabel_nodes(reconstructed_network_hybrid, string_to_sample)

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_hybrid, open(out_stem + ".pkl", "wb"))

        newick = convert_network_to_newick_format(reconstructed_network_hybrid)

        with open(out_fp, "w") as f:
            f.write(newick)

    elif args.ilp:

        target_nodes = cm_uniq.astype(str).apply(lambda x: '|'.join(x), axis=1)

        if verbose:
            print("Running ILP Algorithm on " + str(len(target_nodes)) +
                  " Unique Cells")
            print("Paramters: ILP allowed " + str(time_limit) +
                  "s to complete optimization")

        string_to_sample = dict(zip(target_nodes, cm.index))

        target_nodes = map(lambda x, n: x + "_" + n, target_nodes,
                           cm_uniq.index)

        reconstructed_network_ilp = solve_lineage_instance(
            target_nodes,
            method="ilp",
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            max_neighborhood_size=max_neighborhood_size)

        # score parsimony
        score = 0
        for e in reconstructed_network_ilp.edges():
            score += get_edge_length(e[0], e[1])

        print("Parsimony: " + str(score))

        #reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample)
        newick = convert_network_to_newick_format(reconstructed_network_ilp)

        with open(out_fp, "w") as f:
            f.write(newick)

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_ilp, open(out_stem + ".pkl", "wb"))

    elif args.neighbor_joining:

        cm.drop_duplicates(inplace=True)

        if verbose:
            print("Running Neighbor-Joining on " + str(cm.shape[0]) +
                  " Unique Cells")

        fn = stem + "phylo.txt"
        infile = stem + "infile.txt"

        cm.to_csv(fn, sep='\t')

        os.system(
            "python2 ~/projects/scLineages/SingleCellLineageTracing/scripts/binarize_multistate_charmat.py "
            + fn + " " + infile + " --relaxed")
        aln = AlignIO.read(infile, "phylip-relaxed")

        calculator = DistanceCalculator('identity')
        constructor = DistanceTreeConstructor(calculator, 'nj')
        tree = constructor.build_tree(aln)

        tree.root_at_midpoint()

        nj_net = Phylo.to_networkx(tree)

        # convert labels to characters for writing to file
        i = 0
        for n in nj_net:

            if n.name is None:
                n.name = "internal" + str(i)
                i += 1

        # convert labels to strings, not Bio.Phylo.Clade objects
        c2str = map(lambda x: x.name, nj_net.nodes())
        c2strdict = dict(zip(nj_net.nodes(), c2str))
        nj_net = nx.relabel_nodes(nj_net, c2strdict)

        nj_net = tree_collapse(nj_net)

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(nj_net, open(out_stem + ".pkl", "wb"))

        newick = convert_network_to_newick_format(nj_net)

        with open(out_fp, "w") as f:
            f.write(newick)

        os.system("rm " + infile)
        os.system("rm " + fn)

    elif args.camin_sokal:

        cells = cm.index
        samples = [("s" + str(i)) for i in range(len(cells))]
        samples_to_cells = dict(zip(samples, cells))

        cm.index = list(range(len(cells)))

        if verbose:
            print("Running Camin-Sokal on " + str(cm.shape[0]) +
                  " Unique Cells")

        infile = stem + 'infile.txt'
        fn = stem + "phylo.txt"
        weights_fn = stem + "weights.txt"

        cm.to_csv(fn, sep='\t')

        os.system(
            "python2 /home/mattjones/projects/scLineages/SingleCellLineageTracing/scripts/binarize_multistate_charmat.py "
            + fn + " " + infile)

        weights = construct_weights(infile, weights_fn)

        outfile = stem + 'outfile.txt'
        outtree = stem + 'outtree.txt'
        # run phylip mix with camin-sokal
        responses = "." + stem + ".temp.txt"
        FH = open(responses, 'w')
        current_dir = os.getcwd()
        FH.write(infile + "\n")
        FH.write("F\n" + outfile + "\n")
        FH.write("P\n")
        FH.write("Y\n")
        FH.write("F\n" + outtree + "\n")
        FH.close()

        t0 = time.time()
        cmd = "~/software/phylip-3.697/exe/mix"
        cmd += " < " + responses + " > screenout"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        consense_outtree = stem + "consenseouttree.txt"
        consense_outfile = stem + "conenseoutfile.txt"

        FH = open(responses, "w")
        FH.write(outtree + "\n")
        FH.write("F\n" + consense_outfile + "\n")
        FH.write("Y\n")
        FH.write("F\n" + consense_outtree + "\n")
        FH.close()

        if verbose:
            print("Computing Consensus Tree, elasped time: " +
                  str(time.time() - t0))

        cmd = "~/software/phylip-3.697/exe/consense"
        cmd += " < " + responses + " > screenout2"
        p2 = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p2.pid, 0)

        newick_str = ""
        with open(consense_outtree, "r") as f:
            for l in f:
                l = l.strip()
                newick_str += l

        #tree = Phylo.parse(consense_outtree, "newick").next()
        tree = newick_to_network(newick_str)
        #tree.rooted = True
        cs_net = tree_collapse(tree)
        #cs_net = Phylo.to_networkx(tree)

        cs_net = nx.relabel_nodes(cs_net, samples_to_cells)

        out_stem = "".join(out_fp.split(".")[:-1])

        pic.dump(cs_net, open(out_stem + ".pkl", "wb"))

        newick = convert_network_to_newick_format(cs_net)

        with open(out_fp, "w") as f:
            f.write(newick)

        os.system("rm " + outfile)
        os.system("rm " + responses)
        os.system("rm " + outtree)
        os.system("rm " + consense_outfile)
        os.system("rm " + infile)
        os.system("rm " + fn)

    elif alg == "--max-likelihood" or alg == '-ml':

        #cells = cm.index
        #samples = [("s" + str(i)) for i in range(len(cells))]
        #samples_to_cells = dict(zip(samples, cells))

        #cm.index = list(range(len(cells)))

        if verbose:
            print("Running Camin-Sokal on " + str(cm.shape[0]) +
                  " Unique Cells")

        infile = stem + 'infile.txt'
        fn = stem + "phylo.txt"

        cm.to_csv(fn, sep='\t')

        os.system(
            "python2 /home/mattjones/projects/scLineages/SingleCellLineageTracing/scripts/binarize_multistate_charmat.py "
            + fn + " " + infile + " --relaxed")

        os.system("/home/mattjones/software/FastTreeMP < " + infile + " > " +
                  out_fp)

        tree = Phylo.parse(out_fp, "newick").next()

        ml_net = Phylo.to_networkx(tree)

        i = 0
        for n in ml_net:
            if n.name is None:
                n.name = "internal" + str(i)
                i += 1

        c2str = map(lambda x: str(x), ml_net.nodes())
        c2strdict = dict(zip(ml_net.nodes(), c2str))
        ml_net = nx.relabel_nodes(ml_net, c2strdict)

        out_stem = "".join(out_fp.split(".")[:-1])

        pic.dump(ml_net, open(out_stem + ".pkl", "wb"))

        os.system("rm " + infile)
        os.system("rm " + fn)

    else:

        raise Exception(
            "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, max-likelihood, or camin-sokal"
        )
Esempio n. 25
0
print(tree.ascii_art())

tree_file = open(tree_file, 'w+')
tree_file.write(tree.ascii_art())
tree_file.close()

nws = nj(dm, result_constructor=str)
print(nws)

nws_file_l = open(nws_file, 'w+')
nws_file_l.write(nws)
nws_file_l.close()

bio_tree = Phylo.read("work/NLP/Trees/output_data.txt", 'newick')

tree_net = Phylo.to_networkx(bio_tree)
# networkx.graphviz_layout = networkx.drawing.nx_agraph.pydot_layout

# networkx.draw(tree_net,pos=networkx.spring_layout(tree_net))
# networkx.draw(tree_net)
# matplotlib.plot()
# pyplot.draw()
# pyplot.show()

# graphviz_layout = nx_agraph.pydot_layout

H = networkx.nx_agraph.to_agraph(tree_net)
H.layout()
H.draw('a.ph')
# Phylo.draw_graphviz(H,prog='dot')
# pylab.show()
Esempio n. 26
0
plt.figure()
plt.xlim([0, n_iters + constraint_add])
plt.xlabel("Iterations", fontsize=fontsize)
plt.ylabel("Data Log Likelihood", fontsize=fontsize)
plt.plot(likelihoods)
plt.legend(loc='best', fontsize=12)

plt.savefig('online-likelihoods.png', bbox_inches='tight')

final_tree = sampler.tree.copy()

plt.figure()
plot_tree_2d(final_tree, X, pca)

for node in final_tree.dfs():
    if node.is_leaf():
        node.point = y[node.point]

newick = final_tree.to_newick()
tree = Phylo.read(StringIO(newick), 'newick')

plt.figure()
Phylo.draw_graphviz(tree, prog='neato')
plt.savefig('tree.png', bbox_inches='tight')
graph = Phylo.to_networkx(tree)
with open('tree.nwk', 'w') as fp:
    print >> fp, newick,
nx.write_dot(graph, 'tree.dot')
plt.show()
Esempio n. 27
0
def main():
    """
    Takes in a character matrix, an algorithm, and an output file and 
    returns a tree in newick format.

    """

    parser = argparse.ArgumentParser()
    parser.add_argument("char_fp", type=str, help="character_matrix")
    parser.add_argument("out_fp", type=str, help="output file name")
    parser.add_argument("-nj",
                        "--neighbor-joining",
                        action="store_true",
                        default=False)
    parser.add_argument("--neighbor_joining_weighted",
                        action="store_true",
                        default=False)
    parser.add_argument("--ilp", action="store_true", default=False)
    parser.add_argument("--hybrid", action="store_true", default=False)
    parser.add_argument("--cutoff",
                        type=int,
                        default=80,
                        help="Cutoff for ILP during Hybrid algorithm")
    parser.add_argument(
        "--hybrid_lca_mode",
        action="store_true",
        help=
        "Use LCA distances to transition in hybrid mode, instead of number of cells",
    )
    parser.add_argument("--time_limit",
                        type=int,
                        default=1500,
                        help="Time limit for ILP convergence")
    parser.add_argument("--greedy", "-g", action="store_true", default=False)
    parser.add_argument("--camin-sokal",
                        "-cs",
                        action="store_true",
                        default=False)
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="output verbosity")
    parser.add_argument("--mutation_map", type=str, default="")
    parser.add_argument("--num_threads", type=int, default=1)
    parser.add_argument("--max_neighborhood_size", type=int, default=10000)
    parser.add_argument("--weighted_ilp",
                        "-w",
                        action="store_true",
                        default=False)
    parser.add_argument("--greedy_min_allele_rep", type=float, default=1.0)
    parser.add_argument("--fuzzy_greedy", action="store_true", default=False)
    parser.add_argument("--multinomial_greedy",
                        action="store_true",
                        default=False)
    parser.add_argument("--num_neighbors", default=10)
    parser.add_argument("--num_alternative_solutions", default=100, type=int)
    parser.add_argument("--greedy_missing_data_mode",
                        default="lookahead",
                        type=str)
    parser.add_argument("--greedy_lookahead_depth", default=3, type=int)

    args = parser.parse_args()

    char_fp = args.char_fp
    out_fp = args.out_fp
    verbose = args.verbose

    lca_mode = args.hybrid_lca_mode
    if lca_mode:
        lca_cutoff = args.cutoff
        cell_cutoff = None
    else:
        cell_cutoff = args.cutoff
        lca_cutoff = None
    time_limit = args.time_limit
    num_threads = args.num_threads

    n_neighbors = args.num_neighbors
    num_alt_soln = args.num_alternative_solutions

    max_neighborhood_size = args.max_neighborhood_size

    missing_data_mode = args.greedy_missing_data_mode
    lookahead_depth = args.greedy_lookahead_depth
    if missing_data_mode not in ["knn", "lookahead", "avg", "modified_avg"]:
        raise Exception("Greedy missing data mode not recognized")

    stem = "".join(char_fp.split(".")[:-1])

    cm = pd.read_csv(char_fp, sep="\t", index_col=0, dtype=str)

    cm_uniq = cm.drop_duplicates(inplace=False)

    cm_lookup = list(cm.apply(lambda x: "|".join(x.values), axis=1))
    newick = ""

    prior_probs = None
    if args.mutation_map != "":

        prior_probs = read_mutation_map(args.mutation_map)

    weighted_ilp = args.weighted_ilp
    if prior_probs is None and weighted_ilp:
        raise Exception(
            "If you'd like to use weighted ILP reconstructions, you need to provide a mutation map (i.e. prior probabilities)"
        )

    greedy_min_allele_rep = args.greedy_min_allele_rep
    fuzzy = args.fuzzy_greedy
    probabilistic = args.multinomial_greedy

    if args.greedy:

        target_nodes = list(
            cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1))

        if verbose:
            print("Read in " + str(cm.shape[0]) + " Cells")
            print("Running Greedy Algorithm on " + str(len(target_nodes)) +
                  " Unique States")

        reconstructed_network_greedy, potential_graph_sizes = solve_lineage_instance(
            target_nodes,
            method="greedy",
            prior_probabilities=prior_probs,
            greedy_minimum_allele_rep=greedy_min_allele_rep,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            n_neighbors=n_neighbors,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )

        net = reconstructed_network_greedy.get_network()

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_greedy, open(out_stem + ".pkl", "wb"))

        newick = reconstructed_network_greedy.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

        root = [n for n in net if net.in_degree(n) == 0][0]
        # score parsimony
        score = 0
        for e in nx.dfs_edges(net, source=root):
            score += e[0].get_mut_length(e[1])

        print("Parsimony: " + str(score))

    elif args.hybrid:

        target_nodes = list(
            cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1))

        if verbose:
            print("Running Hybrid Algorithm on " + str(len(target_nodes)) +
                  " Cells")
            if lca_mode:
                print(
                    "Parameters: ILP on sets of cells with a maximum LCA distance of "
                    + str(lca_cutoff) + " with " + str(time_limit) +
                    "s to complete optimization")
            else:
                print("Parameters: ILP on sets of " + str(cell_cutoff) +
                      " cells with " + str(time_limit) +
                      "s to complete optimization")

        # string_to_sample = dict(zip(target_nodes, cm_uniq.index))

        # target_nodes = list(map(lambda x, n: x + "_" + n, target_nodes, cm_uniq.index))

        print("running algorithm...")
        reconstructed_network_hybrid, potential_graph_sizes = solve_lineage_instance(
            target_nodes,
            method="hybrid",
            hybrid_cell_cutoff=cell_cutoff,
            hybrid_lca_cutoff=lca_cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            threads=num_threads,
            max_neighborhood_size=max_neighborhood_size,
            weighted_ilp=weighted_ilp,
            greedy_minimum_allele_rep=greedy_min_allele_rep,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            n_neighbors=n_neighbors,
            maximum_alt_solutions=num_alt_soln,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )

        net = reconstructed_network_hybrid.get_network()

        if verbose:
            print("Writing the tree to output...")

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_hybrid, open(out_stem + ".pkl", "wb"))

        newick = reconstructed_network_hybrid.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

        ## plot out diagnostic potential graph sizes
        h = plt.figure(figsize=(10, 10))
        for i in range(len(potential_graph_sizes)):
            try:
                x, y = (
                    [k for k in potential_graph_sizes[i].keys()],
                    [
                        potential_graph_sizes[i][k]
                        for k in potential_graph_sizes[i].keys()
                    ],
                )
                plt.plot(x, y)
            except:
                continue
        # plt.xlim(0, int(cutoff))
        plt.xlabel("LCA Distance")
        plt.ylabel("Size of Potential Graph")
        plt.savefig(out_stem + "_potentialgraphsizes.pdf")

        # score parsimony
        score = 0
        for e in net.edges():
            score += e[0].get_mut_length(e[1])

        print("Parsimony: " + str(score))

    elif args.ilp:

        target_nodes = list(
            cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1))

        if verbose:
            print("Running ILP Algorithm on " + str(len(target_nodes)) +
                  " Unique Cells")
            print("Paramters: ILP allowed " + str(time_limit) +
                  "s to complete optimization")

        reconstructed_network_ilp, potential_graph_sizes = solve_lineage_instance(
            target_nodes,
            method="ilp",
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            max_neighborhood_size=max_neighborhood_size,
            weighted_ilp=weighted_ilp,
            maximum_alt_solutions=num_alt_soln,
        )

        net = reconstructed_network_ilp.get_network()

        root = [n for n in net if net.in_degree(n) == 0][0]

        # score parsimony
        score = 0
        for e in nx.dfs_edges(net, source=root):
            score += e[0].get_mut_length(e[1])

        print("Parsimony: " + str(score))

        newick = reconstructed_network_ilp.get_newick()

        if verbose:
            print("Writing the tree to output...")

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_ilp, open(out_stem + ".pkl", "wb"))

        with open(out_fp, "w") as f:
            f.write(newick)

        h = plt.figure(figsize=(10, 10))
        for i in range(len(potential_graph_sizes)):
            try:
                x, y = (
                    [k for k in potential_graph_sizes[i].keys()],
                    [
                        potential_graph_sizes[i][k]
                        for k in potential_graph_sizes[i].keys()
                    ],
                )
                plt.plot(x, y)
            except:
                continue
        # plt.xlim(0, int(cutoff))
        plt.xlabel("LCA Distance")
        plt.ylabel("Size of Potential Graph")
        plt.savefig(out_stem + "_potentialgraphsizes.pdf")

    elif args.neighbor_joining:

        out_stem = "".join(out_fp.split(".")[:-1])

        ret_tree = run_nj_naive(cm_uniq, stem, verbose)

        pic.dump(ret_tree, open(out_stem + ".pkl", "wb"))

        newick = ret_tree.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

    elif args.neighbor_joining_weighted:

        out_stem = "".join(out_fp.split(".")[:-1])
        ret_tree = run_nj_weighted(cm_uniq, prior_probs, verbose)

        pic.dump(ret_tree, open(out_stem + ".pkl", "wb"))

        newick = ret_tree.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

    elif args.camin_sokal:

        out_stem = "".join(out_fp.split(".")[:-1])

        ret_tree = run_camin_sokal(cm_uniq, stem, verbose)

        pic.dump(ret_tree, open(out_stem + ".pkl", "wb"))

        newick = convert_network_to_newick_format(ret_tree.get_network())
        # newick = ret_tree.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

    elif alg == "--max-likelihood" or alg == "-ml":

        # cells = cm.index
        # samples = [("s" + str(i)) for i in range(len(cells))]
        # samples_to_cells = dict(zip(samples, cells))

        # cm.index = list(range(len(cells)))

        if verbose:
            print("Running Maximum Likelihood on " + str(cm.shape[0]) +
                  " Unique Cells")

        infile = stem + "infile.txt"
        fn = stem + "phylo.txt"

        cm.to_csv(fn, sep="\t")

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(
            script) + " " + fn + " " + infile + " --relaxed"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        os.system("/home/mattjones/software/FastTreeMP < " + infile + " > " +
                  out_fp)

        tree = Phylo.parse(out_fp, "newick").next()

        ml_net = Phylo.to_networkx(tree)

        i = 0
        for n in ml_net:
            if n.name is None:
                n.name = "internal" + str(i)
                i += 1

        c2str = map(lambda x: str(x), ml_net.nodes())
        c2strdict = dict(zip(ml_net.nodes(), c2str))
        ml_net = nx.relabel_nodes(ml_net, c2strdict)

        out_stem = "".join(out_fp.split(".")[:-1])

        pic.dump(ml_net, open(out_stem + ".pkl", "wb"))

        os.system("rm " + infile)
        os.system("rm " + fn)

    else:

        raise Exception(
            "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, max-likelihood, or camin-sokal"
        )
Esempio n. 28
0
def main():
    """
    Takes in a character matrix, an algorithm, and an output file and
    returns a tree in newick format.

    """
    parser = argparse.ArgumentParser()
    parser.add_argument("netfp", type=str, help="character_matrix")
    parser.add_argument("-nj",
                        "--neighbor-joining",
                        action="store_true",
                        default=False)
    parser.add_argument("--neighbor_joining_weighted",
                        action="store_true",
                        default=False)
    parser.add_argument("--ilp", action="store_true", default=False)
    parser.add_argument("--hybrid", action="store_true", default=False)
    parser.add_argument("--cutoff",
                        type=int,
                        default=80,
                        help="Cutoff for ILP during Hybrid algorithm")
    parser.add_argument(
        "--hybrid_lca_mode",
        action="store_true",
        help=
        "Use LCA distances to transition in hybrid mode, instead of number of cells",
    )
    parser.add_argument("--time_limit",
                        type=int,
                        default=-1,
                        help="Time limit for ILP convergence")
    parser.add_argument(
        "--iter_limit",
        type=int,
        default=-1,
        help="Max number of iterations for ILP solver",
    )
    parser.add_argument("--greedy", "-g", action="store_true", default=False)
    parser.add_argument("--camin-sokal",
                        "-cs",
                        action="store_true",
                        default=False)
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="output verbosity")
    parser.add_argument("--mutation_map", type=str, default="")
    parser.add_argument("--num_threads", type=int, default=1)
    parser.add_argument("--no_triplets", action="store_true", default=False)
    parser.add_argument("--max_neighborhood_size", type=str, default=3000)
    parser.add_argument("--out_fp",
                        type=str,
                        default=None,
                        help="optional output file")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Random seed for ILP solver")

    args = parser.parse_args()

    netfp = args.netfp
    outfp = args.out_fp
    verbose = args.verbose

    lca_mode = args.hybrid_lca_mode
    if lca_mode:
        lca_cutoff = args.cutoff
        cell_cutoff = None
    else:
        cell_cutoff = args.cutoff
        lca_cutoff = None
    time_limit = args.time_limit
    iter_limit = args.iter_limit
    num_threads = args.num_threads
    max_neighborhood_size = args.max_neighborhood_size
    seed = args.seed

    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    score_triplets = not args.no_triplets

    prior_probs = None
    if args.mutation_map != "":

        prior_probs = pic.load(open(args.mutation_map, "rb"))

    name = netfp.split("/")[-1]
    stem = ".".join(name.split(".")[:-1])

    true_network = nx.read_gpickle(netfp)

    if isinstance(true_network, Cassiopeia_Tree):
        true_network = true_network.get_network()

    target_nodes = get_leaves_of_tree(true_network)

    target_nodes_uniq = []
    seen_charstrings = []
    for t in target_nodes:
        if t.char_string not in seen_charstrings:
            seen_charstrings.append(t.char_string)
            target_nodes_uniq.append(t)

    if args.greedy:

        if verbose:
            print("Running Greedy Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")

        reconstructed_network_greedy = solve_lineage_instance(
            target_nodes_uniq,
            method="greedy",
            prior_probabilities=prior_probs)

        net = reconstructed_network_greedy[0]

        if outfp is None:
            outfp = name.replace("true", "greedy")
        pic.dump(net, open(outfp, "wb"))

    elif args.hybrid:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_hybrid = solve_lineage_instance(
            target_nodes_uniq,
            method="hybrid",
            hybrid_cell_cutoff=cell_cutoff,
            hybrid_lca_cutoff=lca_cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            threads=num_threads,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_hybrid[0]

        if outfp is None:
            outfp = name.replace("true", "hybrid")
        pic.dump(net, open(outfp, "wb"))

    elif args.ilp:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_ilp = solve_lineage_instance(
            target_nodes_uniq,
            method="ilp",
            hybrid_subset_cutoff=cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_ilp[0]
        # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample)
        if outfp is None:
            outfp = name.replace("true", "ilp")
        pic.dump(net, open(outfp, "wb"))

    elif args.neighbor_joining:

        if verbose:
            print("Running Neighbor-Joining on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        infile = "".join(name.split(".")[:-1]) + "infile.txt"
        fn = "".join(name.split(".")[:-1]) + "phylo.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(
            script) + " " + fn + " " + infile + " --relaxed"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        aln = AlignIO.read(infile, "phylip-relaxed")

        aln = unique_alignments(aln)

        t0 = time.time()
        calculator = DistanceCalculator("identity", skip_letters="?")
        constructor = DistanceTreeConstructor(calculator, "nj")

        tree = constructor.build_tree(aln)

        tree.root_at_midpoint()

        nj_net = Phylo.to_networkx(tree)

        # convert labels to characters for writing to file
        i = 0
        rndict = {}
        for n in nj_net:

            if n.name is None:
                rndict[n] = Node("state-node", [])
                # n.name = "internal" + str(i)
                # i += 1
            else:
                rndict[n] = Node(n.name, [])

        nj_net = nx.relabel_nodes(nj_net, rndict)

        # convert labels to strings, not Bio.Phylo.Clade objects
        # c2str = map(lambda x: x.name, list(nj_net.nodes()))
        # c2strdict = dict(zip(list(nj_net.nodes()), c2str))
        # nj_net = nx.relabel_nodes(nj_net, c2strdict)

        cm = pd.read_csv(fn, sep="\t", index_col=0)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        nj_net = fill_in_tree(nj_net, cm)

        nj_net = tree_collapse(nj_net)

        for n in nj_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj")
        pic.dump(nj_net, open(outfp, "wb"))
        # Phylo.write(tree, out, 'newick')

        os.system("rm " + infile)
        os.system("rm " + fn)

    elif args.neighbor_joining_weighted:

        if verbose:
            print("Running Neighbor-Joining with Weighted Scoring on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        target_node_charstrings = np.array(
            [t.get_character_vec() for t in target_nodes_uniq])
        dm = compute_distance_mat(target_node_charstrings,
                                  len(target_node_charstrings),
                                  priors=prior_probs)

        ids = [t.name for t in target_nodes_uniq]
        cm_uniq = pd.DataFrame(target_node_charstrings)
        cm_uniq.index = ids
        dm = sp.spatial.distance.squareform(dm)

        dm = DistanceMatrix(dm, ids)

        newick_str = nj(dm, result_constructor=str)

        tree = newick_to_network(newick_str, cm_uniq)

        nj_net = fill_in_tree(tree, cm_uniq)
        nj_net = tree_collapse(nj_net)

        cm_lookup = dict(
            zip(
                list(
                    cm_uniq.apply(
                        lambda x: "|".join([str(k) for k in x.values]),
                        axis=1)),
                cm_uniq.index.values,
            ))

        rdict = {}
        for n in nj_net:
            if n.char_string in cm_lookup:
                n.is_target = True
            else:
                n.is_target = False

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj_weighted")
        pic.dump(nj_net, open(outfp, "wb"))

    elif args.camin_sokal:

        if verbose:
            print("Running Camin-Sokal Max Parsimony Algorithm on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        samples_to_cells = {}
        indices = []
        for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq):
            samples_to_cells["s" + str(i)] = n.name
            indices.append(n.name)
            n.name = str(i)

        infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt"
        fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt"
        weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(script) + " " + fn + " " + infile
        pi = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(pi.pid, 0)

        weights = construct_weights(infile, weights_fn)

        os.system("touch outfile")
        os.system("touch outtree")

        outfile = stem + "outfile.txt"
        outtree = stem + "outtree.txt"
        # run phylip mix with camin-sokal
        responses = "." + stem + ".temp.txt"
        FH = open(responses, "w")
        current_dir = os.getcwd()
        FH.write(infile + "\n")
        FH.write("F\n" + outfile + "\n")
        FH.write("P\n")
        FH.write("W\n")
        FH.write("Y\n")
        FH.write(weights_fn + "\n")
        FH.write("F\n" + outtree + "\n")
        FH.close()

        t0 = time.time()
        cmd = "~/software/phylip-3.697/exe/mix"
        cmd += " < " + responses + " > screenout1"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        consense_outtree = stem + "consenseouttree.txt"
        consense_outfile = stem + "consenseoutfile.txt"

        FH = open(responses, "w")
        FH.write(outtree + "\n")
        FH.write("F\n" + consense_outfile + "\n")
        FH.write("Y\n")
        FH.write("F\n" + consense_outtree + "\n")
        FH.close()

        if verbose:
            print("Computing Consensus Tree, elasped time: " +
                  str(time.time() - t0))

        cmd = "~/software/phylip-3.697/exe/consense"
        cmd += " < " + responses + " > screenout"
        p2 = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p2.pid, 0)

        newick_str = ""
        with open(consense_outtree, "r") as f:
            for l in f:
                l = l.strip()
                newick_str += l

        cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str)
        cm.index = indices

        cs_net = newick_to_network(newick_str, cm)

        for n in cs_net:
            if n.name in samples_to_cells:
                n.name = samples_to_cells[n.name]

        cs_net = fill_in_tree(cs_net, cm)

        cs_net = tree_collapse2(cs_net)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        for n in cs_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net)
        if outfp is None:
            outfp = name.replace("true", "cs")
        pic.dump(cs_net, open(outfp, "wb"))

        os.system("rm " + outfile)
        os.system("rm " + responses)
        os.system("rm " + outtree)
        os.system("rm " + consense_outfile)
        os.system("rm " + infile)
        os.system("rm " + fn)

    else:

        raise Exception(
            "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal"
        )
    "[Required] Location of the file containing nodes of interest for network analysis"
)

# Parse options into variables
(options, args) = parser.parse_args()

treepath = options.treepath
namepath = options.namepath
if treepath is None or namepath is None:
    print "Invalid options"
    sys.exit(1)

# load Tree
trees = Phylo.parse(treepath, 'newick')
Tree = trees.next()
tree = Phylo.to_networkx(Tree)

# load names
f = open(namepath)
lines = f.readlines()
names = []
for line in lines:
    if line:
        names.append(line.strip())

# Analysis

nodes = tree.nodes()
leaves = []
for node in nodes:
    if node.name is not None:
Esempio n. 30
0
 def test_to_networkx(self):
     """Tree to Graph conversion, if networkx is available."""
     tree = Phylo.read(EX_DOLLO, 'phyloxml')
     G = Phylo.to_networkx(tree)
     self.assertEqual(len(G.nodes()), 659)
Esempio n. 31
0
def createHashedPTreeGraph(tree,
                           maxLeafHashValue=2000000000000000000000,
                           refTree=None,
                           treeName="anonymous",
                           hashAlgorithm="pow"):

    print("[INFO] Constructing hashed PTree graph via networkx for {} tree".
          format(treeName))

    if refTree is None:
        print("[INFO] Using purely new node ids for nodes")
    else:
        print("[INFO] Using reference tree ids for nodes")

    # To each node we assign ids that are integers starting from 0
    def assignPhyloIds(tree, nextFreeId=0):
        tree.id = nextFreeId
        nextFreeId = nextFreeId + 1

        for child in tree:
            nextFreeId = assignPhyloIds(child, nextFreeId)
        return nextFreeId

    print("[INFO] Assigning random leafs hashes in range 0 to {}".format(
        maxLeafHashValue))
    assignPhyloIds(tree.clade)
    net = Phylo.to_networkx(tree)

    # Create directed DFS traversal graph for tree
    startingNode = tree.clade
    dfsTree = nx.dfs_tree(net, source=startingNode)
    if not (refTree is None):
        standarizeGraphLabeling(dfsTree, refTree["network"])

    nodesCount = len(dfsTree.nodes())

    # Assign hashes to all leafs
    leafId = 1
    nodeToHashMapping = {}
    hashToNodeMapping = {}
    for node in dfsTree.nodes():
        if len(node) <= 0:

            if hashAlgorithm == "pow":
                leafHash = 2**leafId
            elif hashAlgorithm == "rand":
                nodeHash = random.randint(1, maxLeafHashValue + 1)

            leafId = leafId + 1
            if not (refTree is None):
                if not (node.name is None):
                    foundNodes = list(
                        it.ifilter(lambda n: n.name == node.name,
                                   refTree["network"].nodes()))
                    if len(foundNodes) > 0:
                        leafHash = refTree["nodeToHashMapping"][
                            foundNodes[0].id]["hash"]

            hashObj = {"clade": node, "hash": leafHash}
            nodeToHashMapping[node.id] = hashObj
            hashToNodeMapping[leafHash] = hashObj

    print("[INFO] Generating hashes for the rest of nodes")

    # Function that recursively generates hashes for nodes from
    # hashes of the children nodes
    def recAssignHashes(node, parent):

        for child in nx.neighbors(dfsTree, node):
            recAssignHashes(child, node)

        if not node.id in nodeToHashMapping:
            nodeHash = None
            okChild = 0
            for child in nx.neighbors(dfsTree, node):
                if child.id in nodeToHashMapping:
                    okChild = okChild + 1
                    if nodeHash is None:
                        nodeHash = nodeToHashMapping[child.id]["hash"]
                    else:
                        if hashAlgorithm == "pow":
                            nodeHash = nodeHash | nodeToHashMapping[
                                child.id]["hash"]
                        elif hashAlgorithm == "rand":
                            nodeHash = nodeHash ^ nodeToHashMapping[
                                child.id]["hash"]

            if nodeHash is None:
                nodeHash = random.randint(1, maxLeafHashValue + 1)

            hashObj = {"clade": node, "hash": nodeHash}
            nodeToHashMapping[node.id] = hashObj
            hashToNodeMapping[nodeHash] = hashObj

    recAssignHashes(startingNode, None)
    return {
        "network": dfsTree,
        "nodeToHashMapping": nodeToHashMapping,
        "hashToNodeMapping": hashToNodeMapping
    }
Esempio n. 32
0
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True):
    print 'computing alignments...'
    print '  ...using muscle'
    malis, mrefs, mpairs =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = run_id, ali_type = 'muscle',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_musc_{0}'.format(run_id))) 
    print '  ...using cmalign.'
    salis, srefs, spairs  =\
        mem.getOrSet(setAlignments, 
                     **mem.rc({},
                              seqs = seqs, profiles = profiles, 
                              run_id = run_id, ali_type = 'struct',
                              reset = reset,
                              on_fail = 'compute', 
                              register = 'tuali__struct_{0}'.format(run_id)))
 
    print '  ...making trees.'
    
    for idx, alis in enumerate(zip(malis, salis)):
        m, s = alis
        mtree  = phyml.tree(m,run_id, bionj = True)
        stree  = phyml.tree(s,run_id, bionj = True)
        
        maps = dict([(elt.id,i) for i, elt in enumerate(m)])
        mdists = zeros((len(maps),len(maps)))
        sdists = zeros((len(maps),len(maps)))
        for n1 in mtree.get_terminals():
            for n2 in mtree.get_terminals():
                mdists[maps[n1.name],maps[n2.name]] = \
                    mtree.distance(n1,n2)
        
        for n1 in stree.get_terminals():
            for n2 in stree.get_terminals():
                sdists[maps[n1.name],maps[n2.name]] = \
                    stree.distance(n1,n2)
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1))
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6)

        f = myplots.fignum(4, (8,10))
        ct = mycolors.getct(len(mtree.get_terminals()))

        import networkx

        for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']):
            a = f.add_subplot(sp)
            layout = 'neato'
            G = phylo.to_networkx(t)
            Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False)
            posi = networkx.pygraphviz_layout(Gi, layout, args = '')
            posn = dict((n, posi[Gi.node_labels[n]]) for n in G)


            networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]),
                      node_size = [100 if  n.name in maps.keys() else 0 for n in G.nodes()],
                      width = 1, edge_color = 'black',
                      ax = a,
                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] )
        

            a.annotate('Embedded tree for {0} alignment.'.format(ttype),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,0],textcoords = 'offset pixels')
            a.annotate('Total branch length is {0}'.format(t.total_branch_length()),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')            

        #phylo.draw_graphviz(  mtree,  label_func = lambda x: '', 
        #                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\
        #                          [ct[0] for n in mtree.get_nonterminals()], axes = ax)

        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx))
        f.savefig(datafile, dpi = 200, format = 'ps')
Esempio n. 33
0
    reconstructed_network = pic.load(open(reconstructed_fp, "rb"),
                                     encoding="latin1")

    nodes = [n for n in reconstructed_network.nodes()]
    encoder = dict(zip(nodes, map(lambda x: x.split("_")[0], nodes)))

    reconstructed_network = nx.relabel_nodes(reconstructed_network, encoder)

else:
    k = map(lambda x: "s" + x.split("_")[-1], target_nodes_original_network)
    s_to_char = dict(zip(k, target_nodes))
    char_to_s = dict(zip(target_nodes, k))

    reconstructed_tree = next(Phylo.parse(reconstructed_fp, "newick"))
    reconstructed_tree.rooted = True
    reconstructed_network = Phylo.to_networkx(reconstructed_tree)

    i = 1
    for n in reconstructed_network:
        if n.name is None:
            n.name = "i" + str(i)
            i += 1

    # convert labels to strings, not Bio.Phylo.Clade objects
    c2str = map(lambda x: x.name, reconstructed_network.nodes())
    c2strdict = dict(zip(reconstructed_network.nodes(), c2str))
    reconstructed_network = nx.relabel_nodes(reconstructed_network, c2strdict)

    # convert labels to characters for triplets correct analysis
    reconstructed_network = nx.relabel_nodes(reconstructed_network, s_to_char)
from Bio import Phylo
import subprocess, networkx, re
from functools import reduce
from calculations.python.paths import *

#Phylo incorrectly parses the gisaid_china.MCC.trees file, so we modify it, assume that EPI_ISL_\d+ is an identifier
subprocess.check_output(
    f'sed -E "s/h\S+EPI_ISL_/EPI_ISL_/" {data_path(TREE_UNFILTERED)} | sed -E "s/\|\S+[^,]//" > {data_path(TREE_FILTERED)}',
    shell=True)
trees = Phylo.parse(data_path(TREE_FILTERED), 'nexus')
tree = trees.__next__()
net = Phylo.to_networkx(tree).to_undirected()

#descendants_at_level[i] keeps a dict {node: nodes that can be reached with exactly i steps}
descendants_at_level = [{node: {node} for node in net.nodes}]

LEVEL = 2

for i in range(LEVEL):
    prev_level = descendants_at_level[i]
    descendants_at_level.append({
        node: reduce(set.union,
                     (set(net.neighbors(n)) for n in prev_level[node]))
        for node in net.nodes
    })

#dict actually used to generate list of genomes to compare
d = {
    node: reduce(set.union,
                 (descendants_at_level[i][node] for i in range(LEVEL + 1)))
    for node in net.nodes
Esempio n. 35
0
 def test_to_networkx(self):
     """Tree to Graph conversion, if networkx is available."""
     tree = Phylo.read(EX_DOLLO, 'phyloxml')
     G = Phylo.to_networkx(tree)
     self.assertEqual(len(G.nodes()), 659)
Esempio n. 36
0
def make_header():
    tree_file = 'example.tree'
    tip_data = SeqIO.to_dict(SeqIO.parse("example.nexus", "nexus"))
    NUM_SITES = len(next(iter(tip_data.values())))
    assert all(len(v) == NUM_SITES for v in tip_data.values())

    ## Tree parsing
    G = Phylo.to_networkx(Phylo.read(tree_file, 'newick', rooted=True))
    # FIXME this just arbitrarily assigns the leaves to the first n nodes in some
    # way. Need to make sure it matches up with tip_data.
    print(G.nodes())
    n = len(tip_data)
    leaves = iter(range(n))
    interior = iter(range(n, 2 * n - 1))
    node_remap = {}
    tip_remap = {}
    for c in nx.dfs_postorder_nodes(G):
        if c.name is not None:
            node_id = next(leaves)
            node_remap[c] = node_id
            tip_remap[node_id] = tip_data[c.name]
        else:
            node_remap[c] = next(interior)
    G = nx.relabel_nodes(G, mapping=node_remap)
    postorder = list(nx.dfs_postorder_nodes(G))
    preorder_map = np.empty((len(G), 2), dtype=int)
    preorder_map[:, 0] = list(nx.dfs_preorder_nodes(G))
    for i in range(len(G)):
        preorder_map[i, 1] = list(G.pred[preorder_map[i, 0]])[0] if list(
            G.pred[preorder_map[i, 0]]) else 0
    preorder_map += 1
    child_parent = [None] * len(
        G)  # the i-th entry of child_parent is the parent of node i
    for k in G.pred:
        child_parent[k] = list(G.pred[k])[0] if G.pred[k] else -1

    ## Rate matrix
    mu = .25
    Q = np.full((4, 4), mu)
    np.fill_diagonal(Q, -3 * mu)
    pi = np.ones(4) / 4

    ## Initial partials
    encoding = dict(zip('actg', np.eye(4)))
    encoding['-'] = np.ones(4)
    print(tip_data)
    tip_partials = []
    sparse_tip_partials = []
    for i in range(n):
        v = tip_remap[i]
        tip_partials.append(np.transpose([encoding[vv.lower()] for vv in v
                                          ]))  ## FIXME: ordering is arbitrary
        sp = scipy.sparse.coo_matrix(tip_partials[-1])
        sparse_tip_partials.append(zip(sp.row, sp.col, sp.data))
    print(tip_partials)

    ## Write C++ header file
    hpp = jinja2.Template(open('eigen.j2', 'rt').read())
    with open("eigen.hpp", "wt") as out:
        s = hpp.render(child_parent=child_parent,
                       postorder=postorder,
                       Q=Q,
                       pi=pi,
                       tip_partials=tip_partials,
                       sparse_tip_partials=sparse_tip_partials,
                       num_sites=NUM_SITES)
        out.write(s)
    data = {
        'S': n,
        'L': NUM_SITES,
        'map': preorder_map,
        'rate': 1.0,
        'lower_root': 0.0
    }
    return data