def nwk2gml( treefile, filename='', ): """ Function converts a tree in newick format to a network in gml-format. treefile : str Either a str defining the path to a file containing the tree in Newick-format, or the tree-string itself. filename : str (default='lingpy') The name of the output GML-file. If filename is set to c{None}, the function returns a :py:class:`~networkx.Graph`. Returns ------- graph : networkx.Graph """ # create an empty graph graph = nx.DiGraph() # load the tree if type(treefile) == str: try: tree = cg.LoadTree(treefile) except: tree = cg.LoadTree(treestring=treefile) else: tree = treefile # get the node names of the tree nodes = tree.getNodeNames() # get taxa for convenience taxa = tree.getTipNames() # iterate over the nodes and add them and the edges to the graph for node in nodes: # add the node (just as a precaution) if node in taxa: graph.add_node(node, tip=True) else: graph.add_node(node, tip=False) # get the parent of the node parent = tree.getNodeMatchingName(node).Parent # add the edge if the parent is not None if parent: graph.add_edge(parent.Name, node) return _graph_or_file(graph, filename)
def matrix2tree(matrix, taxa, tree_calc="neighbor", distances=True, filename=""): """ Calculate a tree of a given distance matrix. Parameters ---------- matrix : list The distance matrix to be used. taxa : list A list of the taxa in the distance matrix. tree_calc : str (default="neighbor") The method for tree calculation that shall be used. Select between: * "neighbor": Neighbor-joining method (:evobib:`Saitou1987`) * "upgma" : UPGMA method (:evobib:`Sokal1958`) distances : bool (default=True) If set to c{True}, distances will be included in the tree-representation. filename : str (default='') If a filename is specified, the data will be written to that file. Returns ------- tree : ~lingpy.thirdparty.cogent.tree.PhyloNode A ~lingpy.thirdparty.cogent.tree.PhyloNode object for handling tree files. """ if tree_calc == 'upgma': algorithm = cluster.upgma elif tree_calc == 'neighbor': algorithm = cluster.neighbor else: raise ValueError(tree_calc) tree = cg.LoadTree(treestring=algorithm(matrix, taxa, distances)) if not filename: return tree util.write_text_file(filename + '.nwk', text_type(tree))
def nwk2tree_matrix(newick): """ Convert a newick file to a tree matrix. Notes ----- This is an additional function that can be used for plots with help of matplotlibs functions. The tree_matrix is compatible with those matrices that scipy's linkage functions create. """ if type(newick) == str: tree = cg.LoadTree(treestring=newick) elif hasattr(newick, 'root'): tree = newick taxa = [ t for t in sorted(tree.taxa, key=lambda x: len(tree.getConnectingEdges('root', x)), reverse=True) ] tax2id = dict(zip(taxa, range(len(taxa)))) nodes = [t for t in tree.getNodeNames() if t not in taxa] nodes = sorted( nodes, key=lambda x: len(tree.getNodeMatchingName(x).tips()), ) matrix = [] for node in nodes: n = tree.getNodeMatchingName(node) children = n.Children names = [c.Name for c in children] idxA = tax2id[names[0]] idxB = tax2id[names[1]] idx = max(tax2id.values()) + 1 tax2id[node] = idx obs = len(n.tips()) dst = obs * 1.0 matrix += [[idxA, idxB, dst, obs]] return matrix, taxa
def read_qlc(infile, comment='#'): """ Simple function that loads qlc-format into a dictionary. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. Returns ------- d : dict A dictionary with integer keys corresponding to the order of the lines of the input file. The header is given 0 as a specific key. """ lines = read_text_file(infile, lines=True, normalize="NFC") data, meta, dtype = [], {}, False while lines: line = lines.pop(0) if line.startswith(comment) or not line: continue if line.startswith('@'): key, value = [s.strip() for s in line[1:].split(':', 1)] if key == 'tree': meta["tree"] = cg.LoadTree(treestring=value) elif key == 'json': for j1, j2 in json.loads(value).items(): meta[j1] = j2 else: if key not in meta: meta[key] = value else: if isinstance(meta[key], list): meta[key].append(value) else: log.warning( "Key '{0}' in input file is not unique! Use JSON-format for " "these datatypes!".format(key)) meta[key] = [meta[key]] + [value] # line starts with complex stuff elif line.startswith('<'): tmp = line[1:line.index('>')] # check for specific keywords if ' ' in tmp: dtype = tmp.split(' ')[0] keys = { k: v[1:-1] for k, v in [key.split('=') for key in tmp.split(' ')[1:]] } else: dtype = tmp.strip() keys = {} tmp = [] while True: line = lines.pop(0) if line.startswith('</' + dtype + '>'): break tmp += [line] tmp = '\n'.join(tmp) # check for data stuff if dtype == "json": tmp = json.loads(tmp) if not keys: for key in tmp: meta[key] = tmp[key] elif keys: meta[keys["id"]] = {} for k in tmp: meta[keys["id"]][k] = tmp[k] elif dtype in ['tre', 'nwk']: if "trees" not in meta: meta["trees"] = {} if not keys: keys["id"] = "1" # XXX consider switching to Tree here XXX meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp) elif dtype in ['csv']: meta[keys["id"]] = {} ncol = int(keys.get('ncol', 2)) if "dtype" in keys: transf = eval(keys["dtype"]) else: transf = str # split tmp into lines tmp = tmp.split('\n') for l in tmp: if ncol == 2: a, b = l.split('\t') b = transf(b) else: l = l.split('\t') a = l[0] b = [transf(b) for b in l[1:]] meta[keys["id"]][a] = b elif dtype == 'msa': tmp = tmp.split('\n') if 'msa' not in meta: meta['msa'] = {} ref = keys.get('ref', 'cogid') if ref not in meta['msa']: meta['msa'][ref] = {} tmp_msa = {} try: tmp_msa['dataset'] = meta['dataset'] except: tmp_msa['dataset'] = infile.replace('.csv', '') tmp_msa['seq_id'] = keys['id'] # add consensus string to msa, if it appears in the keys if "consensus" in keys: tmp_msa['consensus'] = keys['consensus'] msad = [] for l in tmp: if not l.startswith(comment): msad.append( [x.strip().rstrip('.') for x in l.split('\t')]) tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa) try: meta['msa'][ref][int(keys['id'])] = tmp_msa except ValueError: meta['msa'][ref][keys['id']] = tmp_msa elif dtype == 'dst': taxa, matrix = read_dst(tmp) distances = [[0.0 for _ in matrix] for _ in matrix] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j: distances[i][j] = cell distances[j][i] = cell meta['distances'] = distances elif dtype == 'scorer': scorer = read_scorer(tmp) if 'scorer' not in meta: meta['scorer'] = {} keys.setdefault('id', 'basic') meta['scorer'][keys['id']] = scorer elif dtype == 'taxa': meta['taxa'] = [t.strip() for t in tmp.split('\n')] else: data += [[l.strip() for l in line.split('\t')]] # create the dictionary in which the data will be stored d = {} # check for first line, if a local ID is given in the header (or simply # "ID"), take this line as the ID, otherwise create it local_id = data[0][0].lower() in ['id', 'local_id', 'localid'] # iterate over data and fill the dictionary (a bit inefficient, but enough # for the moment) try: i = 1 for j, line in enumerate(data[1:]): if local_id: d[int(line[0])] = line[1:] else: d[i] = line i += 1 except ValueError as e: # pragma: no cover raise Exception("Error processing line {0}:\n".format(j) + str(data[1:][j]) + '\nOriginal error message: ' + str(e)) # assign the header to d[0] if local_id: d[0] = [x.lower() for x in data[0][1:]] else: d[0] = [x.lower() for x in data[0]] for m in meta: d[m] = meta[m] if 'trees' in d and 'tree' not in d: d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1] return d
def gls2gml( gls, graph, tree, filename='', ): """ Create GML-representation of a given gain-loss-scenario (GLS). Parameters ---------- gls : list A list of tuples, indicating the origins of characters along a tree. graph : networkx.graph A graph that serves as a template for the plotting of the GLS. tree : cogent.tree.PhyloNode A tree object. """ # check for tree-formatting if type(tree) == str: tree = cg.LoadTree(treestring=tree) # create a mapper for the ids and the string-names mapper = {} for node, data in graph.nodes(data=True): mapper[data['label']] = node # create a graph g = nx.Graph() # sort the gls according to the number of tips gls_srt = sorted(gls, key=lambda x: len(tree.getNodeMatchingName(x[0]).tips()), reverse=True) # set the basic event frame, depending on the state of the root if gls_srt[0][1] == 1 and gls_srt[0][0] == 'root': this_color = "#ffffff" state = 'O' else: this_color = "#000000" state = 'l' # let all nodes inherit these parameters for node, data in graph.nodes(data=True): data['graphics']['fill'] = this_color data['graphics']['type'] = 'ellipse' data['graphics']['w'] = 20.0 data['graphics']['h'] = 20.0 data['origin'] = 0 data['state'] = state g.add_node(node, **data) # assign the root as starting point data = graph.nodes[mapper['root']] data['graphics']['type'] = 'ellipse' data['graphics']['w'] = 50.0 data['graphics']['h'] = 50.0 data['state'] = state g.add_node(mapper['root'], **data) # iterate over the nodes involved in change and assign the values to their # children for name, event in gls_srt: if event == 1: this_fill = '#ffffff' state = 'O' else: this_fill = '#000000' state = 'L' # get the names of the descendant nodes in the subtree sub_tree_nodes = tree.getNodeMatchingName(name).getNodeNames() # iterate over all nodes to change for node in sub_tree_nodes: data = g.nodes[mapper[node]] data['graphics']['fill'] = this_fill data['state'] = state.lower() g.add_node(mapper[node], **data) # change the size of the root of the subtree g.nodes[mapper[name]]['graphics']['h'] = 50.0 g.nodes[mapper[name]]['graphics']['w'] = 50.0 g.nodes[mapper[name]]['graphics']['fill'] = this_fill g.nodes[mapper[name]]['origin'] = 1 g.nodes[mapper[name]]['state'] = state # add the edges to the tree for edgeA, edgeB, data in graph.edges(data=True): # for computers with new networkx version try: del data['graphics']['Line'] except: pass # if 'label' not in data: g.add_edge(edgeA, edgeB, **data) return _graph_or_file(g, filename)
def radial_layout(treestring, change=lambda x: x**1.75, degree=100, filename='', start=0, root='root'): """ Function calculates a simple radial tree layout. Parameters ---------- treefile : str Either a str defining the path to a file containing the tree in Newick-format, or the tree-string itself. filename : str (default=None) The name of the output file (GML-format). If set to c{None}, no output will be written to file. change : function (default = lambda x:2 * x**2) The function used to modify the radius in the polar projection of the tree. Returns ------- graph : networkx.Graph A graph representation of the tree with coordinates specified in the graphics-attribute of the nodes. Notes ----- This function creates a radial tree-layout from a given tree specified in Newick format. """ # calculate the factor for projection from the degree pfactor = degree / 360 # get starting factor startf = start * np.pi / 180 # calculate the projection (should be centered) if degree <= 180: pstart = startf + (180 - degree) / 360 * np.pi pend = pstart + 2 * np.pi * pfactor else: pstart = startf + 0 pend = startf + 2 * np.pi * pfactor # define private function for centering of nodes def get_center(nodes): # first sort all values since we need max and min of the theta values xvals = sorted([n[0] for n in nodes]) # get minimum and maximum xA, xB = xvals[0], xvals[-1] # calculate the new coordinates, the radius is simply decreased by 1 y = min([n[1] for n in nodes]) - 1 # the theta-value is calculated by the following formula x = (xA + abs(xA - xB) / 2) return x, y # get the tree if type(treestring) == str: try: tree = cg.LoadTree(treestring) except: tree = cg.LoadTree(treestring=treestring) else: tree = treestring # get the leaves leaves = tree.getTipNames() # get the paths in order to find out the radius of the tree paths = {} for l in leaves: path = tree.getConnectingEdges(root, l) try: paths[len(path)] += [l] except: paths[len(path)] = [l] # get the max path maxL = max(paths) # get the initial coordinates coords = {} for node, x in zip(leaves, np.linspace(pstart, pend, len(leaves))): coords[node] = (x, maxL, 0) # assign leaves to queue queue = [(l, 0) for l in leaves] # make the visited list visited = [] # start the loop while queue: # get the node node, dim = queue.pop(0) # increase the dimension by 1 dim += 1 if node in visited: pass else: # get the parent and all children children = [ child.Name for child in tree.getNodeMatchingName(node).Parent.Children ] # iterate over children goon = True for child in children: if child in coords: pass else: goon = False break # goon, if this is possible if not goon: queue += [(node, dim)] else: x, y = get_center([coords[child] for child in children]) parent = tree.getNodeMatchingName(node).Parent.Name if parent == root: coords[parent] = (x, y, dim + 1) else: coords[parent] = (x, y, dim) visited += [child for child in children] if parent != root: queue += [(parent, dim)] # convert tree to graph graph = nwk2gml(treestring, filename=None) # iterate over the graph and assign the data for n, d in graph.nodes(data=True): x, y, z = coords[n] # change coordinates xN = change(y) * np.cos(x) yN = change(y) * np.sin(x) # get angle for text-rotation in degrees angle = x * 180 / np.pi # derive zorder from angle if angle <= 90: zorder = 90 - angle elif 180 >= angle > 90: zorder = angle - 90 elif 180 < angle <= 270: zorder = angle - 90 elif 270 < angle: zorder = 90 + (360 - angle) # check for specific parts where the angle has to be adapted if 270 >= angle > 180: angle -= 180 s = 'right' elif 180 >= angle >= 90: angle += 180 s = 'right' else: s = 'left' # assign the data to the graph d['graphics'] = { 'x': xN, 'y': yN, 'z': z, 'angle': angle, 's': s, 'zorder': int(zorder) } # don't forget the label d['label'] = n return _graph_or_file(graph, filename)
#load long language names f = open('data/asjp/world_longnames.txt', 'r') rl = f.readlines() f.close() longnames = array([x.strip() for x in rl]) longNameToID = dict({(longnames[i], i) for i in range(0, len(longnames))}) #load long language names f = open('data/asjp/world_names.txt', 'r') rl = f.readlines() f.close() names = array([x.strip() for x in rl]) nameToID = dict({(names[i], i) for i in range(0, len(names))}) guideTree = cg.LoadTree("data/asjp/world-NWPV.nwk") #convert guideTree node names to integers as expected by Lingpy MSA for leaf in guideTree.tips(): leaf.Name = str(longNameToID[leaf.Name]) iteration = 1 numIterations = 100 while iteration <= numIterations: if iteration == 1: #mfile = open("replacement-weights.txt","r") #sounds = array(mfile.readline().strip().split("\t")) #repWeightsRaw = mfile.readlines() #mfile.close() #repWeights = array([x.strip().split('\t') for x in repWeightsRaw])
def plot_gls(gls, treestring, degree=90, fileformat='pdf', **keywords): """ Plot a gain-loss scenario for a given reference tree. """ # get kewyords defaults = dict(figsize=(15, 15), left=0.05, top=0.95, bottom=0.05, right=0.95, radius=0.5, textsize=8, edgewidth=5, linewidth=2, scale_radius=1.2, ylim=1, xlim=1, text=True, gain_color='white', loss_color='black', gain_linestyle='dotted', loss_linestyle='solid', ax_linewidth=0, filename=rcParams['filename']) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # set filename as variabel for convenience filename = keywords['filename'] try: tree = cg.LoadTree(treestring=treestring) except: try: tree = cg.LoadTree(treestring) except: tree = treestring tgraph = radial_layout(treestring, degree=degree) graph = gls2gml(gls, tgraph, tree) nodes = [] # assign nodes and edges for n, d in graph.nodes(data=True): g = d['graphics'] x = g['x'] y = g['y'] s = d['state'] nodes += [(x, y, s)] # now plot the stuff fig = plt.figure(figsize=keywords['figsize']) figsp = fig.add_subplot(111) figsp.axes.get_xaxis().set_visible(False) figsp.axes.get_yaxis().set_visible(False) # set the axes linewidht for s in figsp.spines.values(): s.set_linewidth(keywords['ax_linewidth']) plt.axis('equal') for nA, nB in graph.edges(): xA = graph.node[nA]['graphics']['x'] xB = graph.node[nB]['graphics']['x'] yA = graph.node[nA]['graphics']['y'] yB = graph.node[nB]['graphics']['y'] plt.plot([xA, xB], [yA, yB], '-', color='black', linewidth=keywords['edgewidth'], zorder=1) # now, iterate over nodes for x, y, s in nodes: if s == 'O': w = mpl.patches.Wedge((x, y), keywords['radius'], 0, 360, facecolor=keywords['gain_color'], linewidth=keywords['linewidth'], linestyle=keywords['gain_linestyle']) elif s == 'o': w = mpl.patches.Wedge( (x, y), keywords['radius'] / keywords['scale_radius'], 0, 360, facecolor=keywords['gain_color'], linewidth=keywords['linewidth']) elif s == 'L': w = mpl.patches.Wedge((x, y), keywords['radius'], 0, 360, facecolor=keywords['loss_color'], linewidth=keywords['linewidth'], linestyle=keywords['loss_linestyle']) else: w = mpl.patches.Wedge( (x, y), keywords['radius'] / keywords['scale_radius'], 0, 360, facecolor=keywords['loss_color'], linewidth=keywords['linewidth']) figsp.add_artist(w) # if text is chosen as argument if keywords['text']: if s in 'Oo': t = '1' c = 'black' else: t = '0' c = 'white' plt.text(x, y, t, size=keywords['textsize'], color=c, va="center", ha="center", fontweight='bold') # set x and y-values xvals = [x[0] for x in nodes] yvals = [x[1] for x in nodes] plt.xlim(min(xvals) - keywords['xlim'], max(xvals) + keywords['xlim']) plt.ylim(min(yvals) - keywords['ylim'], max(yvals) + keywords['ylim']) plt.subplots_adjust(left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) plt.savefig(filename + '.' + fileformat) plt.clf() log.file_written(filename + '.' + fileformat)