def parse(data, ttable=None, treename=None): """ Parse a newick string. *data* is any file-like object that can be coerced into shlex, or a string (converted to StringIO) *ttable* is a dictionary mapping node labels in the newick string to other values. Returns: the root node. """ from ivy.tree import Node if type(data) in types.StringTypes: data = StringIO(data) start_pos = data.tell() tokens = Tokenizer(data) node = None root = None lp = 0 rp = 0 rooted = 1 previous = None ni = 0 # node id counter (preorder) - zero-based indexing li = 0 # leaf index counter ii = 0 # internal node index counter pi = 0 # postorder sequence while 1: token = tokens.get_token() #print token, if token == ';' or token == tokens.eof: assert lp == rp, \ "unbalanced parentheses in tree description: (%s, %s)" \ % (lp, rp) break # internal node elif token == '(': lp = lp + 1 newnode = Node() newnode.ni = ni ni += 1 newnode.isleaf = False newnode.ii = ii ii += 1 newnode.treename = treename if node: if node.children: newnode.left = node.children[-1].right + 1 else: newnode.left = node.left + 1 node.add_child(newnode) else: newnode.left = 1 newnode.right = 2 newnode.right = newnode.left + 1 node = newnode elif token == ')': rp = rp + 1 node = node.parent node.pi = pi pi += 1 if node.children: node.right = node.children[-1].right + 1 elif token == ',': node = node.parent if node.children: node.right = node.children[-1].right + 1 # branch length elif token == ':': token = tokens.get_token() if token == '[': node.length_comment = tokens.parse_embedded_comment() token = tokens.get_token() if not (token == ''): try: brlen = float(token) except ValueError: raise ValueError, ("invalid literal for branch length, " "'%s'" % token) else: raise 'NewickError', \ 'unexpected end-of-file (expecting branch length)' node.length = brlen # comment elif token == '[': node.comment = tokens.parse_embedded_comment() if node.comment[0] == '&': # metadata meta = META.findall(node.comment[1:]) if meta: node.meta = {} for k, v in meta: v = eval(v.replace('{', '(').replace('}', ')')) node.meta[k] = v # leaf node or internal node label else: if previous != ')': # leaf node if ttable: try: ttoken = (ttable.get(int(token)) or ttable.get(token)) except ValueError: ttoken = ttable.get(token) if ttoken: token = ttoken newnode = Node() newnode.ni = ni ni += 1 newnode.pi = pi pi += 1 newnode.label = "_".join(token.split()).replace("'", "") newnode.isleaf = True newnode.li = li li += 1 if node.children: newnode.left = node.children[-1].right + 1 else: newnode.left = node.left + 1 newnode.right = newnode.left + 1 newnode.treename = treename node.add_child(newnode) node = newnode else: # label if ttable: node.label = ttable.get(token, token) else: node.label = token previous = token node.isroot = True return node
def parse(data, ttable=None, treename=None): """ Parse a newick string. *data* is any file-like object that can be coerced into shlex, or a string (converted to StringIO) *ttable* is a dictionary mapping node labels in the newick string to other values. Returns: the root node. """ from ivy.tree import Node if type(data) in types.StringTypes: data = StringIO(data) start_pos = data.tell() tokens = Tokenizer(data) node = None; root = None lp=0; rp=0; rooted=1 previous = None ni = 0 # node id counter (preorder) - zero-based indexing li = 0 # leaf index counter ii = 0 # internal node index counter pi = 0 # postorder sequence while 1: token = tokens.get_token() #print token, if token == ';' or token == tokens.eof: assert lp == rp, \ "unbalanced parentheses in tree description: (%s, %s)" \ % (lp, rp) break # internal node elif token == '(': lp = lp+1 newnode = Node() newnode.ni = ni; ni += 1 newnode.isleaf = False newnode.ii = ii; ii += 1 newnode.treename = treename if node: if node.children: newnode.left = node.children[-1].right+1 else: newnode.left = node.left+1 node.add_child(newnode) else: newnode.left = 1; newnode.right = 2 newnode.right = newnode.left+1 node = newnode elif token == ')': rp = rp+1 node = node.parent node.pi = pi; pi += 1 if node.children: node.right = node.children[-1].right + 1 elif token == ',': node = node.parent if node.children: node.right = node.children[-1].right + 1 # branch length elif token == ':': token = tokens.get_token() if token == '[': node.length_comment = tokens.parse_embedded_comment() token = tokens.get_token() if not (token == ''): try: brlen = float(token) except ValueError: raise ValueError, ("invalid literal for branch length, " "'%s'" % token) else: raise 'NewickError', \ 'unexpected end-of-file (expecting branch length)' node.length = brlen # comment elif token == '[': node.comment = tokens.parse_embedded_comment() if node.comment[0] == '&': # metadata meta = META.findall(node.comment[1:]) if meta: node.meta = {} for k, v in meta: v = eval(v.replace('{','(').replace('}',')')) node.meta[k] = v # leaf node or internal node label else: if previous != ')': # leaf node if ttable: try: ttoken = (ttable.get(int(token)) or ttable.get(token)) except ValueError: ttoken = ttable.get(token) if ttoken: token = ttoken newnode = Node() newnode.ni = ni; ni += 1 newnode.pi = pi; pi += 1 newnode.label = "_".join(token.split()).replace("'", "") newnode.isleaf = True newnode.li = li; li += 1 if node.children: newnode.left = node.children[-1].right+1 else: newnode.left = node.left+1 newnode.right = newnode.left+1 newnode.treename = treename node.add_child(newnode) node = newnode else: # label if ttable: node.label = ttable.get(token, token) else: node.label = token previous = token node.isroot = True return node