def _newick_to_tree_node(fh, convert_underscores=True): tree_stack = [] current_depth = 0 last_token = '' next_is_distance = False root = TreeNode() tree_stack.append((root, current_depth)) for token in _tokenize_newick(fh, convert_underscores=convert_underscores): # Check for a label if last_token not in '(,):': if not next_is_distance: tree_stack[-1][0].name = last_token if last_token else None else: next_is_distance = False # Check for a distance if token == ':': next_is_distance = True elif last_token == ':': try: tree_stack[-1][0].length = float(token) except ValueError: raise NewickFormatError("Could not read length as numeric type" ": %s." % token) elif token == '(': current_depth += 1 tree_stack.append((TreeNode(), current_depth)) elif token == ',': tree_stack.append((TreeNode(), current_depth)) elif token == ')': if len(tree_stack) < 2: raise NewickFormatError("Could not parse file as newick." " Parenthesis are unbalanced.") children = [] # Pop all nodes at this depth as they belong to the remaining # node on the top of the stack as children. while current_depth == tree_stack[-1][1]: node, _ = tree_stack.pop() children.insert(0, node) parent = tree_stack[-1][0] if parent.children: raise NewickFormatError("Could not parse file as newick." " Contains unnested children.") # This is much faster than TreeNode.extend for child in children: child.parent = parent parent.children = children current_depth -= 1 elif token == ';': if len(tree_stack) == 1: return root break last_token = token raise NewickFormatError("Could not parse file as newick." " `(Parenthesis)`, `'single-quotes'`," " `[comments]` may be unbalanced, or tree may be" " missing its root.")
def _newick_sniffer(fh): # Strategy: # The following conditions preclude a file from being newick: # * It is an empty file. # * There is whitespace inside of a label (handled by tokenizer) # * : is followed by anything that is an operator # * ( is not preceded immediately by , or another ( # * The parens are unablanced when ; is found. # If 100 tokens (or less if EOF occurs earlier) then it is probably # newick, or at least we can't prove it isn't. operators = set(",;:()") empty = True last_token = ',' indent = 0 try: # 100 tokens ought to be enough for anybody. for token, _ in zip(_tokenize_newick(fh), range(100)): if token not in operators: pass elif token == ',' and last_token != ':' and indent > 0: pass elif token == ':' and last_token != ':': pass elif token == ';' and last_token != ':' and indent == 0: pass elif token == ')' and last_token != ':': indent -= 1 elif token == '(' and (last_token == '(' or last_token == ','): indent += 1 else: raise NewickFormatError() last_token = token empty = False except NewickFormatError: return False, {} return not empty, {}
def _tokenize_newick(fh, convert_underscores=True): structure_tokens = set('(),;:') not_escaped = True label_start = False last_non_ws_char = '' last_char = '' comment_depth = 0 metadata_buffer = [] # Strategy: # We will iterate by character. # Comments in newick are defined as: # [This is a comment] # Nested comments are allowed. # # The following characters indicate structure: # ( ) , ; : # # Whitespace is never allowed in a newick label, so an exception will be # thrown. # # We use ' to indicate a literal string. It has the highest precedence of # any operator. for line in fh: for character in line: # We will start by handling the comment case. # This code branch will probably never execute in practice. # Using a comment_depth we can handle nested comments. # Additionally if we are inside an escaped literal string, then # we don't want to consider it a comment. if character == "[" and not_escaped: # Sometimes we might not want to nest a comment, so we will use # our escape character. This is not explicitly mentioned in # any format specification, but seems like what a reasonable # person might do. if last_non_ws_char != "'" or comment_depth == 0: # Once again, only advance our depth if [ has not been # escaped inside our comment. comment_depth += 1 if comment_depth > 0: # Same as above, but in reverse if character == "]" and last_non_ws_char != "'": comment_depth -= 1 last_non_ws_char = character continue # We are not in a comment block if we are below here. # If we are inside of an escaped string literal, then ( ) , ; are # meaningless to the structure. # Otherwise, we are ready to submit our metadata token. if not_escaped and character in structure_tokens: label_start = False metadata = ''.join(metadata_buffer) # If the following condition is True, then we must have just # closed a literal. We know this because last_non_ws_char is # either None or the last non-whitespace character. # last_non_ws_char is None when we have just escaped an escape # and at the first iteration. if last_non_ws_char == "'" or not convert_underscores: # Make no modifications. yield metadata elif metadata: # Underscores are considered to be spaces when not in an # escaped literal string. yield metadata.replace('_', ' ') # Clear our buffer for the next metadata token and yield our # current structure token. metadata_buffer = [] yield character # We will now handle escaped string literals. # They are inconvenient because any character inside of them is # valid, especially whitespace. # We also need to allow ' to be escaped by '. e.g. '' -> ' elif character == "'": not_escaped = not not_escaped label_start = True if last_non_ws_char == "'": # We are escaping our escape, so it should be added to our # metadata_buffer which will represent some future token. metadata_buffer.append(character) # We do not want a running chain of overcounts, so we need # to clear the last character and continue iteration from # the top. Without this, the following would happen: # ''' ' -> '' <open literal> # What we want is: # ''' ' -> '<open literal> <close literal> last_non_ws_char = '' last_char = '' continue elif not character.isspace() or not not_escaped: if label_start and last_char.isspace() and not_escaped: raise NewickFormatError("Newick files cannot have" " unescaped whitespace in their" " labels.") metadata_buffer.append(character) label_start = True # This is equivalent to an `else` however it prevents coverage from # mis-identifying the `continue` as uncalled because cpython will # optimize it to a jump that is slightly different from the normal # jump it would have done anyways. elif True: # Skip the last statement last_char = character continue last_char = character # This line is skipped in the following cases: # * comment_depth > 0, i.e. we are in a comment. # * We have just processed the sequence '' and we don't want # the sequence ''' to result in ''. # * We have encountered whitespace that is not properly escaped. last_non_ws_char = character