Beispiel #1
0
def _newick_to_tree_node(fh, convert_underscores=True):
    tree_stack = []
    current_depth = 0
    last_token = ''
    next_is_distance = False
    root = TreeNode()
    tree_stack.append((root, current_depth))
    for token in _tokenize_newick(fh, convert_underscores=convert_underscores):
        # Check for a label
        if last_token not in '(,):':
            if not next_is_distance:
                tree_stack[-1][0].name = last_token if last_token else None
            else:
                next_is_distance = False
        # Check for a distance
        if token == ':':
            next_is_distance = True
        elif last_token == ':':
            try:
                tree_stack[-1][0].length = float(token)
            except ValueError:
                raise NewickFormatError("Could not read length as numeric type"
                                        ": %s." % token)

        elif token == '(':
            current_depth += 1
            tree_stack.append((TreeNode(), current_depth))
        elif token == ',':
            tree_stack.append((TreeNode(), current_depth))
        elif token == ')':
            if len(tree_stack) < 2:
                raise NewickFormatError("Could not parse file as newick."
                                        " Parenthesis are unbalanced.")
            children = []
            # Pop all nodes at this depth as they belong to the remaining
            # node on the top of the stack as children.
            while current_depth == tree_stack[-1][1]:
                node, _ = tree_stack.pop()
                children.insert(0, node)
            parent = tree_stack[-1][0]
            if parent.children:
                raise NewickFormatError("Could not parse file as newick."
                                        " Contains unnested children.")
            # This is much faster than TreeNode.extend
            for child in children:
                child.parent = parent
            parent.children = children
            current_depth -= 1
        elif token == ';':
            if len(tree_stack) == 1:
                return root
            break

        last_token = token

    raise NewickFormatError("Could not parse file as newick."
                            " `(Parenthesis)`, `'single-quotes'`,"
                            " `[comments]` may be unbalanced, or tree may be"
                            " missing its root.")
Beispiel #2
0
def _newick_sniffer(fh):
    # Strategy:
    #   The following conditions preclude a file from being newick:
    #       * It is an empty file.
    #       * There is whitespace inside of a label (handled by tokenizer)
    #       * : is followed by anything that is an operator
    #       * ( is not preceded immediately by , or another (
    #       * The parens are unablanced when ; is found.
    #   If 100 tokens (or less if EOF occurs earlier) then it is probably
    #   newick, or at least we can't prove it isn't.
    operators = set(",;:()")
    empty = True
    last_token = ','
    indent = 0
    try:
        # 100 tokens ought to be enough for anybody.
        for token, _ in zip(_tokenize_newick(fh), range(100)):
            if token not in operators:
                pass
            elif token == ',' and last_token != ':' and indent > 0:
                pass
            elif token == ':' and last_token != ':':
                pass
            elif token == ';' and last_token != ':' and indent == 0:
                pass
            elif token == ')' and last_token != ':':
                indent -= 1
            elif token == '(' and (last_token == '(' or last_token == ','):
                indent += 1
            else:
                raise NewickFormatError()

            last_token = token
            empty = False

    except NewickFormatError:
        return False, {}
    return not empty, {}
Beispiel #3
0
def _tokenize_newick(fh, convert_underscores=True):
    structure_tokens = set('(),;:')
    not_escaped = True
    label_start = False
    last_non_ws_char = ''
    last_char = ''
    comment_depth = 0
    metadata_buffer = []
    # Strategy:
    # We will iterate by character.
    # Comments in newick are defined as:
    # [This is a comment]
    # Nested comments are allowed.
    #
    # The following characters indicate structure:
    #      ( ) , ; :
    #
    # Whitespace is never allowed in a newick label, so an exception will be
    # thrown.
    #
    # We use ' to indicate a literal string. It has the highest precedence of
    # any operator.
    for line in fh:
        for character in line:
            # We will start by handling the comment case.
            # This code branch will probably never execute in practice.
            # Using a comment_depth we can handle nested comments.
            # Additionally if we are inside an escaped literal string, then
            # we don't want to consider it a comment.
            if character == "[" and not_escaped:
                # Sometimes we might not want to nest a comment, so we will use
                # our escape character. This is not explicitly mentioned in
                # any format specification, but seems like what a reasonable
                # person might do.
                if last_non_ws_char != "'" or comment_depth == 0:
                    # Once again, only advance our depth if [ has not been
                    # escaped inside our comment.
                    comment_depth += 1
            if comment_depth > 0:
                # Same as above, but in reverse
                if character == "]" and last_non_ws_char != "'":
                    comment_depth -= 1
                last_non_ws_char = character
                continue
            # We are not in a comment block if we are below here.

            # If we are inside of an escaped string literal, then ( ) , ; are
            # meaningless to the structure.
            # Otherwise, we are ready to submit our metadata token.
            if not_escaped and character in structure_tokens:
                label_start = False
                metadata = ''.join(metadata_buffer)
                # If the following condition is True, then we must have just
                # closed a literal. We know this because last_non_ws_char is
                # either None or the last non-whitespace character.
                # last_non_ws_char is None when we have just escaped an escape
                # and at the first iteration.
                if last_non_ws_char == "'" or not convert_underscores:
                    # Make no modifications.
                    yield metadata
                elif metadata:
                    # Underscores are considered to be spaces when not in an
                    # escaped literal string.
                    yield metadata.replace('_', ' ')
                # Clear our buffer for the next metadata token and yield our
                # current structure token.
                metadata_buffer = []
                yield character
            # We will now handle escaped string literals.
            # They are inconvenient because any character inside of them is
            # valid, especially whitespace.
            # We also need to allow ' to be escaped by '. e.g. '' -> '
            elif character == "'":
                not_escaped = not not_escaped
                label_start = True
                if last_non_ws_char == "'":
                    # We are escaping our escape, so it should be added to our
                    # metadata_buffer which will represent some future token.
                    metadata_buffer.append(character)
                    # We do not want a running chain of overcounts, so we need
                    # to clear the last character and continue iteration from
                    # the top. Without this, the following would happen:
                    # ''' ' -> '' <open literal>
                    # What we want is:
                    # ''' ' -> '<open literal> <close literal>
                    last_non_ws_char = ''
                    last_char = ''
                    continue

            elif not character.isspace() or not not_escaped:
                if label_start and last_char.isspace() and not_escaped:
                    raise NewickFormatError("Newick files cannot have"
                                            " unescaped whitespace in their"
                                            " labels.")
                metadata_buffer.append(character)
                label_start = True

            # This is equivalent to an `else` however it prevents coverage from
            # mis-identifying the `continue` as uncalled because cpython will
            # optimize it to a jump that is slightly different from the normal
            # jump it would have done anyways.
            elif True:
                # Skip the last statement
                last_char = character
                continue

            last_char = character
            # This line is skipped in the following cases:
            #    * comment_depth > 0, i.e. we are in a comment.
            #    * We have just processed the sequence '' and we don't want
            #      the sequence ''' to result in ''.
            #    * We have encountered whitespace that is not properly escaped.
            last_non_ws_char = character