Example #1
0
def merge_whitespace(tokens):
    """Merges consecutive WHITESPACE tokens.

    This generator function merges consecutive WHITESPACE tokens which can
    result from various mechanisms (especially the VALIGN conversion). It also
    ditches WHITESPACE tokens with no source. Note: this function relies on
    positional elements being present in the tokens.
    """
    empty = True
    a, b = tee(tokens)
    # Advance the second copy by one element
    for elem in b:
        empty = False
        break
    space = ''
    line = col = 1
    # Iterate pairwise over the tokens
    for last, token in izip(a, b):
        if last.type == TT.WHITESPACE:
            if token.type == last.type:
                space += token.source
            elif space:
                yield Token(TT.WHITESPACE, None, space, line, col)
        else:
            if token.type == TT.WHITESPACE:
                space, line, col = token[2:]
            yield last
    if not empty:
        if token.type == TT.WHITESPACE:
            yield Token(TT.WHITESPACE, None, space, line, col)
        else:
            yield token
Example #2
0
 def _token(self, index=None):
     """Returns the token at the specified index, or an EOF token."""
     try:
         return self._tokens[self._index if index is None else index]
     except IndexError:
         # If the current index is beyond the end of the token stream,
         # return a "fake" EOF token to represent this
         if self._tokens:
             return Token(TT.EOF, None, '', *self._tokens[-1][3:])
         else:
             return Token(TT.EOF, None, '', 0, 0)
Example #3
0
def convert_indent(tokens, indent='\t'):
    """Converts INDENT tokens into WHITESPACE.

    This generator function converts INDENT tokens into WHITESPACE tokens
    containing the characters specified by the indent parameter. Note: this
    function zeros the positional elements.
    """
    for token in tokens:
        if token.type == TT.INDENT:
            yield Token(TT.WHITESPACE, None, '\n' + indent * token.value, 0, 0)
        else:
            yield Token(token.type, token.value, token.source, 0, 0)
Example #4
0
def convert_valign(tokens):
    """Converts VALIGN and VAPPLY tokens into WHITESPACE.

    This generator function converts VALIGN and VAPPLY tokens into WHITESPACE
    tokens.  Multiple passes are used to convert the VALIGN tokens; each pass
    converts the first VALIGN token found on a set of lines prior to a VAPPLY
    token into a WHITESPACE token. The final result will require recalculation
    of positions if any tokens have been replaced.
    """
    indexes = []
    aligncol = alignline = 0
    more = True
    while more:
        result = []
        more = False
        for i, token in enumerate(recalc_positions(tokens)):
            line, col = token.line, token.column
            result.append(token)
            if token.type == TT.VALIGN:
                if indexes and alignline == line:
                    # If we encounter more than one VALIGN on a line, remember
                    # that we need another pass
                    more = True
                else:
                    # Remember the position of the VALIGN token in the result,
                    # adjust the alignment column if necessary, and remember
                    # the line number so we can ignore any further VALIGN
                    # tokens on this line
                    indexes.append(i)
                    aligncol = max(aligncol, col)
                    alignline = line
            elif token.type == TT.VAPPLY:
                # Convert all the remembered VALIGN tokens into WHITESPACE
                # tokens with appropriate lengths for vertical alignment
                for j in indexes:
                    line, col = result[j].line, result[j].column
                    result[j] = Token(TT.WHITESPACE, None,
                                      ' ' * (aligncol - col), 0, 0)
                # Convert the VAPPLY token into a zero-length WHITESPACE token.
                # We cannot simply remove it as that would invalidate the
                # indexes being generated for the input sequence by the
                # enumerate() call in the loop
                if indexes:
                    result[-1] = Token(TT.WHITESPACE, None, '', 0, 0)
                    indexes = []
                    aligncol = alignline = 0
        # If indexes isn't blank, then we encountered VALIGNs without a
        # corresponding VAPPLY (parser bug)
        assert not indexes
        tokens = result
    return result
Example #5
0
    def _newline(self, index=0, allowempty=False):
        """Adds an INDENT token to the output.

        The _newline() method is called to start a new line in the output. It
        does this by appending (or inserting, depending on the index parameter)
        an INDENT token to the output list. Later, during _parse_finish, INDENT
        tokens are converted into WHITESPACE tokens at the specified
        indentation level.

        See _insert_output for an explanation of allowempty.
        """
        token = Token(TT.INDENT, self._level, '', 0, 0)
        self._insert_output(token, index, allowempty)
Example #6
0
def split_lines(tokens):
    """Splits tokens which contain line breaks.

    This generator function splits up any tokens that contain line breaks so
    that every line has a token beginning at column 1. Note: this function
    relies on positional elements being present in the tokens.
    """
    for token in tokens:
        (type, value, source, line, column) = token
        while '\n' in source:
            if isinstance(value, basestring) and '\n' in value:
                i = value.index('\n') + 1
                new_value, value = value[:i], value[i:]
            else:
                new_value = value
            i = source.index('\n') + 1
            new_source, source = source[:i], source[i:]
            yield Token(type, new_value, new_source, line, column)
            line += 1
            column = 1
        if source or type not in (TT.WHITESPACE, TT.COMMENT):
            yield Token(type, value, source, line, column)
Example #7
0
def recalc_positions(tokens):
    """Recalculates token positions.

    This generator function recalculates the position of each token. It is
    intended for wrapping other functions which alter the source of tokens.
    """
    line = 1
    column = 1
    for token in tokens:
        yield Token(token.type, token.value, token.source, line, column)
        for char in token.source:
            if char == '\n':
                line += 1
                column = 1
            else:
                column += 1
Example #8
0
def strip_whitespace(tokens):
    """Strips trailing WHITESPACE tokens from all lines of output.

    This generator function strips trailing WHITESPACE tokens at the end of a
    line from the provided sequence of tokens. The function assumes that
    WHITESPACE tokens have been merged (two will not appear consecutively).
    Positions present in the tokens are preserved.
    """
    last = None
    for token in tokens:
        if token.type == TT.WHITESPACE:
            if '\n' in token.source:
                last = Token(TT.WHITESPACE, None,
                             '\n' + token.source.split('\n', 1)[1], token.line,
                             token.column)
            else:
                last = token
        else:
            if last:
                yield last
                last = None
            yield token
Example #9
0
    def _match(self, template, prespace=None, postspace=None):
        """Attempt to match the current token against a template token.

        Matches the provided template token against the current token in the
        stream. If the match is successful the current position is moved
        forward to the next non-junk token, and the (potentially transformed)
        matched token is returned. Otherwise, None is returned and the current
        position is not moved.

        The prespace and postspace parameters affect the insertion of
        WHITESPACE tokens into the output when WHITESPACE is present in the
        reformat set property, and a match is successful. If prespace is True,
        a WHITESPACE token containing a single space is added to the output
        prior to appending the matching token. However, if prespace is False,
        no WHITESPACE token will be added, only the matching token.  If
        postspace is False, it will override the prespace setting of the next
        match (useful for suppressing space next to right-associative operators
        like unary plus/minus).

        Note that a False value in either prespace or postspace always
        overrides a True value, i.e. if a match sets postspace to False, the
        value of prespace in the subsequent match is irrelevant; no space will
        be added.  Likewise if a match sets postspace to True, a False prespace
        value in a subsequent match will override this and prevent space from
        being added.

        By default prespace and postspace are None. In this case, the
        _prespace_default() and _postspace_default() methods will be called to
        determine the default based on the match template. These methods should
        be overridden by descendents to deal with additional syntax introduced
        by the dialect they represent. The default implementations in this
        class suppress prespace in the case of dot, comma and close-parenthesis
        operators and postspace in the case of dot and open-parenthesis.
        """
        # Compare the current token against the template. Note that the
        # template may transform the token in order to match (see _cmp_tokens)
        token = self._cmp_tokens(self._token(), template)
        if not token:
            return None
        # If a match was found, add a leading space (if WHITESPACE is being
        # reformatted, and prespace permits it)
        if TT.WHITESPACE in self.reformat:
            if prespace is None:
                prespace = self._prespace_default(template)
            if prespace and self._output and self._output[-1].type not in (
                    TT.INDENT, TT.WHITESPACE):
                self._output.append(Token(TT.WHITESPACE, None, ' ', 0, 0))
        self._output.append(token)
        self._index += 1
        while self._token().type in (TT.COMMENT, TT.WHITESPACE):
            if self._token(
            ).type == TT.COMMENT or TT.WHITESPACE not in self.reformat:
                self._output.append(self._token())
            self._index += 1
        # If postspace is False, prevent the next _match call from adding a
        # leading space by adding an empty WHITESPACE token. The final phase of
        # the parser removes empty tokens.
        if TT.WHITESPACE in self.reformat:
            if postspace is None:
                postspace = self._postspace_default(template)
            if not postspace:
                self._output.append(Token(TT.WHITESPACE, None, '', 0, 0))
        return token
Example #10
0
    def _cmp_tokens(self, token, template):
        """Compares a token against a partial template.

        If the template is just a string, it will match a KEYWORD, OPERATOR, or
        IDENTIFIER token with the same value (the second element of a token).
        If the template is an integer (like the KEYWORD or IDENTIFIER
        constants) it will match a token with the same type, with the following
        exceptions:

        * IDENTIFIER will also match KEYWORD tokens (to allow keywords to be
          used as identifiers)
        * DATATYPE and REGISTER will match KEYWORD or IDENTIFIER (DATATYPE and
          REGISTER tokens should never appear in the input and this allows
          keywords like CHARACTER or identifiers like DECIMAL to be treated as
          datatypes, and things like CURRENT DATE to be treated as special
          registers)
        * STATEMENT will match TERMINATOR (STATEMENT tokens are terminators
          but specific to a top-level SQL statement or CLP command), or EOF
          (the script is assumed to end with an implicit terminator)

        If the template is a tuple it will match a token with the same element
        values up to the number of elements in the partial token.

        The method returns the matched token (transformed if any
        transformations were necessary to make the match, e.g. KEYWORD to
        IDENTIFIER).
        """
        # List of token type transformations that are permitted to occur in
        # order to obtain a successful match (e.g. if we're expecting a
        # DATATYPE but find an IDENTIFIER, the comparison method may mutate the
        # IDENTIFIER token into a DATATYPE token and return it, indicating a
        # successful match)
        transforms = {
            TT.KEYWORD: (TT.IDENTIFIER, TT.DATATYPE, TT.REGISTER, TT.SCHEMA,
                         TT.RELATION, TT.ROUTINE),
            TT.IDENTIFIER:
            (TT.DATATYPE, TT.REGISTER, TT.SCHEMA, TT.RELATION, TT.ROUTINE),
            TT.STRING: (TT.PASSWORD, ),
            TT.TERMINATOR: (TT.STATEMENT, ),
            TT.EOF: (TT.STATEMENT, ),
        }
        if isinstance(template, basestring):
            if token.type in (TT.KEYWORD,
                              TT.OPERATOR) and token.value == template:
                return token
            elif token.type == TT.IDENTIFIER and token.value == template and token.source[
                    0] != '"':
                # Only unquoted identifiers are matched (quoted identifiers
                # aren't used in any part of the SQL dialect)
                return token
        elif isinstance(template, int):
            if token.type == template:
                return token
            elif template in transforms.get(token.type, ()):
                return Token(template, *token[1:])
            else:
                return None
        elif isinstance(template, tuple):
            if token[:len(template)] == template:
                return token
            elif (token.value == template[1]) and (template[0]
                                                   in transforms.get(
                                                       token.type, ())):
                return Token(template[0], *token[1:])
            else:
                return None
        else:
            assert False, "Invalid template token (%s) %s" % (str(
                type(template)), str(template))
Example #11
0
 def _vapply(self, index=0):
     """Inserts a VAPPLY token into the output."""
     token = Token(TT.VAPPLY, None, '', 0, 0)
     self._insert_output(token, index, True)
Example #12
0
 def _valign(self, index=0):
     """Inserts a VALIGN token into the output."""
     token = Token(TT.VALIGN, None, '', 0, 0)
     self._insert_output(token, index, True)
Example #13
0
def format_tokens(tokens,
                  reformat=[],
                  terminator=';',
                  statement=';',
                  namechars=set(sql92_namechars)):
    """Changes token source to a canonical format.

    This generator function handles reformatting tokens into a canonical
    representation (e.g. unquoted identifiers folded to uppercase). The
    optional terminator parameter specifies the terminator for statements
    within a block statement, while the optional statement parameter specifies
    the top-level statement terminator. The reformat parameter specifies which
    types of token will be affected by the function. Note: this function zeros
    the positional elements.
    """
    for token in tokens:
        if token.type in reformat:
            if token.type in (TT.KEYWORD, TT.REGISTER):
                yield Token(token.type, token.value, token.value, 0, 0)
            elif token.type in (TT.IDENTIFIER, TT.DATATYPE, TT.SCHEMA,
                                TT.RELATION, TT.ROUTINE):
                yield Token(token.type, token.value,
                            format_ident(token.value, namechars=namechars), 0,
                            0)
            elif token.type == TT.NUMBER:
                # Ensure decimal values with no decimal portion keep the
                # decimal point (fix for #49)
                if isinstance(token.value,
                              Decimal) and token.value.as_tuple()[-1] == 0:
                    yield Token(TT.NUMBER, token.value,
                                str(token.value) + '.', 0, 0)
                else:
                    yield Token(TT.NUMBER, token.value, str(token.value), 0, 0)
            elif token.type in (TT.STRING, TT.PASSWORD):
                yield Token(token.type, token.value, quote_str(token.value), 0,
                            0)
            elif token.type == TT.LABEL:
                yield Token(
                    TT.LABEL, token.value,
                    format_ident(token.value, namechars=namechars) + ':', 0, 0)
            elif token.type == TT.PARAMETER:
                yield Token(TT.PARAMETER, token.value,
                            format_param(token.value, namechars=namechars), 0,
                            0)
            elif token.type == TT.COMMENT:
                # XXX Need more intelligent comment handling
                ##yield (TT.COMMENT, token[1], '/*%s*/' % (token[1]))
                yield Token(token.type, token.value, token.source, 0, 0)
            elif token.type == TT.STATEMENT:
                yield Token(TT.STATEMENT, token.value, statement, 0, 0)
            elif token.type == TT.TERMINATOR:
                yield Token(TT.TERMINATOR, token.value, terminator, 0, 0)
            else:
                yield Token(token.type, token.value, token.source, 0, 0)
        else:
            yield Token(token.type, token.value, token.source, 0, 0)
Example #14
0
 def excerpt(tokens):
     if len(tokens) > 10:
         excerpt = tokens[:10] + [Token(0, None, '...', 0, 0)]
     else:
         excerpt = tokens
     return ''.join(token.source for token in excerpt)