def merge_whitespace(tokens): """Merges consecutive WHITESPACE tokens. This generator function merges consecutive WHITESPACE tokens which can result from various mechanisms (especially the VALIGN conversion). It also ditches WHITESPACE tokens with no source. Note: this function relies on positional elements being present in the tokens. """ empty = True a, b = tee(tokens) # Advance the second copy by one element for elem in b: empty = False break space = '' line = col = 1 # Iterate pairwise over the tokens for last, token in izip(a, b): if last.type == TT.WHITESPACE: if token.type == last.type: space += token.source elif space: yield Token(TT.WHITESPACE, None, space, line, col) else: if token.type == TT.WHITESPACE: space, line, col = token[2:] yield last if not empty: if token.type == TT.WHITESPACE: yield Token(TT.WHITESPACE, None, space, line, col) else: yield token
def _token(self, index=None): """Returns the token at the specified index, or an EOF token.""" try: return self._tokens[self._index if index is None else index] except IndexError: # If the current index is beyond the end of the token stream, # return a "fake" EOF token to represent this if self._tokens: return Token(TT.EOF, None, '', *self._tokens[-1][3:]) else: return Token(TT.EOF, None, '', 0, 0)
def convert_indent(tokens, indent='\t'): """Converts INDENT tokens into WHITESPACE. This generator function converts INDENT tokens into WHITESPACE tokens containing the characters specified by the indent parameter. Note: this function zeros the positional elements. """ for token in tokens: if token.type == TT.INDENT: yield Token(TT.WHITESPACE, None, '\n' + indent * token.value, 0, 0) else: yield Token(token.type, token.value, token.source, 0, 0)
def convert_valign(tokens): """Converts VALIGN and VAPPLY tokens into WHITESPACE. This generator function converts VALIGN and VAPPLY tokens into WHITESPACE tokens. Multiple passes are used to convert the VALIGN tokens; each pass converts the first VALIGN token found on a set of lines prior to a VAPPLY token into a WHITESPACE token. The final result will require recalculation of positions if any tokens have been replaced. """ indexes = [] aligncol = alignline = 0 more = True while more: result = [] more = False for i, token in enumerate(recalc_positions(tokens)): line, col = token.line, token.column result.append(token) if token.type == TT.VALIGN: if indexes and alignline == line: # If we encounter more than one VALIGN on a line, remember # that we need another pass more = True else: # Remember the position of the VALIGN token in the result, # adjust the alignment column if necessary, and remember # the line number so we can ignore any further VALIGN # tokens on this line indexes.append(i) aligncol = max(aligncol, col) alignline = line elif token.type == TT.VAPPLY: # Convert all the remembered VALIGN tokens into WHITESPACE # tokens with appropriate lengths for vertical alignment for j in indexes: line, col = result[j].line, result[j].column result[j] = Token(TT.WHITESPACE, None, ' ' * (aligncol - col), 0, 0) # Convert the VAPPLY token into a zero-length WHITESPACE token. # We cannot simply remove it as that would invalidate the # indexes being generated for the input sequence by the # enumerate() call in the loop if indexes: result[-1] = Token(TT.WHITESPACE, None, '', 0, 0) indexes = [] aligncol = alignline = 0 # If indexes isn't blank, then we encountered VALIGNs without a # corresponding VAPPLY (parser bug) assert not indexes tokens = result return result
def _newline(self, index=0, allowempty=False): """Adds an INDENT token to the output. The _newline() method is called to start a new line in the output. It does this by appending (or inserting, depending on the index parameter) an INDENT token to the output list. Later, during _parse_finish, INDENT tokens are converted into WHITESPACE tokens at the specified indentation level. See _insert_output for an explanation of allowempty. """ token = Token(TT.INDENT, self._level, '', 0, 0) self._insert_output(token, index, allowempty)
def split_lines(tokens): """Splits tokens which contain line breaks. This generator function splits up any tokens that contain line breaks so that every line has a token beginning at column 1. Note: this function relies on positional elements being present in the tokens. """ for token in tokens: (type, value, source, line, column) = token while '\n' in source: if isinstance(value, basestring) and '\n' in value: i = value.index('\n') + 1 new_value, value = value[:i], value[i:] else: new_value = value i = source.index('\n') + 1 new_source, source = source[:i], source[i:] yield Token(type, new_value, new_source, line, column) line += 1 column = 1 if source or type not in (TT.WHITESPACE, TT.COMMENT): yield Token(type, value, source, line, column)
def recalc_positions(tokens): """Recalculates token positions. This generator function recalculates the position of each token. It is intended for wrapping other functions which alter the source of tokens. """ line = 1 column = 1 for token in tokens: yield Token(token.type, token.value, token.source, line, column) for char in token.source: if char == '\n': line += 1 column = 1 else: column += 1
def strip_whitespace(tokens): """Strips trailing WHITESPACE tokens from all lines of output. This generator function strips trailing WHITESPACE tokens at the end of a line from the provided sequence of tokens. The function assumes that WHITESPACE tokens have been merged (two will not appear consecutively). Positions present in the tokens are preserved. """ last = None for token in tokens: if token.type == TT.WHITESPACE: if '\n' in token.source: last = Token(TT.WHITESPACE, None, '\n' + token.source.split('\n', 1)[1], token.line, token.column) else: last = token else: if last: yield last last = None yield token
def _match(self, template, prespace=None, postspace=None): """Attempt to match the current token against a template token. Matches the provided template token against the current token in the stream. If the match is successful the current position is moved forward to the next non-junk token, and the (potentially transformed) matched token is returned. Otherwise, None is returned and the current position is not moved. The prespace and postspace parameters affect the insertion of WHITESPACE tokens into the output when WHITESPACE is present in the reformat set property, and a match is successful. If prespace is True, a WHITESPACE token containing a single space is added to the output prior to appending the matching token. However, if prespace is False, no WHITESPACE token will be added, only the matching token. If postspace is False, it will override the prespace setting of the next match (useful for suppressing space next to right-associative operators like unary plus/minus). Note that a False value in either prespace or postspace always overrides a True value, i.e. if a match sets postspace to False, the value of prespace in the subsequent match is irrelevant; no space will be added. Likewise if a match sets postspace to True, a False prespace value in a subsequent match will override this and prevent space from being added. By default prespace and postspace are None. In this case, the _prespace_default() and _postspace_default() methods will be called to determine the default based on the match template. These methods should be overridden by descendents to deal with additional syntax introduced by the dialect they represent. The default implementations in this class suppress prespace in the case of dot, comma and close-parenthesis operators and postspace in the case of dot and open-parenthesis. """ # Compare the current token against the template. Note that the # template may transform the token in order to match (see _cmp_tokens) token = self._cmp_tokens(self._token(), template) if not token: return None # If a match was found, add a leading space (if WHITESPACE is being # reformatted, and prespace permits it) if TT.WHITESPACE in self.reformat: if prespace is None: prespace = self._prespace_default(template) if prespace and self._output and self._output[-1].type not in ( TT.INDENT, TT.WHITESPACE): self._output.append(Token(TT.WHITESPACE, None, ' ', 0, 0)) self._output.append(token) self._index += 1 while self._token().type in (TT.COMMENT, TT.WHITESPACE): if self._token( ).type == TT.COMMENT or TT.WHITESPACE not in self.reformat: self._output.append(self._token()) self._index += 1 # If postspace is False, prevent the next _match call from adding a # leading space by adding an empty WHITESPACE token. The final phase of # the parser removes empty tokens. if TT.WHITESPACE in self.reformat: if postspace is None: postspace = self._postspace_default(template) if not postspace: self._output.append(Token(TT.WHITESPACE, None, '', 0, 0)) return token
def _cmp_tokens(self, token, template): """Compares a token against a partial template. If the template is just a string, it will match a KEYWORD, OPERATOR, or IDENTIFIER token with the same value (the second element of a token). If the template is an integer (like the KEYWORD or IDENTIFIER constants) it will match a token with the same type, with the following exceptions: * IDENTIFIER will also match KEYWORD tokens (to allow keywords to be used as identifiers) * DATATYPE and REGISTER will match KEYWORD or IDENTIFIER (DATATYPE and REGISTER tokens should never appear in the input and this allows keywords like CHARACTER or identifiers like DECIMAL to be treated as datatypes, and things like CURRENT DATE to be treated as special registers) * STATEMENT will match TERMINATOR (STATEMENT tokens are terminators but specific to a top-level SQL statement or CLP command), or EOF (the script is assumed to end with an implicit terminator) If the template is a tuple it will match a token with the same element values up to the number of elements in the partial token. The method returns the matched token (transformed if any transformations were necessary to make the match, e.g. KEYWORD to IDENTIFIER). """ # List of token type transformations that are permitted to occur in # order to obtain a successful match (e.g. if we're expecting a # DATATYPE but find an IDENTIFIER, the comparison method may mutate the # IDENTIFIER token into a DATATYPE token and return it, indicating a # successful match) transforms = { TT.KEYWORD: (TT.IDENTIFIER, TT.DATATYPE, TT.REGISTER, TT.SCHEMA, TT.RELATION, TT.ROUTINE), TT.IDENTIFIER: (TT.DATATYPE, TT.REGISTER, TT.SCHEMA, TT.RELATION, TT.ROUTINE), TT.STRING: (TT.PASSWORD, ), TT.TERMINATOR: (TT.STATEMENT, ), TT.EOF: (TT.STATEMENT, ), } if isinstance(template, basestring): if token.type in (TT.KEYWORD, TT.OPERATOR) and token.value == template: return token elif token.type == TT.IDENTIFIER and token.value == template and token.source[ 0] != '"': # Only unquoted identifiers are matched (quoted identifiers # aren't used in any part of the SQL dialect) return token elif isinstance(template, int): if token.type == template: return token elif template in transforms.get(token.type, ()): return Token(template, *token[1:]) else: return None elif isinstance(template, tuple): if token[:len(template)] == template: return token elif (token.value == template[1]) and (template[0] in transforms.get( token.type, ())): return Token(template[0], *token[1:]) else: return None else: assert False, "Invalid template token (%s) %s" % (str( type(template)), str(template))
def _vapply(self, index=0): """Inserts a VAPPLY token into the output.""" token = Token(TT.VAPPLY, None, '', 0, 0) self._insert_output(token, index, True)
def _valign(self, index=0): """Inserts a VALIGN token into the output.""" token = Token(TT.VALIGN, None, '', 0, 0) self._insert_output(token, index, True)
def format_tokens(tokens, reformat=[], terminator=';', statement=';', namechars=set(sql92_namechars)): """Changes token source to a canonical format. This generator function handles reformatting tokens into a canonical representation (e.g. unquoted identifiers folded to uppercase). The optional terminator parameter specifies the terminator for statements within a block statement, while the optional statement parameter specifies the top-level statement terminator. The reformat parameter specifies which types of token will be affected by the function. Note: this function zeros the positional elements. """ for token in tokens: if token.type in reformat: if token.type in (TT.KEYWORD, TT.REGISTER): yield Token(token.type, token.value, token.value, 0, 0) elif token.type in (TT.IDENTIFIER, TT.DATATYPE, TT.SCHEMA, TT.RELATION, TT.ROUTINE): yield Token(token.type, token.value, format_ident(token.value, namechars=namechars), 0, 0) elif token.type == TT.NUMBER: # Ensure decimal values with no decimal portion keep the # decimal point (fix for #49) if isinstance(token.value, Decimal) and token.value.as_tuple()[-1] == 0: yield Token(TT.NUMBER, token.value, str(token.value) + '.', 0, 0) else: yield Token(TT.NUMBER, token.value, str(token.value), 0, 0) elif token.type in (TT.STRING, TT.PASSWORD): yield Token(token.type, token.value, quote_str(token.value), 0, 0) elif token.type == TT.LABEL: yield Token( TT.LABEL, token.value, format_ident(token.value, namechars=namechars) + ':', 0, 0) elif token.type == TT.PARAMETER: yield Token(TT.PARAMETER, token.value, format_param(token.value, namechars=namechars), 0, 0) elif token.type == TT.COMMENT: # XXX Need more intelligent comment handling ##yield (TT.COMMENT, token[1], '/*%s*/' % (token[1])) yield Token(token.type, token.value, token.source, 0, 0) elif token.type == TT.STATEMENT: yield Token(TT.STATEMENT, token.value, statement, 0, 0) elif token.type == TT.TERMINATOR: yield Token(TT.TERMINATOR, token.value, terminator, 0, 0) else: yield Token(token.type, token.value, token.source, 0, 0) else: yield Token(token.type, token.value, token.source, 0, 0)
def excerpt(tokens): if len(tokens) > 10: excerpt = tokens[:10] + [Token(0, None, '...', 0, 0)] else: excerpt = tokens return ''.join(token.source for token in excerpt)