def _token_literal(self, value, start, end):
     actual_value = value[1:-1]
     actual_value = actual_value.replace('\\`', '`').lstrip()
     # First, if it looks like JSON then we parse it as
     # JSON and any json parsing errors propogate as lexing
     # errors.
     if self._looks_like_json(actual_value):
         try:
             return loads(actual_value)
         except ValueError:
             raise LexerError(lexer_position=start,
                              lexer_value=value,
                              message="Bad token %s" % value)
     else:
         potential_value = '"%s"' % actual_value
         try:
             # There's a shortcut syntax where string literals
             # don't have to be quoted.  This is only true if the
             # string doesn't start with chars that could start a valid
             # JSON value.
             return loads(potential_value)
         except ValueError:
             raise LexerError(lexer_position=start,
                              lexer_value=value,
                              message="Bad token %s" % value)
 def tokenize(self, expression):
     if not expression:
         raise EmptyExpressionError()
     previous_column = 0
     for match in self.master_regex.finditer(expression):
         value = match.group()
         start = match.start()
         end = match.end()
         if match.lastgroup == 'skip':
             # Ignore whitespace.
             previous_column = end
             continue
         if start != previous_column:
             bad_value = expression[previous_column:start]
             # Try to give a good error message.
             if bad_value == '"':
                 raise LexerError(
                     lexer_position=previous_column,
                     lexer_value=value,
                     message='Starting quote is missing the ending quote',
                     expression=expression)
             raise LexerError(lexer_position=previous_column,
                              lexer_value=value,
                              message='Unknown character',
                              expression=expression)
         previous_column = end
         token_type = match.lastgroup
         handler = getattr(self, '_token_%s' % token_type.lower(), None)
         if handler is not None:
             value = handler(value, start, end)
         yield {
             'type': token_type,
             'value': value,
             'start': start,
             'end': end
         }
     # At the end of the loop make sure we've consumed all the input.
     # If we haven't then we have unidentified characters.
     if end != len(expression):
         msg = "Unknown characters at the end of the expression"
         raise LexerError(lexer_position=end,
                          lexer_value='',
                          message=msg,
                          expression=expression)
     else:
         yield {
             'type': 'eof',
             'value': '',
             'start': len(expression),
             'end': len(expression)
         }
 def _token_quoted_identifier(self, value, start, end):
     try:
         return loads(value)
     except ValueError as e:
         error_message = str(e).split(':')[0]
         raise LexerError(lexer_position=start,
                          lexer_value=value,
                          message=error_message)
Exemple #4
0
 def _consume_quoted_identifier(self):
     start = self._position
     lexeme = '"' + self._consume_until('"') + '"'
     try:
         token_len = self._position - start
         return {
             'type': 'quoted_identifier',
             'value': loads(lexeme),
             'start': start,
             'end': token_len
         }
     except ValueError as e:
         error_message = str(e).split(':')[0]
         raise LexerError(lexer_position=start,
                          lexer_value=lexeme,
                          message=error_message)
Exemple #5
0
 def _consume_until(self, delimiter):
     # Consume until the delimiter is reached,
     # allowing for the delimiter to be escaped with "\".
     start = self._position
     buff = ''
     self._next()
     while self._current != delimiter:
         if self._current == '\\':
             buff += '\\'
             self._next()
         if self._current is None:
             raise LexerError(lexer_position=start,
                              lexer_value=self._expression,
                              message="Unclosed %s delimiter" % delimiter)
         buff += self._current
         self._next()
     # Skip the closing delimiter.
     self._next()
     return buff
Exemple #6
0
 def _consume_literal(self):
     start = self._position
     lexeme = self._consume_until('`').replace('\\`', '`')
     try:
         # Assume it is valid JSON and attempt to parse.
         parsed_json = loads(lexeme)
     except ValueError:
         try:
             # Invalid JSON values should be converted to quoted
             # JSON strings during the JEP-12 deprecation period.
             parsed_json = loads('"%s"' % lexeme.lstrip())
             warnings.warn("deprecated string literal syntax",
                           PendingDeprecationWarning)
         except ValueError:
             raise LexerError(lexer_position=start,
                              lexer_value=self._expression[start:],
                              message="Bad token %s" % lexeme)
     token_len = self._position - start
     return {
         'type': 'literal',
         'value': parsed_json,
         'start': start,
         'end': token_len
     }
Exemple #7
0
 def tokenize(self, expression):
     self._initialize_for_expression(expression)
     while self._current is not None:
         if self._current in self.SIMPLE_TOKENS:
             yield {
                 'type': self.SIMPLE_TOKENS[self._current],
                 'value': self._current,
                 'start': self._position,
                 'end': self._position + 1
             }
             self._next()
         elif self._current in self.START_IDENTIFIER:
             start = self._position
             buff = self._current
             while self._next() in self.VALID_IDENTIFIER:
                 buff += self._current
             yield {
                 'type': 'unquoted_identifier',
                 'value': buff,
                 'start': start,
                 'end': start + len(buff)
             }
         elif self._current in self.WHITESPACE:
             self._next()
         elif self._current == '[':
             start = self._position
             next_char = self._next()
             if next_char == ']':
                 self._next()
                 yield {
                     'type': 'flatten',
                     'value': '[]',
                     'start': start,
                     'end': start + 2
                 }
             elif next_char == '?':
                 self._next()
                 yield {
                     'type': 'filter',
                     'value': '[?',
                     'start': start,
                     'end': start + 2
                 }
             else:
                 yield {
                     'type': 'lbracket',
                     'value': '[',
                     'start': start,
                     'end': start + 1
                 }
         elif self._current == "'":
             yield self._consume_raw_string_literal()
         elif self._current == '|':
             yield self._match_or_else('|', 'or', 'pipe')
         elif self._current == '&':
             yield self._match_or_else('&', 'and', 'expref')
         elif self._current == '`':
             yield self._consume_literal()
         elif self._current in self.VALID_NUMBER:
             start = self._position
             buff = self._consume_number()
             yield {
                 'type': 'number',
                 'value': int(buff),
                 'start': start,
                 'end': start + len(buff)
             }
         elif self._current == '-':
             # Negative number.
             start = self._position
             buff = self._consume_number()
             if len(buff) > 1:
                 yield {
                     'type': 'number',
                     'value': int(buff),
                     'start': start,
                     'end': start + len(buff)
                 }
             else:
                 raise LexerError(lexer_position=start,
                                  lexer_value=buff,
                                  message="Unknown token '%s'" % buff)
         elif self._current == '"':
             yield self._consume_quoted_identifier()
         elif self._current == '<':
             yield self._match_or_else('=', 'lte', 'lt')
         elif self._current == '>':
             yield self._match_or_else('=', 'gte', 'gt')
         elif self._current == '!':
             yield self._match_or_else('=', 'ne', 'not')
         elif self._current == '=':
             if self._next() == '=':
                 yield {
                     'type': 'eq',
                     'value': '==',
                     'start': self._position - 1,
                     'end': self._position
                 }
                 self._next()
             else:
                 if self._current is None:
                     # If we're at the EOF, we never advanced
                     # the position so we don't need to rewind
                     # it back one location.
                     position = self._position
                 else:
                     position = self._position - 1
                 raise LexerError(lexer_position=position,
                                  lexer_value='=',
                                  message="Unknown token '='")
         else:
             raise LexerError(lexer_position=self._position,
                              lexer_value=self._current,
                              message="Unknown token %s" % self._current)
     yield {
         'type': 'eof',
         'value': '',
         'start': self._length,
         'end': self._length
     }
Exemple #8
0
 def tokenize(self, expression):
     self._initialize_for_expression(expression)
     while self._current is not None:
         if self._current in self.SIMPLE_TOKENS:
             yield {
                 'type': self.SIMPLE_TOKENS[self._current],
                 'value': self._current,
                 'start': self._position,
                 'end': self._position + 1
             }
             self._next()
         elif self._current in self.START_IDENTIFIER:
             start = self._position
             buff = self._current
             while self._next() in self.VALID_IDENTIFIER:
                 buff += self._current
             yield {
                 'type': 'unquoted_identifier',
                 'value': buff,
                 'start': start,
                 'end': start + len(buff)
             }
         elif self._current in self.WHITESPACE:
             self._next()
         elif self._current == '[':
             start = self._position
             next_char = self._next()
             if next_char == ']':
                 self._next()
                 yield {
                     'type': 'flatten',
                     'value': '[]',
                     'start': start,
                     'end': start + 2
                 }
             elif next_char == '?':
                 self._next()
                 yield {
                     'type': 'filter',
                     'value': '[?',
                     'start': start,
                     'end': start + 2
                 }
             else:
                 yield {
                     'type': 'lbracket',
                     'value': '[',
                     'start': start,
                     'end': start + 1
                 }
         elif self._current == "'":
             yield self._consume_raw_string_literal()
         elif self._current == '|':
             yield self._match_or_else('|', 'or', 'pipe')
         elif self._current == '&':
             yield self._match_or_else('&', 'and', 'expref')
         elif self._current == '`':
             yield self._consume_literal()
         elif self._current in self.START_NUMBER:
             start = self._position
             buff = self._current
             while self._next() in self.VALID_NUMBER:
                 buff += self._current
             yield {
                 'type': 'number',
                 'value': int(buff),
                 'start': start,
                 'end': start + len(buff)
             }
         elif self._current == '"':
             yield self._consume_quoted_identifier()
         elif self._current == '<':
             yield self._match_or_else('=', 'lte', 'lt')
         elif self._current == '>':
             yield self._match_or_else('=', 'gte', 'gt')
         elif self._current == '!':
             yield self._match_or_else('=', 'ne', 'not')
         elif self._current == '=':
             yield self._match_or_else('=', 'eq', 'unknown')
         else:
             raise LexerError(lexer_position=self._position,
                              lexer_value=self._current,
                              message="Unknown token %s" % self._current)
     yield {
         'type': 'eof',
         'value': '',
         'start': self._length,
         'end': self._length
     }