def parseQuery(self, query):
    # Lexical analysis.
    try:
        if not isinstance(query, unicode):
            query = query.decode('utf-8')
        tokens = _tokenizer_unicode_regex.findall(query)
    except UnicodeDecodeError:
        tokens = _tokenizer_regex.findall(query)
    self._tokens = tokens
    # classify tokens
    self._tokentypes = [
        _keywords.get(token.upper(), _ATOM) for token in tokens
    ]
    # add _EOF
    self._tokens.append(_EOF)
    self._tokentypes.append(_EOF)
    self._index = 0

    # Syntactical analysis.
    self._ignored = []  # Ignored words in the query, for parseQueryEx
    tree = self._parseOrExpr()
    self._require(_EOF)
    if tree is None:
        raise ParseTree.ParseError("Query contains only common words: %s" %
                                   repr(query))
    return tree
예제 #2
0
 def _parseTerm(self):
     if self._check(_LPAREN):
         tree = self._parseOrExpr()
         self._require(_RPAREN)
     else:
         nodes = []
         nodes = [self._parseAtom()]
         while self._peek(_ATOM):
             nodes.append(self._parseAtom())
         nodes = filter(None, nodes)
         if not nodes:
             return None  # Only stopwords
         structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
                      for i in range(len(nodes))]
         structure.sort()
         nodes = [node for (bit, index, node) in structure]
         if isinstance(nodes[0], ParseTree.NotNode):
             raise ParseTree.ParseError(
                 "a term must have at least one positive word")
         if len(nodes) == 1:
             return nodes[0]
         tree = ParseTree.AndNode(nodes)
     return tree
예제 #3
0
 def _require(self, tokentype):
     if not self._check(tokentype):
         t = self._tokens[self._index]
         msg = "Token %r required, %r found" % (tokentype, t)
         raise ParseTree.ParseError(msg)