def parseQuery(self, query): # Lexical analysis. try: if not isinstance(query, unicode): query = query.decode('utf-8') tokens = _tokenizer_unicode_regex.findall(query) except UnicodeDecodeError: tokens = _tokenizer_regex.findall(query) self._tokens = tokens # classify tokens self._tokentypes = [ _keywords.get(token.upper(), _ATOM) for token in tokens ] # add _EOF self._tokens.append(_EOF) self._tokentypes.append(_EOF) self._index = 0 # Syntactical analysis. self._ignored = [] # Ignored words in the query, for parseQueryEx tree = self._parseOrExpr() self._require(_EOF) if tree is None: raise ParseTree.ParseError("Query contains only common words: %s" % repr(query)) return tree
def _parseTerm(self): if self._check(_LPAREN): tree = self._parseOrExpr() self._require(_RPAREN) else: nodes = [] nodes = [self._parseAtom()] while self._peek(_ATOM): nodes.append(self._parseAtom()) nodes = filter(None, nodes) if not nodes: return None # Only stopwords structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i]) for i in range(len(nodes))] structure.sort() nodes = [node for (bit, index, node) in structure] if isinstance(nodes[0], ParseTree.NotNode): raise ParseTree.ParseError( "a term must have at least one positive word") if len(nodes) == 1: return nodes[0] tree = ParseTree.AndNode(nodes) return tree
def _require(self, tokentype): if not self._check(tokentype): t = self._tokens[self._index] msg = "Token %r required, %r found" % (tokentype, t) raise ParseTree.ParseError(msg)