def _TokenizeForType(self, field_type, value, token_position=0): """Tokenizes value into a sequence of Tokens.""" if field_type == document_pb.FieldValue.NUMBER: return [tokens.Token(chars=value, position=token_position)] if field_type == document_pb.FieldValue.GEO: return [tokens.GeoPoint(latitude=value.lat(), longitude=value.lng(), position=token_position)] tokens_found = [] token_strings = [] if not self._split_restricts: token_strings = self.SetCase(value).split() else: token_strings = self._TokenizeString(value, field_type) for token in token_strings: if ':' in token and self._split_restricts: for subtoken in token.split(':'): tokens_found.append( tokens.Token(chars=subtoken, position=token_position)) token_position += 1 elif '"' in token: for subtoken in token.split('"'): if not subtoken: tokens_found.append( tokens.Quote(chars='"', position=token_position)) else: tokens_found.append( tokens.Token(chars=subtoken, position=token_position)) token_position += 1 else: tokens_found.append(tokens.Token(chars=token, position=token_position)) token_position += 1 return tokens_found
def _Snippet(self, query, field, *args): """Create a snippet given a query and the field to query on. Args: query: A query string containing only a bare term (no operators). field: The field name to query on. *args: Unused optional arguments. These are not used on dev_appserver. Returns: A snippet for the field with the query term bolded. Raises: ExpressionEvaluationError: if this is a sort expression. """ field = query_parser.GetQueryNodeText(field) if self._is_sort_expression: raise ExpressionEvaluationError( 'Failed to parse sort expression \'snippet(' + query_parser.GetQueryNodeText(query) + ', ' + field + ')\': snippet() is not supported in sort expressions') schema = self._inverted_index.GetSchema() if schema.IsType(field, document_pb.FieldValue.NUMBER): raise ExpressionEvaluationError( 'Failed to parse field expression \'snippet(' + query_parser.GetQueryNodeText(query) + ', ' + field + ')\': snippet() argument 2 must be text') terms = self._tokenizer.TokenizeText( query_parser.GetQueryNodeText(query).strip('"')) for term in terms: search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars)) postings = self._inverted_index.GetPostingsForToken(search_token) for posting in postings: if posting.doc_id != self._doc_pb.id( ) or not posting.positions: continue field_val = self._GetFieldValue( search_util.GetFieldInDocument(self._doc_pb, field)) if not field_val: continue doc_words = [ token.chars for token in self._case_preserving_tokenizer.TokenizeText(field_val) ] position = posting.positions[0] return self._GenerateSnippet( doc_words, position, search_util.DEFAULT_MAX_SNIPPET_LENGTH) else: field_val = self._GetFieldValue( search_util.GetFieldInDocument(self._doc_pb, field)) if not field_val: return '' return '%s...' % field_val[:search_util. DEFAULT_MAX_SNIPPET_LENGTH]
def _ExtractPrefixTokens(self, token): """Extracts the prefixes from a term.""" term = token.chars.strip() prefix_tokens = [] for i in range(0, len(term)): if term[i]: prefix_tokens.append( tokens.Token(chars=term[:i + 1], position=token.position)) return prefix_tokens
def _Snippet(self, query, field, *args): """Create a snippet given a query and the field to query on. Args: query: A query string containing only a bare term (no operators). field: The field name to query on. *args: Unused optional arguments. These are not used on dev_appserver. Returns: A snippet for the field with the query term bolded. """ field = query_parser.GetQueryNodeText(field) terms = self._tokenizer.TokenizeText( query_parser.GetQueryNodeText(query).strip('"')) for term in terms: search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars)) postings = self._inverted_index.GetPostingsForToken(search_token) for posting in postings: if posting.doc_id != self._doc_pb.id( ) or not posting.positions: continue field_val = search_util.GetFieldValue( search_util.GetFieldInDocument(self._doc_pb, field)) if not field_val: continue doc_words = [ token.chars for token in self._case_preserving_tokenizer.TokenizeText(field_val) ] position = posting.positions[0] return self._GenerateSnippet( doc_words, position, search_util.DEFAULT_MAX_SNIPPET_LENGTH) else: field_val = search_util.GetFieldValue( search_util.GetFieldInDocument(self._doc_pb, field)) if not field_val: return None return '%s...' % field_val[:search_util. DEFAULT_MAX_SNIPPET_LENGTH]
def _Snippet(self, query, field, *args): field = query_parser.GetQueryNodeText(field) terms = self._tokenizer.TokenizeText( query_parser.GetQueryNodeText(query).strip('"')) for term in terms: search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars)) postings = self._inverted_index.GetPostingsForToken(search_token) for posting in postings: if posting.doc_id != self._doc_pb.id( ) or not posting.positions: continue field_val = search_util.GetFieldValue( search_util.GetFieldInDocument(self._doc_pb, field)) doc_words = [ token.chars for token in self._case_preserving_tokenizer.TokenizeText(field_val) ] position = posting.positions[0] return self._GenerateSnippet( doc_words, position, search_util.DEFAULT_MAX_SNIPPET_LENGTH)
def _DocumentCountForTerm(self, term): """Returns the document count for documents containing the term.""" return len(self._PostingsForToken(tokens.Token(chars=term)))
def _PostingsForFieldToken(self, field, value): """Returns postings for the value occurring in the given field.""" value = simple_tokenizer.NormalizeString(value) return self._PostingsForToken( tokens.Token(chars=value, field_name=field))
def _PostingsForFieldToken(self, field, value): """Returns postings for the value occurring in the given field.""" return self._PostingsForToken( tokens.Token(chars=value.lower(), field_name=field))