Esempio n. 1
0
  def _MatchPhrase(self, field, match, document):
    """Match a textual field with a phrase query node."""
    field_text = field.value().string_value()
    phrase_text = query_parser.GetPhraseQueryNodeText(match)


    if field.value().type() == document_pb.FieldValue.ATOM:
      return self._MatchRawPhraseWithRawAtom(field_text, phrase_text)


    if not phrase_text:
      return False

    phrase = self._parser.TokenizeText(
        search_util.RemoveAccentsNfkd(phrase_text))
    field_text = self._parser.TokenizeText(
        search_util.RemoveAccentsNfkd(field_text))
    if not phrase:
      return True
    posting = None
    for post in self._PostingsForFieldToken(field.name(), phrase[0].chars):
      if post.doc_id == document.id():
        posting = post
        break
    if not posting:
      return False

    def ExtractWords(token_list):
      return (token.chars for token in token_list)

    for position in posting.positions:




      match_words = list(zip(ExtractWords(field_text[position:]),
                        ExtractWords(phrase)))
      if len(match_words) != len(phrase):
        continue


      match = True
      for doc_word, match_word in match_words:
        if doc_word != match_word:
          match = False

      if match:
        return True
    return False
Esempio n. 2
0
  def _MatchTextField(self, field, match, document):
    """Check if a textual field matches a query tree node."""

    if match.getType() == QueryParser.FUZZY:
      return self._MatchTextField(field, match.getChild(0), document)

    if match.getType() == QueryParser.VALUE:
      if query_parser.IsPhrase(match):
        return self._MatchPhrase(field, match, document)


      if field.value().type() == document_pb.FieldValue.ATOM:
        return (field.value().string_value() ==
                query_parser.GetQueryNodeText(match))

      query_tokens = self._parser.TokenizeText(
          query_parser.GetQueryNodeText(match))


      if not query_tokens:
        return True




      if len(query_tokens) > 1:
        def QueryNode(token):
          return query_parser.CreateQueryNode(
              search_util.RemoveAccentsNfkd(token.chars), QueryParser.TEXT)
        return all(self._MatchTextField(field, QueryNode(token), document)
                   for token in query_tokens)

      token_text = search_util.RemoveAccentsNfkd(query_tokens[0].chars)
      matching_docids = [
          post.doc_id for post in self._PostingsForFieldToken(
              field.name(), token_text)]
      return document.id() in matching_docids

    def ExtractGlobalEq(node):
      op = node.getType()
      if ((op == QueryParser.EQ or op == QueryParser.HAS) and
          len(node.children) >= 2):
        if node.children[0].getType() == QueryParser.GLOBAL:
          return node.children[1]
      return node

    if match.getType() == QueryParser.CONJUNCTION:
      return all(self._MatchTextField(field, ExtractGlobalEq(child), document)
                 for child in match.children)

    if match.getType() == QueryParser.DISJUNCTION:
      return any(self._MatchTextField(field, ExtractGlobalEq(child), document)
                 for child in match.children)

    if match.getType() == QueryParser.NEGATION:
      raise ExpressionTreeException('Unable to compare \"' + field.name() +
                                    '\" with negation')


    return False
Esempio n. 3
0
 def TokenizeValue(self, field_value, token_position=0):
     """Tokenizes a document_pb.FieldValue into a sequence of Tokens."""
     if field_value.type() == document_pb.FieldValue.GEO:
         return self._TokenizeForType(field_type=field_value.type(),
                                      value=field_value.geo(),
                                      token_position=token_position)
     return self._TokenizeForType(field_type=field_value.type(),
                                  value=search_util.RemoveAccentsNfkd(
                                      field_value.string_value()),
                                  token_position=token_position)
Esempio n. 4
0
    def _TokenizeForType(self, field_type, value, token_position=0):
        """Tokenizes value into a sequence of Tokens."""
        if field_type == document_pb.FieldValue.NUMBER:
            return [tokens.Token(chars=value, position=token_position)]

        if field_type == document_pb.FieldValue.GEO:
            return [
                tokens.GeoPoint(latitude=value.lat(),
                                longitude=value.lng(),
                                position=token_position)
            ]

        tokens_found = []
        token_strings = []

        if not self._split_restricts:
            token_strings = self.SetCase(
                search_util.RemoveAccentsNfkd(value)).split()
        else:
            token_strings = self._TokenizeString(value, field_type)
        for token in token_strings:
            token = _SINGLE_QUOTE_RE.search(token).group(1)
            if ':' in token and self._split_restricts:
                for subtoken in token.split(':'):
                    tokens_found.append(
                        tokens.Token(chars=subtoken, position=token_position))
                    token_position += 1
            elif '"' in token:
                for subtoken in token.split('"'):
                    if not subtoken:
                        tokens_found.append(
                            tokens.Quote(chars='"', position=token_position))
                    else:
                        tokens_found.append(
                            tokens.Token(chars=subtoken,
                                         position=token_position))
                    token_position += 1
            else:
                tokens_found.append(
                    tokens.Token(chars=token, position=token_position))
                token_position += 1
        return tokens_found
Esempio n. 5
0
 def QueryNode(token):
     return query_parser.CreateQueryNode(
         search_util.RemoveAccentsNfkd(token.chars),
         QueryParser.TEXT)