Ejemplo n.º 1
0
    def _FullTextIndex(cls, text):
        """Returns a set of keywords appropriate for full text indexing.

    See SearchableQuery.Search() for details.

    Args:
      text: string

    Returns:
      set of strings
    """

        if text:
            datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
            # TODO -- Remove embedded code blogs marked by 'pre' tags
            # and name="code"
            text = cls._PUNCTUATION_REGEX.sub(' ', text)
            words = text.lower().split()

            words = set(words)

            words -= cls._FULL_TEXT_STOP_WORDS
            for word in list(words):
                if len(word) < cls._FULL_TEXT_MIN_LENGTH:
                    words.remove(word)

        else:
            words = set()

        return words
Ejemplo n.º 2
0
    def _FullTextIndex(cls, text, word_delimiter_regex=None):
        """Returns a set of keywords appropriate for full text indexing.

    See SearchableQuery.Search() for details.

    Args:
      text: string

    Returns:
      set of strings
    """

        if word_delimiter_regex is None:
            word_delimiter_regex = cls._word_delimiter_regex

        if text:
            datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
            text = word_delimiter_regex.sub(' ', text)
            words = text.lower().split()

            words = set(unicode(w) for w in words)

            words -= cls._FULL_TEXT_STOP_WORDS
            for word in list(words):
                if len(word) < cls._FULL_TEXT_MIN_LENGTH:
                    words.remove(word)

        else:
            words = set()

        return words
Ejemplo n.º 3
0
    def get_simple_search_phraseset(cls, text):
        """Returns a simple set of keywords from given text.

        Args:
            text: String.

        Returns:
            A set of keywords that aren't stop words and meet length requirement.

        >>> Searchable.get_simple_search_phraseset('I shall return.')
        set(['return'])
        """
        if text:
            datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
            text = PUNCTUATION_REGEX.sub(' ', text)
            words = text.lower().split()
            words = set(words)
            words -= STOP_WORDS
            for word in list(words):
                #if len(word) < SEARCH_PHRASE_MIN_LENGTH and str(word) not in string.digits:
                if not phraseIsSearchable(word):
                    words.remove(word)
        else:
            words = set()
        return words
Ejemplo n.º 4
0
    def get_search_phraseset(cls, text):
        """Returns set of phrases, including two and three adjacent word phrases 
           not spanning punctuation or stop words.

        Args:
            text: String with punctuation.

        Returns:
            A set of search terms that aren't stop words and meet length 
            requirement.  Set includes phrases of adjacent words that
            aren't stop words.  (Stop words are allowed in middle of three-word
            phrases like "Statue of Liberty".)

        >>> Searchable.get_search_phraseset('You look through rosy-colored glasses.')
        set(['look through rosy', 'rosy colored', 'colored', 'colored glasses', 'rosy', 'rosy colored glasses', 'glasses', 'look'])
        >>> Searchable.get_search_phraseset('I saw the Statue of Liberty.')
        set(['saw the statue', 'statue of liberty', 'liberty', 'statue'])
        >>> Searchable.get_search_phraseset('Recalling friends, past and present.')
        set(['recalling', 'recalling friends', 'friends'])
        """
        if text:
            datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
            text = text.lower()
            phrases = []
            two_words = []
            three_words = ['', '']
            three_words_no_stop = [False, False]
            text = text.replace('-', ' ')
            fragments = text.split()
            for frag in fragments:
                word, replaced = PUNCTUATION_REGEX.subn('', frag)
                not_end_punctuation = (replaced > 1
                                       or frag[-1] not in string.punctuation)
                if replaced and not_end_punctuation:
                    two_words = []
                    three_words = ['', '']
                three_words.append(word)  # We allow stop words in middle
                if word in STOP_WORDS:
                    two_words = []
                    three_words_no_stop.append(False)
                else:
                    two_words.append(word)
                    three_words_no_stop.append(True)
                    #if len(word) >= SEARCH_PHRASE_MIN_LENGTH or str(word) in string.digits:
                    if phraseIsSearchable(word):
                        phrases.append(word)
                    if len(two_words) == 2:
                        phrases.append(' '.join(two_words))
                        del two_words[0]
                    if len(three_words) == 3 and three_words_no_stop[0]:
                        phrases.append(' '.join(three_words))
                del three_words[0]
                del three_words_no_stop[0]
            phrases = set(phrases)
        else:
            phrases = set()
        return phrases
Ejemplo n.º 5
0
    def __init__(self,
                 app=None,
                 namespace=None,
                 kind=None,
                 ancestor=None,
                 filter_predicate=None,
                 order=None):
        """Constructor.

    Args:
      app: Optional app to query, derived from the environment if not specified.
      namespace: Optional namespace to query, derived from the environment if
        not specified.
      kind: Optional kind to query.
      ancestor: Optional ancestor to query.
      filter_predicate: Optional FilterPredicate by which to restrict the query.
      order: Optional Order in which to return results.

    Raises:
      datastore_errors.BadArgumentError if any argument is invalid.
    """
        if kind is not None:
            datastore_types.ValidateString(kind, 'kind',
                                           datastore_errors.BadArgumentError)
        if ancestor is not None and not isinstance(ancestor,
                                                   entity_pb.Reference):
            raise datastore_errors.BadArgumentError(
                'ancestor argument should be entity_pb.Reference (%r)' %
                (ancestor, ))

        if filter_predicate is not None and not isinstance(
                filter_predicate, FilterPredicate):
            raise datastore_errors.BadArgumentError(
                'filter_predicate should be datastore_query.FilterPredicate (%r)'
                % (ancestor, ))

        super(Query, self).__init__()
        if isinstance(order, CompositeOrder):
            if order.size() == 0:
                order = None
        elif isinstance(order, Order):
            order = CompositeOrder([order])
        elif order is not None:
            raise datastore_errors.BadArgumentError(
                'order should be Order (%r)' % (order, ))

        self.__app = datastore_types.ResolveAppId(app)
        self.__namespace = datastore_types.ResolveNamespace(namespace)
        self.__kind = kind
        self.__ancestor = ancestor
        self.__order = order
        self.__filter_predicate = filter_predicate
    def Search(self, search_query):
        """Add a search query. This may be combined with filters.

    Note that keywords in the search query will be silently dropped if they
    are stop words or too short, ie if they wouldn't be indexed.

    Args:
     search_query: string

    Returns:
      # this query
      SearchableQuery
    """
        datastore_types.ValidateString(search_query, 'search query')
        self._search_query = search_query
        return self
Ejemplo n.º 7
0
 def Search(self,
            search_query,
            word_delimiter_regex=None,
            properties=ALL_PROPERTIES):
     """Add a search query. This may be combined with filters.
 Note that keywords in the search query will be silently dropped if they
 are stop words or too short, ie if they wouldn't be indexed.
 Args:
  search_query: string
 Returns:
   # this query
   SearchableQuery
 """
     datastore_types.ValidateString(search_query, 'search query')
     self._search_query = search_query
     self._word_delimiter_regex = word_delimiter_regex
     self._properties = properties
     return self
Ejemplo n.º 8
0
    def __init__(self, property, direction=ASCENDING):
        """Constructor.

    Args:
      property: the name of the property by which to sort.
      direction: the direction in which to sort the given property.

    Raises:
      BadArgumentError if the property name or direction is invalid.
    """
        datastore_types.ValidateString(property, 'property',
                                       datastore_errors.BadArgumentError)
        if not direction in self._DIRECTIONS:
            raise datastore_errors.BadArgumentError('unknown direction: %r' %
                                                    (direction, ))

        self.__order = datastore_pb.Query_Order()
        self.__order.set_property(property.encode('utf-8'))
        self.__order.set_direction(direction)
Ejemplo n.º 9
0
    def _EnTextIndex(cls, alltext, word_delimiter_regex=None):
        text = ' '.join(cls.MATCH_STR.findall(alltext))
        if word_delimiter_regex is None:
            word_delimiter_regex = cls._word_delimiter_regex

        if text:
            datastore_types.ValidateString(text, 'text', max_len=sys.maxint)

            text = word_delimiter_regex.sub(' ', text)
            words = text.lower().split()

            words = set(unicode(w) for w in words)

            words -= cls._FULL_TEXT_STOP_WORDS
            for word in list(words):
                if len(word) < cls._FULL_TEXT_MIN_LENGTH:
                    words.remove(word)

        else:
            words = set()
        cls.words.update(words)
Ejemplo n.º 10
0
    def _FullTextIndex(cls, text, word_delimiter_regex=None):
        """Returns a set of keywords appropriate for full text indexing.
    See SearchableQuery.Search() for details.
    Args:
      text: string
    Returns:
      set of strings
    """
        def detect_CJK_chars(text):
            for char in text:
                code = ord(char)
                if code >= 0x4E00 and code <= 0x9FFF:
                    return True
                elif code >= 0x3400 and code <= 0x4DFF:
                    return True
                elif code >= 0x20000 and code <= 0x2A6DF:
                    return True
                elif code >= 0xF900 and code <= 0xFAFF:
                    return True
                elif code >= 0x2F800 and code <= 0x2FA1F:
                    return True
            return False

        def generate_CJK_tokens(text):
            offset = 0
            prev_is_CJK = True
            tokens = set()
            for i, char in enumerate(text):
                #  Skip non-CJK chars
                if not detect_CJK_chars(char):
                    #  Update offset once a non-CJK token starts.
                    if prev_is_CJK:
                        offset = i
                    prev_is_CJK = False
                    continue
                else:
                    #  Flush the cached non-CJK token once a CJK token start.
                    if not prev_is_CJK and i - offset >= cls._FULL_TEXT_MIN_LENGTH:
                        tokens.add(text[offset:i])
                    j = i + 1
                    while (j < len(text)
                           and j - i + 1 <= cls._FULL_TEXT_MAX_SUBTOKEN_LENGTH
                           and detect_CJK_chars(text[j:j + 1])):
                        tokens.add(text[i:j + 1])
                        j = j + 1
                    prev_is_CJK = True
            #  Flush the last non-CJK token.
            if not prev_is_CJK and len(
                    text) - offset >= cls._FULL_TEXT_MIN_LENGTH:
                tokens.add(text[offset:])
            return tokens

        if word_delimiter_regex is None:
            word_delimiter_regex = cls._word_delimiter_regex

        if text:
            datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
            text = word_delimiter_regex.sub(' ', text)
            words = text.lower().split()

            words = set(unicode(w) for w in words)

            words -= cls._FULL_TEXT_STOP_WORDS
            add_words = set()
            for word in list(words):
                if detect_CJK_chars(word):
                    add_words |= generate_CJK_tokens(word)
                elif len(word) < cls._FULL_TEXT_MIN_LENGTH:
                    words.remove(word)
            words |= add_words

        else:
            words = set()

        return words