def _FullTextIndex(cls, text): """Returns a set of keywords appropriate for full text indexing. See SearchableQuery.Search() for details. Args: text: string Returns: set of strings """ if text: datastore_types.ValidateString(text, 'text', max_len=sys.maxint) # TODO -- Remove embedded code blogs marked by 'pre' tags # and name="code" text = cls._PUNCTUATION_REGEX.sub(' ', text) words = text.lower().split() words = set(words) words -= cls._FULL_TEXT_STOP_WORDS for word in list(words): if len(word) < cls._FULL_TEXT_MIN_LENGTH: words.remove(word) else: words = set() return words
def _FullTextIndex(cls, text, word_delimiter_regex=None): """Returns a set of keywords appropriate for full text indexing. See SearchableQuery.Search() for details. Args: text: string Returns: set of strings """ if word_delimiter_regex is None: word_delimiter_regex = cls._word_delimiter_regex if text: datastore_types.ValidateString(text, 'text', max_len=sys.maxint) text = word_delimiter_regex.sub(' ', text) words = text.lower().split() words = set(unicode(w) for w in words) words -= cls._FULL_TEXT_STOP_WORDS for word in list(words): if len(word) < cls._FULL_TEXT_MIN_LENGTH: words.remove(word) else: words = set() return words
def get_simple_search_phraseset(cls, text): """Returns a simple set of keywords from given text. Args: text: String. Returns: A set of keywords that aren't stop words and meet length requirement. >>> Searchable.get_simple_search_phraseset('I shall return.') set(['return']) """ if text: datastore_types.ValidateString(text, 'text', max_len=sys.maxint) text = PUNCTUATION_REGEX.sub(' ', text) words = text.lower().split() words = set(words) words -= STOP_WORDS for word in list(words): #if len(word) < SEARCH_PHRASE_MIN_LENGTH and str(word) not in string.digits: if not phraseIsSearchable(word): words.remove(word) else: words = set() return words
def get_search_phraseset(cls, text): """Returns set of phrases, including two and three adjacent word phrases not spanning punctuation or stop words. Args: text: String with punctuation. Returns: A set of search terms that aren't stop words and meet length requirement. Set includes phrases of adjacent words that aren't stop words. (Stop words are allowed in middle of three-word phrases like "Statue of Liberty".) >>> Searchable.get_search_phraseset('You look through rosy-colored glasses.') set(['look through rosy', 'rosy colored', 'colored', 'colored glasses', 'rosy', 'rosy colored glasses', 'glasses', 'look']) >>> Searchable.get_search_phraseset('I saw the Statue of Liberty.') set(['saw the statue', 'statue of liberty', 'liberty', 'statue']) >>> Searchable.get_search_phraseset('Recalling friends, past and present.') set(['recalling', 'recalling friends', 'friends']) """ if text: datastore_types.ValidateString(text, 'text', max_len=sys.maxint) text = text.lower() phrases = [] two_words = [] three_words = ['', ''] three_words_no_stop = [False, False] text = text.replace('-', ' ') fragments = text.split() for frag in fragments: word, replaced = PUNCTUATION_REGEX.subn('', frag) not_end_punctuation = (replaced > 1 or frag[-1] not in string.punctuation) if replaced and not_end_punctuation: two_words = [] three_words = ['', ''] three_words.append(word) # We allow stop words in middle if word in STOP_WORDS: two_words = [] three_words_no_stop.append(False) else: two_words.append(word) three_words_no_stop.append(True) #if len(word) >= SEARCH_PHRASE_MIN_LENGTH or str(word) in string.digits: if phraseIsSearchable(word): phrases.append(word) if len(two_words) == 2: phrases.append(' '.join(two_words)) del two_words[0] if len(three_words) == 3 and three_words_no_stop[0]: phrases.append(' '.join(three_words)) del three_words[0] del three_words_no_stop[0] phrases = set(phrases) else: phrases = set() return phrases
def __init__(self, app=None, namespace=None, kind=None, ancestor=None, filter_predicate=None, order=None): """Constructor. Args: app: Optional app to query, derived from the environment if not specified. namespace: Optional namespace to query, derived from the environment if not specified. kind: Optional kind to query. ancestor: Optional ancestor to query. filter_predicate: Optional FilterPredicate by which to restrict the query. order: Optional Order in which to return results. Raises: datastore_errors.BadArgumentError if any argument is invalid. """ if kind is not None: datastore_types.ValidateString(kind, 'kind', datastore_errors.BadArgumentError) if ancestor is not None and not isinstance(ancestor, entity_pb.Reference): raise datastore_errors.BadArgumentError( 'ancestor argument should be entity_pb.Reference (%r)' % (ancestor, )) if filter_predicate is not None and not isinstance( filter_predicate, FilterPredicate): raise datastore_errors.BadArgumentError( 'filter_predicate should be datastore_query.FilterPredicate (%r)' % (ancestor, )) super(Query, self).__init__() if isinstance(order, CompositeOrder): if order.size() == 0: order = None elif isinstance(order, Order): order = CompositeOrder([order]) elif order is not None: raise datastore_errors.BadArgumentError( 'order should be Order (%r)' % (order, )) self.__app = datastore_types.ResolveAppId(app) self.__namespace = datastore_types.ResolveNamespace(namespace) self.__kind = kind self.__ancestor = ancestor self.__order = order self.__filter_predicate = filter_predicate
def Search(self, search_query): """Add a search query. This may be combined with filters. Note that keywords in the search query will be silently dropped if they are stop words or too short, ie if they wouldn't be indexed. Args: search_query: string Returns: # this query SearchableQuery """ datastore_types.ValidateString(search_query, 'search query') self._search_query = search_query return self
def Search(self, search_query, word_delimiter_regex=None, properties=ALL_PROPERTIES): """Add a search query. This may be combined with filters. Note that keywords in the search query will be silently dropped if they are stop words or too short, ie if they wouldn't be indexed. Args: search_query: string Returns: # this query SearchableQuery """ datastore_types.ValidateString(search_query, 'search query') self._search_query = search_query self._word_delimiter_regex = word_delimiter_regex self._properties = properties return self
def __init__(self, property, direction=ASCENDING): """Constructor. Args: property: the name of the property by which to sort. direction: the direction in which to sort the given property. Raises: BadArgumentError if the property name or direction is invalid. """ datastore_types.ValidateString(property, 'property', datastore_errors.BadArgumentError) if not direction in self._DIRECTIONS: raise datastore_errors.BadArgumentError('unknown direction: %r' % (direction, )) self.__order = datastore_pb.Query_Order() self.__order.set_property(property.encode('utf-8')) self.__order.set_direction(direction)
def _EnTextIndex(cls, alltext, word_delimiter_regex=None): text = ' '.join(cls.MATCH_STR.findall(alltext)) if word_delimiter_regex is None: word_delimiter_regex = cls._word_delimiter_regex if text: datastore_types.ValidateString(text, 'text', max_len=sys.maxint) text = word_delimiter_regex.sub(' ', text) words = text.lower().split() words = set(unicode(w) for w in words) words -= cls._FULL_TEXT_STOP_WORDS for word in list(words): if len(word) < cls._FULL_TEXT_MIN_LENGTH: words.remove(word) else: words = set() cls.words.update(words)
def _FullTextIndex(cls, text, word_delimiter_regex=None): """Returns a set of keywords appropriate for full text indexing. See SearchableQuery.Search() for details. Args: text: string Returns: set of strings """ def detect_CJK_chars(text): for char in text: code = ord(char) if code >= 0x4E00 and code <= 0x9FFF: return True elif code >= 0x3400 and code <= 0x4DFF: return True elif code >= 0x20000 and code <= 0x2A6DF: return True elif code >= 0xF900 and code <= 0xFAFF: return True elif code >= 0x2F800 and code <= 0x2FA1F: return True return False def generate_CJK_tokens(text): offset = 0 prev_is_CJK = True tokens = set() for i, char in enumerate(text): # Skip non-CJK chars if not detect_CJK_chars(char): # Update offset once a non-CJK token starts. if prev_is_CJK: offset = i prev_is_CJK = False continue else: # Flush the cached non-CJK token once a CJK token start. if not prev_is_CJK and i - offset >= cls._FULL_TEXT_MIN_LENGTH: tokens.add(text[offset:i]) j = i + 1 while (j < len(text) and j - i + 1 <= cls._FULL_TEXT_MAX_SUBTOKEN_LENGTH and detect_CJK_chars(text[j:j + 1])): tokens.add(text[i:j + 1]) j = j + 1 prev_is_CJK = True # Flush the last non-CJK token. if not prev_is_CJK and len( text) - offset >= cls._FULL_TEXT_MIN_LENGTH: tokens.add(text[offset:]) return tokens if word_delimiter_regex is None: word_delimiter_regex = cls._word_delimiter_regex if text: datastore_types.ValidateString(text, 'text', max_len=sys.maxint) text = word_delimiter_regex.sub(' ', text) words = text.lower().split() words = set(unicode(w) for w in words) words -= cls._FULL_TEXT_STOP_WORDS add_words = set() for word in list(words): if detect_CJK_chars(word): add_words |= generate_CJK_tokens(word) elif len(word) < cls._FULL_TEXT_MIN_LENGTH: words.remove(word) words |= add_words else: words = set() return words