def get_search_phrases(self, indexing_func=None): """Returns search phrases from properties in a given Model instance. Args (optional): only_index: List of strings. Restricts indexing to these property names. indexing_func: A function that returns a set of keywords or phrases. Note that the indexing_func can be passed in to allow more customized search phrase generation. Two model variables influence the output of this method: INDEX_ONLY: If None, all indexable properties are indexed. If a list of property names, only those properties are indexed. INDEX_MULTI_WORD: Class variable that allows multi-word search phrases like "statue of liberty." INDEX_STEMMING: Returns stemmed phrases. """ if not indexing_func: klass = self.__class__ if klass.INDEX_MULTI_WORD: indexing_func = klass.get_search_phraseset else: indexing_func = klass.get_simple_search_phraseset if self.INDEX_STEMMING: stemmer = Stemmer.Stemmer('english') phrases = set() for prop_name, prop_value in self.properties().iteritems(): if (not self.INDEX_ONLY) or (prop_name in self.INDEX_ONLY): values = prop_value.get_value_for_datastore(self) if not isinstance(values, list): values = [values] if (isinstance(values[0], basestring) and not isinstance(values[0], datastore_types.Blob)): for value in values: words = indexing_func(value) if self.INDEX_STEMMING: stemmed_words = set(stemmer.stemWords(words)) phrases.update(stemmed_words) else: phrases.update(words) return list(phrases)
def full_text_search(phrase, limit=10, kind=None, stemming=INDEX_STEMMING, multi_word_literal=INDEX_MULTI_WORD, searched_phrases_out=None): """Queries search indices for phrases using a merge-join. Args: phrase: String. Search phrase. kind: String. Returned keys/entities are restricted to this kind. Returns: A list of (key, title) tuples corresponding to the indexed entities. Multi-word literal matches are returned first. TODO -- Should provide feedback if input search phrase has stop words, etc. """ if searched_phrases_out is None: searched_phrases_out = [] index_keys = [] keywords = PUNCTUATION_REGEX.sub(' ', phrase).lower().split() if stemming: stemmer = Stemmer.Stemmer('english') klass = StemmedIndex else: klass = LiteralIndex if len(keywords) > 1 and multi_word_literal: # Try to match literal multi-word phrases first if len(keywords) == 2: search_phrases = [' '.join(keywords)] else: search_phrases = [] sub_strings = len(keywords) - 2 keyword_not_stop_word = map(lambda x: x not in STOP_WORDS, keywords) for pos in xrange(0, sub_strings): if keyword_not_stop_word[pos] and keyword_not_stop_word[ pos + 2]: search_phrases.append(' '.join(keywords[pos:pos + 3])) query = klass.all(keys_only=True) for phrase in search_phrases: if stemming: phrase = stemmer.stemWord(phrase) searched_phrases_out.append(phrase) query = query.filter('phrases =', phrase) if kind: query = query.filter('parent_kind =', kind) index_keys = query.fetch(limit=limit) if len(index_keys) < limit: new_limit = limit - len(index_keys) keywords = filter(lambda x: len(x) >= SEARCH_PHRASE_MIN_LENGTH, keywords) if stemming: keywords = stemmer.stemWords(keywords) query = klass.all(keys_only=True) for keyword in keywords: searched_phrases_out.append(phrase) query = query.filter('phrases =', keyword) if kind: query = query.filter('parent_kind =', kind) single_word_matches = [key for key in query.fetch(limit=new_limit) \ if key not in index_keys] index_keys.extend(single_word_matches) return [(key.parent(), SearchIndex.get_title(key.name())) for key in index_keys]