def __init__(self, index_dir: str): ts = TurkishStemmer() self.__schema = fields.Schema( message=fields.TEXT(stored=True, field_boost=1.5, analyzer=analysis.StemmingAnalyzer() | analysis.NgramFilter(minsize=2, maxsize=5)), meta_content=fields.TEXT( stored=True, analyzer=analysis.StemmingAnalyzer() | analysis.NgramFilter(minsize=2, maxsize=5)), message_id=fields.NUMERIC(stored=True, bits=64), chat_id=fields.NUMERIC(stored=True, bits=64), message_tr=fields.TEXT( stored=False, field_boost=1.5, analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem, stoplist=STOP_WORDS_TR) | analysis.NgramFilter(minsize=2, maxsize=5)), meta_content_tr=fields.TEXT( stored=False, analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem, stoplist=STOP_WORDS_TR) | analysis.NgramFilter(minsize=2, maxsize=5)), ) if not os.path.isdir(index_dir): os.mkdir(index_dir) self.__index = index.create_in(index_dir, self.__schema) else: self.__index = index.open_dir(index_dir)
def _search_ngrams(self, cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit): """ Break tokens into ngrams and search on those instead. This should make searching more resistant to typos and unfinished words. See docs at https://whoosh.readthedocs.io/en/latest/ngrams.html """ hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize)) ngrams = [token.text for token in token_analyzer(cleaned_query)] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = self.searcher.search(self.parser.parse('*' + query + '*'), limit=float(tool_search_limit)) for i, curr_hit in enumerate(curr_hits): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit['id'] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[prev_hit] = curr_hits.score( i) + hits_with_score[prev_hit] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[curr_hit['id']] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted(hits_with_score.items(), key=lambda x: x[1], reverse=True) # Return the tool ids return [item[0] for item in hits_with_score[0:int(tool_search_limit)]]
def test_shared_composition(): shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() ana1 = shared | analysis.NgramFilter(3) ana2 = shared | analysis.DoubleMetaphoneFilter() assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"]) assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
def test_ngrams(): s = u("abcdefg h ij klm") tk = analysis.RegexTokenizer(r"\S+") def dotest(f): ana = tk | f tokens = ana(s, positions=True, chars=True) return "/".join(t.text for t in tokens) f = analysis.NgramFilter(3, 4) assert_equal(dotest(f), "abc/abcd/bcd/bcde/cde/cdef/def/defg/efg/klm") f = analysis.NgramFilter(3, 4, at="start") assert_equal(dotest(f), "abc/abcd/klm") f = analysis.NgramFilter(3, 4, at="end") assert_equal(dotest(f), "defg/efg/klm") ana = tk | analysis.NgramFilter(2, 5, at="end") tokens = [(t.text, t.startchar, t.endchar) for t in ana(s, chars=True)] assert_equal(tokens, [("cdefg", 2, 7), ("defg", 3, 7), ("efg", 4, 7), ("fg", 5, 7), ("ij", 10, 12), ("klm", 13, 16), ("lm", 14, 16)])
def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'labels_B': float( tool_label_boost ), 'stub_B': float( tool_stub_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema ) # Hyphens are wildcards in Whoosh causing bad things if q.find( '-' ) != -1: q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] ) # Perform tool search with ngrams if set to true in the config file if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) ) ngrams = [ token.text for token in token_analyzer( q ) ] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) ) for i, curr_hit in enumerate( curr_hits ): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit[ 'id' ] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True ) # Return the tool ids return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ] else: # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]