def __init__(self, index_dir: str):
     ts = TurkishStemmer()
     self.__schema = fields.Schema(
         message=fields.TEXT(stored=True,
                             field_boost=1.5,
                             analyzer=analysis.StemmingAnalyzer()
                             | analysis.NgramFilter(minsize=2, maxsize=5)),
         meta_content=fields.TEXT(
             stored=True,
             analyzer=analysis.StemmingAnalyzer()
             | analysis.NgramFilter(minsize=2, maxsize=5)),
         message_id=fields.NUMERIC(stored=True, bits=64),
         chat_id=fields.NUMERIC(stored=True, bits=64),
         message_tr=fields.TEXT(
             stored=False,
             field_boost=1.5,
             analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem,
                                                stoplist=STOP_WORDS_TR)
             | analysis.NgramFilter(minsize=2, maxsize=5)),
         meta_content_tr=fields.TEXT(
             stored=False,
             analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem,
                                                stoplist=STOP_WORDS_TR)
             | analysis.NgramFilter(minsize=2, maxsize=5)),
     )
     if not os.path.isdir(index_dir):
         os.mkdir(index_dir)
         self.__index = index.create_in(index_dir, self.__schema)
     else:
         self.__index = index.open_dir(index_dir)
Beispiel #2
0
 def _search_ngrams(self, cleaned_query, tool_ngram_minsize,
                    tool_ngram_maxsize, tool_search_limit):
     """
     Break tokens into ngrams and search on those instead.
     This should make searching more resistant to typos and unfinished words.
     See docs at https://whoosh.readthedocs.io/en/latest/ngrams.html
     """
     hits_with_score = {}
     token_analyzer = StandardAnalyzer() | analysis.NgramFilter(
         minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize))
     ngrams = [token.text for token in token_analyzer(cleaned_query)]
     for query in ngrams:
         # Get the tool list with respective scores for each qgram
         curr_hits = self.searcher.search(self.parser.parse('*' + query +
                                                            '*'),
                                          limit=float(tool_search_limit))
         for i, curr_hit in enumerate(curr_hits):
             is_present = False
             for prev_hit in hits_with_score:
                 # Check if the tool appears again for the next qgram search
                 if curr_hit['id'] == prev_hit:
                     is_present = True
                     # Add the current score with the previous one if the
                     # tool appears again for the next qgram
                     hits_with_score[prev_hit] = curr_hits.score(
                         i) + hits_with_score[prev_hit]
             # Add the tool if not present to the collection with its score
             if not is_present:
                 hits_with_score[curr_hit['id']] = curr_hits.score(i)
     # Sort the results based on aggregated BM25 score in decreasing order of scores
     hits_with_score = sorted(hits_with_score.items(),
                              key=lambda x: x[1],
                              reverse=True)
     # Return the tool ids
     return [item[0] for item in hits_with_score[0:int(tool_search_limit)]]
Beispiel #3
0
def test_shared_composition():
    shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter()

    ana1 = shared | analysis.NgramFilter(3)
    ana2 = shared | analysis.DoubleMetaphoneFilter()

    assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"])
    assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
Beispiel #4
0
def test_ngrams():
    s = u("abcdefg h ij klm")
    tk = analysis.RegexTokenizer(r"\S+")

    def dotest(f):
        ana = tk | f
        tokens = ana(s, positions=True, chars=True)
        return "/".join(t.text for t in tokens)

    f = analysis.NgramFilter(3, 4)
    assert_equal(dotest(f), "abc/abcd/bcd/bcde/cde/cdef/def/defg/efg/klm")

    f = analysis.NgramFilter(3, 4, at="start")
    assert_equal(dotest(f), "abc/abcd/klm")

    f = analysis.NgramFilter(3, 4, at="end")
    assert_equal(dotest(f), "defg/efg/klm")

    ana = tk | analysis.NgramFilter(2, 5, at="end")
    tokens = [(t.text, t.startchar, t.endchar) for t in ana(s, chars=True)]
    assert_equal(tokens, [("cdefg", 2, 7), ("defg", 3, 7), ("efg", 4, 7),
                          ("fg", 5, 7), ("ij", 10, 12), ("klm", 13, 16),
                          ("lm", 14, 16)])
 def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(
         weighting=BM25F(
             field_B={ 'name_B': float( tool_name_boost ),
                       'section_B': float( tool_section_boost ),
                       'description_B': float( tool_description_boost ),
                       'labels_B': float( tool_label_boost ),
                       'stub_B': float( tool_stub_boost ),
                       'help_B': float( tool_help_boost ) }
         )
     )
     # Set query to search name, description, section, help, and labels.
     parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema )
     # Hyphens are wildcards in Whoosh causing bad things
     if q.find( '-' ) != -1:
         q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] )
     # Perform tool search with ngrams if set to true in the config file
     if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ):
         hits_with_score = {}
         token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) )
         ngrams = [ token.text for token in token_analyzer( q ) ]
         for query in ngrams:
             # Get the tool list with respective scores for each qgram
             curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) )
             for i, curr_hit in enumerate( curr_hits ):
                 is_present = False
                 for prev_hit in hits_with_score:
                     # Check if the tool appears again for the next qgram search
                     if curr_hit[ 'id' ] == prev_hit:
                         is_present = True
                         # Add the current score with the previous one if the
                         # tool appears again for the next qgram
                         hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ]
                 # Add the tool if not present to the collection with its score
                 if not is_present:
                     hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i)
         # Sort the results based on aggregated BM25 score in decreasing order of scores
         hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True )
         # Return the tool ids
         return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ]
     else:
         # Perform the search
         hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) )
         return [ hit[ 'id' ] for hit in hits ]