def hyper_parameter_evaluate(): ks = [0.25, 0.75, 1.25, 2] bs = [i / 10 for i in range(11)] with open('ndcg_bm25_abstract.csv', 'w') as file: csv_writer = csv.writer(file) for k in ks: # Single-field BM25 # scoring_function = BM25F(K1 = k) # search_function = search_single_field # result = evaluate(search_function, scoring_function) for titles_b in bs: for caption_and_headers_b in bs: for body_b in bs: # BM25F scoring_function = BM25F( K1=k, titles_B=titles_b, caption_and_headers_B=caption_and_headers_b, body_B=body_b) search_function = search_bm25f_or result = evaluate(search_function, scoring_function) row = [k, result] csv_writer.writerow(row) print(row) # Do the evaluation. #hyper_parameter_evaluate()
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def searching(idx, query, limit=10): bm = BM25F() searcher = idx.searcher(weighting=bm) qp = QueryParser("text", schema=idx.schema) q = qp.parse(query) results = searcher.search(q, limit=limit) return results
def search(self, queryEntered, page): title = list() plot = list() poster = list() year = list() director = list() genre = list() actors = list() tomato_score = list() # JY for the sake of demonstrating ranking weight, not going to affect search much visibly. #mw=MultiWeighting(BM25F(), tomato_score=FunctionWeighting(custom_weight)) # plot=BM25F(B=0.75, plot_B=1.0, K1=2.0), actors=BM25F(B=0.75, actors_B=1.0, K1=1.5), director=TF_IDF() ) with self.indexer.searcher(weighting=BM25F()) as search: parser = MultifieldParser(['title', 'plot','actors', 'director', 'genre'], schema=self.indexer.schema, termclass=FuzzyTerm) # parser.add_plugin(plugins.FuzzyTermPlugin()) parser.add_plugin(plugins.SequencePlugin()) query = parser.parse(queryEntered) results = search.search_page(query, page, 20, sortedby = {'tomato_score'}, reverse=True) # 'tomato_score', 'year' for x in results: title.append(x['title']) plot.append(x['plot']) poster.append(x['poster']) tomato_score.append(x['tomato_score']) year.append(x['year']) director.append(x['director']) actors.append(x['actors']) genre.append(x['genre']) return title, plot, poster, tomato_score, year, actors, director, genre, results.pagecount if results.pagecount < 23 else 23
def search( self, query, return_attribute='id' ): # Change field boosts for searcher to place more weight on title, description than help. searcher = self.index.searcher( \ weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \ ) ) # Set query to search title, description, and help. parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema ) results = searcher.search( parser.parse( query ) ) return [ result[ return_attribute ] for result in results ]
def search(self, q: str, tool_name_boost: CanConvertToFloat, tool_id_boost: CanConvertToFloat, tool_section_boost: CanConvertToFloat, tool_description_boost: CanConvertToFloat, tool_label_boost: CanConvertToFloat, tool_stub_boost: CanConvertToFloat, tool_help_boost: CanConvertToFloat, tool_search_limit: CanConvertToFloat, tool_enable_ngram_search: bool, tool_ngram_minsize: CanConvertToInt, tool_ngram_maxsize: CanConvertToInt) -> List[str]: """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher self.searcher = self.index.searcher(weighting=MultiWeighting( BM25F(), old_id=BM25F(old_id_B=float(tool_id_boost)), name=BM25F(name_B=float(tool_name_boost)), section=BM25F(section_B=float(tool_section_boost)), description=BM25F(description_B=float(tool_description_boost)), labels=BM25F(labels_B=float(tool_label_boost)), stub=BM25F(stub_B=float(tool_stub_boost)), help=BM25F(help_B=float(tool_help_boost)))) # Use OrGroup to change the default operation for joining multiple terms to logical OR. # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match. # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur' # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user. # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1. # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default # Adding the FuzzyTermPlugin to account for misspellings and typos, using a max distance of 2 og = OrGroup.factory(0.9) self.parser = MultifieldParser([ 'name', 'old_id', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema, group=og) cleaned_query = q.lower() if tool_enable_ngram_search is True: rval = self._search_ngrams(cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit) return rval else: cleaned_query = ' '.join(token.text for token in self.rex(cleaned_query)) # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie' parsed_query = self.parser.parse(f"*{cleaned_query}*") hits = self.searcher.search(parsed_query, limit=float(tool_search_limit), sortedby='') return [hit['id'] for hit in hits]
def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'labels_B': float( tool_label_boost ), 'stub_B': float( tool_stub_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema ) # Hyphens are wildcards in Whoosh causing bad things if q.find( '-' ) != -1: q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] ) # Perform tool search with ngrams if set to true in the config file if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) ) ngrams = [ token.text for token in token_analyzer( q ) ] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) ) for i, curr_hit in enumerate( curr_hits ): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit[ 'id' ] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True ) # Return the tool ids return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ] else: # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def search(self, query, return_attribute='id'): # Change field boosts for searcher searcher = self.index.searcher(weighting=BM25F( field_B={ 'title_B': 9, 'section_B': 3, 'description_B': 2, 'help_B': 0.5 })) # Set query to search title, description, section, and help. parser = MultifieldParser(['title', 'description', 'section', 'help'], schema=schema) # Perform the search hits = searcher.search(parser.parse('*' + query + '*'), limit=20) return [hit[return_attribute] for hit in hits]
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher self.searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Use OrGroup to change the default operation for joining multiple terms to logical OR. # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match. # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur' # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user. # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1. # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default og = OrGroup.factory(0.9) self.parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema, group=og) cleaned_query = q.lower() # Replace hyphens, since they are wildcards in Whoosh causing false positives if cleaned_query.find('-') != -1: cleaned_query = (' ').join( token.text for token in self.rex(to_unicode(cleaned_query))) if tool_enable_ngram_search is True: rval = self._search_ngrams(cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit) return rval else: # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie' parsed_query = self.parser.parse(cleaned_query + '*') hits = self.searcher.search(parsed_query, limit=float(tool_search_limit), sortedby='') return [hit['id'] for hit in hits]
def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_help_boost, tool_search_limit ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, and help. parser = MultifieldParser( [ 'name', 'description', 'section', 'help' ], schema=schema ) # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def get_searcher(index=INDEX, score_by="BM25F"): """ get_searcher([index=INDEX, score_by="RTE"]) Obtém o buscador para o índice fornecido no parâmetro ``index``. O parâmetro ``score_by`` permite a escolha de uma função de pontuação diferente para o par (query, documento). As funções de pontuação disponíveis são: RTE, TFIDF e BM25F. .. code-block:: python >>> from searcher import get_searcher >>> from index import get_index >>> >>> idx = get_index() >>> searcher = get_searcher(idx, score_by="RTE") >>> :param index: Índice de documentos. :type index: FileIndex :param score_by: Função de pontuação entre a *query* do usuário e um documento recuperado. :type score_by: str :returns: Searcher """ try: from whoosh.scoring import BM25F, TF_IDF except ImportError: print "Ocorreu um erro na importação das funções de pontuação." # Converte para MAIÚSCULO. score_by = score_by.upper() # Escolha da função de pontuação. if score_by == "TF-IDF": score_function = TF_IDF() elif score_by == "BM25F": score_function = BM25F() return index.searcher(weighting=score_function)
def search_index(query, score_func_name, dirname): ix = index.open_dir(dirname, schema=get_schema()) og = OrGroup.factory(0.9) qp = QueryParser("content", schema=get_schema(), group=og) # qp.add_plugin(FuzzyTermPlugin()) # query = ' '.join([(x + '~' if len(x) > 5 else x) for x in query.split(' ')]) q = qp.parse(query) score_func = OkBM25() if score_func_name == 'ok': score_func = OkBM25() elif score_func_name == 'bm25f': score_func = BM25F() elif score_func_name == 'pln': score_func = PLN() elif score_func_name == 'tfidf': score_func = TF_IDF() elif score_func_name == 'freq': score_func = Frequency() searcher = ix.searcher(weighting=score_func) results = searcher.search(q, limit=None) results.fragmenter.surround = 100 return results
def scorer(self, searcher, fieldname, text, qf=1): # BM25 bm25Scorer = BM25F().scorer(searcher, fieldname, text, qf) tfidfScorer = TF_IDF().scorer(searcher, fieldname, text, qf) return self.Scorer(tfidfScorer, bm25Scorer)
elif (operation_type == "OR"): op_type = qparser.OrGroup else: op_type = qparser.AndGroup dirname = "indexdir" ix = open_dir(dirname) qp = qparser.MultifieldParser( ['content', 'path', 'title', 'head1', 'head2', 'head3', 'head4'], ix.schema, group=op_type) qp.add_plugin(qparser.PlusMinusPlugin) query = qp.parse(search_input) # print(query) if search_type == "BM25": w = BM25F(B=0.75, K1=1.5) elif search_type == "TFIDF": w = TF_IDF() else: w = BM25F( B=0.75, K1=1.5, ) with ix.searcher(weighting=w) as searcher: results = searcher.search(query, terms=True) results.fragmenter = highlight.ContextFragmenter( maxchars=50, surround=50, ) # print(list(searcher.lexicon("content"))) found_doc_num = results.scored_length()
from evaluation_BM25 import evaluate from whoosh.scoring import BM25F from search import search_bm25f_and, search_bm25f_or scoring_function = BM25F(K1=1.5, titles_B=0.9, caption_and_headers_B=0.9, body_B=0.2) search_function_and = search_bm25f_and search_function_or = search_bm25f_or result_and = evaluate(search_function_and, scoring_function) result_or = evaluate(search_function_or, scoring_function) print("Result for and: ") print(result_and) print("Result for or ") print(result_or)
writer.add_document(path=link['href'], title=filename[:-4], content=f_string) except: writer.add_document(path=u'None', title=filename[:-4], content=f_string) writer.commit() qp = qparser.MultifieldParser(['content', 'path', 'title'], ix.schema, group=qparser.OrGroup) query = qp.parse("transgenic growth ") # print(query) b = BM25F(B=0.75, K1=1.5) t = TF_IDF() f = Frequency() with ix.searcher(weighting=f) as searcher: results = searcher.search( query, terms=True, ) results.fragmenter = highlight.ContextFragmenter( maxchars=50, surround=90, ) if results: for hit in results: snip = hit.highlights('content')