def test_memory_codec(): from whoosh.codec import memory from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) codec = memory.MemoryCodec() with codec.writer(schema) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) reader = codec.reader(schema) s = Searcher(reader) assert ("a", "delta") in reader q = query.Term("a", "delta") r = s.search(q) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join( s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") cfield = schema["c"] c_sortables = cfield.sortable_terms(reader, "c") c_values = [cfield.from_bytes(t) for t in c_sortables] assert c_values, [-200, -100, 100, 200, 300] assert reader.has_column("c") c_values = list(reader.column_reader("c")) assert c_values == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo"
def test2(): queries = load_queries() ix = index.open_dir(index_dir) mysearcher = Searcher(ix) #mysearcher = ix.searcher(weighting=scoring.BM25F()) for query in queries: print("Processing query number", query['id']) results = mysearcher.search(query['text'], limit=10) print(results)
def test2(): queries = load_queries() ix = index.open_dir(index_dir) mysearcher = Searcher(ix) #mysearcher = ix.searcher(weighting=scoring.BM25F()) for query in queries: print("Processing query number", query['id']) results = mysearcher.search(query['text'], limit=10) print(results)
def test_memory_codec(): from whoosh.codec import memory from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) codec = memory.MemoryCodec() with codec.writer(schema) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) reader = codec.reader(schema) s = Searcher(reader) assert ("a", "delta") in reader q = query.Term("a", "delta") r = s.search(q) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join(s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") cfield = schema["c"] c_sortables = cfield.sortable_terms(reader, "c") c_values = [cfield.from_bytes(t) for t in c_sortables] assert c_values, [-200, -100, 100, 200, 300] assert reader.has_column("c") c_values = list(reader.column_reader("c")) assert c_values == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo" assert reader.has_word_graph("d") gr = reader.word_graph("d") assert (" ".join(gr.flatten_strings()) == "aching dipping echoing filling going hopping opening " "pulling quelling rolling selling timing using whining " "yelling")
def _parse_and_search( self, searcher: Searcher, content: str, value: str, limit: Optional[int] = None, terms: bool = False, group: Type[Union[AndGroup, OrGroup]] = OrGroup) -> Results: parser = QueryParser(content, self._ix.schema, group=group) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) return searcher.search(query, terms=terms, limit=limit)
def get_index_docnum_of_article_id(article_id: str, ix_reader: MultiReader, article_id_fieldname='articleID'): ix_searcher = Searcher(ix_reader) rslts = ix_searcher.search( QueryParser(article_id_fieldname, ix_reader.schema).parse(article_id)) if len(rslts) == 0: LOGGER.warning( 'Article ID {} was not found in the index'.format(article_id)) return -1 if len(rslts) > 1: LOGGER.warning( 'Article ID {} has multiple instances in the index'.format( article_id)) return rslts[0].docnum
def test_all(self): domain = [u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot"] schema = Schema(text=TEXT) storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() for _ in xrange(100): w.add_document(text=u" ".join(choice(domain) for i in xrange(randint(10, 20)))) w.commit() # provide initializer arguments for any weighting classes that require them init_args = {} reader = ix.reader() for wclass in self._weighting_classes(): if wclass in init_args: weighting = wclass(*init_args[wclass]) else: weighting = wclass() searcher = Searcher(reader, weighting) for word in domain: r = searcher.search(query.Term("text", word))
def test_missing_field_scoring(self): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = RamStorage() idx = storage.create_index(schema) writer = idx.writer() writer.add_document(name=u'Frank', hobbies=u'baseball, basketball') writer.commit() self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 1) # name writer = idx.writer() writer.add_document(name=u'Jonny') writer.commit() self.assertEqual(len(idx.segments), 1) self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 2) # name reader = idx.reader() searcher = Searcher(reader) parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema) q = parser.parse(u"baseball") result = searcher.search(q) self.assertEqual(len(result), 1)
def test_missing_field_scoring(self): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = RamStorage() idx = storage.create_index(schema) writer = idx.writer() writer.add_document(name=u'Frank', hobbies=u'baseball, basketball') writer.commit() self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 1) # name writer = idx.writer() writer.add_document(name=u'Jonny') writer.commit() self.assertEqual(len(idx.segments), 1) self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 2) # name reader = idx.reader() searcher = Searcher(reader) parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema) q = parser.parse(u"baseball") result = searcher.search(q) self.assertEqual(len(result), 1)
def match_search_query(searcher: Searcher, q: Query) -> bool: return bool(searcher.search(q))
class WikiSearcher: weighting = { 'TF_IDF': scoring.TF_IDF, 'BM25F': scoring.BM25F, 'FREQUENCY': scoring.Frequency, } group = { 'OR': qparser.OrGroup, 'AND': qparser.AndGroup, } base_url = 'https://en.wikipedia.org/wiki/' def __init__(self, index, page_ranker): """ Creazione del QueryParser relativo al testo della query. Aggiungo il plugin 'MultifieldPlugin' al 'QueryParser' perchè mi permette poi nella funzione di 'search' di poter modificare il filed boost (cosa che non siamo riusciti a fare se avessimo definito direttamente un 'MultifieldParser'). - MULTIFIELD così effettuo ricerca sia nel titolo che nel testo. Il parser utilizza l'analizzatore definito nell'index, riferito al field corrispondente. Se ho definito 2 analyzer diversi per titolo e testo, questi vengono usati rispettivamente per parsare la query per il titolo e per il testo. ES: testo con stemmer e titolo senza stemmer: query: 'fortified' --> query parsata :'(text:fortifi OR title:fortified)' - GROUP default concatena i token con 'AND'. Specificando 'OrGroup' concatena con OR. Utilizzando il FACTORY, do un punteggio maggiore ai documenti in cui un certo termine ha una frequenza più alta. Senza FACTORY non ho questo effetto. """ self.index = index self.page_ranker = page_ranker self.expand = Expander(disambiguate_fn='noun_sense') self.parser = qparser.QueryParser(None, index.schema) self.multifield_plugin = qparser.MultifieldPlugin(['text', 'title']) self.parser.add_plugin(self.multifield_plugin) self.weighting = 'BM25F' self.searcher = WhooshSearcher( reader=self.index.reader(), weighting=WikiSearcher.weighting[self.weighting]) def search(self, text, limit=10, exp=True, page_rank=True, text_boost=1.0, title_boost=1.0, weighting='BM25F', group='AND'): """ Funzione che esegue la ricerca con i parametri passati in input. Se il pagerank è specificato, lo score finale del documento viene calcolato sia in funzione dello score fornito dalla ricerca che al valore di pagerank. Per prima cosa avviene la fase di settaggio del parser e del weighting con i valori passati in input. Dopo di chè avviene il query expansion. Poi, avviene il parsing del testo, che viene passato al searcher che ottiene i documenti rilevanti. :param self :param text: testo che verrà convertito in query dal parser :param limit: numero max di documenti ritornati :param exp: boolean se abilitare o meno il query expansion :param page_rank: boolean se abilitare o meno il pagerank :param text_boost: boosting del campo testo :param title_boost: boosting del campo titolo :param weighting: metodo di weighting :param group: come vengono concatenati i token della query return dict con i risultati. """ self.multifield_plugin.boosts = { 'text': text_boost, 'title': title_boost } self.parser.group = WikiSearcher.group.get(group, 'AND') text, list_token_expanded = self.expand(text) if exp else (text, None) query = self.parser.parse(text) if weighting != self.weighting: print('Imposto il searcher con il weighting : ' + weighting + ' Può impiegare tempo ...') self.searcher = WhooshSearcher( reader=self.index.reader(), weighting=WikiSearcher.weighting.get(weighting, 'BM25F')) self.weighting = weighting print('Weighting impostato correttamente') #print('Query : '+str(query)) res = {} results = self.searcher.search(query, limit=limit) res['time_second'] = results.runtime res['expanded'] = list_token_expanded if exp else [] res['n_res'] = results.estimated_length() final_score_fn, values_page_rank = self.__combinedScore( page_rank, results) if page_rank: results = sorted(results, key=final_score_fn, reverse=True) res['docs'] = [{ 'link': WikiSearcher.base_url + result['title'].replace(" ", "_"), 'title': result['title'], 'highlight': result.highlights("text", top=2), 'final_score': final_score_fn(result), 'score': result.score, 'page_rank': values_page_rank.get(result['id_page'], -1) } for result in results] return res def __combinedScore(self, page_rank, results): """ Ritorna il riferimento alla funzione usata per il calcolo dello score finale combinato con il valore di pagerank. In questa funzione ricavo, tramite il 'getRank' del 'pageRanker', i valori di pagerank che ho calcolato in precedenza. :param self :param page_rank: boolean per capire se serve usare il pagerank per la query corrente :param results: risultati della query """ values_page_rank = {} if page_rank: values_page_rank = self.page_ranker.getRank( [res['id_page'] for res in results], 5) def final_score_fn(result): if page_rank: return result.score * values_page_rank.get( result['id_page'], 1) return result.score return final_score_fn, values_page_rank def getFieldInfo(self, field): """ Ottengo le informazioni riferite al field. Funzione utile in fase di debug. :param field: field di cui voglio le informazioni return dict con le info del field specificato """ return {'length': self.searcher.field_length(field)} def getGeneralInfo(self): """ Ottengo le informazioni generali riferite all'indice. Funzione utile in fase di debug. :param self: return dict con le info """ return {'doc_count': self.searcher.doc_count()}