def delete_by_query(self, q, searcher=None): """Deletes any documents matching a query object. Note that this method opens and closes a Searcher. If you are calling this method repeatedly (for example, deleting changed documents before reindexing them), you should open your own Searcher object and pass it in with the 'searcher' keyword argument for efficiency. :*returns*: the number of documents deleted. """ if searcher is None: from whoosh.searching import Searcher s = Searcher(self) else: s = searcher count = 0 try: for docnum in q.docs(s): self.delete_document(docnum) count += 1 return count finally: if searcher is None: s.close() return count
def test_vector_merge(self): a = analysis.StandardAnalyzer() schema = fields.Schema( title=fields.TEXT, content=fields.TEXT(vector=formats.Frequency(analyzer=a))) ix = self.make_index("testindex", schema, "vector_merge") try: writer = ix.writer() writer.add_document( title=u"one", content=u"This is the story of the black hole story") writer.commit() writer = ix.writer() writer.add_document(title=u"two", content=u"You can read along in your book") writer.commit() reader = ix.reader() searcher = Searcher(reader) docnum = searcher.document_number(title=u"one") vec = list(reader.vector(docnum, "content").items_as("frequency")) self.assertEqual(vec, [(u'black', 1), (u'hole', 1), (u'story', 2)]) docnum = searcher.document_number(title=u"two") vec = list(searcher.vector_as("frequency", docnum, "content")) self.assertEqual(vec, [(u'along', 1), (u'book', 1), (u'read', 1), (u'your', 1)]) finally: pass
def delete_by_query(self, q, searcher = None): """Deletes any documents matching a query object. Note that this method opens and closes a Searcher. If you are calling this method repeatedly (for example, deleting changed documents before reindexing them), you should open your own Searcher object and pass it in with the 'searcher' keyword argument for efficiency. :*returns*: the number of documents deleted. """ if searcher is None: from whoosh.searching import Searcher s = Searcher(self) else: s = searcher count = 0 try: for docnum in q.docs(s): self.delete_document(docnum) count += 1 return count finally: if searcher is None: s.close() return count
def test_vector_merge(self): a = analysis.StandardAnalyzer() schema = fields.Schema(title = fields.TEXT, content = fields.TEXT(vector=formats.Frequency(analyzer=a))) ix = self.make_index("testindex", schema, "vector_merge") try: writer = ix.writer() writer.add_document(title=u"one", content=u"This is the story of the black hole story") writer.commit() writer = ix.writer() writer.add_document(title=u"two", content=u"You can read along in your book") writer.commit() reader = ix.reader() searcher = Searcher(reader) docnum = searcher.document_number(title=u"one") vec = list(reader.vector(docnum, "content").items_as("frequency")) self.assertEqual(vec, [(u'black', 1), (u'hole', 1), (u'story', 2)]) docnum = searcher.document_number(title=u"two") vec = list(searcher.vector_as("frequency", docnum, "content")) self.assertEqual(vec, [(u'along', 1), (u'book', 1), (u'read', 1), (u'your', 1)]) finally: pass
def test2(): queries = load_queries() ix = index.open_dir(index_dir) mysearcher = Searcher(ix) #mysearcher = ix.searcher(weighting=scoring.BM25F()) for query in queries: print("Processing query number", query['id']) results = mysearcher.search(query['text'], limit=10) print(results)
def test_memory_codec(): from whoosh.codec import memory from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) codec = memory.MemoryCodec() with codec.writer(schema) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) reader = codec.reader(schema) s = Searcher(reader) assert ("a", "delta") in reader q = query.Term("a", "delta") r = s.search(q) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join(s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") cfield = schema["c"] c_sortables = cfield.sortable_terms(reader, "c") c_values = [cfield.from_bytes(t) for t in c_sortables] assert c_values, [-200, -100, 100, 200, 300] assert reader.has_column("c") c_values = list(reader.column_reader("c")) assert c_values == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo" assert reader.has_word_graph("d") gr = reader.word_graph("d") assert (" ".join(gr.flatten_strings()) == "aching dipping echoing filling going hopping opening " "pulling quelling rolling selling timing using whining " "yelling")
def get_index_docnum_of_article_id(article_id: str, ix_reader: MultiReader, article_id_fieldname='articleID'): ix_searcher = Searcher(ix_reader) rslts = ix_searcher.search( QueryParser(article_id_fieldname, ix_reader.schema).parse(article_id)) if len(rslts) == 0: LOGGER.warning( 'Article ID {} was not found in the index'.format(article_id)) return -1 if len(rslts) > 1: LOGGER.warning( 'Article ID {} has multiple instances in the index'.format( article_id)) return rslts[0].docnum
def __init__(self, index, page_ranker): """ Creazione del QueryParser relativo al testo della query. Aggiungo il plugin 'MultifieldPlugin' al 'QueryParser' perchè mi permette poi nella funzione di 'search' di poter modificare il filed boost (cosa che non siamo riusciti a fare se avessimo definito direttamente un 'MultifieldParser'). - MULTIFIELD così effettuo ricerca sia nel titolo che nel testo. Il parser utilizza l'analizzatore definito nell'index, riferito al field corrispondente. Se ho definito 2 analyzer diversi per titolo e testo, questi vengono usati rispettivamente per parsare la query per il titolo e per il testo. ES: testo con stemmer e titolo senza stemmer: query: 'fortified' --> query parsata :'(text:fortifi OR title:fortified)' - GROUP default concatena i token con 'AND'. Specificando 'OrGroup' concatena con OR. Utilizzando il FACTORY, do un punteggio maggiore ai documenti in cui un certo termine ha una frequenza più alta. Senza FACTORY non ho questo effetto. """ self.index = index self.page_ranker = page_ranker self.expand = Expander(disambiguate_fn='noun_sense') self.parser = qparser.QueryParser(None, index.schema) self.multifield_plugin = qparser.MultifieldPlugin(['text', 'title']) self.parser.add_plugin(self.multifield_plugin) self.weighting = 'BM25F' self.searcher = WhooshSearcher( reader=self.index.reader(), weighting=WikiSearcher.weighting[self.weighting])
def searcher(self, **kwargs): """Returns a Searcher object for this index. Keyword arguments are passed to the Searcher object's constructor. :*returns*: searching.Searcher """ from whoosh.searching import Searcher return Searcher(self, **kwargs)
def searcher(self, **kwargs): """Returns a Searcher object for this index. Keyword arguments are passed to the Searcher object's constructor. :rtype: :class:`whoosh.searching.Searcher` """ from whoosh.searching import Searcher return Searcher(self.reader(), fromindex=self, **kwargs)
def _parse_and_search( self, searcher: Searcher, content: str, value: str, limit: Optional[int] = None, terms: bool = False, group: Type[Union[AndGroup, OrGroup]] = OrGroup) -> Results: parser = QueryParser(content, self._ix.schema, group=group) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) return searcher.search(query, terms=terms, limit=limit)
def test_all(self): domain = [u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot"] schema = Schema(text=TEXT) storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() for _ in xrange(100): w.add_document(text=u" ".join(choice(domain) for i in xrange(randint(10, 20)))) w.commit() # provide initializer arguments for any weighting classes that require them init_args = {} reader = ix.reader() for wclass in self._weighting_classes(): if wclass in init_args: weighting = wclass(*init_args[wclass]) else: weighting = wclass() searcher = Searcher(reader, weighting) for word in domain: r = searcher.search(query.Term("text", word))
def test_memory_codec(): from whoosh.codec import memory from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) codec = memory.MemoryCodec() with codec.writer(schema) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) reader = codec.reader(schema) s = Searcher(reader) assert ("a", "delta") in reader q = query.Term("a", "delta") r = s.search(q) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join( s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") cfield = schema["c"] c_sortables = cfield.sortable_terms(reader, "c") c_values = [cfield.from_bytes(t) for t in c_sortables] assert c_values, [-200, -100, 100, 200, 300] assert reader.has_column("c") c_values = list(reader.column_reader("c")) assert c_values == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo"
def test_missing_field_scoring(self): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = RamStorage() idx = storage.create_index(schema) writer = idx.writer() writer.add_document(name=u'Frank', hobbies=u'baseball, basketball') writer.commit() self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 1) # name writer = idx.writer() writer.add_document(name=u'Jonny') writer.commit() self.assertEqual(len(idx.segments), 1) self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 2) # name reader = idx.reader() searcher = Searcher(reader) parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema) q = parser.parse(u"baseball") result = searcher.search(q) self.assertEqual(len(result), 1)
def __init__(self, file_index: index.FileIndex, index_docnums: list = None, name: str = 'DB', ix_reader: IndexReader = None, content_field='body'): self.name = name self.ix = file_index self._reader = ix_reader if ix_reader is not None else file_index.reader( ) self._private_reader = False if ix_reader is not None else True self._searcher = Searcher(self._reader) if index_docnums is not None: self._docnums = set(index_docnums) else: self._docnums = self._get_all_db_ids() self._dfs, self._tfs, self._total_terms = self._build(content_field) self._tfidfs = defaultdict(float)
def match_search_query(searcher: Searcher, q: Query) -> bool: return bool(searcher.search(q))
class WikiSearcher: weighting = { 'TF_IDF': scoring.TF_IDF, 'BM25F': scoring.BM25F, 'FREQUENCY': scoring.Frequency, } group = { 'OR': qparser.OrGroup, 'AND': qparser.AndGroup, } base_url = 'https://en.wikipedia.org/wiki/' def __init__(self, index, page_ranker): """ Creazione del QueryParser relativo al testo della query. Aggiungo il plugin 'MultifieldPlugin' al 'QueryParser' perchè mi permette poi nella funzione di 'search' di poter modificare il filed boost (cosa che non siamo riusciti a fare se avessimo definito direttamente un 'MultifieldParser'). - MULTIFIELD così effettuo ricerca sia nel titolo che nel testo. Il parser utilizza l'analizzatore definito nell'index, riferito al field corrispondente. Se ho definito 2 analyzer diversi per titolo e testo, questi vengono usati rispettivamente per parsare la query per il titolo e per il testo. ES: testo con stemmer e titolo senza stemmer: query: 'fortified' --> query parsata :'(text:fortifi OR title:fortified)' - GROUP default concatena i token con 'AND'. Specificando 'OrGroup' concatena con OR. Utilizzando il FACTORY, do un punteggio maggiore ai documenti in cui un certo termine ha una frequenza più alta. Senza FACTORY non ho questo effetto. """ self.index = index self.page_ranker = page_ranker self.expand = Expander(disambiguate_fn='noun_sense') self.parser = qparser.QueryParser(None, index.schema) self.multifield_plugin = qparser.MultifieldPlugin(['text', 'title']) self.parser.add_plugin(self.multifield_plugin) self.weighting = 'BM25F' self.searcher = WhooshSearcher( reader=self.index.reader(), weighting=WikiSearcher.weighting[self.weighting]) def search(self, text, limit=10, exp=True, page_rank=True, text_boost=1.0, title_boost=1.0, weighting='BM25F', group='AND'): """ Funzione che esegue la ricerca con i parametri passati in input. Se il pagerank è specificato, lo score finale del documento viene calcolato sia in funzione dello score fornito dalla ricerca che al valore di pagerank. Per prima cosa avviene la fase di settaggio del parser e del weighting con i valori passati in input. Dopo di chè avviene il query expansion. Poi, avviene il parsing del testo, che viene passato al searcher che ottiene i documenti rilevanti. :param self :param text: testo che verrà convertito in query dal parser :param limit: numero max di documenti ritornati :param exp: boolean se abilitare o meno il query expansion :param page_rank: boolean se abilitare o meno il pagerank :param text_boost: boosting del campo testo :param title_boost: boosting del campo titolo :param weighting: metodo di weighting :param group: come vengono concatenati i token della query return dict con i risultati. """ self.multifield_plugin.boosts = { 'text': text_boost, 'title': title_boost } self.parser.group = WikiSearcher.group.get(group, 'AND') text, list_token_expanded = self.expand(text) if exp else (text, None) query = self.parser.parse(text) if weighting != self.weighting: print('Imposto il searcher con il weighting : ' + weighting + ' Può impiegare tempo ...') self.searcher = WhooshSearcher( reader=self.index.reader(), weighting=WikiSearcher.weighting.get(weighting, 'BM25F')) self.weighting = weighting print('Weighting impostato correttamente') #print('Query : '+str(query)) res = {} results = self.searcher.search(query, limit=limit) res['time_second'] = results.runtime res['expanded'] = list_token_expanded if exp else [] res['n_res'] = results.estimated_length() final_score_fn, values_page_rank = self.__combinedScore( page_rank, results) if page_rank: results = sorted(results, key=final_score_fn, reverse=True) res['docs'] = [{ 'link': WikiSearcher.base_url + result['title'].replace(" ", "_"), 'title': result['title'], 'highlight': result.highlights("text", top=2), 'final_score': final_score_fn(result), 'score': result.score, 'page_rank': values_page_rank.get(result['id_page'], -1) } for result in results] return res def __combinedScore(self, page_rank, results): """ Ritorna il riferimento alla funzione usata per il calcolo dello score finale combinato con il valore di pagerank. In questa funzione ricavo, tramite il 'getRank' del 'pageRanker', i valori di pagerank che ho calcolato in precedenza. :param self :param page_rank: boolean per capire se serve usare il pagerank per la query corrente :param results: risultati della query """ values_page_rank = {} if page_rank: values_page_rank = self.page_ranker.getRank( [res['id_page'] for res in results], 5) def final_score_fn(result): if page_rank: return result.score * values_page_rank.get( result['id_page'], 1) return result.score return final_score_fn, values_page_rank def getFieldInfo(self, field): """ Ottengo le informazioni riferite al field. Funzione utile in fase di debug. :param field: field di cui voglio le informazioni return dict con le info del field specificato """ return {'length': self.searcher.field_length(field)} def getGeneralInfo(self): """ Ottengo le informazioni generali riferite all'indice. Funzione utile in fase di debug. :param self: return dict con le info """ return {'doc_count': self.searcher.doc_count()}
def searcher(self, **kwargs): from whoosh.searching import Searcher return Searcher(self.reader(), **kwargs)
def search(self, text, limit=10, exp=True, page_rank=True, text_boost=1.0, title_boost=1.0, weighting='BM25F', group='AND'): """ Funzione che esegue la ricerca con i parametri passati in input. Se il pagerank è specificato, lo score finale del documento viene calcolato sia in funzione dello score fornito dalla ricerca che al valore di pagerank. Per prima cosa avviene la fase di settaggio del parser e del weighting con i valori passati in input. Dopo di chè avviene il query expansion. Poi, avviene il parsing del testo, che viene passato al searcher che ottiene i documenti rilevanti. :param self :param text: testo che verrà convertito in query dal parser :param limit: numero max di documenti ritornati :param exp: boolean se abilitare o meno il query expansion :param page_rank: boolean se abilitare o meno il pagerank :param text_boost: boosting del campo testo :param title_boost: boosting del campo titolo :param weighting: metodo di weighting :param group: come vengono concatenati i token della query return dict con i risultati. """ self.multifield_plugin.boosts = { 'text': text_boost, 'title': title_boost } self.parser.group = WikiSearcher.group.get(group, 'AND') text, list_token_expanded = self.expand(text) if exp else (text, None) query = self.parser.parse(text) if weighting != self.weighting: print('Imposto il searcher con il weighting : ' + weighting + ' Può impiegare tempo ...') self.searcher = WhooshSearcher( reader=self.index.reader(), weighting=WikiSearcher.weighting.get(weighting, 'BM25F')) self.weighting = weighting print('Weighting impostato correttamente') #print('Query : '+str(query)) res = {} results = self.searcher.search(query, limit=limit) res['time_second'] = results.runtime res['expanded'] = list_token_expanded if exp else [] res['n_res'] = results.estimated_length() final_score_fn, values_page_rank = self.__combinedScore( page_rank, results) if page_rank: results = sorted(results, key=final_score_fn, reverse=True) res['docs'] = [{ 'link': WikiSearcher.base_url + result['title'].replace(" ", "_"), 'title': result['title'], 'highlight': result.highlights("text", top=2), 'final_score': final_score_fn(result), 'score': result.score, 'page_rank': values_page_rank.get(result['id_page'], -1) } for result in results] return res