def incremental_index_msmacro(index: FileIndex, commit_every_n: int = 1_000_000): indexed_docs = set() print('Collecting indexed document IDs...') with index.searcher() as searcher: for doc in searcher.all_stored_fields(): indexed_docs.add(doc['doc_id']) remaining = 3_200_000 - len(indexed_docs) print( f'Found {len(indexed_docs)} documents, adding {remaining} missing documents...' ) writer = create_writer(index) i = 0 for doc in tqdm(iter_msmarco_docs(), total=remaining, unit='docs'): if doc['doc_id'] not in indexed_docs: writer.add_document(**doc) i += 1 if i % commit_every_n == 0: writer.commit() writer = create_writer(index) writer.commit() print('Done!')
def query(query_str: str, index: FileIndex) -> Iterable[Tuple[str, int, float]]: with index.searcher() as searcher: query = QueryParser('body', index.schema).parse(query_str) results = searcher.search(query) for result in results: score = result.score rank = result.rank yield result['doc_id'], rank, score
class WhooshManager(models.Manager): def __init__(self, *args, **kwargs): self.default = kwargs.pop("default",None) self.parser = None self.fields = kwargs.pop('fields', []) + ['id'] self.real_time = kwargs.pop('real_time', True) if not os.path.lexists(STORAGE_DIR): os.makedirs(STORAGE_DIR) self.storage = filestore.FileStorage(STORAGE_DIR) try: self.index = FileIndex(self.storage) except (IndexError, EmptyIndexError): self.index = None super(WhooshManager, self).__init__(*args, **kwargs) def contribute_to_class(self, model, name): super(WhooshManager, self).contribute_to_class(model, name) class_prepared.connect(self.class_prepared_callback, sender=self.model) def class_prepared_callback(self, sender, **kwargs): schema_dict = {} for field_name in self.fields: field = self.model._meta.get_field_by_name(field_name)[0] schema_dict[field.name] = field_mapping[field.__class__] self.schema = Schema(**schema_dict) if self.index is None: self.index = FileIndex.create(self.storage, self.schema) self.searcher = self.index.searcher() if self.real_time: post_save.connect(self.post_save_callback, sender=self.model) post_delete.connect(self.post_delete_callback, sender=self.model) def post_save_callback(self, sender, instance, created, **kwargs): dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields]) self.index = self.index.refresh() writer = self.index.writer() if created: writer.add_document(**dct) else: writer.update_document(**dct) writer.commit() def post_delete_callback(self, sender, instance, **kwargs): pass def query(self, q): if self.parser is None: self.parser = QueryParser(self.default, schema=self.schema) results = self.searcher.search(self.parser.parse(q)) return self.filter(id__in=[r['id'] for r in results])
class WhooshManager(models.Manager): def __init__(self, *args, **kwargs): self.default = kwargs.pop("default", None) self.parser = None self.fields = kwargs.pop('fields', []) + ['id'] self.real_time = kwargs.pop('real_time', True) if not os.path.lexists(STORAGE_DIR): os.makedirs(STORAGE_DIR) self.storage = filestore.FileStorage(STORAGE_DIR) try: self.index = FileIndex(self.storage) except (IndexError, EmptyIndexError): self.index = None super(WhooshManager, self).__init__(*args, **kwargs) def contribute_to_class(self, model, name): super(WhooshManager, self).contribute_to_class(model, name) class_prepared.connect(self.class_prepared_callback, sender=self.model) def class_prepared_callback(self, sender, **kwargs): schema_dict = {} for field_name in self.fields: field = self.model._meta.get_field_by_name(field_name)[0] schema_dict[field.name] = field_mapping[field.__class__] self.schema = Schema(**schema_dict) if self.index is None: self.index = FileIndex.create(self.storage, self.schema) self.searcher = self.index.searcher() if self.real_time: post_save.connect(self.post_save_callback, sender=self.model) post_delete.connect(self.post_delete_callback, sender=self.model) def post_save_callback(self, sender, instance, created, **kwargs): dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields]) self.index = self.index.refresh() writer = self.index.writer() if created: writer.add_document(**dct) else: writer.update_document(**dct) writer.commit() def post_delete_callback(self, sender, instance, **kwargs): pass def query(self, q): if self.parser is None: self.parser = QueryParser(self.default, schema=self.schema) results = self.searcher.search(self.parser.parse(q)) return self.filter(id__in=[r['id'] for r in results])
class BM25CandidateSelector(CandidateSelector): def __init__(self, corpus: Corpus, index_path: str, top_k, extend_candidate_citations: bool): super().__init__(top_k) self.index_path = index_path storage = FileStorage(self.index_path, readonly=True) self._bm25_index = FileIndex(copy_to_ram(storage), schema=schema) self.searcher = self._bm25_index.searcher(weighting=scoring.BM25F) self.query_parser = MultifieldParser( [FieldNames.TITLE, FieldNames.ABSTRACT], self._bm25_index.schema, group=qparser.OrGroup) self.corpus = corpus self.extend_candidate_citations = extend_candidate_citations def fetch_candidates(self, doc_id, candidate_ids_pool): title_key_terms = ' '.join([ t for t, _ in self.searcher.key_terms_from_text( 'title', self.corpus[doc_id].title, numterms=3) ]) abstract_key_terms = ' '.join([ t for t, _ in self.searcher.key_terms_from_text( 'abstract', self.corpus[doc_id].abstract) ]) # Implement BM25 index builder and return query = self.query_parser.parse(title_key_terms + " " + abstract_key_terms) results = self.searcher.search(query, limit=self.top_k + 1, optimize=True, scored=True) candidate_ids_pool = set(candidate_ids_pool) candidate_ids = [] candidate_scores = [] for result in results: if result['id'] in candidate_ids_pool and result['id'] != doc_id: candidate_ids.append(result['id']) candidate_scores.append(result.score) return candidate_ids, candidate_scores
data_type = 'pd' valid_docs = set(open(args.valid_docs).read().strip().split('\n')) searcher = JSearcher(JString(args.index)) searcher.setBM25Similarity(args.k1, args.b) print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b)) if args.rm3: searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight) print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format(args.fbTerms, args.fbDocs, args.originalQueryWeight)) schema = Schema(title=TEXT, abstract=TEXT, id=ID(stored=True)) storage = FileStorage(args.whoosh_index, readonly=True) bm25_index = FileIndex(copy_to_ram(storage), schema=schema) whoosh_searcher = bm25_index.searcher(weighting=scoring.BM25F) with open(args.output, 'w') as fout: start_time = time.time() for line_number, line in enumerate(open(args.qid_queries)): query_id, query = line.strip().split('\t') query = update_query_with_key_terms(query, whoosh_searcher) # We return one more result because it is almost certain that we will # retrieve the document that originated the query. hits = searcher.search( JString(query.encode('utf8')), args.hits + 1) if line_number % 10 == 0: time_per_query = (time.time() - start_time) / (line_number + 1) print('Retrieving query {} ({:0.3f} s/query)'.format( line_number, time_per_query))