コード例 #1
0
def incremental_index_msmacro(index: FileIndex,
                              commit_every_n: int = 1_000_000):
    indexed_docs = set()

    print('Collecting indexed document IDs...')
    with index.searcher() as searcher:
        for doc in searcher.all_stored_fields():
            indexed_docs.add(doc['doc_id'])

    remaining = 3_200_000 - len(indexed_docs)
    print(
        f'Found {len(indexed_docs)} documents, adding {remaining} missing documents...'
    )
    writer = create_writer(index)
    i = 0
    for doc in tqdm(iter_msmarco_docs(), total=remaining, unit='docs'):
        if doc['doc_id'] not in indexed_docs:
            writer.add_document(**doc)
            i += 1
            if i % commit_every_n == 0:
                writer.commit()
                writer = create_writer(index)

    writer.commit()

    print('Done!')
コード例 #2
0
def query(query_str: str,
          index: FileIndex) -> Iterable[Tuple[str, int, float]]:
    with index.searcher() as searcher:
        query = QueryParser('body', index.schema).parse(query_str)
        results = searcher.search(query)

        for result in results:
            score = result.score
            rank = result.rank
            yield result['doc_id'], rank, score
コード例 #3
0
ファイル: managers.py プロジェクト: mugwort-rc/django-whoosh
class WhooshManager(models.Manager):
    def __init__(self, *args, **kwargs):
        self.default = kwargs.pop("default",None)
        self.parser = None
        self.fields = kwargs.pop('fields', []) + ['id']
        self.real_time = kwargs.pop('real_time', True)
        if not os.path.lexists(STORAGE_DIR):
            os.makedirs(STORAGE_DIR)
        self.storage = filestore.FileStorage(STORAGE_DIR)
        try:
            self.index = FileIndex(self.storage)
        except (IndexError, EmptyIndexError):
            self.index = None
        super(WhooshManager, self).__init__(*args, **kwargs)
    
    def contribute_to_class(self, model, name):
        super(WhooshManager, self).contribute_to_class(model, name)
        class_prepared.connect(self.class_prepared_callback, sender=self.model)
    
    def class_prepared_callback(self, sender, **kwargs):
        schema_dict = {}
        for field_name in self.fields:
            field = self.model._meta.get_field_by_name(field_name)[0]
            schema_dict[field.name] = field_mapping[field.__class__]
        self.schema = Schema(**schema_dict)
        if self.index is None:
            self.index = FileIndex.create(self.storage, self.schema)
        self.searcher = self.index.searcher()
        if self.real_time:
            post_save.connect(self.post_save_callback, sender=self.model)
            post_delete.connect(self.post_delete_callback, sender=self.model)
    
    def post_save_callback(self, sender, instance, created, **kwargs):
        dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields])
        self.index = self.index.refresh()
        writer = self.index.writer()
        if created:
            writer.add_document(**dct)
        else:
            writer.update_document(**dct)
        writer.commit()
    
    def post_delete_callback(self, sender, instance, **kwargs):
        pass
    
    def query(self, q):
        if self.parser is None:
            self.parser = QueryParser(self.default, schema=self.schema)
        results = self.searcher.search(self.parser.parse(q))
        return self.filter(id__in=[r['id'] for r in results])
コード例 #4
0
class WhooshManager(models.Manager):
    def __init__(self, *args, **kwargs):
        self.default = kwargs.pop("default", None)
        self.parser = None
        self.fields = kwargs.pop('fields', []) + ['id']
        self.real_time = kwargs.pop('real_time', True)
        if not os.path.lexists(STORAGE_DIR):
            os.makedirs(STORAGE_DIR)
        self.storage = filestore.FileStorage(STORAGE_DIR)
        try:
            self.index = FileIndex(self.storage)
        except (IndexError, EmptyIndexError):
            self.index = None
        super(WhooshManager, self).__init__(*args, **kwargs)

    def contribute_to_class(self, model, name):
        super(WhooshManager, self).contribute_to_class(model, name)
        class_prepared.connect(self.class_prepared_callback, sender=self.model)

    def class_prepared_callback(self, sender, **kwargs):
        schema_dict = {}
        for field_name in self.fields:
            field = self.model._meta.get_field_by_name(field_name)[0]
            schema_dict[field.name] = field_mapping[field.__class__]
        self.schema = Schema(**schema_dict)
        if self.index is None:
            self.index = FileIndex.create(self.storage, self.schema)
        self.searcher = self.index.searcher()
        if self.real_time:
            post_save.connect(self.post_save_callback, sender=self.model)
            post_delete.connect(self.post_delete_callback, sender=self.model)

    def post_save_callback(self, sender, instance, created, **kwargs):
        dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields])
        self.index = self.index.refresh()
        writer = self.index.writer()
        if created:
            writer.add_document(**dct)
        else:
            writer.update_document(**dct)
        writer.commit()

    def post_delete_callback(self, sender, instance, **kwargs):
        pass

    def query(self, q):
        if self.parser is None:
            self.parser = QueryParser(self.default, schema=self.schema)
        results = self.searcher.search(self.parser.parse(q))
        return self.filter(id__in=[r['id'] for r in results])
コード例 #5
0
class BM25CandidateSelector(CandidateSelector):
    def __init__(self, corpus: Corpus, index_path: str, top_k,
                 extend_candidate_citations: bool):
        super().__init__(top_k)
        self.index_path = index_path

        storage = FileStorage(self.index_path, readonly=True)
        self._bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
        self.searcher = self._bm25_index.searcher(weighting=scoring.BM25F)
        self.query_parser = MultifieldParser(
            [FieldNames.TITLE, FieldNames.ABSTRACT],
            self._bm25_index.schema,
            group=qparser.OrGroup)
        self.corpus = corpus
        self.extend_candidate_citations = extend_candidate_citations

    def fetch_candidates(self, doc_id, candidate_ids_pool):

        title_key_terms = ' '.join([
            t for t, _ in self.searcher.key_terms_from_text(
                'title', self.corpus[doc_id].title, numterms=3)
        ])
        abstract_key_terms = ' '.join([
            t for t, _ in self.searcher.key_terms_from_text(
                'abstract', self.corpus[doc_id].abstract)
        ])
        # Implement BM25 index builder and return
        query = self.query_parser.parse(title_key_terms + " " +
                                        abstract_key_terms)
        results = self.searcher.search(query,
                                       limit=self.top_k + 1,
                                       optimize=True,
                                       scored=True)

        candidate_ids_pool = set(candidate_ids_pool)
        candidate_ids = []
        candidate_scores = []
        for result in results:
            if result['id'] in candidate_ids_pool and result['id'] != doc_id:
                candidate_ids.append(result['id'])
                candidate_scores.append(result.score)

        return candidate_ids, candidate_scores
コード例 #6
0
      data_type = 'pd'
      valid_docs = set(open(args.valid_docs).read().strip().split('\n'))

    searcher = JSearcher(JString(args.index))
    searcher.setBM25Similarity(args.k1, args.b)
    print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b))
    if args.rm3:
        searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight)
        print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format(args.fbTerms, args.fbDocs, args.originalQueryWeight))

    schema = Schema(title=TEXT,
                    abstract=TEXT,
                    id=ID(stored=True))
    storage = FileStorage(args.whoosh_index, readonly=True)
    bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
    whoosh_searcher = bm25_index.searcher(weighting=scoring.BM25F)

    with open(args.output, 'w') as fout:
      start_time = time.time()
      for line_number, line in enumerate(open(args.qid_queries)):
          query_id, query = line.strip().split('\t')
          query = update_query_with_key_terms(query, whoosh_searcher)
          # We return one more result because it is almost certain that we will 
          # retrieve the document that originated the query.
          hits = searcher.search(
              JString(query.encode('utf8')), args.hits + 1)

          if line_number % 10 == 0:
              time_per_query = (time.time() - start_time) / (line_number + 1)
              print('Retrieving query {} ({:0.3f} s/query)'.format(
                  line_number, time_per_query))