Esempio n. 1
0
 def __init__(self, *args, **kwargs):
     self.default = kwargs.pop("default", None)
     self.parser = None
     self.fields = kwargs.pop('fields', []) + ['id']
     self.real_time = kwargs.pop('real_time', True)
     if not os.path.lexists(STORAGE_DIR):
         os.makedirs(STORAGE_DIR)
     self.storage = filestore.FileStorage(STORAGE_DIR)
     try:
         self.index = FileIndex(self.storage)
     except (IndexError, EmptyIndexError):
         self.index = None
     super(WhooshManager, self).__init__(*args, **kwargs)
Esempio n. 2
0
class WhooshManager(models.Manager):
    def __init__(self, *args, **kwargs):
        self.default = kwargs.pop("default", None)
        self.parser = None
        self.fields = kwargs.pop('fields', []) + ['id']
        self.real_time = kwargs.pop('real_time', True)
        if not os.path.lexists(STORAGE_DIR):
            os.makedirs(STORAGE_DIR)
        self.storage = filestore.FileStorage(STORAGE_DIR)
        try:
            self.index = FileIndex(self.storage)
        except (IndexError, EmptyIndexError):
            self.index = None
        super(WhooshManager, self).__init__(*args, **kwargs)

    def contribute_to_class(self, model, name):
        super(WhooshManager, self).contribute_to_class(model, name)
        class_prepared.connect(self.class_prepared_callback, sender=self.model)

    def class_prepared_callback(self, sender, **kwargs):
        schema_dict = {}
        for field_name in self.fields:
            field = self.model._meta.get_field_by_name(field_name)[0]
            schema_dict[field.name] = field_mapping[field.__class__]
        self.schema = Schema(**schema_dict)
        if self.index is None:
            self.index = FileIndex.create(self.storage, self.schema)
        self.searcher = self.index.searcher()
        if self.real_time:
            post_save.connect(self.post_save_callback, sender=self.model)
            post_delete.connect(self.post_delete_callback, sender=self.model)

    def post_save_callback(self, sender, instance, created, **kwargs):
        dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields])
        self.index = self.index.refresh()
        writer = self.index.writer()
        if created:
            writer.add_document(**dct)
        else:
            writer.update_document(**dct)
        writer.commit()

    def post_delete_callback(self, sender, instance, **kwargs):
        pass

    def query(self, q):
        if self.parser is None:
            self.parser = QueryParser(self.default, schema=self.schema)
        results = self.searcher.search(self.parser.parse(q))
        return self.filter(id__in=[r['id'] for r in results])
Esempio n. 3
0
    def __init__(self, corpus: Corpus, index_path: str, top_k,
                 extend_candidate_citations: bool):
        super().__init__(top_k)
        self.index_path = index_path

        storage = FileStorage(self.index_path, readonly=True)
        self._bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
        self.searcher = self._bm25_index.searcher(weighting=scoring.BM25F)
        self.query_parser = MultifieldParser(
            [FieldNames.TITLE, FieldNames.ABSTRACT],
            self._bm25_index.schema,
            group=qparser.OrGroup)
        self.corpus = corpus
        self.extend_candidate_citations = extend_candidate_citations
Esempio n. 4
0
class WhooshManager(models.Manager):
    def __init__(self, *args, **kwargs):
        self.default = kwargs.pop("default",None)
        self.parser = None
        self.fields = kwargs.pop('fields', []) + ['id']
        self.real_time = kwargs.pop('real_time', True)
        if not os.path.lexists(STORAGE_DIR):
            os.makedirs(STORAGE_DIR)
        self.storage = filestore.FileStorage(STORAGE_DIR)
        try:
            self.index = FileIndex(self.storage)
        except (IndexError, EmptyIndexError):
            self.index = None
        super(WhooshManager, self).__init__(*args, **kwargs)
    
    def contribute_to_class(self, model, name):
        super(WhooshManager, self).contribute_to_class(model, name)
        class_prepared.connect(self.class_prepared_callback, sender=self.model)
    
    def class_prepared_callback(self, sender, **kwargs):
        schema_dict = {}
        for field_name in self.fields:
            field = self.model._meta.get_field_by_name(field_name)[0]
            schema_dict[field.name] = field_mapping[field.__class__]
        self.schema = Schema(**schema_dict)
        if self.index is None:
            self.index = FileIndex.create(self.storage, self.schema)
        self.searcher = self.index.searcher()
        if self.real_time:
            post_save.connect(self.post_save_callback, sender=self.model)
            post_delete.connect(self.post_delete_callback, sender=self.model)
    
    def post_save_callback(self, sender, instance, created, **kwargs):
        dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields])
        self.index = self.index.refresh()
        writer = self.index.writer()
        if created:
            writer.add_document(**dct)
        else:
            writer.update_document(**dct)
        writer.commit()
    
    def post_delete_callback(self, sender, instance, **kwargs):
        pass
    
    def query(self, q):
        if self.parser is None:
            self.parser = QueryParser(self.default, schema=self.schema)
        results = self.searcher.search(self.parser.parse(q))
        return self.filter(id__in=[r['id'] for r in results])
Esempio n. 5
0
def incremental_index_msmacro(index: FileIndex,
                              commit_every_n: int = 1_000_000):
    indexed_docs = set()

    print('Collecting indexed document IDs...')
    with index.searcher() as searcher:
        for doc in searcher.all_stored_fields():
            indexed_docs.add(doc['doc_id'])

    remaining = 3_200_000 - len(indexed_docs)
    print(
        f'Found {len(indexed_docs)} documents, adding {remaining} missing documents...'
    )
    writer = create_writer(index)
    i = 0
    for doc in tqdm(iter_msmarco_docs(), total=remaining, unit='docs'):
        if doc['doc_id'] not in indexed_docs:
            writer.add_document(**doc)
            i += 1
            if i % commit_every_n == 0:
                writer.commit()
                writer = create_writer(index)

    writer.commit()

    print('Done!')
Esempio n. 6
0
def incremental_index(indexdir, indexname, rowData):
    """
    注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意
    :param rowData: 每一行的数据
    :param indexdir:
    :param indexname:
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)
    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(json.dumps(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    writer.commit()
Esempio n. 7
0
    def reader(self, reuse=None):
        from whoosh.index import FileIndex

        self._check_state()
        return FileIndex._reader(self.storage,
                                 self.schema,
                                 self.segments,
                                 self.generation,
                                 reuse=reuse)
Esempio n. 8
0
def query(query_str: str,
          index: FileIndex) -> Iterable[Tuple[str, int, float]]:
    with index.searcher() as searcher:
        query = QueryParser('body', index.schema).parse(query_str)
        results = searcher.search(query)

        for result in results:
            score = result.score
            rank = result.rank
            yield result['doc_id'], rank, score
Esempio n. 9
0
    def init_indexes(self):
        """Create indexes for schemas."""
        state = self.app_state

        for name, schema in self.schemas.items():
            if current_app.testing:
                storage = TestingStorage()
            else:
                index_path = (Path(state.whoosh_base) / name).absolute()
                if not index_path.exists():
                    index_path.mkdir(parents=True)
                storage = FileStorage(text_type(index_path))

            if storage.index_exists(name):
                index = FileIndex(storage, schema, name)
            else:
                index = FileIndex.create(storage, schema, name)

            state.indexes[name] = index
Esempio n. 10
0
 def class_prepared_callback(self, sender, **kwargs):
     schema_dict = {}
     for field_name in self.fields:
         field = self.model._meta.get_field_by_name(field_name)[0]
         schema_dict[field.name] = field_mapping[field.__class__]
     self.schema = Schema(**schema_dict)
     if self.index is None:
         self.index = FileIndex.create(self.storage, self.schema)
     self.searcher = self.index.searcher()
     if self.real_time:
         post_save.connect(self.post_save_callback, sender=self.model)
         post_delete.connect(self.post_delete_callback, sender=self.model)
Esempio n. 11
0
 def class_prepared_callback(self, sender, **kwargs):
     schema_dict = {}
     for field_name in self.fields:
         field = self.model._meta.get_field_by_name(field_name)[0]
         schema_dict[field.name] = field_mapping[field.__class__]
     self.schema = Schema(**schema_dict)
     if self.index is None:
         self.index = FileIndex.create(self.storage, self.schema)
     self.searcher = self.index.searcher()
     if self.real_time:
         post_save.connect(self.post_save_callback, sender=self.model)
         post_delete.connect(self.post_delete_callback, sender=self.model)
Esempio n. 12
0
 def __init__(self, *args, **kwargs):
     self.default = kwargs.pop("default",None)
     self.parser = None
     self.fields = kwargs.pop('fields', []) + ['id']
     self.real_time = kwargs.pop('real_time', True)
     if not os.path.lexists(STORAGE_DIR):
         os.makedirs(STORAGE_DIR)
     self.storage = filestore.FileStorage(STORAGE_DIR)
     try:
         self.index = FileIndex(self.storage)
     except (IndexError, EmptyIndexError):
         self.index = None
     super(WhooshManager, self).__init__(*args, **kwargs)
Esempio n. 13
0
 def __init__(self,
              file_index: index.FileIndex,
              index_docnums: list = None,
              name: str = 'DB',
              ix_reader: IndexReader = None,
              content_field='body'):
     self.name = name
     self.ix = file_index
     self._reader = ix_reader if ix_reader is not None else file_index.reader(
     )
     self._private_reader = False if ix_reader is not None else True
     self._searcher = Searcher(self._reader)
     if index_docnums is not None:
         self._docnums = set(index_docnums)
     else:
         self._docnums = self._get_all_db_ids()
     self._dfs, self._tfs, self._total_terms = self._build(content_field)
     self._tfidfs = defaultdict(float)
Esempio n. 14
0
class BM25CandidateSelector(CandidateSelector):
    def __init__(self, corpus: Corpus, index_path: str, top_k,
                 extend_candidate_citations: bool):
        super().__init__(top_k)
        self.index_path = index_path

        storage = FileStorage(self.index_path, readonly=True)
        self._bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
        self.searcher = self._bm25_index.searcher(weighting=scoring.BM25F)
        self.query_parser = MultifieldParser(
            [FieldNames.TITLE, FieldNames.ABSTRACT],
            self._bm25_index.schema,
            group=qparser.OrGroup)
        self.corpus = corpus
        self.extend_candidate_citations = extend_candidate_citations

    def fetch_candidates(self, doc_id, candidate_ids_pool):

        title_key_terms = ' '.join([
            t for t, _ in self.searcher.key_terms_from_text(
                'title', self.corpus[doc_id].title, numterms=3)
        ])
        abstract_key_terms = ' '.join([
            t for t, _ in self.searcher.key_terms_from_text(
                'abstract', self.corpus[doc_id].abstract)
        ])
        # Implement BM25 index builder and return
        query = self.query_parser.parse(title_key_terms + " " +
                                        abstract_key_terms)
        results = self.searcher.search(query,
                                       limit=self.top_k + 1,
                                       optimize=True,
                                       scored=True)

        candidate_ids_pool = set(candidate_ids_pool)
        candidate_ids = []
        candidate_scores = []
        for result in results:
            if result['id'] in candidate_ids_pool and result['id'] != doc_id:
                candidate_ids.append(result['id'])
                candidate_scores.append(result.score)

        return candidate_ids, candidate_scores
Esempio n. 15
0
    def init_indexes(self):
        """Create indexes for schemas."""
        state = self.app_state

        for name, schema in self.schemas.items():
            if current_app.testing:
                storage = TestingStorage()
            else:
                index_path = (Path(state.whoosh_base) / name).absolute()
                if not index_path.exists():
                    index_path.mkdir(parents=True)
                storage = FileStorage(str(index_path))

            if storage.index_exists(name):
                index = FileIndex(storage, schema, name)
            else:
                index = FileIndex.create(storage, schema, name)

            state.indexes[name] = index
def update_index(index: FileIndex, cards: List[Card]) -> None:
    writer = index.writer()
    # We exclude tokens here because they can have the exact same name as cards.
    # We exclude emblems here to stop them showing up as
    cards = [c for c in cards if multiverse.is_playable_layout(c.layout)]
    for card in cards:
        names = card.names
        if card.name not in names:
            names.append(card.name)  # Split and aftermath cards
        for name in names:
            document = {}
            document['id'] = card.id
            document['name'] = name
            document['canonical_name'] = card.name
            document['name_tokenized'] = name
            document['name_stemmed'] = name
            document['name_normalized'] = name
            writer.update_document(**document)
    writer.commit()
Esempio n. 17
0
def update_index(indexdir, indexname, rowData):
    """
    注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意
    :param indexdir: 索引目录
    :param indexname: 索引名
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)

    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(str(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    # 其实就是在创建索引的时候要求唯一, 要不数据重复是追加了
    # Because "path" is marked as unique, calling update_document with path="/a"
    # will delete any existing documents where the "path" field contains "/a".
    # writer.update_document(path=u"/a", content="Replacement for the first document")
    writer.commit()
    if args.valid_docs:
      data_type = 'pd'
      valid_docs = set(open(args.valid_docs).read().strip().split('\n'))

    searcher = JSearcher(JString(args.index))
    searcher.setBM25Similarity(args.k1, args.b)
    print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b))
    if args.rm3:
        searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight)
        print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format(args.fbTerms, args.fbDocs, args.originalQueryWeight))

    schema = Schema(title=TEXT,
                    abstract=TEXT,
                    id=ID(stored=True))
    storage = FileStorage(args.whoosh_index, readonly=True)
    bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
    whoosh_searcher = bm25_index.searcher(weighting=scoring.BM25F)

    with open(args.output, 'w') as fout:
      start_time = time.time()
      for line_number, line in enumerate(open(args.qid_queries)):
          query_id, query = line.strip().split('\t')
          query = update_query_with_key_terms(query, whoosh_searcher)
          # We return one more result because it is almost certain that we will 
          # retrieve the document that originated the query.
          hits = searcher.search(
              JString(query.encode('utf8')), args.hits + 1)

          if line_number % 10 == 0:
              time_per_query = (time.time() - start_time) / (line_number + 1)
              print('Retrieving query {} ({:0.3f} s/query)'.format(
Esempio n. 19
0
 def _create_index(self):
     storage = EncryptedFileStorage(self.index_folder, self.key)
     return FileIndex.create(storage, self._mail_schema(), indexname="mails")
Esempio n. 20
0
 def _create_index(self):
     masterkey = self.soledad_querier.get_index_masterkey
     storage = EncryptedFileStorage(self.INDEX_FOLDER, masterkey)
     return FileIndex.create(storage, self._mail_schema(), indexname='mails')
Esempio n. 21
0
    def reader(self, reuse=None):
        from whoosh.index import FileIndex

        self._check_state()
        return FileIndex._reader(self.storage, self.schema, self.segments,
                                 self.generation, reuse=reuse)
Esempio n. 22
0
 def _create_index(self):
     storage = EncryptedFileStorage(self.index_folder, self.key)
     return FileIndex.create(storage,
                             self._mail_schema(),
                             indexname='mails')
Esempio n. 23
0
def create_writer(index: FileIndex) -> IndexWriter:
    w = index.writer(limitmb=16000, procs=1)

    return w
Esempio n. 24
0
    def create_index(self, schema, indexname=_DEF_INDEX_NAME):
        if self.readonly:
            raise ReadOnlyError

        TOC.create(self, schema, indexname)
        return FileIndex(self, schema, indexname)
Esempio n. 25
0
 def open_index(self, indexname=_DEF_INDEX_NAME, schema=None):
     return FileIndex(self, schema=schema, indexname=indexname)
Esempio n. 26
0
 def _create_index(self):
     masterkey = self.soledad_querier.get_index_masterkey()
     storage = EncryptedFileStorage(self.INDEX_FOLDER, masterkey)
     return FileIndex.create(storage,
                             self._mail_schema(),
                             indexname='mails')