Example #1
0
def incremental_index(indexdir, indexname, rowData):
    """
    注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意
    :param rowData: 每一行的数据
    :param indexdir:
    :param indexname:
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)
    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(json.dumps(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    writer.commit()
Example #2
0
 def __init__(self, *args, **kwargs):
     self.default = kwargs.pop("default", None)
     self.parser = None
     self.fields = kwargs.pop('fields', []) + ['id']
     self.real_time = kwargs.pop('real_time', True)
     if not os.path.lexists(STORAGE_DIR):
         os.makedirs(STORAGE_DIR)
     self.storage = filestore.FileStorage(STORAGE_DIR)
     try:
         self.index = FileIndex(self.storage)
     except (IndexError, EmptyIndexError):
         self.index = None
     super(WhooshManager, self).__init__(*args, **kwargs)
Example #3
0
    def __init__(self, corpus: Corpus, index_path: str, top_k,
                 extend_candidate_citations: bool):
        super().__init__(top_k)
        self.index_path = index_path

        storage = FileStorage(self.index_path, readonly=True)
        self._bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
        self.searcher = self._bm25_index.searcher(weighting=scoring.BM25F)
        self.query_parser = MultifieldParser(
            [FieldNames.TITLE, FieldNames.ABSTRACT],
            self._bm25_index.schema,
            group=qparser.OrGroup)
        self.corpus = corpus
        self.extend_candidate_citations = extend_candidate_citations
Example #4
0
    def init_indexes(self):
        """Create indexes for schemas."""
        state = self.app_state

        for name, schema in self.schemas.items():
            if current_app.testing:
                storage = TestingStorage()
            else:
                index_path = (Path(state.whoosh_base) / name).absolute()
                if not index_path.exists():
                    index_path.mkdir(parents=True)
                storage = FileStorage(text_type(index_path))

            if storage.index_exists(name):
                index = FileIndex(storage, schema, name)
            else:
                index = FileIndex.create(storage, schema, name)

            state.indexes[name] = index
def update_index(indexdir, indexname, rowData):
    """
    注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意
    :param indexdir: 索引目录
    :param indexname: 索引名
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)

    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(str(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    # 其实就是在创建索引的时候要求唯一, 要不数据重复是追加了
    # Because "path" is marked as unique, calling update_document with path="/a"
    # will delete any existing documents where the "path" field contains "/a".
    # writer.update_document(path=u"/a", content="Replacement for the first document")
    writer.commit()
    if args.valid_docs:
      data_type = 'pd'
      valid_docs = set(open(args.valid_docs).read().strip().split('\n'))

    searcher = JSearcher(JString(args.index))
    searcher.setBM25Similarity(args.k1, args.b)
    print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b))
    if args.rm3:
        searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight)
        print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format(args.fbTerms, args.fbDocs, args.originalQueryWeight))

    schema = Schema(title=TEXT,
                    abstract=TEXT,
                    id=ID(stored=True))
    storage = FileStorage(args.whoosh_index, readonly=True)
    bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
    whoosh_searcher = bm25_index.searcher(weighting=scoring.BM25F)

    with open(args.output, 'w') as fout:
      start_time = time.time()
      for line_number, line in enumerate(open(args.qid_queries)):
          query_id, query = line.strip().split('\t')
          query = update_query_with_key_terms(query, whoosh_searcher)
          # We return one more result because it is almost certain that we will 
          # retrieve the document that originated the query.
          hits = searcher.search(
              JString(query.encode('utf8')), args.hits + 1)

          if line_number % 10 == 0:
              time_per_query = (time.time() - start_time) / (line_number + 1)
              print('Retrieving query {} ({:0.3f} s/query)'.format(
Example #7
0
 def open_index(self, indexname=_DEF_INDEX_NAME, schema=None):
     return FileIndex(self, schema=schema, indexname=indexname)
Example #8
0
    def create_index(self, schema, indexname=_DEF_INDEX_NAME):
        if self.readonly:
            raise ReadOnlyError

        TOC.create(self, schema, indexname)
        return FileIndex(self, schema, indexname)