def incremental_index(indexdir, indexname, rowData): """ 注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意 :param rowData: 每一行的数据 :param indexdir: :param indexname: :return: """ # print(indexdir) storage = FileStorage(indexdir) ix = FileIndex(storage, indexname=indexname) writer = AsyncWriter(ix) docline = """writer.add_document(""" for key in rowData: val = rowData[key] if not val: val = "" elif isinstance(val, (Decimal, )): val = str(val) else: val = pymysql.escape_string(json.dumps(val)) docline += key + '="' + val + '", ' docline = docline.rstrip(", ") docline += """)""" exec(docline) # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc") # writer.add_document(content="人在塔在", ID="hik") writer.commit()
def __init__(self, *args, **kwargs): self.default = kwargs.pop("default", None) self.parser = None self.fields = kwargs.pop('fields', []) + ['id'] self.real_time = kwargs.pop('real_time', True) if not os.path.lexists(STORAGE_DIR): os.makedirs(STORAGE_DIR) self.storage = filestore.FileStorage(STORAGE_DIR) try: self.index = FileIndex(self.storage) except (IndexError, EmptyIndexError): self.index = None super(WhooshManager, self).__init__(*args, **kwargs)
def __init__(self, corpus: Corpus, index_path: str, top_k, extend_candidate_citations: bool): super().__init__(top_k) self.index_path = index_path storage = FileStorage(self.index_path, readonly=True) self._bm25_index = FileIndex(copy_to_ram(storage), schema=schema) self.searcher = self._bm25_index.searcher(weighting=scoring.BM25F) self.query_parser = MultifieldParser( [FieldNames.TITLE, FieldNames.ABSTRACT], self._bm25_index.schema, group=qparser.OrGroup) self.corpus = corpus self.extend_candidate_citations = extend_candidate_citations
def init_indexes(self): """Create indexes for schemas.""" state = self.app_state for name, schema in self.schemas.items(): if current_app.testing: storage = TestingStorage() else: index_path = (Path(state.whoosh_base) / name).absolute() if not index_path.exists(): index_path.mkdir(parents=True) storage = FileStorage(text_type(index_path)) if storage.index_exists(name): index = FileIndex(storage, schema, name) else: index = FileIndex.create(storage, schema, name) state.indexes[name] = index
def update_index(indexdir, indexname, rowData): """ 注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意 :param indexdir: 索引目录 :param indexname: 索引名 :return: """ # print(indexdir) storage = FileStorage(indexdir) ix = FileIndex(storage, indexname=indexname) writer = AsyncWriter(ix) docline = """writer.add_document(""" for key in rowData: val = rowData[key] if not val: val = "" elif isinstance(val, (Decimal, )): val = str(val) else: val = pymysql.escape_string(str(val)) docline += key + '="' + val + '", ' docline = docline.rstrip(", ") docline += """)""" exec(docline) # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc") # writer.add_document(content="人在塔在", ID="hik") # 其实就是在创建索引的时候要求唯一, 要不数据重复是追加了 # Because "path" is marked as unique, calling update_document with path="/a" # will delete any existing documents where the "path" field contains "/a". # writer.update_document(path=u"/a", content="Replacement for the first document") writer.commit()
if args.valid_docs: data_type = 'pd' valid_docs = set(open(args.valid_docs).read().strip().split('\n')) searcher = JSearcher(JString(args.index)) searcher.setBM25Similarity(args.k1, args.b) print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b)) if args.rm3: searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight) print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format(args.fbTerms, args.fbDocs, args.originalQueryWeight)) schema = Schema(title=TEXT, abstract=TEXT, id=ID(stored=True)) storage = FileStorage(args.whoosh_index, readonly=True) bm25_index = FileIndex(copy_to_ram(storage), schema=schema) whoosh_searcher = bm25_index.searcher(weighting=scoring.BM25F) with open(args.output, 'w') as fout: start_time = time.time() for line_number, line in enumerate(open(args.qid_queries)): query_id, query = line.strip().split('\t') query = update_query_with_key_terms(query, whoosh_searcher) # We return one more result because it is almost certain that we will # retrieve the document that originated the query. hits = searcher.search( JString(query.encode('utf8')), args.hits + 1) if line_number % 10 == 0: time_per_query = (time.time() - start_time) / (line_number + 1) print('Retrieving query {} ({:0.3f} s/query)'.format(
def open_index(self, indexname=_DEF_INDEX_NAME, schema=None): return FileIndex(self, schema=schema, indexname=indexname)
def create_index(self, schema, indexname=_DEF_INDEX_NAME): if self.readonly: raise ReadOnlyError TOC.create(self, schema, indexname) return FileIndex(self, schema, indexname)