def __init__(self, index_store_path): store = NIOFSDirectory(Paths.get(index_store_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def __init__(self, corpusPath, storeDir): if not os.path.exists(storeDir): os.mkdir(storeDir) store = NIOFSDirectory(Paths.get(storeDir)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(corpusPath, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
help='qa data for evaluation', default='/home/xwhan/data/nq/nq-dev.txt') parser.add_argument('--topk', type=int, default=500) args = parser.parse_args() qas = [json.loads(line) for line in open(args.qa_data).readlines()][:1000] questions = [ _["question"][:-1] if _["question"].endswith("?") else _["question"] for _ in qas ] answers = [item["answer"] for item in qas] print("Loading Lucene Index ...") lucene.initVM(vmargs=['-Djava.aws.headless=true']) analyzer = StandardAnalyzer() searchDir = NIOFSDirectory(Paths.get(args.index_path)) searcher = IndexSearcher(DirectoryReader.open(searchDir)) # try tuning the hyperparameters of bm25 for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]: for b in [0.5, 0.6, 0.7, 0.8, 0.9]: print(f"Grid search.... k1: {k1}; b: {b}") searcher.setSimilarity(BM25Similarity(k1, b)) parser = QueryParser('Context', analyzer) retrieved = [] print("Searching ...") for q in tqdm(questions):