def build_jsoup(self, path, replace=False, optimize=True): with logger.duration(f'building {self._path}'): if self._settings['built']: if replace: logger.warn(f'removing index: {self._path}') shutil.rmtree(self._path) else: logger.warn(f'adding to existing index: {self._path}') thread_count = onir.util.safe_thread_count() index_args = J.A_IndexArgs() index_args.collectionClass = 'TrecCollection' index_args.generatorClass = 'JsoupGenerator' index_args.threads = thread_count index_args.input = path index_args.index = self._path index_args.storePositions = True index_args.storeDocvectors = True index_args.storeRawDocs = True index_args.storeTransformedDocs = True index_args.keepStopwords = self._settings['keep_stops'] index_args.stemmer = self._settings['stemmer'] index_args.optimize = optimize indexer = J.A_IndexCollection(index_args) thread = threading.Thread(target=indexer.run) thread.start() thread.join() self._settings['built'] = True self._dump_settings()
def build(self, doc_iter, replace=False, optimize=True, store_term_weights=False): with logger.duration(f'building {self._base_path}'): thread_count = onir.util.safe_thread_count() with tempfile.TemporaryDirectory() as d: if self._settings['built']: if replace: logger.warn(f'removing index: {self._base_path}') shutil.rmtree(self._base_path) else: logger.warn( f'adding to existing index: {self._base_path}') fifos = [] for t in range(thread_count): fifo = os.path.join(d, f'{t}.json') os.mkfifo(fifo) fifos.append(fifo) index_args = J.A_IndexArgs() index_args.collectionClass = 'JsonCollection' index_args.generatorClass = 'LuceneDocumentGenerator' index_args.threads = thread_count index_args.input = d index_args.index = self._base_path index_args.storePositions = True index_args.storeDocvectors = True index_args.storeTermWeights = store_term_weights index_args.keepStopwords = self._settings['keep_stops'] index_args.stemmer = self._settings['stemmer'] index_args.optimize = optimize indexer = J.A_IndexCollection(index_args) thread = threading.Thread(target=indexer.run) thread.start() time.sleep( 1 ) # give it some time to start up, otherwise fails due to race condition for i, doc in enumerate(doc_iter): f = fifos[hash(i) % thread_count] if isinstance(f, str): f = open(f, 'wt') fifos[hash(i) % thread_count] = f data = {'id': doc.did, 'contents': 'a'} data.update(doc.data) json.dump(data, f) f.write('\n') for f in fifos: if not isinstance(f, str): f.close() else: with open(f, 'wt'): pass # open and close to indicate file is done logger.debug('waiting to join') thread.join() self._settings['built'] = True self._dump_settings()