Ejemplo n.º 1
0
 def build_jsoup(self, path, replace=False, optimize=True):
     with logger.duration(f'building {self._path}'):
         if self._settings['built']:
             if replace:
                 logger.warn(f'removing index: {self._path}')
                 shutil.rmtree(self._path)
             else:
                 logger.warn(f'adding to existing index: {self._path}')
         thread_count = onir.util.safe_thread_count()
         index_args = J.A_IndexArgs()
         index_args.collectionClass = 'TrecCollection'
         index_args.generatorClass = 'JsoupGenerator'
         index_args.threads = thread_count
         index_args.input = path
         index_args.index = self._path
         index_args.storePositions = True
         index_args.storeDocvectors = True
         index_args.storeRawDocs = True
         index_args.storeTransformedDocs = True
         index_args.keepStopwords = self._settings['keep_stops']
         index_args.stemmer = self._settings['stemmer']
         index_args.optimize = optimize
         indexer = J.A_IndexCollection(index_args)
         thread = threading.Thread(target=indexer.run)
         thread.start()
         thread.join()
         self._settings['built'] = True
         self._dump_settings()
Ejemplo n.º 2
0
 def build(self,
           doc_iter,
           replace=False,
           optimize=True,
           store_term_weights=False):
     with logger.duration(f'building {self._base_path}'):
         thread_count = onir.util.safe_thread_count()
         with tempfile.TemporaryDirectory() as d:
             if self._settings['built']:
                 if replace:
                     logger.warn(f'removing index: {self._base_path}')
                     shutil.rmtree(self._base_path)
                 else:
                     logger.warn(
                         f'adding to existing index: {self._base_path}')
             fifos = []
             for t in range(thread_count):
                 fifo = os.path.join(d, f'{t}.json')
                 os.mkfifo(fifo)
                 fifos.append(fifo)
             index_args = J.A_IndexArgs()
             index_args.collectionClass = 'JsonCollection'
             index_args.generatorClass = 'LuceneDocumentGenerator'
             index_args.threads = thread_count
             index_args.input = d
             index_args.index = self._base_path
             index_args.storePositions = True
             index_args.storeDocvectors = True
             index_args.storeTermWeights = store_term_weights
             index_args.keepStopwords = self._settings['keep_stops']
             index_args.stemmer = self._settings['stemmer']
             index_args.optimize = optimize
             indexer = J.A_IndexCollection(index_args)
             thread = threading.Thread(target=indexer.run)
             thread.start()
             time.sleep(
                 1
             )  # give it some time to start up, otherwise fails due to race condition
             for i, doc in enumerate(doc_iter):
                 f = fifos[hash(i) % thread_count]
                 if isinstance(f, str):
                     f = open(f, 'wt')
                     fifos[hash(i) % thread_count] = f
                 data = {'id': doc.did, 'contents': 'a'}
                 data.update(doc.data)
                 json.dump(data, f)
                 f.write('\n')
             for f in fifos:
                 if not isinstance(f, str):
                     f.close()
                 else:
                     with open(f, 'wt'):
                         pass  # open and close to indicate file is done
             logger.debug('waiting to join')
             thread.join()
             self._settings['built'] = True
             self._dump_settings()