def optimize_index(self, ct): self.write_state_update(ct, 1.0 / 3 * 2) dputils.gc_collect() index = ct.get_whoosh_index() print "\toptimize index" writer = index.writer() writer.commit(optimize=True)
def optimize_index(self, ct): self.write_state_update(ct, 1.0 / 3 * 2) dputils.gc_collect() index = ct.get_whoosh_index() print '\toptimize index' writer = index.writer() writer.commit(optimize=True)
def populate_index(self, ct, index=None): chrono("POPULATE_INDEX:") # Add documents to the index print "\tgenerate sort rankings" chrono("RANK_VALUES:") ct.prepare_value_rankings( callback=lambda progress: self.write_state_update(ct, max(0.001, 1.0 / 3.0 * progress)) ) chrono(":RANK_VALUES") chrono("INDEXING QUERY:") print "\tretrieve all records" dputils.gc_collect() from whoosh.writing import BufferedWriter rcs = ct.get_all_records(True) record_count = rcs.count() writer = None chrono(":INDEXING QUERY") print "\tadd records to index" i = 0 commit_size = 500 progress_size = 200 # settings.DEV_SERVER = True chrono("INDEXING:") chrono("First record:") record_condition = ct.get_option("condition", None) pbar = dputils.ProgressBar(record_count) # Indexing can use n x 100 MB # Which can be excessive for small VMs # One technique is to create small, independent index segments # Then optimise them outside this fct on a separate index for record in rcs.iterator(): if i == 0: chrono(":First record") pbar.update(i + 1) if (i % commit_size) == 0: # we have to commit every x document otherwise the memory saturates on the VM # BufferedWriter is buggy and will crash after a few 100x docs if writer: writer.commit(merge=False) # we have to recreate after commit because commit unlock index writer = None index = None dputils.gc_collect() index = ct.get_whoosh_index() writer = index.writer() i += 1 if record_condition and not record_condition(record): continue writer.add_document(**ct.get_document_from_record(record)) if (i % progress_size) == 0: self.write_state_update(ct, (1 + 1.0 * i / record_count) * 1.0 / 3) if writer: writer.commit(merge=False) # rcs = None # ct.clear_value_rankings() pbar.complete() chrono(":INDEXING") print "\n" chrono(":POPULATE_INDEX") print "\tdone (%s records)" % record_count
def populate_index(self, ct, index=None): chrono('POPULATE_INDEX:') # Add documents to the index print '\tgenerate sort rankings' chrono('RANK_VALUES:') ct.prepare_value_rankings( callback=lambda progress: self.write_state_update( ct, max(0.001, 1.0 / 3.0 * progress))) chrono(':RANK_VALUES') chrono('INDEXING QUERY:') print '\tretrieve all records' dputils.gc_collect() from whoosh.writing import BufferedWriter rcs = ct.get_all_records(True) record_count = rcs.count() writer = None chrono(':INDEXING QUERY') print '\tadd records to index' i = 0 commit_size = 500 progress_size = 200 # settings.DEV_SERVER = True chrono('INDEXING:') chrono('First record:') record_condition = ct.get_option('condition', None) pbar = dputils.ProgressBar(record_count) # Indexing can use n x 100 MB # Which can be excessive for small VMs # One technique is to create small, independent index segments # Then optimise them outside this fct on a separate index for record in rcs.iterator(): if i == 0: chrono(':First record') pbar.update(i + 1) if (i % commit_size) == 0: # we have to commit every x document otherwise the memory saturates on the VM # BufferedWriter is buggy and will crash after a few 100x docs if writer: writer.commit(merge=False) # we have to recreate after commit because commit unlock index writer = None index = None dputils.gc_collect() index = ct.get_whoosh_index() writer = index.writer() i += 1 if record_condition and not record_condition(record): continue writer.add_document(**ct.get_document_from_record(record)) if (i % progress_size) == 0: self.write_state_update(ct, (1 + 1.0 * i / record_count) * 1.0 / 3) if writer: writer.commit(merge=False) #rcs = None # ct.clear_value_rankings() pbar.complete() chrono(':INDEXING') print '\n' chrono(':POPULATE_INDEX') print '\tdone (%s records)' % record_count