Esempio n. 1
0
 def optimize_index(self, ct):
     self.write_state_update(ct, 1.0 / 3 * 2)
     dputils.gc_collect()
     index = ct.get_whoosh_index()
     print "\toptimize index"
     writer = index.writer()
     writer.commit(optimize=True)
Esempio n. 2
0
 def optimize_index(self, ct):
     self.write_state_update(ct, 1.0 / 3 * 2)
     dputils.gc_collect()
     index = ct.get_whoosh_index()
     print '\toptimize index'
     writer = index.writer()
     writer.commit(optimize=True)
Esempio n. 3
0
    def populate_index(self, ct, index=None):
        chrono("POPULATE_INDEX:")

        # Add documents to the index
        print "\tgenerate sort rankings"

        chrono("RANK_VALUES:")
        ct.prepare_value_rankings(
            callback=lambda progress: self.write_state_update(ct, max(0.001, 1.0 / 3.0 * progress))
        )
        chrono(":RANK_VALUES")

        chrono("INDEXING QUERY:")
        print "\tretrieve all records"
        dputils.gc_collect()

        from whoosh.writing import BufferedWriter

        rcs = ct.get_all_records(True)
        record_count = rcs.count()

        writer = None

        chrono(":INDEXING QUERY")

        print "\tadd records to index"

        i = 0
        commit_size = 500
        progress_size = 200

        # settings.DEV_SERVER = True
        chrono("INDEXING:")
        chrono("First record:")

        record_condition = ct.get_option("condition", None)

        pbar = dputils.ProgressBar(record_count)

        # Indexing can use n x 100 MB
        # Which can be excessive for small VMs
        # One technique is to create small, independent index segments
        # Then optimise them outside this fct on a separate index
        for record in rcs.iterator():
            if i == 0:
                chrono(":First record")
            pbar.update(i + 1)

            if (i % commit_size) == 0:
                # we have to commit every x document otherwise the memory saturates on the VM
                # BufferedWriter is buggy and will crash after a few 100x docs
                if writer:
                    writer.commit(merge=False)

                # we have to recreate after commit because commit unlock index
                writer = None
                index = None
                dputils.gc_collect()

                index = ct.get_whoosh_index()
                writer = index.writer()

            i += 1

            if record_condition and not record_condition(record):
                continue

            writer.add_document(**ct.get_document_from_record(record))

            if (i % progress_size) == 0:
                self.write_state_update(ct, (1 + 1.0 * i / record_count) * 1.0 / 3)

        if writer:
            writer.commit(merge=False)
        # rcs = None
        # ct.clear_value_rankings()

        pbar.complete()
        chrono(":INDEXING")

        print "\n"

        chrono(":POPULATE_INDEX")

        print "\tdone (%s records)" % record_count
Esempio n. 4
0
    def populate_index(self, ct, index=None):
        chrono('POPULATE_INDEX:')

        # Add documents to the index
        print '\tgenerate sort rankings'

        chrono('RANK_VALUES:')
        ct.prepare_value_rankings(
            callback=lambda progress: self.write_state_update(
                ct, max(0.001, 1.0 / 3.0 * progress)))
        chrono(':RANK_VALUES')

        chrono('INDEXING QUERY:')
        print '\tretrieve all records'
        dputils.gc_collect()

        from whoosh.writing import BufferedWriter
        rcs = ct.get_all_records(True)
        record_count = rcs.count()

        writer = None

        chrono(':INDEXING QUERY')

        print '\tadd records to index'

        i = 0
        commit_size = 500
        progress_size = 200

        # settings.DEV_SERVER = True
        chrono('INDEXING:')
        chrono('First record:')

        record_condition = ct.get_option('condition', None)

        pbar = dputils.ProgressBar(record_count)

        # Indexing can use n x 100 MB
        # Which can be excessive for small VMs
        # One technique is to create small, independent index segments
        # Then optimise them outside this fct on a separate index
        for record in rcs.iterator():
            if i == 0:
                chrono(':First record')
            pbar.update(i + 1)

            if (i % commit_size) == 0:
                # we have to commit every x document otherwise the memory saturates on the VM
                # BufferedWriter is buggy and will crash after a few 100x docs
                if writer:
                    writer.commit(merge=False)

                # we have to recreate after commit because commit unlock index
                writer = None
                index = None
                dputils.gc_collect()

                index = ct.get_whoosh_index()
                writer = index.writer()

            i += 1

            if record_condition and not record_condition(record):
                continue

            writer.add_document(**ct.get_document_from_record(record))

            if (i % progress_size) == 0:
                self.write_state_update(ct,
                                        (1 + 1.0 * i / record_count) * 1.0 / 3)

        if writer:
            writer.commit(merge=False)
        #rcs = None
        # ct.clear_value_rankings()

        pbar.complete()
        chrono(':INDEXING')

        print '\n'

        chrono(':POPULATE_INDEX')

        print '\tdone (%s records)' % record_count