Esempio n. 1
0
    def reindex(self, batch=1000, skip=0):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, 'indexObject') is not None:
                    log('skipping indexing of %r via private method.\n' % obj)
                    continue
                count += 1
                if count <= skip:
                    continue
                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 2
0
 def setUp(self):
     provideUtility(SolrConnectionConfig(), ISolrConnectionConfig)
     self.mngr = SolrConnectionManager()
     self.mngr.setHost(active=True)
     conn = self.mngr.getConnection()
     fakehttp(conn, getData('schema.xml'))       # fake schema response
     self.mngr.getSchema()                       # read and cache the schema
     self.proc = SolrIndexProcessor(self.mngr)
Esempio n. 3
0
    def reindex(self, batch=1000, skip=0):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log("reindexing solr catalog...\n")
        if skip:
            log("skipping indexing of %d object(s)...\n" % skip)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, "indexObject") is not None:
                    log("skipping indexing of %r via private method.\n" % obj)
                    continue
                count += 1
                if count <= skip:
                    continue
                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log("missing data, skipping indexing of %r.\n" % obj)
        checkPoint()
        conn.commit()
        log("solr index rebuilt.\n")
        msg = "processed %d items in %s (%s cpu time)."
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 4
0
 def setUp(self):
     self.mngr = SolrConnectionManager()
     self.mngr.setHost(active=True)
     conn = self.mngr.getConnection()
     fakehttp(conn, getData('schema.xml'))  # fake schema response
     self.mngr.getSchema()  # read and cache the schema
     self.proc = SolrIndexProcessor(self.mngr)
     config = getConfig()
     config.atomic_updates = True
Esempio n. 5
0
    def cleanup(self, batch=1000):
        """ remove entries from solr that don't have a corresponding Zope
            object or have a different UID than the real object"""
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        log = self.mklog(use_std_log=True)
        log('cleaning up solr index...\n')
        key = manager.getSchema().uniqueKey

        start = 0
        resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start))
        res = resp.results()
        log('%s items in solr catalog\n' % resp.response.numFound)
        deleted = 0
        reindexed = 0
        while len(res) > 0:
            for flare in res:
                try:
                    ob = PloneFlare(flare).getObject()
                except Exception as err:
                    log('Error getting object, removing: %s (%s)\n' % (
                        flare['path_string'], err))
                    conn.delete(flare[key])
                    deleted += 1
                    continue
                if not IUUIDAware.providedBy(ob):
                    log('Object %s of type %s does not support uuids, skipping.\n' %
                        ('/'.join(ob.getPhysicalPath()), ob.meta_type))
                    continue
                uuid = IUUID(ob)
                if uuid != flare[key]:
                    log('indexed under wrong UID, removing: %s\n' %
                        flare['path_string'])
                    conn.delete(flare[key])
                    deleted += 1
                    realob_res = SolrResponse(conn.search(q='%s:%s' %
                                              (key, uuid))).results()
                    if len(realob_res) == 0:
                        log('no sane entry for last object, reindexing\n')
                        data, missing = proc.getData(ob)
                        prepareData(data)
                        if not missing:
                            boost = boost_values(ob, data)
                            conn.add(boost_values=boost, **data)
                            reindexed += 1
                        else:
                            log('  missing data, cannot index.\n')
            log('handled batch of %d items, commiting\n' % len(res))
            conn.commit()
            start += batch
            resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start))
            res = resp.results()
        msg = 'solr cleanup finished, %s item(s) removed, %s item(s) reindexed\n' % (deleted, reindexed)
        log(msg)
        logger.info(msg)
Esempio n. 6
0
 def testTwoRequests(self):
     mngr = SolrConnectionManager(active=True)
     proc = SolrIndexProcessor(mngr)
     output = fakehttp(mngr.getConnection(), getData('schema.xml'),
                       getData('add_response.txt'))
     proc.index(self.foo)
     mngr.closeConnection()
     self.assertEqual(len(output), 2)
     self.failUnless(output.get().startswith(self.schema_request))
     self.assertEqual(sortFields(output.get()), getData('add_request.txt'))
Esempio n. 7
0
 def testTwoRequests(self):
     mngr = SolrConnectionManager(active=True)
     proc = SolrIndexProcessor(mngr)
     output = fakehttp(mngr.getConnection(), getData('schema.xml'),
         getData('add_response.txt'))
     proc.index(self.foo)
     mngr.closeConnection()
     self.assertEqual(len(output), 2)
     self.failUnless(output.get().startswith(self.schema_request))
     self.assertEqual(sortFields(output.get()), getData('add_request.txt'))
Esempio n. 8
0
 def setUp(self):
     provideUtility(SolrConnectionConfig(), ISolrConnectionConfig)
     self.mngr = SolrConnectionManager()
     self.mngr.setHost(active=True)
     self.conn = self.mngr.getConnection()
     self.proc = SolrIndexProcessor(self.mngr)
     self.log = []                   # catch log messages...
     def logger(*args):
         self.log.extend(args)
     logger_indexer.warning = logger
Esempio n. 9
0
class RobustnessTests(TestCase):

    layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE

    def setUp(self):
        self.mngr = SolrConnectionManager()
        self.mngr.setHost(active=True)
        self.conn = self.mngr.getConnection()
        self.proc = SolrIndexProcessor(self.mngr)
        self.log = []  # catch log messages...

        def logger(*args):
            self.log.extend(args)

        logger_indexer.warning = logger
        config = getConfig()
        config.atomic_updates = True

    def tearDown(self):
        self.mngr.closeConnection()
        self.mngr.setHost(active=False)

    def testIndexingWithUniqueKeyMissing(self):
        # fake schema response
        fakehttp(self.conn, getData('simple_schema.xml'))
        # read and cache the schema
        self.mngr.getSchema()
        response = getData('add_response.txt')
        output = fakehttp(self.conn, response)  # fake add response
        foo = Foo(id='500', name='foo')
        # indexing sends data
        self.proc.index(foo)
        # nothing happened...
        self.assertEqual(len(output), 0)
        self.assertEqual(
            self.log,
            ['schema is missing unique key, skipping indexing of %r', foo])

    def testUnindexingWithUniqueKeyMissing(self):
        # fake schema response
        fakehttp(self.conn, getData('simple_schema.xml'))
        # read and cache the schema
        self.mngr.getSchema()
        response = getData('delete_response.txt')
        # fake delete response
        output = fakehttp(self.conn, response)
        foo = Foo(id='500', name='foo')
        # unindexing sends data
        self.proc.unindex(foo)
        # nothing happened...
        self.assertEqual(len(output), 0)
        self.assertEqual(
            self.log,
            ['schema is missing unique key, skipping unindexing of %r', foo])
Esempio n. 10
0
    def setUp(self):
        self.mngr = SolrConnectionManager()
        self.mngr.setHost(active=True)
        self.conn = self.mngr.getConnection()
        self.proc = SolrIndexProcessor(self.mngr)
        self.log = []  # catch log messages...

        def logger(*args):
            self.log.extend(args)

        logger_indexer.warning = logger
        config = getConfig()
        config.atomic_updates = True
Esempio n. 11
0
class RobustnessTests(TestCase):

    layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE

    def setUp(self):
        self.mngr = SolrConnectionManager()
        self.mngr.setHost(active=True)
        self.conn = self.mngr.getConnection()
        self.proc = SolrIndexProcessor(self.mngr)
        self.log = []                   # catch log messages...

        def logger(*args):
            self.log.extend(args)
        logger_indexer.warning = logger
        config = getConfig()
        config.atomic_updates = True

    def tearDown(self):
        self.mngr.closeConnection()
        self.mngr.setHost(active=False)

    def testIndexingWithUniqueKeyMissing(self):
        # fake schema response
        fakehttp(self.conn, getData('simple_schema.xml'))
        # read and cache the schema
        self.mngr.getSchema()
        response = getData('add_response.txt')
        output = fakehttp(self.conn, response)              # fake add response
        foo = Foo(id='500', name='foo')
        # indexing sends data
        self.proc.index(foo)
        # nothing happened...
        self.assertEqual(len(output), 0)
        self.assertEqual(self.log, [
            'schema is missing unique key, skipping indexing of %r', foo])

    def testUnindexingWithUniqueKeyMissing(self):
        # fake schema response
        fakehttp(self.conn, getData('simple_schema.xml'))
        # read and cache the schema
        self.mngr.getSchema()
        response = getData('delete_response.txt')
        # fake delete response
        output = fakehttp(self.conn, response)
        foo = Foo(id='500', name='foo')
        # unindexing sends data
        self.proc.unindex(foo)
        # nothing happened...
        self.assertEqual(len(output), 0)
        self.assertEqual(self.log, [
            'schema is missing unique key, skipping unindexing of %r', foo])
Esempio n. 12
0
 def testExtraRequest(self):
     # basically the same as `testThreeRequests`, except it
     # tests adding fake responses consecutively
     mngr = SolrConnectionManager(active=True)
     proc = SolrIndexProcessor(mngr)
     conn = mngr.getConnection()
     output = fakehttp(conn, getData('schema.xml'))
     fakemore(conn, getData('add_response.txt'))
     proc.index(self.foo)
     fakemore(conn, getData('delete_response.txt'))
     proc.unindex(self.foo)
     mngr.closeConnection()
     self.assertEqual(len(output), 3)
     self.failUnless(output.get().startswith(self.schema_request))
     self.assertEqual(sortFields(output.get()), getData('add_request.txt'))
     self.assertEqual(output.get(), getData('delete_request.txt'))
Esempio n. 13
0
 def setUp(self):
     provideUtility(SolrConnectionConfig(), ISolrConnectionConfig)
     self.mngr = SolrConnectionManager()
     self.mngr.setHost(active=True)
     conn = self.mngr.getConnection()
     fakehttp(conn, getData('schema.xml'))       # fake schema response
     self.mngr.getSchema()                       # read and cache the schema
     self.proc = SolrIndexProcessor(self.mngr)
Esempio n. 14
0
 def setUp(self):
     self.mngr = SolrConnectionManager()
     self.mngr.setHost(active=True)
     conn = self.mngr.getConnection()
     fakehttp(conn, getData('schema.xml'))       # fake schema response
     self.mngr.getSchema()                       # read and cache the schema
     self.proc = SolrIndexProcessor(self.mngr)
     config = getConfig()
     config.atomic_updates = True
Esempio n. 15
0
    def setUp(self):
        self.mngr = SolrConnectionManager()
        self.mngr.setHost(active=True)
        self.conn = self.mngr.getConnection()
        self.proc = SolrIndexProcessor(self.mngr)
        self.log = []                   # catch log messages...

        def logger(*args):
            self.log.extend(args)
        logger_indexer.warning = logger
        config = getConfig()
        config.atomic_updates = True
Esempio n. 16
0
    def reindex(self,
                batch=1000,
                skip=0,
                limit=0,
                ignore_portal_types=None,
                only_portal_types=None,
                idxs=[],
                ignore_exceptions=False):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.commit(soft=True)
        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                try:
                    adder(conn, boost_values=my_boost_values, **data)
                except Exception, e:
                    logger.warn('Error %s @ %s', e, data['path_string'])
                    if not ignore_exceptions:
                        raise
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
Esempio n. 17
0
    def testLocalConnections(self):
        config = getConfig()
        config.atomic_updates = True
        mngr = SolrConnectionManager(active=True)
        proc = SolrIndexProcessor(mngr)
        mngr.setHost(active=True)
        schema = getData("schema.xml")
        log = []

        def runner():
            # fake schema response on solr connection - caches the schema
            fakehttp(mngr.getConnection(), getData("schema.xml"))
            mngr.getConnection().get_schema()

            fakehttp(mngr.getConnection(), schema)  # fake schema response
            # read and cache the schema
            mngr.getSchema()
            response = getData("add_response.txt")
            # fake add response
            output = fakehttp(mngr.getConnection(), response)
            # indexing sends data
            proc.index(Foo(id="500", name="python test doc"))
            mngr.closeConnection()
            log.append(str(output))
            log.append(proc)
            log.append(mngr.getConnection())

        # after the runner was set up, another thread can be created and
        # started;  its output should contain the proper indexing request,
        # whereas the main thread's connection remain idle;  the latter
        # cannot be checked directly, but the connection object would raise
        # an exception if it was used to send a request without setting up
        # a fake response beforehand...
        thread = Thread(target=runner)
        thread.start()
        thread.join()
        conn = mngr.getConnection()  # get this thread's connection
        fakehttp(conn, schema)  # fake schema response
        mngr.getSchema()  # read and cache the schema
        mngr.closeConnection()
        mngr.setHost(active=False)
        self.assertEqual(len(log), 3)
        self.assertEqual(
            sortFields(log[0].encode("utf-8")), getData("add_request.txt").rstrip(b"\n")
        )
        self.failUnless(isinstance(log[1], SolrIndexProcessor))
        self.failUnless(isinstance(log[2], SolrConnection))
        self.failUnless(isinstance(proc, SolrIndexProcessor))
        self.failUnless(isinstance(conn, SolrConnection))
        self.assertEqual(log[1], proc)  # processors should be the same...
        self.assertNotEqual(log[2], conn)  # but not the connections
Esempio n. 18
0
 def testFourRequests(self):
     mngr = SolrConnectionManager(active=True)
     proc = SolrIndexProcessor(mngr)
     output = fakehttp(
         mngr.getConnection(),
         getData("schema.xml"),
         getData("add_response.txt"),
         getData("delete_response.txt"),
         getData("commit_response.txt"),
     )
     proc.index(self.foo)
     proc.unindex(self.foo)
     proc.commit()
     mngr.closeConnection()
     self.assertEqual(len(output), 4)
     self.failUnless(output.get().decode("utf-8").startswith(self.schema_request))
     self.assertEqual(
         sortFields(output.get()), getData("add_request.txt").rstrip(b"\n")
     )
     self.assertEqual(output.get(), getData("delete_request.txt").rstrip(b"\n"))
     self.assertEqual(output.get(), getData("commit_request.txt").rstrip(b"\n"))
Esempio n. 19
0
 def testExtraRequest(self):
     # basically the same as `testThreeRequests`, except it
     # tests adding fake responses consecutively
     mngr = SolrConnectionManager(active=True)
     proc = SolrIndexProcessor(mngr)
     conn = mngr.getConnection()
     output = fakehttp(conn, getData('schema.xml'))
     fakemore(conn, getData('add_response.txt'))
     proc.index(self.foo)
     fakemore(conn, getData('delete_response.txt'))
     proc.unindex(self.foo)
     mngr.closeConnection()
     self.assertEqual(len(output), 3)
     self.failUnless(output.get().startswith(self.schema_request))
     self.assertEqual(sortFields(output.get()), getData('add_request.txt'))
     self.assertEqual(output.get(), getData('delete_request.txt'))
Esempio n. 20
0
def solr_index(self, obj, attributes=None):
    # Fix issue https://github.com/collective/collective.solr/issues/189
    if attributes is not None:
        attributes = None
    return SolrIndexProcessor._old_index(self, obj, attributes)
Esempio n. 21
0
    def sync(self, batch=1000):
        """Sync the Solr index with the portal catalog. Records contained
        in the catalog but not in Solr will be indexed and records not
        contained in the catalog will be removed.
        """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey
        zodb_conn = self.context._p_jar
        catalog = getToolByName(self.context, 'portal_catalog')
        getIndex = catalog._catalog.getIndex
        modified_index = getIndex('modified')
        uid_index = getIndex(key)
        log = self.mklog()
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        # get Solr status
        query = '+%s:[* TO *]' % key
        response = conn.search(q=query, rows=MAX_ROWS, fl='%s modified' % key)
        # avoid creating DateTime instances
        simple_unmarshallers = unmarshallers.copy()
        simple_unmarshallers['date'] = parse_date_as_datetime
        flares = SolrResponse(response, simple_unmarshallers)
        response.close()
        solr_results = {}
        solr_uids = set()

        def _utc_convert(value):
            t_tup = value.utctimetuple()
            return (((
                (t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) *
                    60 + t_tup[4])

        for flare in flares:
            uid = flare[key]
            solr_uids.add(uid)
            solr_results[uid] = _utc_convert(flare['modified'])
        # get catalog status
        cat_results = {}
        cat_uids = set()
        for uid, rid in uid_index._index.items():
            cat_uids.add(uid)
            cat_results[uid] = rid
        # differences
        index = cat_uids.difference(solr_uids)
        solr_uids.difference_update(cat_uids)
        unindex = solr_uids
        processed = 0
        flush = notimeout(lambda: conn.flush())

        def checkPoint():
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        # Look up objects
        uid_rid_get = cat_results.get
        rid_path_get = catalog._catalog.paths.get
        catalog_traverse = catalog.unrestrictedTraverse

        def lookup(uid,
                   rid=None,
                   uid_rid_get=uid_rid_get,
                   rid_path_get=rid_path_get,
                   catalog_traverse=catalog_traverse):
            if rid is None:
                rid = uid_rid_get(uid)
            if not rid:
                return None
            if not isinstance(rid, int):
                rid = tuple(rid)[0]
            path = rid_path_get(rid)
            if not path:
                return None
            try:
                obj = catalog_traverse(path)
            except AttributeError:
                return None
            return obj

        log('processing %d "unindex" operations next...\n' % len(unindex))
        op = notimeout(lambda uid: conn.delete(id=uid))
        for uid in unindex:
            obj = lookup(uid)
            if obj is None:
                op(uid)
                processed += 1
                cpi.next()
            else:
                log('not unindexing existing object %r.\n' % uid)
        log('processing %d "index" operations next...\n' % len(index))
        op = notimeout(lambda obj: proc.index(obj))
        for uid in index:
            obj = lookup(uid)
            if indexable(obj):
                op(obj)
                processed += 1
                cpi.next()
            else:
                log('not indexing unindexable object %r.\n' % uid)
            if obj is not None:
                obj._p_deactivate()
        log('processing "reindex" operations next...\n')
        op = notimeout(lambda obj: proc.reindex(obj))
        cat_mod_get = modified_index._unindex.get
        solr_mod_get = solr_results.get
        done = unindex.union(index)
        for uid, rid in cat_results.items():
            if uid in done:
                continue
            if isinstance(rid, IITreeSet):
                rid = rid.keys()[0]
            if cat_mod_get(rid) != solr_mod_get(uid):
                obj = lookup(uid, rid=rid)
                if indexable(obj):
                    op(obj)
                    processed += 1
                    cpi.next()
                else:
                    log('not reindexing unindexable object %r.\n' % uid)
                if obj is not None:
                    obj._p_deactivate()
        conn.commit()
        log('solr index synced.\n')
        msg = 'processed %d object(s) in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 22
0
    def reindex(self, batch=1000, skip=0, limit=0):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                adder(conn, boost_values=boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                count += 1
                if count <= skip:
                    continue
                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        log('indexing %r\n' % obj)
                        pt = data.get('portal_type', 'default')
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data['_solr_adder'] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
                if limit and count >= (skip + limit):
                    break
        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 23
0
class QueueIndexerTests(TestCase):

    layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE

    def setUp(self):
        self.mngr = SolrConnectionManager()
        self.mngr.setHost(active=True)
        conn = self.mngr.getConnection()
        fakehttp(conn, getData("schema.xml"))  # fake schema response
        self.mngr.getSchema()  # read and cache the schema
        self.proc = SolrIndexProcessor(self.mngr)
        config = getConfig()
        config.atomic_updates = True

    def tearDown(self):
        self.mngr.closeConnection()
        self.mngr.setHost(active=False)

    def testPrepareData(self):
        data = {"allowedRolesAndUsers": ["user:test_user_1_", "user:portal_owner"]}
        prepareData(data)
        self.assertEqual(
            data, {"allowedRolesAndUsers": ["user$test_user_1_", "user$portal_owner"]}
        )

    def testLanguageParameterHandling(self):
        # empty strings are replaced...
        data = {"Language": ["en", ""]}
        prepareData(data)
        self.assertEqual(data, {"Language": ["en", "any"]})
        data = {"Language": ""}
        prepareData(data)
        self.assertEqual(data, {"Language": "any"})
        # for other indices this shouldn't happen...
        data = {"Foo": ["en", ""]}
        prepareData(data)
        self.assertEqual(data, {"Foo": ["en", ""]})

    def testIndexObject(self):
        response = getData("add_response.txt")
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        # indexing sends data
        self.proc.index(Foo(id="500", name="python test doc"))
        self.assertEqual(
            sortFields(str(output).encode("utf-8")),
            getData("add_request.txt").rstrip(b"\n"),
        )

    def testIndexAccessorRaises(self):
        response = getData("add_response.txt")
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)

        def brokenfunc():
            raise ValueError

        self.proc.index(
            Foo(id="500", name="python test doc", text=brokenfunc)
        )  # indexing sends data
        self.assertEqual(
            sortFields(str(output).encode("utf-8")),
            getData("add_request.txt").rstrip(b"\n"),
        )

    def testPartialIndexObject(self):
        foo = Foo(id="500", name="foo", price=42.0)
        # first index all attributes...
        response = getData("add_response.txt")
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        self.assert_(
            str(output).find('<field name="price" update="set">42.0</field>') > 0,
            '"price" data not found',
        )
        # then only a subset...
        response = getData("add_response.txt")
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo, attributes=["id", "name"])
        output = str(output)
        self.assert_(
            output.find('<field name="name" update="set">foo</field>') > 0,
            '"name" data not found',
        )
        # at this point we'd normally check for a partial update:
        self.assertEqual(output.find("price"), -1, '"price" data found?')
        self.assertEqual(output.find("42"), -1, '"price" data found?')

    def testDateIndexing(self):
        foo = Foo(
            id="zeidler",
            name="andi",
            cat="nerd",
            timestamp=DateTime("May 11 1972 03:45:59.999730 GMT"),
        )
        response = getData("add_response.txt")
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        required = (
            '<field name="timestamp" update="set">' "1972-05-11T03:45:59.999Z</field>"
        )
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testDateIndexingWithPythonDateTime(self):
        foo = Foo(
            id="gerken",
            name="patrick",
            cat="nerd",
            timestamp=datetime(1980, 9, 29, 14, 0o2, 59, 999730),
        )
        response = getData("add_response.txt")
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        required = (
            '<field name="timestamp" update="set">' "1980-09-29T14:02:59.999Z</field>"
        )
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testDateIndexingWithPythonDate(self):
        foo = Foo(
            id="brand", name="jan-carel", cat="nerd", timestamp=date(1982, 8, 0o5)
        )
        response = getData("add_response.txt")
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        required = (
            '<field name="timestamp" update="set">' "1982-08-05T00:00:00.000Z</field>"
        )
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testReindexObject(self):
        response = getData("add_response.txt")
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        # reindexing sends data
        self.proc.reindex(Foo(id="500", name="python test doc"))
        self.assertEqual(
            sortFields(str(output).encode("utf-8")),
            getData("add_request.txt").rstrip(b"\n"),
        )

    def testUnindexObject(self):
        response = getData("delete_response.txt")
        # fake response
        output = fakehttp(self.mngr.getConnection(), response)
        # unindexing sends data
        self.proc.unindex(Foo(id="500", name="python test doc"))
        self.assertEqual(
            str(output), getData("delete_request.txt").decode("utf-8").rstrip("\n")
        )

    def testCommit(self):
        response = getData("commit_response.txt")
        # fake response
        output = fakehttp(self.mngr.getConnection(), response)
        # committing sends data
        self.proc.commit()
        self.assertEqual(
            str(output), getData("commit_request.txt").decode("utf-8").rstrip("\n")
        )

    def testNoIndexingWithoutAllRequiredFields(self):
        response = getData("dummy_response.txt")
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        # indexing sends data
        self.proc.index(Foo(id="500"))
        self.assertEqual(str(output), "")

    def testIndexerMethods(self):
        class Bar(Foo):
            def cat(self):
                return "nerd"

            def price(self):
                raise AttributeError("price")

        foo = Bar(id="500", name="foo")
        # raising the exception should keep the attribute from being indexed
        response = getData("add_response.txt")
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        output = str(output)
        self.assertTrue(
            output.find('<field name="cat" update="set">nerd</field>') > 0,
            '"cat" data not found',
        )
        self.assertEqual(output.find("price"), -1, '"price" data found?')
Esempio n. 24
0
class QueueIndexerTests(TestCase):

    def setUp(self):
        provideUtility(SolrConnectionConfig(), ISolrConnectionConfig)
        self.mngr = SolrConnectionManager()
        self.mngr.setHost(active=True)
        conn = self.mngr.getConnection()
        fakehttp(conn, getData('schema.xml'))       # fake schema response
        self.mngr.getSchema()                       # read and cache the schema
        self.proc = SolrIndexProcessor(self.mngr)

    def tearDown(self):
        self.mngr.closeConnection()
        self.mngr.setHost(active=False)

    def testPrepareData(self):
        data = {'allowedRolesAndUsers': [
            'user:test_user_1_', 'user:portal_owner']}
        prepareData(data)
        self.assertEqual(
            data,
            {
                'allowedRolesAndUsers': [
                    'user$test_user_1_',
                    'user$portal_owner'
                ]
            }
        )

    def testLanguageParameterHandling(self):
        # empty strings are replaced...
        data = {'Language': ['en', '']}
        prepareData(data)
        self.assertEqual(data, {'Language': ['en', 'any']})
        data = {'Language': ''}
        prepareData(data)
        self.assertEqual(data, {'Language': 'any'})
        # for other indices this shouldn't happen...
        data = {'Foo': ['en', '']}
        prepareData(data)
        self.assertEqual(data, {'Foo': ['en', '']})

    def testIndexObject(self):
        response = getData('add_response.txt')
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        # indexing sends data
        self.proc.index(Foo(id='500', name='python test doc'))
        self.assertEqual(sortFields(str(output)), getData('add_request.txt'))

    def testIndexAccessorRaises(self):
        response = getData('add_response.txt')
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)

        def brokenfunc():
            raise ValueError
        self.proc.index(Foo(id='500', name='python test doc',
                            text=brokenfunc))   # indexing sends data
        self.assertEqual(sortFields(str(output)), getData('add_request.txt'))

    def testPartialIndexObject(self):
        foo = Foo(id='500', name='foo', price=42.0)
        # first index all attributes...
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        self.assert_(str(output).find(
            '<field name="price">42.0</field>') > 0, '"price" data not found')
        # then only a subset...
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo, attributes=['id', 'name'])
        output = str(output)
        self.assert_(
            output.find('<field name="name">foo</field>') > 0,
            '"name" data not found'
        )
        # at this point we'd normally check for a partial update:
        #   self.assertEqual(output.find('price'), -1, '"price" data found?')
        #   self.assertEqual(output.find('42'), -1, '"price" data found?')
        # however, until SOLR-139 has been implemented (re)index operations
        # always need to provide data for all attributes in the schema...
        self.assert_(
            output.find('<field name="price">42.0</field>') > 0,
            '"price" data not found'
        )

    def testDateIndexing(self):
        foo = Foo(id='zeidler', name='andi', cat='nerd',
                  timestamp=DateTime('May 11 1972 03:45 GMT'))
        response = getData('add_response.txt')
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        required = '<field name="timestamp">1972-05-11T03:45:00.000Z</field>'
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testDateIndexingWithPythonDateTime(self):
        foo = Foo(id='gerken', name='patrick', cat='nerd',
                  timestamp=datetime(1980, 9, 29, 14, 02))
        response = getData('add_response.txt')
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        required = '<field name="timestamp">1980-09-29T14:02:00.000Z</field>'
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testDateIndexingWithPythonDate(self):
        foo = Foo(id='brand', name='jan-carel',
                  cat='nerd', timestamp=date(1982, 8, 05))
        response = getData('add_response.txt')
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        required = '<field name="timestamp">1982-08-05T00:00:00.000Z</field>'
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testReindexObject(self):
        response = getData('add_response.txt')
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        # reindexing sends data
        self.proc.reindex(Foo(id='500', name='python test doc'))
        self.assertEqual(sortFields(str(output)), getData('add_request.txt'))

    def testUnindexObject(self):
        response = getData('delete_response.txt')
        # fake response
        output = fakehttp(self.mngr.getConnection(), response)
        # unindexing sends data
        self.proc.unindex(Foo(id='500', name='python test doc'))
        self.assertEqual(str(output), getData('delete_request.txt'))

    def testCommit(self):
        response = getData('commit_response.txt')
        # fake response
        output = fakehttp(self.mngr.getConnection(), response)
        # committing sends data
        self.proc.commit()
        self.assertEqual(str(output), getData('commit_request.txt'))

    def testNoIndexingWithoutAllRequiredFields(self):
        response = getData('dummy_response.txt')
        # fake add response
        output = fakehttp(self.mngr.getConnection(), response)
        # indexing sends data
        self.proc.index(Foo(id='500'))
        self.assertEqual(str(output), '')

    def testIndexerMethods(self):
        class Bar(Foo):

            def cat(self):
                return 'nerd'

            def price(self):
                raise AttributeError('price')
        foo = Bar(id='500', name='foo')
        # raising the exception should keep the attribute from being indexed
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        output = str(output)
        self.assertTrue(
            output.find('<field name="cat">nerd</field>') > 0,
            '"cat" data not found'
        )
        self.assertEqual(output.find('price'), -1, '"price" data found?')
Esempio n. 25
0
class QueueIndexerTests(TestCase):

    def setUp(self):
        provideUtility(SolrConnectionConfig(), ISolrConnectionConfig)
        self.mngr = SolrConnectionManager()
        self.mngr.setHost(active=True)
        conn = self.mngr.getConnection()
        fakehttp(conn, getData('schema.xml'))       # fake schema response
        self.mngr.getSchema()                       # read and cache the schema
        self.proc = SolrIndexProcessor(self.mngr)

    def tearDown(self):
        self.mngr.closeConnection()
        self.mngr.setHost(active=False)

    def testPrepareData(self):
        data = {'allowedRolesAndUsers': ['user:test_user_1_', 'user:portal_owner']}
        prepareData(data)
        self.assertEqual(data, {'allowedRolesAndUsers': ['user$test_user_1_', 'user$portal_owner']})

    def testLanguageParameterHandling(self):
        # empty strings are replaced...
        data = {'Language': ['en', '']}
        prepareData(data)
        self.assertEqual(data, {'Language': ['en', 'any']})
        data = {'Language': ''}
        prepareData(data)
        self.assertEqual(data, {'Language': 'any'})
        # for other indices this shouldn't happen...
        data = {'Foo': ['en', '']}
        prepareData(data)
        self.assertEqual(data, {'Foo': ['en', '']})

    def testIndexObject(self):
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake add response
        self.proc.index(Foo(id='500', name='python test doc'))   # indexing sends data
        self.assertEqual(sortFields(str(output)), getData('add_request.txt'))

    def testIndexAccessorRaises(self):
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake add response
        def brokenfunc():
            raise ValueError
        self.proc.index(Foo(id='500', name='python test doc',
                            text=brokenfunc))   # indexing sends data
        self.assertEqual(sortFields(str(output)), getData('add_request.txt'))

    def testPartialIndexObject(self):
        foo = Foo(id='500', name='foo', price=42.0)
        # first index all attributes...
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        self.assert_(str(output).find('<field name="price">42.0</field>') > 0, '"price" data not found')
        # then only a subset...
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo, attributes=['id', 'name'])
        output = str(output)
        self.assert_(output.find('<field name="name">foo</field>') > 0, '"name" data not found')
        # at this point we'd normally check for a partial update:
        #   self.assertEqual(output.find('price'), -1, '"price" data found?')
        #   self.assertEqual(output.find('42'), -1, '"price" data found?')
        # however, until SOLR-139 has been implemented (re)index operations
        # always need to provide data for all attributes in the schema...
        self.assert_(output.find('<field name="price">42.0</field>') > 0, '"price" data not found')

    def testDateIndexing(self):
        foo = Foo(id='zeidler', name='andi', cat='nerd', timestamp=DateTime('May 11 1972 03:45 GMT'))
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake add response
        self.proc.index(foo)
        required = '<field name="timestamp">1972-05-11T03:45:00.000Z</field>'
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testDateIndexingWithPythonDateTime(self):
        foo = Foo(id='gerken', name='patrick', cat='nerd', timestamp=datetime(1980, 9, 29, 14, 02))
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake add response
        self.proc.index(foo)
        required = '<field name="timestamp">1980-09-29T14:02:00.000Z</field>'
        self.assert_(str(output).find(required) > 0, '"date" data not found')

    def testReindexObject(self):
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake add response
        self.proc.reindex(Foo(id='500', name='python test doc')) # reindexing sends data
        self.assertEqual(sortFields(str(output)), getData('add_request.txt'))

    def testUnindexObject(self):
        response = getData('delete_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake response
        self.proc.unindex(Foo(id='500', name='python test doc')) # unindexing sends data
        self.assertEqual(str(output), getData('delete_request.txt'))

    def testCommit(self):
        response = getData('commit_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake response
        self.proc.commit()                                       # committing sends data
        self.assertEqual(str(output), getData('commit_request.txt'))

    def testNoIndexingWithoutAllRequiredFields(self):
        response = getData('dummy_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)   # fake add response
        self.proc.index(Foo(id='500'))                           # indexing sends data
        self.assertEqual(str(output), '')

    def testIndexerMethods(self):
        class Bar(Foo):
            def cat(self):
                return 'nerd'
            def price(self):
                raise AttributeError('price')
        foo = Bar(id='500', name='foo')
        # raising the exception should keep the attribute from being indexed
        response = getData('add_response.txt')
        output = fakehttp(self.mngr.getConnection(), response)
        self.proc.index(foo)
        output = str(output)
        self.assertTrue(output.find('<field name="cat">nerd</field>') > 0, '"cat" data not found')
        self.assertEqual(output.find('price'), -1, '"price" data found?')
Esempio n. 26
0
    def sync(self, batch=1000):
        """Sync the Solr index with the portal catalog. Records contained
        in the catalog but not in Solr will be indexed and records not
        contained in the catalog will be removed.
        """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey
        zodb_conn = self.context._p_jar
        catalog = getToolByName(self.context, "portal_catalog")
        getIndex = catalog._catalog.getIndex
        modified_index = getIndex("modified")
        uid_index = getIndex(key)
        log = self.mklog()
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        # get Solr status
        query = "+%s:[* TO *]" % key
        response = conn.search(q=query, rows=MAX_ROWS, fl="%s modified" % key)
        # avoid creating DateTime instances
        simple_unmarshallers = unmarshallers.copy()
        simple_unmarshallers["date"] = parse_date_as_datetime
        flares = SolrResponse(response, simple_unmarshallers)
        response.close()
        solr_results = {}
        solr_uids = set()

        def _utc_convert(value):
            t_tup = value.utctimetuple()
            return (((t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) * 60 + t_tup[4]

        for flare in flares:
            uid = flare[key]
            solr_uids.add(uid)
            solr_results[uid] = _utc_convert(flare["modified"])
        # get catalog status
        cat_results = {}
        cat_uids = set()
        for uid, rid in uid_index._index.items():
            cat_uids.add(uid)
            cat_results[uid] = rid
        # differences
        index = cat_uids.difference(solr_uids)
        solr_uids.difference_update(cat_uids)
        unindex = solr_uids
        processed = 0
        flush = notimeout(lambda: conn.flush())

        def checkPoint():
            msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        # Look up objects
        uid_rid_get = cat_results.get
        rid_path_get = catalog._catalog.paths.get
        catalog_traverse = catalog.unrestrictedTraverse

        def lookup(
            uid, rid=None, uid_rid_get=uid_rid_get, rid_path_get=rid_path_get, catalog_traverse=catalog_traverse
        ):
            if rid is None:
                rid = uid_rid_get(uid)
            if not rid:
                return None
            if not isinstance(rid, int):
                rid = tuple(rid)[0]
            path = rid_path_get(rid)
            if not path:
                return None
            try:
                obj = catalog_traverse(path)
            except AttributeError:
                return None
            return obj

        log('processing %d "unindex" operations next...\n' % len(unindex))
        op = notimeout(lambda uid: conn.delete(id=uid))
        for uid in unindex:
            obj = lookup(uid)
            if obj is None:
                op(uid)
                processed += 1
                cpi.next()
            else:
                log("not unindexing existing object %r.\n" % uid)
        log('processing %d "index" operations next...\n' % len(index))
        op = notimeout(lambda obj: proc.index(obj))
        for uid in index:
            obj = lookup(uid)
            if indexable(obj):
                op(obj)
                processed += 1
                cpi.next()
            else:
                log("not indexing unindexable object %r.\n" % uid)
            if obj is not None:
                obj._p_deactivate()
        log('processing "reindex" operations next...\n')
        op = notimeout(lambda obj: proc.reindex(obj))
        cat_mod_get = modified_index._unindex.get
        solr_mod_get = solr_results.get
        done = unindex.union(index)
        for uid, rid in cat_results.items():
            if uid in done:
                continue
            if isinstance(rid, IITreeSet):
                rid = rid.keys()[0]
            if cat_mod_get(rid) != solr_mod_get(uid):
                obj = lookup(uid, rid=rid)
                if indexable(obj):
                    op(obj)
                    processed += 1
                    cpi.next()
                else:
                    log("not reindexing unindexable object %r.\n" % uid)
                if obj is not None:
                    obj._p_deactivate()
        conn.commit()
        log("solr index synced.\n")
        msg = "processed %d object(s) in %s (%s cpu time)."
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 27
0
    def cleanup(self, batch=1000):
        """remove entries from solr that don't have a corresponding Zope
        object or have a different UID than the real object"""
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        log = self.mklog(use_std_log=True)
        log("cleaning up solr index...\n")
        key = manager.getSchema().uniqueKey

        start = 0
        resp = SolrResponse(conn.search(q="*:*", rows=batch, start=start))
        res = resp.results()
        log("%s items in solr catalog\n" % resp.response.numFound)
        deleted = 0
        reindexed = 0
        while len(res) > 0:
            for flare in res:
                try:
                    ob = PloneFlare(flare).getObject()
                except Exception as err:
                    log("Error getting object, removing: %s (%s)\n" %
                        (flare["path_string"], err))
                    conn.delete(flare[key])
                    deleted += 1
                    continue
                if ob is None:
                    log("Object not found, removing: %s\n" %
                        (flare["path_string"]))
                    conn.delete(flare[key])
                    deleted += 1
                    continue
                if not IUUIDAware.providedBy(ob):
                    no_skipping_msg = ("Object %s of type %s does not " +
                                       "support uuids, skipping.\n")
                    log(no_skipping_msg %
                        ("/".join(ob.getPhysicalPath()), ob.meta_type))
                    continue
                uuid = IUUID(ob)
                if uuid != flare[key]:
                    log("indexed under wrong UID, removing: %s\n" %
                        flare["path_string"])
                    conn.delete(flare[key])
                    deleted += 1
                    realob_res = SolrResponse(
                        conn.search(q="%s:%s" % (key, uuid))).results()
                    if len(realob_res) == 0:
                        log("no sane entry for last object, reindexing\n")
                        data, missing = proc.getData(ob)
                        prepareData(data)
                        if not missing:
                            boost = boost_values(ob, data)
                            conn.add(boost_values=boost, **data)
                            reindexed += 1
                        else:
                            log("  missing data, cannot index.\n")
            log("handled batch of %d items, committing\n" % len(res))
            conn.commit()
            start += batch
            resp = SolrResponse(conn.search(q="*:*", rows=batch, start=start))
            res = resp.results()
        finished_msg = ("solr cleanup finished, %s item(s) removed, " +
                        "%s item(s) reindexed\n")
        msg = finished_msg % (deleted, reindexed)
        log(msg)
        logger.info(msg)
Esempio n. 28
0
    def reindex(
        self,
        batch=1000,
        skip=0,
        limit=0,
        ignore_portal_types=None,
        only_portal_types=None,
        idxs=[],
        ignore_exceptions=False,
    ):
        """find all contentish objects (meaning all objects derived from one
        of the catalog mixin classes) and (re)indexes them"""

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log("reindexing solr catalog...\n")
        if skip:
            log("skipping indexing of %d object(s)...\n" % skip)
        if limit:
            log("limiting indexing to %d object(s)...\n" % limit)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(process_time)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated

        def flush():
            return conn.commit(soft=True)

        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop("_solr_adder")
                try:
                    adder(conn, boost_values=my_boost_values, **data)
                except Exception as e:
                    logger.warning("Error %s @ %s", e, data["path_string"])
                    if not ignore_exceptions:
                        raise
            updates.clear()
            msg = ("intermediate commit (%d items processed, "
                   "last batch in %s)...\n" % (processed, next(lap)))
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0

        if atomic:
            log("indexing only {0} \n".format(idxs))

        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                if getOwnIndexMethod:
                    if getOwnIndexMethod(obj, "indexObject") is not None:
                        log("skipping indexing of %r via private method.\n" %
                            obj)
                        continue

                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and key not in attributes:
                    attributes.append(key)
                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        log("indexing %r\n" % obj)

                        pt = data.get("portal_type", "default")
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data["_solr_adder"] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        next(cpi)
                else:
                    log("missing data, skipping indexing of %r.\n" % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log("solr index rebuilt.\n")
        msg = "processed %d items in %s (%s cpu time)."
        msg = msg % (processed, next(real), next(cpu))
        log(msg)
        logger.info(msg)
Esempio n. 29
0
    def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None,
                only_portal_types=None):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                adder(conn, boost_values=my_boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        log('indexing %r\n' % obj)
                        pt = data.get('portal_type', 'default')
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data['_solr_adder'] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)