Esempio n. 1
0
 def reindex(self, batch=1000, skip=0):
     """ find all contentish objects (meaning all objects derived from one
         of the catalog mixin classes) and (re)indexes them """
     requestFactory = queryUtility(IRequestFactory)
     indexProcessor = queryUtility(IZeroCMSIndexQueueProcessor, name="zerocms")
     zodb_conn = self.context._p_jar
     log = self.mklog()
     log('reindexing documents to ZeroCMS...\n')
     if skip:
         log('skipping indexing of %d object(s)...\n' % skip)
     real = timer()          # real time
     lap = timer()           # real lap time (for intermediate commits)
     cpu = timer(clock)      # cpu time
     processed = 0
     updates = {}            # list to hold data to be updated
     count = 0
     for path, obj in findObjects(self.context):
         if indexable(obj):
             if getOwnIndexMethod(obj, 'indexObject') is not None:
                 log('skipping indexing of %r via private method.\n' % obj)
                 continue
             count += 1
             if count <= skip:
                 continue
             indexProcessor.index(obj)
             processed += 1
     zodb_conn.cacheGC();
     log('All documents exported to ZeroCMS.\n')
     msg = 'processed %d items in %s (%s cpu time).'
     msg = msg % (processed, real.next(), cpu.next())
     log(msg)
     logger.info(msg)
Esempio n. 2
0
    def reindex(self, batch=1000, skip=0):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, 'indexObject') is not None:
                    log('skipping indexing of %r via private method.\n' % obj)
                    continue
                count += 1
                if count <= skip:
                    continue
                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 3
0
    def reindex(self, batch=1000, skip=0):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log("reindexing solr catalog...\n")
        if skip:
            log("skipping indexing of %d object(s)...\n" % skip)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, "indexObject") is not None:
                    log("skipping indexing of %r via private method.\n" % obj)
                    continue
                count += 1
                if count <= skip:
                    continue
                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log("missing data, skipping indexing of %r.\n" % obj)
        checkPoint()
        conn.commit()
        log("solr index rebuilt.\n")
        msg = "processed %d items in %s (%s cpu time)."
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 4
0
    def index(self, obj, attributes=None):
        """Index the specified attributes for obj using atomic updates, or all
        of them if `attributes` is `None`.

        Changes to the original method include making sure the uniqueKey is
        part of the attributes, and passing the attributes to the
        self.getData() call to avoid causing Plone to index all fields instead
        of just the necessary ones.
        """
        conn = self.getConnection()
        if conn is not None and indexable(obj):
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping indexing of %r'
                logger.warning(msg, obj)
                return

            if attributes is not None:
                attributes = set(schema.keys()).intersection(attributes)
                if not attributes:
                    return
                if not uniqueKey in attributes:
                    # The uniqueKey is required in order to identify the
                    # document when doing atomic updates.
                    attributes.add(uniqueKey)

            data, missing = self.getData(obj, attributes=attributes)
            if not data:
                return  # don't index with no data...
            prepareData(data)

            if data.get(uniqueKey, None) is not None and not missing:
                config = getUtility(ISolrConnectionConfig)
                if config.commit_within:
                    data['commitWithin'] = config.commit_within
                try:
                    logger.debug('indexing %r (%r)', obj, data)
                    conn.add(boost_values=boost_values(obj, data), **data)
                except (SolrException, error):
                    logger.exception('exception during indexing %r', obj)
Esempio n. 5
0
    def index(self, obj, attributes=None):
        """Index the specified attributes for obj using atomic updates, or all
        of them if `attributes` is `None`.

        Changes to the original method include making sure the uniqueKey is
        part of the attributes, and passing the attributes to the
        self.getData() call to avoid causing Plone to index all fields instead
        of just the necessary ones.
        """
        conn = self.getConnection()
        if conn is not None and indexable(obj):
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping indexing of %r'
                logger.warning(msg, obj)
                return

            if attributes is not None:
                attributes = set(schema.keys()).intersection(attributes)
                if not attributes:
                    return
                if not uniqueKey in attributes:
                    # The uniqueKey is required in order to identify the
                    # document when doing atomic updates.
                    attributes.add(uniqueKey)

            data, missing = self.getData(obj, attributes=attributes)
            if not data:
                return          # don't index with no data...
            prepareData(data)

            if data.get(uniqueKey, None) is not None and not missing:
                config = getUtility(ISolrConnectionConfig)
                if config.commit_within:
                    data['commitWithin'] = config.commit_within
                try:
                    logger.debug('indexing %r (%r)', obj, data)
                    conn.add(boost_values=boost_values(obj, data), **data)
                except (SolrException, error):
                    logger.exception('exception during indexing %r', obj)
Esempio n. 6
0
    def sync(self, batch=1000):
        """Sync the Solr index with the portal catalog. Records contained
        in the catalog but not in Solr will be indexed and records not
        contained in the catalog will be removed.
        """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey
        zodb_conn = self.context._p_jar
        catalog = getToolByName(self.context, "portal_catalog")
        getIndex = catalog._catalog.getIndex
        modified_index = getIndex("modified")
        uid_index = getIndex(key)
        log = self.mklog()
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        # get Solr status
        query = "+%s:[* TO *]" % key
        response = conn.search(q=query, rows=MAX_ROWS, fl="%s modified" % key)
        # avoid creating DateTime instances
        simple_unmarshallers = unmarshallers.copy()
        simple_unmarshallers["date"] = parse_date_as_datetime
        flares = SolrResponse(response, simple_unmarshallers)
        response.close()
        solr_results = {}
        solr_uids = set()

        def _utc_convert(value):
            t_tup = value.utctimetuple()
            return (((t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) * 60 + t_tup[4]

        for flare in flares:
            uid = flare[key]
            solr_uids.add(uid)
            solr_results[uid] = _utc_convert(flare["modified"])
        # get catalog status
        cat_results = {}
        cat_uids = set()
        for uid, rid in uid_index._index.items():
            cat_uids.add(uid)
            cat_results[uid] = rid
        # differences
        index = cat_uids.difference(solr_uids)
        solr_uids.difference_update(cat_uids)
        unindex = solr_uids
        processed = 0
        flush = notimeout(lambda: conn.flush())

        def checkPoint():
            msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        # Look up objects
        uid_rid_get = cat_results.get
        rid_path_get = catalog._catalog.paths.get
        catalog_traverse = catalog.unrestrictedTraverse

        def lookup(
            uid, rid=None, uid_rid_get=uid_rid_get, rid_path_get=rid_path_get, catalog_traverse=catalog_traverse
        ):
            if rid is None:
                rid = uid_rid_get(uid)
            if not rid:
                return None
            if not isinstance(rid, int):
                rid = tuple(rid)[0]
            path = rid_path_get(rid)
            if not path:
                return None
            try:
                obj = catalog_traverse(path)
            except AttributeError:
                return None
            return obj

        log('processing %d "unindex" operations next...\n' % len(unindex))
        op = notimeout(lambda uid: conn.delete(id=uid))
        for uid in unindex:
            obj = lookup(uid)
            if obj is None:
                op(uid)
                processed += 1
                cpi.next()
            else:
                log("not unindexing existing object %r.\n" % uid)
        log('processing %d "index" operations next...\n' % len(index))
        op = notimeout(lambda obj: proc.index(obj))
        for uid in index:
            obj = lookup(uid)
            if indexable(obj):
                op(obj)
                processed += 1
                cpi.next()
            else:
                log("not indexing unindexable object %r.\n" % uid)
            if obj is not None:
                obj._p_deactivate()
        log('processing "reindex" operations next...\n')
        op = notimeout(lambda obj: proc.reindex(obj))
        cat_mod_get = modified_index._unindex.get
        solr_mod_get = solr_results.get
        done = unindex.union(index)
        for uid, rid in cat_results.items():
            if uid in done:
                continue
            if isinstance(rid, IITreeSet):
                rid = rid.keys()[0]
            if cat_mod_get(rid) != solr_mod_get(uid):
                obj = lookup(uid, rid=rid)
                if indexable(obj):
                    op(obj)
                    processed += 1
                    cpi.next()
                else:
                    log("not reindexing unindexable object %r.\n" % uid)
                if obj is not None:
                    obj._p_deactivate()
        conn.commit()
        log("solr index synced.\n")
        msg = "processed %d object(s) in %s (%s cpu time)."
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 7
0
    def sync(self, batch=1000):
        """Sync the Solr index with the portal catalog. Records contained
        in the catalog but not in Solr will be indexed and records not
        contained in the catalog will be removed.
        """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey
        zodb_conn = self.context._p_jar
        catalog = getToolByName(self.context, 'portal_catalog')
        getIndex = catalog._catalog.getIndex
        modified_index = getIndex('modified')
        uid_index = getIndex(key)
        log = self.mklog()
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        # get Solr status
        query = '+%s:[* TO *]' % key
        response = conn.search(q=query, rows=MAX_ROWS, fl='%s modified' % key)
        # avoid creating DateTime instances
        simple_unmarshallers = unmarshallers.copy()
        simple_unmarshallers['date'] = parse_date_as_datetime
        flares = SolrResponse(response, simple_unmarshallers)
        response.close()
        solr_results = {}
        solr_uids = set()

        def _utc_convert(value):
            t_tup = value.utctimetuple()
            return (((
                (t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) *
                    60 + t_tup[4])

        for flare in flares:
            uid = flare[key]
            solr_uids.add(uid)
            solr_results[uid] = _utc_convert(flare['modified'])
        # get catalog status
        cat_results = {}
        cat_uids = set()
        for uid, rid in uid_index._index.items():
            cat_uids.add(uid)
            cat_results[uid] = rid
        # differences
        index = cat_uids.difference(solr_uids)
        solr_uids.difference_update(cat_uids)
        unindex = solr_uids
        processed = 0
        flush = notimeout(lambda: conn.flush())

        def checkPoint():
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        # Look up objects
        uid_rid_get = cat_results.get
        rid_path_get = catalog._catalog.paths.get
        catalog_traverse = catalog.unrestrictedTraverse

        def lookup(uid,
                   rid=None,
                   uid_rid_get=uid_rid_get,
                   rid_path_get=rid_path_get,
                   catalog_traverse=catalog_traverse):
            if rid is None:
                rid = uid_rid_get(uid)
            if not rid:
                return None
            if not isinstance(rid, int):
                rid = tuple(rid)[0]
            path = rid_path_get(rid)
            if not path:
                return None
            try:
                obj = catalog_traverse(path)
            except AttributeError:
                return None
            return obj

        log('processing %d "unindex" operations next...\n' % len(unindex))
        op = notimeout(lambda uid: conn.delete(id=uid))
        for uid in unindex:
            obj = lookup(uid)
            if obj is None:
                op(uid)
                processed += 1
                cpi.next()
            else:
                log('not unindexing existing object %r.\n' % uid)
        log('processing %d "index" operations next...\n' % len(index))
        op = notimeout(lambda obj: proc.index(obj))
        for uid in index:
            obj = lookup(uid)
            if indexable(obj):
                op(obj)
                processed += 1
                cpi.next()
            else:
                log('not indexing unindexable object %r.\n' % uid)
            if obj is not None:
                obj._p_deactivate()
        log('processing "reindex" operations next...\n')
        op = notimeout(lambda obj: proc.reindex(obj))
        cat_mod_get = modified_index._unindex.get
        solr_mod_get = solr_results.get
        done = unindex.union(index)
        for uid, rid in cat_results.items():
            if uid in done:
                continue
            if isinstance(rid, IITreeSet):
                rid = rid.keys()[0]
            if cat_mod_get(rid) != solr_mod_get(uid):
                obj = lookup(uid, rid=rid)
                if indexable(obj):
                    op(obj)
                    processed += 1
                    cpi.next()
                else:
                    log('not reindexing unindexable object %r.\n' % uid)
                if obj is not None:
                    obj._p_deactivate()
        conn.commit()
        log('solr index synced.\n')
        msg = 'processed %d object(s) in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Esempio n. 8
0
def solr_dump_catalog(app, args):
    """Dumps the catalog and metadata contents into a nested directory
    structure full of pickles containing the information in dict format.
    These can be updated by later re-runs and used to import the data via the
    `update_solr` command. You can optionally specify the id of the Plone site
    as the first command line argument.
    """
    _enable_log()
    db = app._p_jar.db()
    from Testing import makerequest
    root = makerequest.makerequest(app)
    site = _get_site(root, args)
    data_dir = _data_dir(site.getId())
    _make_dir(data_dir)

    catalog = site.portal_catalog
    _catalog = catalog._catalog
    catalog_length = len(catalog)
    uids_get = _catalog.uids.get

    conn = _solr_connection()
    schema = conn.getSchema()
    wanted = set(schema.keys())
    # We need the data from path
    wanted.add('path')
    conn.close()

    logger.info('Process %s catalog items' % catalog_length)

    from collective.indexing.indexer import getOwnIndexMethod
    from collective.solr.indexer import indexable
    from collective.solr.utils import findObjects
    from plone.app.folder.nogopip import GopipIndex
    from Products.PluginIndexes.DateIndex.DateIndex import DateIndex
    from Products.PluginIndexes.DateRangeIndex import DateRangeIndex
    from Products.ZCTextIndex import WidCode
    from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex

    indexes = catalog.indexes()
    indexes.sort()

    gopip_indexes = set()
    for indexname in indexes:
        if indexname in _catalog.schema:
            # There's no need to get metadata from the indexes
            continue
        if indexname not in wanted:
            # skip indexes not present in the Solr schema
            continue
        logger.info('Dumping index: %s' % indexname)
        index = _catalog.getIndex(indexname)
        if isinstance(index, DateRangeIndex.DateRangeIndex):
            # Solr cannot deal with range indexes directly
            continue
        if isinstance(index, ZCTextIndex):
            get_word = index.getLexicon().get_word
            wid_decode = WidCode.decode
            batch = 0
            for i, (uid, value) in enumerate(index.index._docwords.items()):
                batch = _log_batch(db, batch, i)
                words = ' '.join([get_word(w) for w in wid_decode(value)])
                _dump(data_dir, uid, {indexname: words})
        elif isinstance(index, GopipIndex):
            # happens last as it needs a full site traversal
            gopip_indexes.add(indexname)
            continue
        elif not hasattr(index, '_unindex'):
            logger.warn("Unsupported index '%s' without an _unindex." %
                indexname)
        else:
            date_index = isinstance(index, DateIndex)
            batch = 0
            for i, (uid, value) in enumerate(index._unindex.iteritems()):
                batch = _log_batch(db, batch, i)
                value = _convert_value(indexname, value, date_index)
                if value:
                    _dump(data_dir, uid, {indexname: value})

    # dump metadata
    logger.info('Dumping metadata records')
    batch = 0
    for i, uid in enumerate(_catalog.paths.iterkeys()):
        batch = _log_batch(db, batch, i)
        values = {}
        for k, v in _catalog.getMetadataForRID(uid).iteritems():
            definition = schema.get(k)
            if not definition:
                continue
            class_ = definition['class_']
            date_index = class_ == 'solr.TrieDateField'
            value = _convert_value(k, v, date_index)
            if value is not None:
                values[k] = value
            elif class_ == 'solr.TextField':
                values[k] = ''
        _dump(data_dir, uid, values)

    # deal with GopipIndexes
    batch = 0
    logger.info('Traversing site to dump Gopip index information')
    for i, (path, obj) in enumerate(findObjects(site)):
        batch = _log_batch(db, batch, i)
        if not indexable(obj):
            continue
        elif getOwnIndexMethod(obj, 'indexObject') is not None:
            continue
        parent = aq_parent(obj)
        uid = uids_get('/'.join(obj.getPhysicalPath()), None)
        if uid is None:
            continue
        if hasattr(aq_base(parent), 'getObjectPosition'):
            pos = parent.getObjectPosition(path.split('/')[-1])
            data = {}
            for name in gopip_indexes:
                data[name] = pos
            _dump(data_dir, uid, data)
        else:
            data = {}
            for name in gopip_indexes:
                data[name] = 0
            _dump(data_dir, uid, data)
        if not getattr(aq_base(obj), 'isPrincipiaFolderish', False):
            # Remove non-folders from the cache immediately as we no longer
            # need them
            obj._p_deactivate()
Esempio n. 9
0
    def reindex(self, batch=1000, skip=0, idxs=[]):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = FtwSolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                # Only update specified fields by using atomic updates
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, 'indexObject') is not None:
                    log('skipping indexing of %r via private method.\n' % obj)
                    continue
                count += 1
                if count <= skip:
                    continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and not key in attributes:
                    attributes.append(key)

                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)