Example #1
0
 def unindex(self, obj):
     conn = self.getConnection()
     if conn is not None:
         schema = self.manager.getSchema()
         if schema is None:
             msg = "unable to fetch schema, skipping unindexing of %r"
             logger.warning(msg, obj)
             return
         uniqueKey = schema.get("uniqueKey", None)
         if uniqueKey is None:
             msg = "schema is missing unique key, skipping unindexing of %r"
             logger.warning(msg, obj)
             return
         data, missing = self.getData(obj, attributes=[uniqueKey])
         prepareData(data)
         if not uniqueKey in data:
             msg = "Can not unindex: no unique key for object %r"
             logger.info(msg, obj)
             return
         data_key = data[uniqueKey]
         if data_key is None:
             msg = "Can not unindex: `None` unique key for object %r"
             logger.info(msg, obj)
             return
         try:
             logger.debug("unindexing %r (%r)", obj, data)
             conn.delete(id=data_key)
         except (SolrException, error):
             logger.exception("exception during unindexing %r", obj)
    def _reindex(self,obj):
        manager, proc, conn= self._get_man_proc_conn()
        pwo=proc.wrapObject(obj)
        schema=self._getSchema()
        data={}
        for name,field in schema.items():
            try:
                value = getattr(pwo, name)
                if callable(value):
                    value = value()
            except AttributeError:
                continue
            handler = handlers.get(field.class_, None)
            if handler is not None:
                try:
                    value = handler(value)
                except AttributeError:
                    continue
            elif isinstance(value, (list, tuple)) and not field.multiValued:
                separator = getattr(field, 'separator', ' ')
                value = separator.join(value)
            data[name] = value

        prepareData(data)
        conn.add(**data)
Example #3
0
 def index(self, obj, attributes=None):
     conn = self.getConnection()
     if conn is not None and indexable(obj):
         # unfortunately with current versions of solr we need to provide
         # data for _all_ fields during an <add> -- partial updates aren't
         # supported (see https://issues.apache.org/jira/browse/SOLR-139)
         # however, the reindexing can be skipped if none of the given
         # attributes match existing solr indexes...
         schema = self.manager.getSchema()
         if schema is None:
             msg = 'unable to fetch schema, skipping indexing of %r'
             logger.warning(msg, obj)
             return
         uniqueKey = schema.get('uniqueKey', None)
         if uniqueKey is None:
             msg = 'schema is missing unique key, skipping indexing of %r'
             logger.warning(msg, obj)
             return
         if attributes is not None:
             attributes = set(schema.keys()).intersection(attributes)
             if not attributes:
                 return
         data, missing = self.getData(obj)
         if not data:
             return          # don't index with no data...
         prepareData(data)
         if data.get(uniqueKey, None) is not None and not missing:
             config = getUtility(ISolrConnectionConfig)
             if config.commit_within:
                 data['commitWithin'] = config.commit_within
             try:
                 logger.debug('indexing %r (%r)', obj, data)
                 conn.add(boost_values=boost_values(obj, data), **data)
             except (SolrException, error):
                 logger.exception('exception during indexing %r', obj)
Example #4
0
    def index(self, obj, attributes=None):
        """Index the specified attributes for obj using atomic updates, or all
        of them if `attributes` is `None`.
        Also make sure the `uniqueKey` is part of attributes, and passing the
        attributes to the self.getData() call to avoid causing Plone to index
        all fields instead of just the necessary ones.
        """
        conn = self.getConnection()
        if conn is not None and ICheckIndexable(obj)():
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping indexing of %r'
                logger.warning(msg, obj)
                return

            if attributes is not None:

                if 'path' in attributes:
                    attributes = list(attributes)
                    attributes.extend(['path_string', 'path_parents',
                                       'path_depth'])

                attributes = set(schema.keys()).intersection(attributes)
                if not attributes:
                    return

                if uniqueKey not in attributes:
                    # The uniqueKey is required in order to identify the
                    # document when doing atomic updates.
                    attributes.add(uniqueKey)

            data, missing = self.getData(obj, attributes=attributes)
            if not data:
                return          # don't index with no data...
            prepareData(data)
            if data.get(uniqueKey, None) is not None and not missing:
                registry = getUtility(IRegistry)
                config_commit_within = registry['collective.solr.commit_within']   # noqa
                if config_commit_within:
                    data['commitWithin'] = config_commit_within
                try:
                    logger.debug('indexing %r (%r)', obj, data)
                    pt = data.get('portal_type', 'default')
                    logger.debug(
                        'indexing %r with %r adder (%r)', obj, pt, data
                    )

                    adder = queryAdapter(obj, ISolrAddHandler, name=pt)

                    if adder is None:
                        adder = DefaultAdder(obj)
                    adder(conn, boost_values=boost_values(obj, data), **data)
                except (SolrConnectionException, error):
                    logger.exception('exception during indexing %r', obj)
Example #5
0
    def reindex(self, batch=1000, skip=0):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log("reindexing solr catalog...\n")
        if skip:
            log("skipping indexing of %d object(s)...\n" % skip)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, "indexObject") is not None:
                    log("skipping indexing of %r via private method.\n" % obj)
                    continue
                count += 1
                if count <= skip:
                    continue
                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log("missing data, skipping indexing of %r.\n" % obj)
        checkPoint()
        conn.commit()
        log("solr index rebuilt.\n")
        msg = "processed %d items in %s (%s cpu time)."
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Example #6
0
    def cleanup(self, batch=1000):
        """ remove entries from solr that don't have a corresponding Zope
            object or have a different UID than the real object"""
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        log = self.mklog(use_std_log=True)
        log('cleaning up solr index...\n')
        key = manager.getSchema().uniqueKey

        start = 0
        resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start))
        res = resp.results()
        log('%s items in solr catalog\n' % resp.response.numFound)
        deleted = 0
        reindexed = 0
        while len(res) > 0:
            for flare in res:
                try:
                    ob = PloneFlare(flare).getObject()
                except Exception as err:
                    log('Error getting object, removing: %s (%s)\n' % (
                        flare['path_string'], err))
                    conn.delete(flare[key])
                    deleted += 1
                    continue
                if not IUUIDAware.providedBy(ob):
                    log('Object %s of type %s does not support uuids, skipping.\n' %
                        ('/'.join(ob.getPhysicalPath()), ob.meta_type))
                    continue
                uuid = IUUID(ob)
                if uuid != flare[key]:
                    log('indexed under wrong UID, removing: %s\n' %
                        flare['path_string'])
                    conn.delete(flare[key])
                    deleted += 1
                    realob_res = SolrResponse(conn.search(q='%s:%s' %
                                              (key, uuid))).results()
                    if len(realob_res) == 0:
                        log('no sane entry for last object, reindexing\n')
                        data, missing = proc.getData(ob)
                        prepareData(data)
                        if not missing:
                            boost = boost_values(ob, data)
                            conn.add(boost_values=boost, **data)
                            reindexed += 1
                        else:
                            log('  missing data, cannot index.\n')
            log('handled batch of %d items, commiting\n' % len(res))
            conn.commit()
            start += batch
            resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start))
            res = resp.results()
        msg = 'solr cleanup finished, %s item(s) removed, %s item(s) reindexed\n' % (deleted, reindexed)
        log(msg)
        logger.info(msg)
Example #7
0
def solrSearchResults(request=None, **keywords):
    """ perform a query using solr after translating the passed in
        parameters with portal catalog semantics """
    search = queryUtility(ISearch)
    config = queryUtility(ISolrConnectionConfig)
    if request is None:
        # try to get a request instance, so that flares can be adapted to
        # ploneflares and urls can be converted into absolute ones etc;
        # however, in this case any arguments from the request are ignored
        request = getattr(getSite(), 'REQUEST', None)
        args = deepcopy(keywords)
    elif IHTTPRequest.providedBy(request):
        args = deepcopy(request.form)  # ignore headers and other stuff
        args.update(keywords)       # keywords take precedence
    else:
        assert isinstance(request, dict), request
        args = deepcopy(request)
        args.update(keywords)       # keywords take precedence
        # if request is a dict, we need the real request in order to
        # be able to adapt to plone flares
        request = getattr(getSite(), 'REQUEST', args)
    if 'path' in args and 'navtree' in args['path']:
        raise FallBackException     # we can't handle navtree queries yet
    use_solr = args.get('use_solr', False)  # A special key to force Solr
    if not use_solr and config.required:
        required = set(config.required).intersection(args)
        if required:
            for key in required:
                if not args[key]:
                    raise FallBackException
        else:
            raise FallBackException
    schema = search.getManager().getSchema() or {}
    params = cleanupQueryParameters(extractQueryParameters(args), schema)
    languageFilter(args)
    prepareData(args)
    mangleQuery(args, config, schema)
    query = search.buildQuery(**args)
    if query != {}:
        optimizeQueryParameters(query, params)
        __traceback_info__ = (query, params, args)
        response = search(query, **params)
    else:
        return SolrResponse()
    def wrap(flare):
        """ wrap a flare object with a helper class """
        adapter = queryMultiAdapter((flare, request), IFlare)
        return adapter is not None and adapter or flare
    results = response.results()
    for idx, flare in enumerate(results):
        flare = wrap(flare)
        for missing in set(schema.stored).difference(flare):
            flare[missing] = MV
        results[idx] = wrap(flare)
    padResults(results, **params)           # pad the batch
    return response
Example #8
0
 def testPrepareData(self):
     data = {'allowedRolesAndUsers': [
         'user:test_user_1_', 'user:portal_owner']}
     prepareData(data)
     self.assertEqual(
         data,
         {
             'allowedRolesAndUsers': [
                 'user$test_user_1_',
                 'user$portal_owner'
             ]
         }
     )
Example #9
0
    def index(self, obj, attributes=None):
        """Index the specified attributes for obj using atomic updates, or all
        of them if `attributes` is `None`.

        Changes to the original method include making sure the uniqueKey is
        part of the attributes, and passing the attributes to the
        self.getData() call to avoid causing Plone to index all fields instead
        of just the necessary ones.
        """
        conn = self.getConnection()
        if conn is not None and indexable(obj):
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping indexing of %r'
                logger.warning(msg, obj)
                return

            if attributes is not None:
                attributes = set(schema.keys()).intersection(attributes)
                if not attributes:
                    return
                if not uniqueKey in attributes:
                    # The uniqueKey is required in order to identify the
                    # document when doing atomic updates.
                    attributes.add(uniqueKey)

            data, missing = self.getData(obj, attributes=attributes)
            if not data:
                return          # don't index with no data...
            prepareData(data)

            if data.get(uniqueKey, None) is not None and not missing:
                config = getUtility(ISolrConnectionConfig)
                if config.commit_within:
                    data['commitWithin'] = config.commit_within
                try:
                    logger.debug('indexing %r (%r)', obj, data)
                    conn.add(boost_values=boost_values(obj, data), **data)
                except (SolrException, error):
                    logger.exception('exception during indexing %r', obj)
Example #10
0
def search_solr(query, request=None, lang_query=True, **params):
    search = queryUtility(ISearch)

    if lang_query:
        dummy = {}
        languageFilter(dummy)
        prepareData(dummy)  # this replaces '' with 'any'
        langquery = 'Language:(%s)' % ' OR '.join(dummy['Language'])
        query = '(%s) AND %s' % (query, langquery)

    response = search(query, **params)
    if request is None:
        request = getSite().REQUEST
    response.request = request
    results = response.results()
    for idx, flare in enumerate(results):
        results[idx] = PloneFlare(flare, request=request)
    padResults(results, **params)           # pad the batch
    return response
Example #11
0
    def index(self, obj, attributes=None):
        conn = self.getConnection()
        if conn is not None and ICheckIndexable(obj)():
            # unfortunately with current versions of solr we need to provide
            # data for _all_ fields during an <add> -- partial updates aren't
            # supported (see https://issues.apache.org/jira/browse/SOLR-139)
            # however, the reindexing can be skipped if none of the given
            # attributes match existing solr indexes...
            schema = self.manager.getSchema()
            if schema is None:
                msg = "unable to fetch schema, skipping indexing of %r"
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get("uniqueKey", None)
            if uniqueKey is None:
                msg = "schema is missing unique key, skipping indexing of %r"
                logger.warning(msg, obj)
                return
            if attributes is not None:
                attributes = set(schema.keys()).intersection(attributes)
                if not attributes:
                    return
            data, missing = self.getData(obj)
            if not data:
                return  # don't index with no data...
            prepareData(data)
            if data.get(uniqueKey, None) is not None and not missing:
                config = getUtility(ISolrConnectionConfig)
                if config.commit_within:
                    data["commitWithin"] = config.commit_within
                try:
                    logger.debug("indexing %r (%r)", obj, data)
                    pt = data.get("portal_type", "default")
                    logger.debug("indexing %r with %r adder (%r)", obj, pt, data)

                    adder = queryAdapter(obj, ISolrAddHandler, name=pt)

                    if adder is None:
                        adder = DefaultAdder(obj)
                    adder(conn, boost_values=boost_values(obj, data), **data)
                except (SolrException, error):
                    logger.exception("exception during indexing %r", obj)
Example #12
0
    def unindex(self, obj):
        conn = self.getConnection()
        if conn is not None:
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping unindexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping unindexing of %r'
                logger.warning(msg, obj)
                return

            # remove the PathWrapper, otherwise IndexableObjectWrapper fails
            # to get the UID indexer (for dexterity objects) and the parent 
            # UID is acquired
            if hasattr(obj, 'context'):
                obj = obj.context

            data, missing = self.getData(obj, attributes=[uniqueKey])
            prepareData(data)
            if not uniqueKey in data:
                msg = 'Can not unindex: no unique key for object %r'
                logger.info(msg, obj)
                return
            data_key = data[uniqueKey]
            if data_key is None:
                msg = 'Can not unindex: `None` unique key for object %r'
                logger.info(msg, obj)
                return
            try:
                logger.debug('unindexing %r (%r)', obj, data)
                conn.delete(id=data_key)
            except (SolrException, error):
                logger.exception('exception during unindexing %r', obj)
Example #13
0
 def testLanguageParameterHandling(self):
     # empty strings are replaced...
     data = {'Language': ['en', '']}
     prepareData(data)
     self.assertEqual(data, {'Language': ['en', 'any']})
     data = {'Language': ''}
     prepareData(data)
     self.assertEqual(data, {'Language': 'any'})
     # for other indices this shouldn't happen...
     data = {'Foo': ['en', '']}
     prepareData(data)
     self.assertEqual(data, {'Foo': ['en', '']})
Example #14
0
 def testRemoveControlCharacters(self):
     data = {'SearchableText': 'foo\n\tbar\a\f\r'}
     prepareData(data)
     self.assertEqual(data, {'SearchableText': 'foo\n\tbar  \r'})
Example #15
0
 def testUnicodeSearchableText(self):
     data = {"SearchableText": u"f\xf8\xf8 bar"}
     prepareData(data)
     self.assertEqual(data, {"SearchableText": u"f\xf8\xf8 bar"})
 def testPrepareData(self):
     data = {"allowedRolesAndUsers": ["user:test_user_1_", "user:portal_owner"]}
     prepareData(data)
     self.assertEqual(
         data, {"allowedRolesAndUsers": ["user$test_user_1_", "user$portal_owner"]}
     )
Example #17
0
    def buildQueryAndParameters(self, default=None, **args):
        """ helper to build a querystring for simple use-cases """
        schema = self.getManager().getSchema() or {}

        params = subtractQueryParameters(args)
        params = cleanupQueryParameters(params, schema)
        config = self.getConfig()

        prepareData(args)
        mangleQuery(args, config, schema)

        logger.debug('building query for "%r", %r', default, args)
        schema = self.getManager().getSchema() or {}
        defaultSearchField = getattr(schema, 'defaultSearchField', None)
        args[None] = default
        query = {}

        for name, value in sorted(args.items()):
            field = schema.get(name or defaultSearchField, None)
            if field is None or not field.indexed:
                logger.info(
                    'dropping unknown search attribute "%s" '
                    ' (%r) for query: %r', name, value, args
                )
                continue
            if isinstance(value, bool):
                value = str(value).lower()
            elif not value:     # solr doesn't like empty fields (+foo:"")
                if not name:
                    continue
                logger.info(
                    'empty search term form "%s:%s", aborting buildQuery' % (
                        name,
                        value
                    )
                )
                return {}, params
            elif field.class_ == 'solr.BoolField':
                if not isinstance(value, (tuple, list)):
                    value = [value]
                falses = '0', 'False', MV
                true = lambda v: bool(v) and v not in falses
                value = set(map(true, value))
                if not len(value) == 1:
                    assert len(value) == 2      # just to make sure
                    continue                    # skip when "true or false"
                value = str(value.pop()).lower()
            elif isinstance(value, (tuple, list)):
                # list items should be treated as literals, but
                # nevertheless only get quoted when necessary
                value = '(%s)' % ' OR '.join(map(quote_iterable_item, value))
            elif isinstance(value, set):        # sets are taken literally
                if len(value) == 1:
                    query[name] = ''.join(value)
                else:
                    query[name] = '(%s)' % ' OR '.join(value)
                continue
            elif isinstance(value, basestring):
                if field.class_ == 'solr.TextField':
                    if isWildCard(value):
                        value = prepare_wildcard(value)
                    value = quote(value, textfield=True)
                    # if we have an intra-word hyphen, we need quotes
                    if '\\-' in value or '\\+' in value:
                        if value[0] != '"':
                            value = '"%s"' % value
                else:
                    value = quote(value)
                if not value:   # don't search for empty strings, even quoted
                    continue
            else:
                logger.info(
                    'skipping unsupported value "%r" (%s)', value, name
                )
                continue
            if name is None:
                if value and value[0] not in '+-':
                    value = '+%s' % value
            else:
                value = '+%s:%s' % (name, value)
            query[name] = value
        logger.debug('built query "%s"', query)

        if query:
            optimizeQueryParameters(query, params)
        return query, params
Example #18
0
 def testRemoveControlCharacters(self):
     data = {"SearchableText": "foo\n\tbar\a\f\r"}
     prepareData(data)
     self.assertEqual(data, {"SearchableText": "foo\n\tbar  \r"})
Example #19
0
 def testUnicodeSearchableText(self):
     data = {'SearchableText': u'f\xf8\xf8 bar'}
     prepareData(data)
     self.assertEqual(data, {'SearchableText': 'f\xc3\xb8\xc3\xb8 bar'})
Example #20
0
    def reindex(self, batch=1000, skip=0, limit=0):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                adder(conn, boost_values=boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                count += 1
                if count <= skip:
                    continue
                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        log('indexing %r\n' % obj)
                        pt = data.get('portal_type', 'default')
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data['_solr_adder'] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
                if limit and count >= (skip + limit):
                    break
        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Example #21
0
    def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None,
                only_portal_types=None):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                adder(conn, boost_values=my_boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        log('indexing %r\n' % obj)
                        pt = data.get('portal_type', 'default')
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data['_solr_adder'] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
    def reindex(
        self,
        batch=1000,
        skip=0,
        limit=0,
        ignore_portal_types=None,
        only_portal_types=None,
        idxs=[],
        ignore_exceptions=False,
    ):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log("reindexing solr catalog...\n")
        if skip:
            log("skipping indexing of %d object(s)...\n" % skip)
        if limit:
            log("limiting indexing to %d object(s)...\n" % limit)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated

        def flush():
            return conn.commit(soft=True)

        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop("_solr_adder")
                try:
                    adder(conn, boost_values=my_boost_values, **data)
                except Exception as e:
                    logger.warning("Error %s @ %s", e, data["path_string"])
                    if not ignore_exceptions:
                        raise
            updates.clear()
            msg = ("intermediate commit (%d items processed, "
                   "last batch in %s)...\n" % (processed, next(lap)))
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0

        if atomic:
            log("indexing only {0} \n".format(idxs))

        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                if getOwnIndexMethod:
                    if getOwnIndexMethod(obj, "indexObject") is not None:
                        log("skipping indexing of %r via private method.\n" %
                            obj)
                        continue

                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and key not in attributes:
                    attributes.append(key)
                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        log("indexing %r\n" % obj)

                        pt = data.get("portal_type", "default")
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data["_solr_adder"] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        next(cpi)
                else:
                    log("missing data, skipping indexing of %r.\n" % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log("solr index rebuilt.\n")
        msg = "processed %d items in %s (%s cpu time)."
        msg = msg % (processed, next(real), next(cpu))
        log(msg)
        logger.info(msg)
Example #23
0
 def testUnicodeSearchableText(self):
     data = {'SearchableText': u'f\xf8\xf8 bar'}
     prepareData(data)
     self.assertEqual(data, {'SearchableText': 'f\xc3\xb8\xc3\xb8 bar'})
Example #24
0
 def testRemoveControlCharacters(self):
     data = {'SearchableText': 'foo\n\tbar\a\f\r'}
     prepareData(data)
     self.assertEqual(data, {'SearchableText': 'foo\n\tbar  \r'})
Example #25
0
    def index(self, obj, attributes=None):
        """Index the specified attributes for obj using atomic updates, or all
        of them if `attributes` is `None`.
        Also make sure the `uniqueKey` is part of attributes, and passing the
        attributes to the self.getData() call to avoid causing Plone to index
        all fields instead of just the necessary ones.
        """
        conn = self.getConnection()
        if conn is not None and ICheckIndexable(obj)():
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping indexing of %r'
                logger.warning(msg, obj)
                return

            if attributes is not None:

                if 'path' in attributes:
                    attributes = list(attributes)
                    attributes.extend(
                        ['path_string', 'path_parents', 'path_depth'])

                if attributes:
                    attributes = set(schema.keys()).intersection(attributes)
                    if not attributes:
                        return
                else:
                    attributes = schema.keys()

                if uniqueKey not in attributes:
                    # The uniqueKey is required in order to identify the
                    # document when doing atomic updates.
                    attributes.add(uniqueKey)

            data, missing = self.getData(obj, attributes=attributes)
            if not data:
                return  # don't index with no data...
            prepareData(data)
            if data.get(uniqueKey, None) is not None and not missing:
                registry = getUtility(IRegistry)
                config_commit_within = registry[
                    'collective.solr.commit_within']  # noqa
                if config_commit_within:
                    data['commitWithin'] = config_commit_within
                try:
                    logger.debug('indexing %r (%r)', obj, data)
                    pt = data.get('portal_type', 'default')
                    logger.debug('indexing %r with %r adder (%r)', obj, pt,
                                 data)

                    adder = queryAdapter(obj, ISolrAddHandler, name=pt)

                    if adder is None:
                        adder = DefaultAdder(obj)
                    adder(conn, boost_values=boost_values(obj, data), **data)
                except (SolrConnectionException, error):
                    logger.exception('exception during indexing %r', obj)
Example #26
0
    def buildQueryAndParameters(self, default=None, **args):
        """ helper to build a querystring for simple use-cases """
        schema = self.getManager().getSchema() or {}

        params = subtractQueryParameters(args)
        params = cleanupQueryParameters(params, schema)
        config = self.getConfig()

        languageFilter(args)
        prepareData(args)
        mangleQuery(args, config, schema)

        logger.debug('building query for "%r", %r', default, args)
        schema = self.getManager().getSchema() or {}
        defaultSearchField = getattr(schema, 'defaultSearchField', None)
        args[None] = default
        query = {}

        for name, value in sorted(args.items()):
            field = schema.get(name or defaultSearchField, None)
            if field is None or not field.indexed:
                logger.info(
                    'dropping unknown search attribute "%s" '
                    ' (%r) for query: %r', name, value, args)
                continue
            if isinstance(value, bool):
                value = str(value).lower()
            elif not value:  # solr doesn't like empty fields (+foo:"")
                if not name:
                    continue
                logger.info(
                    'empty search term form "%s:%s", aborting buildQuery' %
                    (name, value))
                return {}, params
            elif field.class_ == 'solr.BoolField':
                if not isinstance(value, (tuple, list)):
                    value = [value]
                falses = '0', 'False', MV
                true = lambda v: bool(v) and v not in falses
                value = set(map(true, value))
                if not len(value) == 1:
                    assert len(value) == 2  # just to make sure
                    continue  # skip when "true or false"
                value = str(value.pop()).lower()
            elif isinstance(value, (tuple, list)):
                # list items should be treated as literals, but
                # nevertheless only get quoted when necessary
                value = '(%s)' % ' OR '.join(map(quote_iterable_item, value))
            elif isinstance(value, set):  # sets are taken literally
                if len(value) == 1:
                    query[name] = ''.join(value)
                else:
                    query[name] = '(%s)' % ' OR '.join(value)
                continue
            elif isinstance(value, basestring):
                if field.class_ == 'solr.TextField':
                    if isWildCard(value):
                        value = prepare_wildcard(value)
                    value = quote(value, textfield=True)
                    # if we have an intra-word hyphen, we need quotes
                    if '\\-' in value or '\\+' in value:
                        if value[0] != '"':
                            value = '"%s"' % value
                else:
                    value = quote(value)
                if not value:  # don't search for empty strings, even quoted
                    continue
            else:
                logger.info('skipping unsupported value "%r" (%s)', value,
                            name)
                continue
            if name is None:
                if value and value[0] not in '+-':
                    value = '+%s' % value
            else:
                value = '+%s:%s' % (name, value)
            query[name] = value
        logger.debug('built query "%s"', query)

        if query:
            optimizeQueryParameters(query, params)
        return query, params
Example #27
0
    def cleanup(self, batch=1000):
        """ remove entries from solr that don't have a corresponding Zope
            object or have a different UID than the real object"""
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        log = self.mklog(use_std_log=True)
        log('cleaning up solr index...\n')
        key = manager.getSchema().uniqueKey

        start = 0
        resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start))
        res = resp.results()
        log('%s items in solr catalog\n' % resp.response.numFound)
        deleted = 0
        reindexed = 0
        while len(res) > 0:
            for flare in res:
                try:
                    ob = PloneFlare(flare).getObject()
                except Exception as err:
                    log('Error getting object, removing: %s (%s)\n' % (
                        flare['path_string'], err))
                    conn.delete(flare[key])
                    deleted += 1
                    continue
                if ob is None:
                    log('Object not found, removing: %s\n' % (
                        flare['path_string']))
                    conn.delete(flare[key])
                    deleted += 1
                    continue
                if not IUUIDAware.providedBy(ob):
                    no_skipping_msg = 'Object %s of type %s does not ' + \
                        'support uuids, skipping.\n'
                    log(
                        no_skipping_msg %
                        ('/'.join(ob.getPhysicalPath()), ob.meta_type)
                    )
                    continue
                uuid = IUUID(ob)
                if uuid != flare[key]:
                    log('indexed under wrong UID, removing: %s\n' %
                        flare['path_string'])
                    conn.delete(flare[key])
                    deleted += 1
                    realob_res = SolrResponse(conn.search(q='%s:%s' %
                                              (key, uuid))).results()
                    if len(realob_res) == 0:
                        log('no sane entry for last object, reindexing\n')
                        data, missing = proc.getData(ob)
                        prepareData(data)
                        if not missing:
                            boost = boost_values(ob, data)
                            conn.add(boost_values=boost, **data)
                            reindexed += 1
                        else:
                            log('  missing data, cannot index.\n')
            log('handled batch of %d items, committing\n' % len(res))
            conn.commit()
            start += batch
            resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start))
            res = resp.results()
        finished_msg = 'solr cleanup finished, %s item(s) removed, ' + \
            '%s item(s) reindexed\n'
        msg = finished_msg % (deleted, reindexed)
        log(msg)
        logger.info(msg)
Example #28
0
    def reindex(self, batch=1000, skip=0, idxs=[]):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = FtwSolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                # Only update specified fields by using atomic updates
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, 'indexObject') is not None:
                    log('skipping indexing of %r via private method.\n' % obj)
                    continue
                count += 1
                if count <= skip:
                    continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and not key in attributes:
                    attributes.append(key)

                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Example #29
0
class SolrMaintenanceView(BrowserView):
    """ helper view for indexing all portal content in Solr """
    implements(ISolrMaintenanceView)

    def mklog(self, use_std_log=False):
        """ helper to prepend a time stamp to the output """
        write = self.request.RESPONSE.write

        def log(msg, timestamp=True):
            if timestamp:
                msg = strftime('%Y/%m/%d-%H:%M:%S ') + msg
            write(msg)
            if use_std_log:
                logger.info(msg)
        return log

    def optimize(self):
        """ optimize solr indexes """
        manager = queryUtility(ISolrConnectionManager)
        conn = manager.getConnection()
        conn.setTimeout(None)
        conn.commit(optimize=True)
        return 'solr indexes optimized.'

    def clear(self):
        """ clear all data from solr, i.e. delete all indexed objects """
        manager = queryUtility(ISolrConnectionManager)
        uniqueKey = manager.getSchema().uniqueKey
        conn = manager.getConnection()
        conn.setTimeout(None)
        conn.deleteByQuery('%s:[* TO *]' % uniqueKey)
        conn.commit()
        return 'solr index cleared.'

    def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None,
                only_portal_types=None, idxs=[], ignore_exceptions=False):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.commit(soft=True)
        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                try:
                    adder(conn, boost_values=my_boost_values, **data)
                except Exception, e:
                    logger.warn('Error %s @ %s', e, data['path_string'])
                    if not ignore_exceptions:
                        raise
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0

        if atomic:
            log('indexing only {0} \n'.format(idxs))

        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():

                if getOwnIndexMethod(obj, 'indexObject') is not None:
                    log('skipping indexing of %r via private method.\n' % obj)
                    continue

                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and key not in attributes:
                    attributes.append(key)
                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        log('indexing %r\n' % obj)

                        pt = data.get('portal_type', 'default')
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data['_solr_adder'] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Example #30
0
    def reindex(self, batch=1000, skip=0, idxs=[]):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """
        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = FtwSolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for boost_values, data in updates.values():
                # Only update specified fields by using atomic updates
                conn.add(boost_values=boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if indexable(obj):
                if getOwnIndexMethod(obj, 'indexObject') is not None:
                    log('skipping indexing of %r via private method.\n' % obj)
                    continue
                count += 1
                if count <= skip:
                    continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and not key in attributes:
                    attributes.append(key)

                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)