def unindex(self, obj): conn = self.getConnection() if conn is not None: schema = self.manager.getSchema() if schema is None: msg = "unable to fetch schema, skipping unindexing of %r" logger.warning(msg, obj) return uniqueKey = schema.get("uniqueKey", None) if uniqueKey is None: msg = "schema is missing unique key, skipping unindexing of %r" logger.warning(msg, obj) return data, missing = self.getData(obj, attributes=[uniqueKey]) prepareData(data) if not uniqueKey in data: msg = "Can not unindex: no unique key for object %r" logger.info(msg, obj) return data_key = data[uniqueKey] if data_key is None: msg = "Can not unindex: `None` unique key for object %r" logger.info(msg, obj) return try: logger.debug("unindexing %r (%r)", obj, data) conn.delete(id=data_key) except (SolrException, error): logger.exception("exception during unindexing %r", obj)
def _reindex(self,obj): manager, proc, conn= self._get_man_proc_conn() pwo=proc.wrapObject(obj) schema=self._getSchema() data={} for name,field in schema.items(): try: value = getattr(pwo, name) if callable(value): value = value() except AttributeError: continue handler = handlers.get(field.class_, None) if handler is not None: try: value = handler(value) except AttributeError: continue elif isinstance(value, (list, tuple)) and not field.multiValued: separator = getattr(field, 'separator', ' ') value = separator.join(value) data[name] = value prepareData(data) conn.add(**data)
def index(self, obj, attributes=None): conn = self.getConnection() if conn is not None and indexable(obj): # unfortunately with current versions of solr we need to provide # data for _all_ fields during an <add> -- partial updates aren't # supported (see https://issues.apache.org/jira/browse/SOLR-139) # however, the reindexing can be skipped if none of the given # attributes match existing solr indexes... schema = self.manager.getSchema() if schema is None: msg = 'unable to fetch schema, skipping indexing of %r' logger.warning(msg, obj) return uniqueKey = schema.get('uniqueKey', None) if uniqueKey is None: msg = 'schema is missing unique key, skipping indexing of %r' logger.warning(msg, obj) return if attributes is not None: attributes = set(schema.keys()).intersection(attributes) if not attributes: return data, missing = self.getData(obj) if not data: return # don't index with no data... prepareData(data) if data.get(uniqueKey, None) is not None and not missing: config = getUtility(ISolrConnectionConfig) if config.commit_within: data['commitWithin'] = config.commit_within try: logger.debug('indexing %r (%r)', obj, data) conn.add(boost_values=boost_values(obj, data), **data) except (SolrException, error): logger.exception('exception during indexing %r', obj)
def index(self, obj, attributes=None): """Index the specified attributes for obj using atomic updates, or all of them if `attributes` is `None`. Also make sure the `uniqueKey` is part of attributes, and passing the attributes to the self.getData() call to avoid causing Plone to index all fields instead of just the necessary ones. """ conn = self.getConnection() if conn is not None and ICheckIndexable(obj)(): schema = self.manager.getSchema() if schema is None: msg = 'unable to fetch schema, skipping indexing of %r' logger.warning(msg, obj) return uniqueKey = schema.get('uniqueKey', None) if uniqueKey is None: msg = 'schema is missing unique key, skipping indexing of %r' logger.warning(msg, obj) return if attributes is not None: if 'path' in attributes: attributes = list(attributes) attributes.extend(['path_string', 'path_parents', 'path_depth']) attributes = set(schema.keys()).intersection(attributes) if not attributes: return if uniqueKey not in attributes: # The uniqueKey is required in order to identify the # document when doing atomic updates. attributes.add(uniqueKey) data, missing = self.getData(obj, attributes=attributes) if not data: return # don't index with no data... prepareData(data) if data.get(uniqueKey, None) is not None and not missing: registry = getUtility(IRegistry) config_commit_within = registry['collective.solr.commit_within'] # noqa if config_commit_within: data['commitWithin'] = config_commit_within try: logger.debug('indexing %r (%r)', obj, data) pt = data.get('portal_type', 'default') logger.debug( 'indexing %r with %r adder (%r)', obj, pt, data ) adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) adder(conn, boost_values=boost_values(obj, data), **data) except (SolrConnectionException, error): logger.exception('exception during indexing %r', obj)
def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log("missing data, skipping indexing of %r.\n" % obj) checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def cleanup(self, batch=1000): """ remove entries from solr that don't have a corresponding Zope object or have a different UID than the real object""" manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() log = self.mklog(use_std_log=True) log('cleaning up solr index...\n') key = manager.getSchema().uniqueKey start = 0 resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start)) res = resp.results() log('%s items in solr catalog\n' % resp.response.numFound) deleted = 0 reindexed = 0 while len(res) > 0: for flare in res: try: ob = PloneFlare(flare).getObject() except Exception as err: log('Error getting object, removing: %s (%s)\n' % ( flare['path_string'], err)) conn.delete(flare[key]) deleted += 1 continue if not IUUIDAware.providedBy(ob): log('Object %s of type %s does not support uuids, skipping.\n' % ('/'.join(ob.getPhysicalPath()), ob.meta_type)) continue uuid = IUUID(ob) if uuid != flare[key]: log('indexed under wrong UID, removing: %s\n' % flare['path_string']) conn.delete(flare[key]) deleted += 1 realob_res = SolrResponse(conn.search(q='%s:%s' % (key, uuid))).results() if len(realob_res) == 0: log('no sane entry for last object, reindexing\n') data, missing = proc.getData(ob) prepareData(data) if not missing: boost = boost_values(ob, data) conn.add(boost_values=boost, **data) reindexed += 1 else: log(' missing data, cannot index.\n') log('handled batch of %d items, commiting\n' % len(res)) conn.commit() start += batch resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start)) res = resp.results() msg = 'solr cleanup finished, %s item(s) removed, %s item(s) reindexed\n' % (deleted, reindexed) log(msg) logger.info(msg)
def solrSearchResults(request=None, **keywords): """ perform a query using solr after translating the passed in parameters with portal catalog semantics """ search = queryUtility(ISearch) config = queryUtility(ISolrConnectionConfig) if request is None: # try to get a request instance, so that flares can be adapted to # ploneflares and urls can be converted into absolute ones etc; # however, in this case any arguments from the request are ignored request = getattr(getSite(), 'REQUEST', None) args = deepcopy(keywords) elif IHTTPRequest.providedBy(request): args = deepcopy(request.form) # ignore headers and other stuff args.update(keywords) # keywords take precedence else: assert isinstance(request, dict), request args = deepcopy(request) args.update(keywords) # keywords take precedence # if request is a dict, we need the real request in order to # be able to adapt to plone flares request = getattr(getSite(), 'REQUEST', args) if 'path' in args and 'navtree' in args['path']: raise FallBackException # we can't handle navtree queries yet use_solr = args.get('use_solr', False) # A special key to force Solr if not use_solr and config.required: required = set(config.required).intersection(args) if required: for key in required: if not args[key]: raise FallBackException else: raise FallBackException schema = search.getManager().getSchema() or {} params = cleanupQueryParameters(extractQueryParameters(args), schema) languageFilter(args) prepareData(args) mangleQuery(args, config, schema) query = search.buildQuery(**args) if query != {}: optimizeQueryParameters(query, params) __traceback_info__ = (query, params, args) response = search(query, **params) else: return SolrResponse() def wrap(flare): """ wrap a flare object with a helper class """ adapter = queryMultiAdapter((flare, request), IFlare) return adapter is not None and adapter or flare results = response.results() for idx, flare in enumerate(results): flare = wrap(flare) for missing in set(schema.stored).difference(flare): flare[missing] = MV results[idx] = wrap(flare) padResults(results, **params) # pad the batch return response
def testPrepareData(self): data = {'allowedRolesAndUsers': [ 'user:test_user_1_', 'user:portal_owner']} prepareData(data) self.assertEqual( data, { 'allowedRolesAndUsers': [ 'user$test_user_1_', 'user$portal_owner' ] } )
def index(self, obj, attributes=None): """Index the specified attributes for obj using atomic updates, or all of them if `attributes` is `None`. Changes to the original method include making sure the uniqueKey is part of the attributes, and passing the attributes to the self.getData() call to avoid causing Plone to index all fields instead of just the necessary ones. """ conn = self.getConnection() if conn is not None and indexable(obj): schema = self.manager.getSchema() if schema is None: msg = 'unable to fetch schema, skipping indexing of %r' logger.warning(msg, obj) return uniqueKey = schema.get('uniqueKey', None) if uniqueKey is None: msg = 'schema is missing unique key, skipping indexing of %r' logger.warning(msg, obj) return if attributes is not None: attributes = set(schema.keys()).intersection(attributes) if not attributes: return if not uniqueKey in attributes: # The uniqueKey is required in order to identify the # document when doing atomic updates. attributes.add(uniqueKey) data, missing = self.getData(obj, attributes=attributes) if not data: return # don't index with no data... prepareData(data) if data.get(uniqueKey, None) is not None and not missing: config = getUtility(ISolrConnectionConfig) if config.commit_within: data['commitWithin'] = config.commit_within try: logger.debug('indexing %r (%r)', obj, data) conn.add(boost_values=boost_values(obj, data), **data) except (SolrException, error): logger.exception('exception during indexing %r', obj)
def search_solr(query, request=None, lang_query=True, **params): search = queryUtility(ISearch) if lang_query: dummy = {} languageFilter(dummy) prepareData(dummy) # this replaces '' with 'any' langquery = 'Language:(%s)' % ' OR '.join(dummy['Language']) query = '(%s) AND %s' % (query, langquery) response = search(query, **params) if request is None: request = getSite().REQUEST response.request = request results = response.results() for idx, flare in enumerate(results): results[idx] = PloneFlare(flare, request=request) padResults(results, **params) # pad the batch return response
def index(self, obj, attributes=None): conn = self.getConnection() if conn is not None and ICheckIndexable(obj)(): # unfortunately with current versions of solr we need to provide # data for _all_ fields during an <add> -- partial updates aren't # supported (see https://issues.apache.org/jira/browse/SOLR-139) # however, the reindexing can be skipped if none of the given # attributes match existing solr indexes... schema = self.manager.getSchema() if schema is None: msg = "unable to fetch schema, skipping indexing of %r" logger.warning(msg, obj) return uniqueKey = schema.get("uniqueKey", None) if uniqueKey is None: msg = "schema is missing unique key, skipping indexing of %r" logger.warning(msg, obj) return if attributes is not None: attributes = set(schema.keys()).intersection(attributes) if not attributes: return data, missing = self.getData(obj) if not data: return # don't index with no data... prepareData(data) if data.get(uniqueKey, None) is not None and not missing: config = getUtility(ISolrConnectionConfig) if config.commit_within: data["commitWithin"] = config.commit_within try: logger.debug("indexing %r (%r)", obj, data) pt = data.get("portal_type", "default") logger.debug("indexing %r with %r adder (%r)", obj, pt, data) adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) adder(conn, boost_values=boost_values(obj, data), **data) except (SolrException, error): logger.exception("exception during indexing %r", obj)
def unindex(self, obj): conn = self.getConnection() if conn is not None: schema = self.manager.getSchema() if schema is None: msg = 'unable to fetch schema, skipping unindexing of %r' logger.warning(msg, obj) return uniqueKey = schema.get('uniqueKey', None) if uniqueKey is None: msg = 'schema is missing unique key, skipping unindexing of %r' logger.warning(msg, obj) return # remove the PathWrapper, otherwise IndexableObjectWrapper fails # to get the UID indexer (for dexterity objects) and the parent # UID is acquired if hasattr(obj, 'context'): obj = obj.context data, missing = self.getData(obj, attributes=[uniqueKey]) prepareData(data) if not uniqueKey in data: msg = 'Can not unindex: no unique key for object %r' logger.info(msg, obj) return data_key = data[uniqueKey] if data_key is None: msg = 'Can not unindex: `None` unique key for object %r' logger.info(msg, obj) return try: logger.debug('unindexing %r (%r)', obj, data) conn.delete(id=data_key) except (SolrException, error): logger.exception('exception during unindexing %r', obj)
def testLanguageParameterHandling(self): # empty strings are replaced... data = {'Language': ['en', '']} prepareData(data) self.assertEqual(data, {'Language': ['en', 'any']}) data = {'Language': ''} prepareData(data) self.assertEqual(data, {'Language': 'any'}) # for other indices this shouldn't happen... data = {'Foo': ['en', '']} prepareData(data) self.assertEqual(data, {'Foo': ['en', '']})
def testRemoveControlCharacters(self): data = {'SearchableText': 'foo\n\tbar\a\f\r'} prepareData(data) self.assertEqual(data, {'SearchableText': 'foo\n\tbar \r'})
def testUnicodeSearchableText(self): data = {"SearchableText": u"f\xf8\xf8 bar"} prepareData(data) self.assertEqual(data, {"SearchableText": u"f\xf8\xf8 bar"})
def testPrepareData(self): data = {"allowedRolesAndUsers": ["user:test_user_1_", "user:portal_owner"]} prepareData(data) self.assertEqual( data, {"allowedRolesAndUsers": ["user$test_user_1_", "user$portal_owner"]} )
def buildQueryAndParameters(self, default=None, **args): """ helper to build a querystring for simple use-cases """ schema = self.getManager().getSchema() or {} params = subtractQueryParameters(args) params = cleanupQueryParameters(params, schema) config = self.getConfig() prepareData(args) mangleQuery(args, config, schema) logger.debug('building query for "%r", %r', default, args) schema = self.getManager().getSchema() or {} defaultSearchField = getattr(schema, 'defaultSearchField', None) args[None] = default query = {} for name, value in sorted(args.items()): field = schema.get(name or defaultSearchField, None) if field is None or not field.indexed: logger.info( 'dropping unknown search attribute "%s" ' ' (%r) for query: %r', name, value, args ) continue if isinstance(value, bool): value = str(value).lower() elif not value: # solr doesn't like empty fields (+foo:"") if not name: continue logger.info( 'empty search term form "%s:%s", aborting buildQuery' % ( name, value ) ) return {}, params elif field.class_ == 'solr.BoolField': if not isinstance(value, (tuple, list)): value = [value] falses = '0', 'False', MV true = lambda v: bool(v) and v not in falses value = set(map(true, value)) if not len(value) == 1: assert len(value) == 2 # just to make sure continue # skip when "true or false" value = str(value.pop()).lower() elif isinstance(value, (tuple, list)): # list items should be treated as literals, but # nevertheless only get quoted when necessary value = '(%s)' % ' OR '.join(map(quote_iterable_item, value)) elif isinstance(value, set): # sets are taken literally if len(value) == 1: query[name] = ''.join(value) else: query[name] = '(%s)' % ' OR '.join(value) continue elif isinstance(value, basestring): if field.class_ == 'solr.TextField': if isWildCard(value): value = prepare_wildcard(value) value = quote(value, textfield=True) # if we have an intra-word hyphen, we need quotes if '\\-' in value or '\\+' in value: if value[0] != '"': value = '"%s"' % value else: value = quote(value) if not value: # don't search for empty strings, even quoted continue else: logger.info( 'skipping unsupported value "%r" (%s)', value, name ) continue if name is None: if value and value[0] not in '+-': value = '+%s' % value else: value = '+%s:%s' % (name, value) query[name] = value logger.debug('built query "%s"', query) if query: optimizeQueryParameters(query, params) return query, params
def testRemoveControlCharacters(self): data = {"SearchableText": "foo\n\tbar\a\f\r"} prepareData(data) self.assertEqual(data, {"SearchableText": "foo\n\tbar \r"})
def testUnicodeSearchableText(self): data = {'SearchableText': u'f\xf8\xf8 bar'} prepareData(data) self.assertEqual(data, {'SearchableText': 'f\xc3\xb8\xc3\xb8 bar'})
def reindex(self, batch=1000, skip=0, limit=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): adder = data.pop('_solr_adder') adder(conn, boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data['_solr_adder'] = adder updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop('_solr_adder') adder(conn, boost_values=my_boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data['_solr_adder'] = adder updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def reindex( self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], ignore_exceptions=False, ): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) if limit: log("limiting indexing to %d object(s)...\n" % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated def flush(): return conn.commit(soft=True) flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop("_solr_adder") try: adder(conn, boost_values=my_boost_values, **data) except Exception as e: logger.warning("Error %s @ %s", e, data["path_string"]) if not ignore_exceptions: raise updates.clear() msg = ("intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, next(lap))) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 if atomic: log("indexing only {0} \n".format(idxs)) for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): if getOwnIndexMethod: if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and key not in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: log("indexing %r\n" % obj) pt = data.get("portal_type", "default") adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data["_solr_adder"] = adder updates[value] = (boost_values(obj, data), data) processed += 1 next(cpi) else: log("missing data, skipping indexing of %r.\n" % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, next(real), next(cpu)) log(msg) logger.info(msg)
def index(self, obj, attributes=None): """Index the specified attributes for obj using atomic updates, or all of them if `attributes` is `None`. Also make sure the `uniqueKey` is part of attributes, and passing the attributes to the self.getData() call to avoid causing Plone to index all fields instead of just the necessary ones. """ conn = self.getConnection() if conn is not None and ICheckIndexable(obj)(): schema = self.manager.getSchema() if schema is None: msg = 'unable to fetch schema, skipping indexing of %r' logger.warning(msg, obj) return uniqueKey = schema.get('uniqueKey', None) if uniqueKey is None: msg = 'schema is missing unique key, skipping indexing of %r' logger.warning(msg, obj) return if attributes is not None: if 'path' in attributes: attributes = list(attributes) attributes.extend( ['path_string', 'path_parents', 'path_depth']) if attributes: attributes = set(schema.keys()).intersection(attributes) if not attributes: return else: attributes = schema.keys() if uniqueKey not in attributes: # The uniqueKey is required in order to identify the # document when doing atomic updates. attributes.add(uniqueKey) data, missing = self.getData(obj, attributes=attributes) if not data: return # don't index with no data... prepareData(data) if data.get(uniqueKey, None) is not None and not missing: registry = getUtility(IRegistry) config_commit_within = registry[ 'collective.solr.commit_within'] # noqa if config_commit_within: data['commitWithin'] = config_commit_within try: logger.debug('indexing %r (%r)', obj, data) pt = data.get('portal_type', 'default') logger.debug('indexing %r with %r adder (%r)', obj, pt, data) adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) adder(conn, boost_values=boost_values(obj, data), **data) except (SolrConnectionException, error): logger.exception('exception during indexing %r', obj)
def buildQueryAndParameters(self, default=None, **args): """ helper to build a querystring for simple use-cases """ schema = self.getManager().getSchema() or {} params = subtractQueryParameters(args) params = cleanupQueryParameters(params, schema) config = self.getConfig() languageFilter(args) prepareData(args) mangleQuery(args, config, schema) logger.debug('building query for "%r", %r', default, args) schema = self.getManager().getSchema() or {} defaultSearchField = getattr(schema, 'defaultSearchField', None) args[None] = default query = {} for name, value in sorted(args.items()): field = schema.get(name or defaultSearchField, None) if field is None or not field.indexed: logger.info( 'dropping unknown search attribute "%s" ' ' (%r) for query: %r', name, value, args) continue if isinstance(value, bool): value = str(value).lower() elif not value: # solr doesn't like empty fields (+foo:"") if not name: continue logger.info( 'empty search term form "%s:%s", aborting buildQuery' % (name, value)) return {}, params elif field.class_ == 'solr.BoolField': if not isinstance(value, (tuple, list)): value = [value] falses = '0', 'False', MV true = lambda v: bool(v) and v not in falses value = set(map(true, value)) if not len(value) == 1: assert len(value) == 2 # just to make sure continue # skip when "true or false" value = str(value.pop()).lower() elif isinstance(value, (tuple, list)): # list items should be treated as literals, but # nevertheless only get quoted when necessary value = '(%s)' % ' OR '.join(map(quote_iterable_item, value)) elif isinstance(value, set): # sets are taken literally if len(value) == 1: query[name] = ''.join(value) else: query[name] = '(%s)' % ' OR '.join(value) continue elif isinstance(value, basestring): if field.class_ == 'solr.TextField': if isWildCard(value): value = prepare_wildcard(value) value = quote(value, textfield=True) # if we have an intra-word hyphen, we need quotes if '\\-' in value or '\\+' in value: if value[0] != '"': value = '"%s"' % value else: value = quote(value) if not value: # don't search for empty strings, even quoted continue else: logger.info('skipping unsupported value "%r" (%s)', value, name) continue if name is None: if value and value[0] not in '+-': value = '+%s' % value else: value = '+%s:%s' % (name, value) query[name] = value logger.debug('built query "%s"', query) if query: optimizeQueryParameters(query, params) return query, params
def cleanup(self, batch=1000): """ remove entries from solr that don't have a corresponding Zope object or have a different UID than the real object""" manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() log = self.mklog(use_std_log=True) log('cleaning up solr index...\n') key = manager.getSchema().uniqueKey start = 0 resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start)) res = resp.results() log('%s items in solr catalog\n' % resp.response.numFound) deleted = 0 reindexed = 0 while len(res) > 0: for flare in res: try: ob = PloneFlare(flare).getObject() except Exception as err: log('Error getting object, removing: %s (%s)\n' % ( flare['path_string'], err)) conn.delete(flare[key]) deleted += 1 continue if ob is None: log('Object not found, removing: %s\n' % ( flare['path_string'])) conn.delete(flare[key]) deleted += 1 continue if not IUUIDAware.providedBy(ob): no_skipping_msg = 'Object %s of type %s does not ' + \ 'support uuids, skipping.\n' log( no_skipping_msg % ('/'.join(ob.getPhysicalPath()), ob.meta_type) ) continue uuid = IUUID(ob) if uuid != flare[key]: log('indexed under wrong UID, removing: %s\n' % flare['path_string']) conn.delete(flare[key]) deleted += 1 realob_res = SolrResponse(conn.search(q='%s:%s' % (key, uuid))).results() if len(realob_res) == 0: log('no sane entry for last object, reindexing\n') data, missing = proc.getData(ob) prepareData(data) if not missing: boost = boost_values(ob, data) conn.add(boost_values=boost, **data) reindexed += 1 else: log(' missing data, cannot index.\n') log('handled batch of %d items, committing\n' % len(res)) conn.commit() start += batch resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start)) res = resp.results() finished_msg = 'solr cleanup finished, %s item(s) removed, ' + \ '%s item(s) reindexed\n' msg = finished_msg % (deleted, reindexed) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0, idxs=[]): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = FtwSolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): # Only update specified fields by using atomic updates conn.add(boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and not key in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
class SolrMaintenanceView(BrowserView): """ helper view for indexing all portal content in Solr """ implements(ISolrMaintenanceView) def mklog(self, use_std_log=False): """ helper to prepend a time stamp to the output """ write = self.request.RESPONSE.write def log(msg, timestamp=True): if timestamp: msg = strftime('%Y/%m/%d-%H:%M:%S ') + msg write(msg) if use_std_log: logger.info(msg) return log def optimize(self): """ optimize solr indexes """ manager = queryUtility(ISolrConnectionManager) conn = manager.getConnection() conn.setTimeout(None) conn.commit(optimize=True) return 'solr indexes optimized.' def clear(self): """ clear all data from solr, i.e. delete all indexed objects """ manager = queryUtility(ISolrConnectionManager) uniqueKey = manager.getSchema().uniqueKey conn = manager.getConnection() conn.setTimeout(None) conn.deleteByQuery('%s:[* TO *]' % uniqueKey) conn.commit() return 'solr index cleared.' def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], ignore_exceptions=False): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.commit(soft=True) flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop('_solr_adder') try: adder(conn, boost_values=my_boost_values, **data) except Exception, e: logger.warn('Error %s @ %s', e, data['path_string']) if not ignore_exceptions: raise updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 if atomic: log('indexing only {0} \n'.format(idxs)) for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and key not in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data['_solr_adder'] = adder updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)