def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ requestFactory = queryUtility(IRequestFactory) indexProcessor = queryUtility(IZeroCMSIndexQueueProcessor, name="zerocms") zodb_conn = self.context._p_jar log = self.mklog() log('reindexing documents to ZeroCMS...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 updates = {} # list to hold data to be updated count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue indexProcessor.index(obj) processed += 1 zodb_conn.cacheGC(); log('All documents exported to ZeroCMS.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log("missing data, skipping indexing of %r.\n" % obj) checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def testGetOwnIndexMethod(self): from collective.indexing.indexer import getOwnIndexMethod self.setRoles(['Manager']) # a regular content object uses the standard methods... container = self.folder self.failIf(getOwnIndexMethod(container, 'indexObject')) self.failIf(getOwnIndexMethod(container, 'reindexObject')) self.failIf(getOwnIndexMethod(container, 'unindexObject')) news = self.portal.news self.failIf(getOwnIndexMethod(news, 'indexObject')) self.failIf(getOwnIndexMethod(news, 'reindexObject')) self.failIf(getOwnIndexMethod(news, 'unindexObject')) event = container[container.invokeFactory('Event', id='event')] self.failIf(getOwnIndexMethod(event, 'indexObject')) self.failIf(getOwnIndexMethod(event, 'reindexObject')) self.failIf(getOwnIndexMethod(event, 'unindexObject')) # while a criterion has private methods... container.invokeFactory('Collection', id='coll') crit = ATPortalTypeCriterion('crit') self.failUnless(getOwnIndexMethod(crit, 'indexObject')) self.failUnless(getOwnIndexMethod(crit, 'reindexObject')) self.failUnless(getOwnIndexMethod(crit, 'unindexObject')) # our sample class only has a private `indexObject`... from collective.indexing.tests.content import Foo foo = Foo('foo') self.failUnless(getOwnIndexMethod(foo, 'indexObject')) self.failIf(getOwnIndexMethod(foo, 'reindexObject')) self.failIf(getOwnIndexMethod(foo, 'unindexObject'))
def reindex( self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], ignore_exceptions=False, ): """find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them""" if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) if limit: log("limiting indexing to %d object(s)...\n" % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(process_time) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated def flush(): return conn.commit(soft=True) flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop("_solr_adder") try: adder(conn, boost_values=my_boost_values, **data) except Exception as e: logger.warning("Error %s @ %s", e, data["path_string"]) if not ignore_exceptions: raise updates.clear() msg = ("intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, next(lap))) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 if atomic: log("indexing only {0} \n".format(idxs)) for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): if getOwnIndexMethod: if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and key not in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: log("indexing %r\n" % obj) pt = data.get("portal_type", "default") adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data["_solr_adder"] = adder updates[value] = (boost_values(obj, data), data) processed += 1 next(cpi) else: log("missing data, skipping indexing of %r.\n" % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, next(real), next(cpu)) log(msg) logger.info(msg)
class SolrMaintenanceView(BrowserView): """ helper view for indexing all portal content in Solr """ implements(ISolrMaintenanceView) def mklog(self, use_std_log=False): """ helper to prepend a time stamp to the output """ write = self.request.RESPONSE.write def log(msg, timestamp=True): if timestamp: msg = strftime('%Y/%m/%d-%H:%M:%S ') + msg write(msg) if use_std_log: logger.info(msg) return log def optimize(self): """ optimize solr indexes """ manager = queryUtility(ISolrConnectionManager) conn = manager.getConnection() conn.setTimeout(None) conn.commit(optimize=True) return 'solr indexes optimized.' def clear(self): """ clear all data from solr, i.e. delete all indexed objects """ manager = queryUtility(ISolrConnectionManager) uniqueKey = manager.getSchema().uniqueKey conn = manager.getConnection() conn.setTimeout(None) conn.deleteByQuery('%s:[* TO *]' % uniqueKey) conn.commit() return 'solr index cleared.' def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], ignore_exceptions=False): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.commit(soft=True) flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop('_solr_adder') try: adder(conn, boost_values=my_boost_values, **data) except Exception, e: logger.warn('Error %s @ %s', e, data['path_string']) if not ignore_exceptions: raise updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 if atomic: log('indexing only {0} \n'.format(idxs)) for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and key not in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data['_solr_adder'] = adder updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def testGetOwnIndexMethod(self): from collective.indexing.indexer import getOwnIndexMethod self.setRoles(['Manager']) # a regular content object uses the standard methods... container = self.folder self.failIf(getOwnIndexMethod(container, 'indexObject')) self.failIf(getOwnIndexMethod(container, 'reindexObject')) self.failIf(getOwnIndexMethod(container, 'unindexObject')) news = self.portal.news self.failIf(getOwnIndexMethod(news, 'indexObject')) self.failIf(getOwnIndexMethod(news, 'reindexObject')) self.failIf(getOwnIndexMethod(news, 'unindexObject')) event = container[container.invokeFactory('Event', id='event')] self.failIf(getOwnIndexMethod(event, 'indexObject')) self.failIf(getOwnIndexMethod(event, 'reindexObject')) self.failIf(getOwnIndexMethod(event, 'unindexObject')) # while a criterion has private methods... container.invokeFactory('Topic', id='coll') crit = container.coll.addCriterion('Type', 'ATPortalTypeCriterion') self.failUnless(getOwnIndexMethod(crit, 'indexObject')) self.failUnless(getOwnIndexMethod(crit, 'reindexObject')) self.failUnless(getOwnIndexMethod(crit, 'unindexObject')) # our sample class only has a private `indexObject`... from collective.indexing.tests.content import Foo foo = Foo('foo') self.failUnless(getOwnIndexMethod(foo, 'indexObject')) self.failIf(getOwnIndexMethod(foo, 'reindexObject')) self.failIf(getOwnIndexMethod(foo, 'unindexObject'))
def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], no_log=False, index_fulltext=False): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ self.disable_csrf() if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") conn = IConnection(getUtility(IConnectionConfig)) if not index_fulltext: logger.info("NOT indexing SearchableText. " "Pass index_fulltext=True to do otherwise.") attributes = [x['name'] for x in conn.schema['fields']] attributes.remove(u'SearchableText') idxs = attributes else: logger.warn("Indexing SearchableText. This may take a while.") atomic = idxs != [] zodb_conn = self.context._p_jar CI = ContentIndexer() log = self.mklog(write_to_response=not no_log) log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 uniqueKey = conn.schema['uniqueKey'] updates = {} # list to hold data to be updated flush = notimeout(lambda: conn.commit(softCommit=True)) def checkPoint(): for data in updates.values(): adder = data.pop('_solr_adder', set([])) adder.add(data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 if atomic: log('indexing only {0} \n'.format(idxs)) for path, obj in findObjects(self.context): try: if ICheckIndexable(obj)(): if getOwnIndexMethod(obj, 'indexObject') is not None: log( 'skipping indexing of %r via private method.\n', obj ) continue count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and uniqueKey not in attributes: attributes.append(uniqueKey) data = CI._get_data(obj, attributes=attributes) missing = False # Do we have that in scorched? if not missing or atomic: value = data.get(uniqueKey, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryMultiAdapter((obj, conn), name=pt) if adder is None: adder = ContentAdder(obj, conn) data['_solr_adder'] = adder # Do not boost, c.solr only feature updates[value] = data processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break except: logger.exception( 'Failed reindexing: %r (%s)', obj, path, ) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def solr_dump_catalog(app, args): """Dumps the catalog and metadata contents into a nested directory structure full of pickles containing the information in dict format. These can be updated by later re-runs and used to import the data via the `update_solr` command. You can optionally specify the id of the Plone site as the first command line argument. """ _enable_log() db = app._p_jar.db() from Testing import makerequest root = makerequest.makerequest(app) site = _get_site(root, args) data_dir = _data_dir(site.getId()) _make_dir(data_dir) catalog = site.portal_catalog _catalog = catalog._catalog catalog_length = len(catalog) uids_get = _catalog.uids.get conn = _solr_connection() schema = conn.getSchema() wanted = set(schema.keys()) # We need the data from path wanted.add('path') conn.close() logger.info('Process %s catalog items' % catalog_length) from collective.indexing.indexer import getOwnIndexMethod from collective.solr.indexer import indexable from collective.solr.utils import findObjects from plone.app.folder.nogopip import GopipIndex from Products.PluginIndexes.DateIndex.DateIndex import DateIndex from Products.PluginIndexes.DateRangeIndex import DateRangeIndex from Products.ZCTextIndex import WidCode from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex indexes = catalog.indexes() indexes.sort() gopip_indexes = set() for indexname in indexes: if indexname in _catalog.schema: # There's no need to get metadata from the indexes continue if indexname not in wanted: # skip indexes not present in the Solr schema continue logger.info('Dumping index: %s' % indexname) index = _catalog.getIndex(indexname) if isinstance(index, DateRangeIndex.DateRangeIndex): # Solr cannot deal with range indexes directly continue if isinstance(index, ZCTextIndex): get_word = index.getLexicon().get_word wid_decode = WidCode.decode batch = 0 for i, (uid, value) in enumerate(index.index._docwords.items()): batch = _log_batch(db, batch, i) words = ' '.join([get_word(w) for w in wid_decode(value)]) _dump(data_dir, uid, {indexname: words}) elif isinstance(index, GopipIndex): # happens last as it needs a full site traversal gopip_indexes.add(indexname) continue elif not hasattr(index, '_unindex'): logger.warn("Unsupported index '%s' without an _unindex." % indexname) else: date_index = isinstance(index, DateIndex) batch = 0 for i, (uid, value) in enumerate(index._unindex.iteritems()): batch = _log_batch(db, batch, i) value = _convert_value(indexname, value, date_index) if value: _dump(data_dir, uid, {indexname: value}) # dump metadata logger.info('Dumping metadata records') batch = 0 for i, uid in enumerate(_catalog.paths.iterkeys()): batch = _log_batch(db, batch, i) values = {} for k, v in _catalog.getMetadataForRID(uid).iteritems(): definition = schema.get(k) if not definition: continue class_ = definition['class_'] date_index = class_ == 'solr.TrieDateField' value = _convert_value(k, v, date_index) if value is not None: values[k] = value elif class_ == 'solr.TextField': values[k] = '' _dump(data_dir, uid, values) # deal with GopipIndexes batch = 0 logger.info('Traversing site to dump Gopip index information') for i, (path, obj) in enumerate(findObjects(site)): batch = _log_batch(db, batch, i) if not indexable(obj): continue elif getOwnIndexMethod(obj, 'indexObject') is not None: continue parent = aq_parent(obj) uid = uids_get('/'.join(obj.getPhysicalPath()), None) if uid is None: continue if hasattr(aq_base(parent), 'getObjectPosition'): pos = parent.getObjectPosition(path.split('/')[-1]) data = {} for name in gopip_indexes: data[name] = pos _dump(data_dir, uid, data) else: data = {} for name in gopip_indexes: data[name] = 0 _dump(data_dir, uid, data) if not getattr(aq_base(obj), 'isPrincipiaFolderish', False): # Remove non-folders from the cache immediately as we no longer # need them obj._p_deactivate()
def reindex(self, batch=1000, skip=0, idxs=[]): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = FtwSolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): # Only update specified fields by using atomic updates conn.add(boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and not key in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)