def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ requestFactory = queryUtility(IRequestFactory) indexProcessor = queryUtility(IZeroCMSIndexQueueProcessor, name="zerocms") zodb_conn = self.context._p_jar log = self.mklog() log('reindexing documents to ZeroCMS...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 updates = {} # list to hold data to be updated count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue indexProcessor.index(obj) processed += 1 zodb_conn.cacheGC(); log('All documents exported to ZeroCMS.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log("missing data, skipping indexing of %r.\n" % obj) checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def index(self, obj, attributes=None): """Index the specified attributes for obj using atomic updates, or all of them if `attributes` is `None`. Changes to the original method include making sure the uniqueKey is part of the attributes, and passing the attributes to the self.getData() call to avoid causing Plone to index all fields instead of just the necessary ones. """ conn = self.getConnection() if conn is not None and indexable(obj): schema = self.manager.getSchema() if schema is None: msg = 'unable to fetch schema, skipping indexing of %r' logger.warning(msg, obj) return uniqueKey = schema.get('uniqueKey', None) if uniqueKey is None: msg = 'schema is missing unique key, skipping indexing of %r' logger.warning(msg, obj) return if attributes is not None: attributes = set(schema.keys()).intersection(attributes) if not attributes: return if not uniqueKey in attributes: # The uniqueKey is required in order to identify the # document when doing atomic updates. attributes.add(uniqueKey) data, missing = self.getData(obj, attributes=attributes) if not data: return # don't index with no data... prepareData(data) if data.get(uniqueKey, None) is not None and not missing: config = getUtility(ISolrConnectionConfig) if config.commit_within: data['commitWithin'] = config.commit_within try: logger.debug('indexing %r (%r)', obj, data) conn.add(boost_values=boost_values(obj, data), **data) except (SolrException, error): logger.exception('exception during indexing %r', obj)
def sync(self, batch=1000): """Sync the Solr index with the portal catalog. Records contained in the catalog but not in Solr will be indexed and records not contained in the catalog will be removed. """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey zodb_conn = self.context._p_jar catalog = getToolByName(self.context, "portal_catalog") getIndex = catalog._catalog.getIndex modified_index = getIndex("modified") uid_index = getIndex(key) log = self.mklog() real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time # get Solr status query = "+%s:[* TO *]" % key response = conn.search(q=query, rows=MAX_ROWS, fl="%s modified" % key) # avoid creating DateTime instances simple_unmarshallers = unmarshallers.copy() simple_unmarshallers["date"] = parse_date_as_datetime flares = SolrResponse(response, simple_unmarshallers) response.close() solr_results = {} solr_uids = set() def _utc_convert(value): t_tup = value.utctimetuple() return (((t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) * 60 + t_tup[4] for flare in flares: uid = flare[key] solr_uids.add(uid) solr_results[uid] = _utc_convert(flare["modified"]) # get catalog status cat_results = {} cat_uids = set() for uid, rid in uid_index._index.items(): cat_uids.add(uid) cat_results[uid] = rid # differences index = cat_uids.difference(solr_uids) solr_uids.difference_update(cat_uids) unindex = solr_uids processed = 0 flush = notimeout(lambda: conn.flush()) def checkPoint(): msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) # Look up objects uid_rid_get = cat_results.get rid_path_get = catalog._catalog.paths.get catalog_traverse = catalog.unrestrictedTraverse def lookup( uid, rid=None, uid_rid_get=uid_rid_get, rid_path_get=rid_path_get, catalog_traverse=catalog_traverse ): if rid is None: rid = uid_rid_get(uid) if not rid: return None if not isinstance(rid, int): rid = tuple(rid)[0] path = rid_path_get(rid) if not path: return None try: obj = catalog_traverse(path) except AttributeError: return None return obj log('processing %d "unindex" operations next...\n' % len(unindex)) op = notimeout(lambda uid: conn.delete(id=uid)) for uid in unindex: obj = lookup(uid) if obj is None: op(uid) processed += 1 cpi.next() else: log("not unindexing existing object %r.\n" % uid) log('processing %d "index" operations next...\n' % len(index)) op = notimeout(lambda obj: proc.index(obj)) for uid in index: obj = lookup(uid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log("not indexing unindexable object %r.\n" % uid) if obj is not None: obj._p_deactivate() log('processing "reindex" operations next...\n') op = notimeout(lambda obj: proc.reindex(obj)) cat_mod_get = modified_index._unindex.get solr_mod_get = solr_results.get done = unindex.union(index) for uid, rid in cat_results.items(): if uid in done: continue if isinstance(rid, IITreeSet): rid = rid.keys()[0] if cat_mod_get(rid) != solr_mod_get(uid): obj = lookup(uid, rid=rid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log("not reindexing unindexable object %r.\n" % uid) if obj is not None: obj._p_deactivate() conn.commit() log("solr index synced.\n") msg = "processed %d object(s) in %s (%s cpu time)." msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def sync(self, batch=1000): """Sync the Solr index with the portal catalog. Records contained in the catalog but not in Solr will be indexed and records not contained in the catalog will be removed. """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey zodb_conn = self.context._p_jar catalog = getToolByName(self.context, 'portal_catalog') getIndex = catalog._catalog.getIndex modified_index = getIndex('modified') uid_index = getIndex(key) log = self.mklog() real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time # get Solr status query = '+%s:[* TO *]' % key response = conn.search(q=query, rows=MAX_ROWS, fl='%s modified' % key) # avoid creating DateTime instances simple_unmarshallers = unmarshallers.copy() simple_unmarshallers['date'] = parse_date_as_datetime flares = SolrResponse(response, simple_unmarshallers) response.close() solr_results = {} solr_uids = set() def _utc_convert(value): t_tup = value.utctimetuple() return ((( (t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) * 60 + t_tup[4]) for flare in flares: uid = flare[key] solr_uids.add(uid) solr_results[uid] = _utc_convert(flare['modified']) # get catalog status cat_results = {} cat_uids = set() for uid, rid in uid_index._index.items(): cat_uids.add(uid) cat_results[uid] = rid # differences index = cat_uids.difference(solr_uids) solr_uids.difference_update(cat_uids) unindex = solr_uids processed = 0 flush = notimeout(lambda: conn.flush()) def checkPoint(): msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) # Look up objects uid_rid_get = cat_results.get rid_path_get = catalog._catalog.paths.get catalog_traverse = catalog.unrestrictedTraverse def lookup(uid, rid=None, uid_rid_get=uid_rid_get, rid_path_get=rid_path_get, catalog_traverse=catalog_traverse): if rid is None: rid = uid_rid_get(uid) if not rid: return None if not isinstance(rid, int): rid = tuple(rid)[0] path = rid_path_get(rid) if not path: return None try: obj = catalog_traverse(path) except AttributeError: return None return obj log('processing %d "unindex" operations next...\n' % len(unindex)) op = notimeout(lambda uid: conn.delete(id=uid)) for uid in unindex: obj = lookup(uid) if obj is None: op(uid) processed += 1 cpi.next() else: log('not unindexing existing object %r.\n' % uid) log('processing %d "index" operations next...\n' % len(index)) op = notimeout(lambda obj: proc.index(obj)) for uid in index: obj = lookup(uid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log('not indexing unindexable object %r.\n' % uid) if obj is not None: obj._p_deactivate() log('processing "reindex" operations next...\n') op = notimeout(lambda obj: proc.reindex(obj)) cat_mod_get = modified_index._unindex.get solr_mod_get = solr_results.get done = unindex.union(index) for uid, rid in cat_results.items(): if uid in done: continue if isinstance(rid, IITreeSet): rid = rid.keys()[0] if cat_mod_get(rid) != solr_mod_get(uid): obj = lookup(uid, rid=rid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log('not reindexing unindexable object %r.\n' % uid) if obj is not None: obj._p_deactivate() conn.commit() log('solr index synced.\n') msg = 'processed %d object(s) in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def solr_dump_catalog(app, args): """Dumps the catalog and metadata contents into a nested directory structure full of pickles containing the information in dict format. These can be updated by later re-runs and used to import the data via the `update_solr` command. You can optionally specify the id of the Plone site as the first command line argument. """ _enable_log() db = app._p_jar.db() from Testing import makerequest root = makerequest.makerequest(app) site = _get_site(root, args) data_dir = _data_dir(site.getId()) _make_dir(data_dir) catalog = site.portal_catalog _catalog = catalog._catalog catalog_length = len(catalog) uids_get = _catalog.uids.get conn = _solr_connection() schema = conn.getSchema() wanted = set(schema.keys()) # We need the data from path wanted.add('path') conn.close() logger.info('Process %s catalog items' % catalog_length) from collective.indexing.indexer import getOwnIndexMethod from collective.solr.indexer import indexable from collective.solr.utils import findObjects from plone.app.folder.nogopip import GopipIndex from Products.PluginIndexes.DateIndex.DateIndex import DateIndex from Products.PluginIndexes.DateRangeIndex import DateRangeIndex from Products.ZCTextIndex import WidCode from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex indexes = catalog.indexes() indexes.sort() gopip_indexes = set() for indexname in indexes: if indexname in _catalog.schema: # There's no need to get metadata from the indexes continue if indexname not in wanted: # skip indexes not present in the Solr schema continue logger.info('Dumping index: %s' % indexname) index = _catalog.getIndex(indexname) if isinstance(index, DateRangeIndex.DateRangeIndex): # Solr cannot deal with range indexes directly continue if isinstance(index, ZCTextIndex): get_word = index.getLexicon().get_word wid_decode = WidCode.decode batch = 0 for i, (uid, value) in enumerate(index.index._docwords.items()): batch = _log_batch(db, batch, i) words = ' '.join([get_word(w) for w in wid_decode(value)]) _dump(data_dir, uid, {indexname: words}) elif isinstance(index, GopipIndex): # happens last as it needs a full site traversal gopip_indexes.add(indexname) continue elif not hasattr(index, '_unindex'): logger.warn("Unsupported index '%s' without an _unindex." % indexname) else: date_index = isinstance(index, DateIndex) batch = 0 for i, (uid, value) in enumerate(index._unindex.iteritems()): batch = _log_batch(db, batch, i) value = _convert_value(indexname, value, date_index) if value: _dump(data_dir, uid, {indexname: value}) # dump metadata logger.info('Dumping metadata records') batch = 0 for i, uid in enumerate(_catalog.paths.iterkeys()): batch = _log_batch(db, batch, i) values = {} for k, v in _catalog.getMetadataForRID(uid).iteritems(): definition = schema.get(k) if not definition: continue class_ = definition['class_'] date_index = class_ == 'solr.TrieDateField' value = _convert_value(k, v, date_index) if value is not None: values[k] = value elif class_ == 'solr.TextField': values[k] = '' _dump(data_dir, uid, values) # deal with GopipIndexes batch = 0 logger.info('Traversing site to dump Gopip index information') for i, (path, obj) in enumerate(findObjects(site)): batch = _log_batch(db, batch, i) if not indexable(obj): continue elif getOwnIndexMethod(obj, 'indexObject') is not None: continue parent = aq_parent(obj) uid = uids_get('/'.join(obj.getPhysicalPath()), None) if uid is None: continue if hasattr(aq_base(parent), 'getObjectPosition'): pos = parent.getObjectPosition(path.split('/')[-1]) data = {} for name in gopip_indexes: data[name] = pos _dump(data_dir, uid, data) else: data = {} for name in gopip_indexes: data[name] = 0 _dump(data_dir, uid, data) if not getattr(aq_base(obj), 'isPrincipiaFolderish', False): # Remove non-folders from the cache immediately as we no longer # need them obj._p_deactivate()
def reindex(self, batch=1000, skip=0, idxs=[]): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = FtwSolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): # Only update specified fields by using atomic updates conn.add(boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and not key in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)