def sync(self, batch=1000): """Sync the Solr index with the portal catalog. Records contained in the catalog but not in Solr will be indexed and records not contained in the catalog will be removed. """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey zodb_conn = self.context._p_jar catalog = getToolByName(self.context, "portal_catalog") getIndex = catalog._catalog.getIndex modified_index = getIndex("modified") uid_index = getIndex(key) log = self.mklog() real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time # get Solr status query = "+%s:[* TO *]" % key response = conn.search(q=query, rows=MAX_ROWS, fl="%s modified" % key) # avoid creating DateTime instances simple_unmarshallers = unmarshallers.copy() simple_unmarshallers["date"] = parse_date_as_datetime flares = SolrResponse(response, simple_unmarshallers) response.close() solr_results = {} solr_uids = set() def _utc_convert(value): t_tup = value.utctimetuple() return (((t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) * 60 + t_tup[4] for flare in flares: uid = flare[key] solr_uids.add(uid) solr_results[uid] = _utc_convert(flare["modified"]) # get catalog status cat_results = {} cat_uids = set() for uid, rid in uid_index._index.items(): cat_uids.add(uid) cat_results[uid] = rid # differences index = cat_uids.difference(solr_uids) solr_uids.difference_update(cat_uids) unindex = solr_uids processed = 0 flush = notimeout(lambda: conn.flush()) def checkPoint(): msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) # Look up objects uid_rid_get = cat_results.get rid_path_get = catalog._catalog.paths.get catalog_traverse = catalog.unrestrictedTraverse def lookup( uid, rid=None, uid_rid_get=uid_rid_get, rid_path_get=rid_path_get, catalog_traverse=catalog_traverse ): if rid is None: rid = uid_rid_get(uid) if not rid: return None if not isinstance(rid, int): rid = tuple(rid)[0] path = rid_path_get(rid) if not path: return None try: obj = catalog_traverse(path) except AttributeError: return None return obj log('processing %d "unindex" operations next...\n' % len(unindex)) op = notimeout(lambda uid: conn.delete(id=uid)) for uid in unindex: obj = lookup(uid) if obj is None: op(uid) processed += 1 cpi.next() else: log("not unindexing existing object %r.\n" % uid) log('processing %d "index" operations next...\n' % len(index)) op = notimeout(lambda obj: proc.index(obj)) for uid in index: obj = lookup(uid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log("not indexing unindexable object %r.\n" % uid) if obj is not None: obj._p_deactivate() log('processing "reindex" operations next...\n') op = notimeout(lambda obj: proc.reindex(obj)) cat_mod_get = modified_index._unindex.get solr_mod_get = solr_results.get done = unindex.union(index) for uid, rid in cat_results.items(): if uid in done: continue if isinstance(rid, IITreeSet): rid = rid.keys()[0] if cat_mod_get(rid) != solr_mod_get(uid): obj = lookup(uid, rid=rid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log("not reindexing unindexable object %r.\n" % uid) if obj is not None: obj._p_deactivate() conn.commit() log("solr index synced.\n") msg = "processed %d object(s) in %s (%s cpu time)." msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def sync(self, batch=1000): """Sync the Solr index with the portal catalog. Records contained in the catalog but not in Solr will be indexed and records not contained in the catalog will be removed. """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey zodb_conn = self.context._p_jar catalog = getToolByName(self.context, 'portal_catalog') getIndex = catalog._catalog.getIndex modified_index = getIndex('modified') uid_index = getIndex(key) log = self.mklog() real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time # get Solr status query = '+%s:[* TO *]' % key response = conn.search(q=query, rows=MAX_ROWS, fl='%s modified' % key) # avoid creating DateTime instances simple_unmarshallers = unmarshallers.copy() simple_unmarshallers['date'] = parse_date_as_datetime flares = SolrResponse(response, simple_unmarshallers) response.close() solr_results = {} solr_uids = set() def _utc_convert(value): t_tup = value.utctimetuple() return ((( (t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) * 60 + t_tup[4]) for flare in flares: uid = flare[key] solr_uids.add(uid) solr_results[uid] = _utc_convert(flare['modified']) # get catalog status cat_results = {} cat_uids = set() for uid, rid in uid_index._index.items(): cat_uids.add(uid) cat_results[uid] = rid # differences index = cat_uids.difference(solr_uids) solr_uids.difference_update(cat_uids) unindex = solr_uids processed = 0 flush = notimeout(lambda: conn.flush()) def checkPoint(): msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) # Look up objects uid_rid_get = cat_results.get rid_path_get = catalog._catalog.paths.get catalog_traverse = catalog.unrestrictedTraverse def lookup(uid, rid=None, uid_rid_get=uid_rid_get, rid_path_get=rid_path_get, catalog_traverse=catalog_traverse): if rid is None: rid = uid_rid_get(uid) if not rid: return None if not isinstance(rid, int): rid = tuple(rid)[0] path = rid_path_get(rid) if not path: return None try: obj = catalog_traverse(path) except AttributeError: return None return obj log('processing %d "unindex" operations next...\n' % len(unindex)) op = notimeout(lambda uid: conn.delete(id=uid)) for uid in unindex: obj = lookup(uid) if obj is None: op(uid) processed += 1 cpi.next() else: log('not unindexing existing object %r.\n' % uid) log('processing %d "index" operations next...\n' % len(index)) op = notimeout(lambda obj: proc.index(obj)) for uid in index: obj = lookup(uid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log('not indexing unindexable object %r.\n' % uid) if obj is not None: obj._p_deactivate() log('processing "reindex" operations next...\n') op = notimeout(lambda obj: proc.reindex(obj)) cat_mod_get = modified_index._unindex.get solr_mod_get = solr_results.get done = unindex.union(index) for uid, rid in cat_results.items(): if uid in done: continue if isinstance(rid, IITreeSet): rid = rid.keys()[0] if cat_mod_get(rid) != solr_mod_get(uid): obj = lookup(uid, rid=rid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log('not reindexing unindexable object %r.\n' % uid) if obj is not None: obj._p_deactivate() conn.commit() log('solr index synced.\n') msg = 'processed %d object(s) in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
class QueueIndexerTests(TestCase): def setUp(self): provideUtility(SolrConnectionConfig(), ISolrConnectionConfig) self.mngr = SolrConnectionManager() self.mngr.setHost(active=True) conn = self.mngr.getConnection() fakehttp(conn, getData('schema.xml')) # fake schema response self.mngr.getSchema() # read and cache the schema self.proc = SolrIndexProcessor(self.mngr) def tearDown(self): self.mngr.closeConnection() self.mngr.setHost(active=False) def testPrepareData(self): data = {'allowedRolesAndUsers': ['user:test_user_1_', 'user:portal_owner']} prepareData(data) self.assertEqual(data, {'allowedRolesAndUsers': ['user$test_user_1_', 'user$portal_owner']}) def testLanguageParameterHandling(self): # empty strings are replaced... data = {'Language': ['en', '']} prepareData(data) self.assertEqual(data, {'Language': ['en', 'any']}) data = {'Language': ''} prepareData(data) self.assertEqual(data, {'Language': 'any'}) # for other indices this shouldn't happen... data = {'Foo': ['en', '']} prepareData(data) self.assertEqual(data, {'Foo': ['en', '']}) def testIndexObject(self): response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake add response self.proc.index(Foo(id='500', name='python test doc')) # indexing sends data self.assertEqual(sortFields(str(output)), getData('add_request.txt')) def testIndexAccessorRaises(self): response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake add response def brokenfunc(): raise ValueError self.proc.index(Foo(id='500', name='python test doc', text=brokenfunc)) # indexing sends data self.assertEqual(sortFields(str(output)), getData('add_request.txt')) def testPartialIndexObject(self): foo = Foo(id='500', name='foo', price=42.0) # first index all attributes... response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) self.assert_(str(output).find('<field name="price">42.0</field>') > 0, '"price" data not found') # then only a subset... response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo, attributes=['id', 'name']) output = str(output) self.assert_(output.find('<field name="name">foo</field>') > 0, '"name" data not found') # at this point we'd normally check for a partial update: # self.assertEqual(output.find('price'), -1, '"price" data found?') # self.assertEqual(output.find('42'), -1, '"price" data found?') # however, until SOLR-139 has been implemented (re)index operations # always need to provide data for all attributes in the schema... self.assert_(output.find('<field name="price">42.0</field>') > 0, '"price" data not found') def testDateIndexing(self): foo = Foo(id='zeidler', name='andi', cat='nerd', timestamp=DateTime('May 11 1972 03:45 GMT')) response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake add response self.proc.index(foo) required = '<field name="timestamp">1972-05-11T03:45:00.000Z</field>' self.assert_(str(output).find(required) > 0, '"date" data not found') def testDateIndexingWithPythonDateTime(self): foo = Foo(id='gerken', name='patrick', cat='nerd', timestamp=datetime(1980, 9, 29, 14, 02)) response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake add response self.proc.index(foo) required = '<field name="timestamp">1980-09-29T14:02:00.000Z</field>' self.assert_(str(output).find(required) > 0, '"date" data not found') def testReindexObject(self): response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake add response self.proc.reindex(Foo(id='500', name='python test doc')) # reindexing sends data self.assertEqual(sortFields(str(output)), getData('add_request.txt')) def testUnindexObject(self): response = getData('delete_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake response self.proc.unindex(Foo(id='500', name='python test doc')) # unindexing sends data self.assertEqual(str(output), getData('delete_request.txt')) def testCommit(self): response = getData('commit_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake response self.proc.commit() # committing sends data self.assertEqual(str(output), getData('commit_request.txt')) def testNoIndexingWithoutAllRequiredFields(self): response = getData('dummy_response.txt') output = fakehttp(self.mngr.getConnection(), response) # fake add response self.proc.index(Foo(id='500')) # indexing sends data self.assertEqual(str(output), '') def testIndexerMethods(self): class Bar(Foo): def cat(self): return 'nerd' def price(self): raise AttributeError('price') foo = Bar(id='500', name='foo') # raising the exception should keep the attribute from being indexed response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) output = str(output) self.assertTrue(output.find('<field name="cat">nerd</field>') > 0, '"cat" data not found') self.assertEqual(output.find('price'), -1, '"price" data found?')
class QueueIndexerTests(TestCase): def setUp(self): provideUtility(SolrConnectionConfig(), ISolrConnectionConfig) self.mngr = SolrConnectionManager() self.mngr.setHost(active=True) conn = self.mngr.getConnection() fakehttp(conn, getData('schema.xml')) # fake schema response self.mngr.getSchema() # read and cache the schema self.proc = SolrIndexProcessor(self.mngr) def tearDown(self): self.mngr.closeConnection() self.mngr.setHost(active=False) def testPrepareData(self): data = {'allowedRolesAndUsers': [ 'user:test_user_1_', 'user:portal_owner']} prepareData(data) self.assertEqual( data, { 'allowedRolesAndUsers': [ 'user$test_user_1_', 'user$portal_owner' ] } ) def testLanguageParameterHandling(self): # empty strings are replaced... data = {'Language': ['en', '']} prepareData(data) self.assertEqual(data, {'Language': ['en', 'any']}) data = {'Language': ''} prepareData(data) self.assertEqual(data, {'Language': 'any'}) # for other indices this shouldn't happen... data = {'Foo': ['en', '']} prepareData(data) self.assertEqual(data, {'Foo': ['en', '']}) def testIndexObject(self): response = getData('add_response.txt') # fake add response output = fakehttp(self.mngr.getConnection(), response) # indexing sends data self.proc.index(Foo(id='500', name='python test doc')) self.assertEqual(sortFields(str(output)), getData('add_request.txt')) def testIndexAccessorRaises(self): response = getData('add_response.txt') # fake add response output = fakehttp(self.mngr.getConnection(), response) def brokenfunc(): raise ValueError self.proc.index(Foo(id='500', name='python test doc', text=brokenfunc)) # indexing sends data self.assertEqual(sortFields(str(output)), getData('add_request.txt')) def testPartialIndexObject(self): foo = Foo(id='500', name='foo', price=42.0) # first index all attributes... response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) self.assert_(str(output).find( '<field name="price">42.0</field>') > 0, '"price" data not found') # then only a subset... response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo, attributes=['id', 'name']) output = str(output) self.assert_( output.find('<field name="name">foo</field>') > 0, '"name" data not found' ) # at this point we'd normally check for a partial update: # self.assertEqual(output.find('price'), -1, '"price" data found?') # self.assertEqual(output.find('42'), -1, '"price" data found?') # however, until SOLR-139 has been implemented (re)index operations # always need to provide data for all attributes in the schema... self.assert_( output.find('<field name="price">42.0</field>') > 0, '"price" data not found' ) def testDateIndexing(self): foo = Foo(id='zeidler', name='andi', cat='nerd', timestamp=DateTime('May 11 1972 03:45 GMT')) response = getData('add_response.txt') # fake add response output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) required = '<field name="timestamp">1972-05-11T03:45:00.000Z</field>' self.assert_(str(output).find(required) > 0, '"date" data not found') def testDateIndexingWithPythonDateTime(self): foo = Foo(id='gerken', name='patrick', cat='nerd', timestamp=datetime(1980, 9, 29, 14, 02)) response = getData('add_response.txt') # fake add response output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) required = '<field name="timestamp">1980-09-29T14:02:00.000Z</field>' self.assert_(str(output).find(required) > 0, '"date" data not found') def testDateIndexingWithPythonDate(self): foo = Foo(id='brand', name='jan-carel', cat='nerd', timestamp=date(1982, 8, 05)) response = getData('add_response.txt') # fake add response output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) required = '<field name="timestamp">1982-08-05T00:00:00.000Z</field>' self.assert_(str(output).find(required) > 0, '"date" data not found') def testReindexObject(self): response = getData('add_response.txt') # fake add response output = fakehttp(self.mngr.getConnection(), response) # reindexing sends data self.proc.reindex(Foo(id='500', name='python test doc')) self.assertEqual(sortFields(str(output)), getData('add_request.txt')) def testUnindexObject(self): response = getData('delete_response.txt') # fake response output = fakehttp(self.mngr.getConnection(), response) # unindexing sends data self.proc.unindex(Foo(id='500', name='python test doc')) self.assertEqual(str(output), getData('delete_request.txt')) def testCommit(self): response = getData('commit_response.txt') # fake response output = fakehttp(self.mngr.getConnection(), response) # committing sends data self.proc.commit() self.assertEqual(str(output), getData('commit_request.txt')) def testNoIndexingWithoutAllRequiredFields(self): response = getData('dummy_response.txt') # fake add response output = fakehttp(self.mngr.getConnection(), response) # indexing sends data self.proc.index(Foo(id='500')) self.assertEqual(str(output), '') def testIndexerMethods(self): class Bar(Foo): def cat(self): return 'nerd' def price(self): raise AttributeError('price') foo = Bar(id='500', name='foo') # raising the exception should keep the attribute from being indexed response = getData('add_response.txt') output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) output = str(output) self.assertTrue( output.find('<field name="cat">nerd</field>') > 0, '"cat" data not found' ) self.assertEqual(output.find('price'), -1, '"price" data found?')
class QueueIndexerTests(TestCase): layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE def setUp(self): self.mngr = SolrConnectionManager() self.mngr.setHost(active=True) conn = self.mngr.getConnection() fakehttp(conn, getData("schema.xml")) # fake schema response self.mngr.getSchema() # read and cache the schema self.proc = SolrIndexProcessor(self.mngr) config = getConfig() config.atomic_updates = True def tearDown(self): self.mngr.closeConnection() self.mngr.setHost(active=False) def testPrepareData(self): data = {"allowedRolesAndUsers": ["user:test_user_1_", "user:portal_owner"]} prepareData(data) self.assertEqual( data, {"allowedRolesAndUsers": ["user$test_user_1_", "user$portal_owner"]} ) def testLanguageParameterHandling(self): # empty strings are replaced... data = {"Language": ["en", ""]} prepareData(data) self.assertEqual(data, {"Language": ["en", "any"]}) data = {"Language": ""} prepareData(data) self.assertEqual(data, {"Language": "any"}) # for other indices this shouldn't happen... data = {"Foo": ["en", ""]} prepareData(data) self.assertEqual(data, {"Foo": ["en", ""]}) def testIndexObject(self): response = getData("add_response.txt") # fake add response output = fakehttp(self.mngr.getConnection(), response) # indexing sends data self.proc.index(Foo(id="500", name="python test doc")) self.assertEqual( sortFields(str(output).encode("utf-8")), getData("add_request.txt").rstrip(b"\n"), ) def testIndexAccessorRaises(self): response = getData("add_response.txt") # fake add response output = fakehttp(self.mngr.getConnection(), response) def brokenfunc(): raise ValueError self.proc.index( Foo(id="500", name="python test doc", text=brokenfunc) ) # indexing sends data self.assertEqual( sortFields(str(output).encode("utf-8")), getData("add_request.txt").rstrip(b"\n"), ) def testPartialIndexObject(self): foo = Foo(id="500", name="foo", price=42.0) # first index all attributes... response = getData("add_response.txt") output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) self.assert_( str(output).find('<field name="price" update="set">42.0</field>') > 0, '"price" data not found', ) # then only a subset... response = getData("add_response.txt") output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo, attributes=["id", "name"]) output = str(output) self.assert_( output.find('<field name="name" update="set">foo</field>') > 0, '"name" data not found', ) # at this point we'd normally check for a partial update: self.assertEqual(output.find("price"), -1, '"price" data found?') self.assertEqual(output.find("42"), -1, '"price" data found?') def testDateIndexing(self): foo = Foo( id="zeidler", name="andi", cat="nerd", timestamp=DateTime("May 11 1972 03:45:59.999730 GMT"), ) response = getData("add_response.txt") # fake add response output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) required = ( '<field name="timestamp" update="set">' "1972-05-11T03:45:59.999Z</field>" ) self.assert_(str(output).find(required) > 0, '"date" data not found') def testDateIndexingWithPythonDateTime(self): foo = Foo( id="gerken", name="patrick", cat="nerd", timestamp=datetime(1980, 9, 29, 14, 0o2, 59, 999730), ) response = getData("add_response.txt") # fake add response output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) required = ( '<field name="timestamp" update="set">' "1980-09-29T14:02:59.999Z</field>" ) self.assert_(str(output).find(required) > 0, '"date" data not found') def testDateIndexingWithPythonDate(self): foo = Foo( id="brand", name="jan-carel", cat="nerd", timestamp=date(1982, 8, 0o5) ) response = getData("add_response.txt") # fake add response output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) required = ( '<field name="timestamp" update="set">' "1982-08-05T00:00:00.000Z</field>" ) self.assert_(str(output).find(required) > 0, '"date" data not found') def testReindexObject(self): response = getData("add_response.txt") # fake add response output = fakehttp(self.mngr.getConnection(), response) # reindexing sends data self.proc.reindex(Foo(id="500", name="python test doc")) self.assertEqual( sortFields(str(output).encode("utf-8")), getData("add_request.txt").rstrip(b"\n"), ) def testUnindexObject(self): response = getData("delete_response.txt") # fake response output = fakehttp(self.mngr.getConnection(), response) # unindexing sends data self.proc.unindex(Foo(id="500", name="python test doc")) self.assertEqual( str(output), getData("delete_request.txt").decode("utf-8").rstrip("\n") ) def testCommit(self): response = getData("commit_response.txt") # fake response output = fakehttp(self.mngr.getConnection(), response) # committing sends data self.proc.commit() self.assertEqual( str(output), getData("commit_request.txt").decode("utf-8").rstrip("\n") ) def testNoIndexingWithoutAllRequiredFields(self): response = getData("dummy_response.txt") # fake add response output = fakehttp(self.mngr.getConnection(), response) # indexing sends data self.proc.index(Foo(id="500")) self.assertEqual(str(output), "") def testIndexerMethods(self): class Bar(Foo): def cat(self): return "nerd" def price(self): raise AttributeError("price") foo = Bar(id="500", name="foo") # raising the exception should keep the attribute from being indexed response = getData("add_response.txt") output = fakehttp(self.mngr.getConnection(), response) self.proc.index(foo) output = str(output) self.assertTrue( output.find('<field name="cat" update="set">nerd</field>') > 0, '"cat" data not found', ) self.assertEqual(output.find("price"), -1, '"price" data found?')