def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) assert redis.dbsize() > num_keys queue.purge_queues() assert redis.get('ckanext-harvest:some-random-key') == 'foobar' assert redis.dbsize() == num_keys assert redis.llen(queue.get_gather_routing_key()) == 0 assert redis.llen(queue.get_fetch_routing_key()) == 0 finally: redis.delete('ckanext-harvest:some-random-key')
def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': raise SkipTest() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) ok_(redis.dbsize() > num_keys) queue.purge_queues() assert_equal(redis.get('ckanext-harvest:some-random-key'), 'foobar') assert_equal(redis.dbsize(), num_keys) assert_equal(redis.llen(queue.get_gather_routing_key()), 0) assert_equal(redis.llen(queue.get_fetch_routing_key()), 0) finally: redis.delete('ckanext-harvest:some-random-key')
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.test.gather', queue.get_gather_routing_key()) consumer_fetch = queue.get_consumer('ckan.harvest.test.fetch', queue.get_fetch_routing_key()) consumer.queue_purge(queue='ckan.harvest.test.gather') consumer_fetch.queue_purge(queue='ckan.harvest.test.fetch') user = logic.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')( context, source_dict ) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert_equal(len(all_objects), 3) assert_equal(all_objects[0].state, 'COMPLETE') assert_equal(all_objects[0].report_status, 'added') assert_equal(all_objects[1].state, 'COMPLETE') assert_equal(all_objects[1].report_status, 'added') assert_equal(all_objects[2].state, 'COMPLETE') assert_equal(all_objects[2].report_status, 'added') ## fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['status'], u'Finished') assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) assert_equal(harvest_source_dict['status']['total_datasets'], 3) assert_equal(harvest_source_dict['status']['job_count'], 1) ########### Second run ######################## harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert_equal(count, 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all() assert_equal(len(all_objects), 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all() assert_equal(len(all_objects), 2) all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all() assert_equal(len(all_objects), 1) # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) context['detailed'] = True harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) assert_equal(harvest_source_dict['status']['total_datasets'], 2) assert_equal(harvest_source_dict['status']['job_count'], 2)
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.test.gather', queue.get_gather_routing_key()) consumer_fetch = queue.get_consumer('ckan.harvest.test.fetch', queue.get_fetch_routing_key()) consumer.queue_purge(queue='ckan.harvest.test.gather') consumer_fetch.queue_purge(queue='ckan.harvest.test.fetch') user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by( current=True).all() assert_equal(len(all_objects), 3) assert_equal(all_objects[0].state, 'COMPLETE') assert_equal(all_objects[0].report_status, 'added') assert_equal(all_objects[1].state, 'COMPLETE') assert_equal(all_objects[1].report_status, 'added') assert_equal(all_objects[2].state, 'COMPLETE') assert_equal(all_objects[2].report_status, 'added') ## fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert_equal(harvest_job['status'], u'Finished') assert_equal(harvest_job['stats'], { 'added': 3, 'updated': 0, 'not modified': 0, 'errors': 0, 'deleted': 0 }) harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert_equal(harvest_source_dict['status']['last_job']['stats'], { 'added': 3, 'updated': 0, 'not modified': 0, 'errors': 0, 'deleted': 0 }) assert_equal(harvest_source_dict['status']['total_datasets'], 3) assert_equal(harvest_source_dict['status']['job_count'], 1) ########### Second run ######################## harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert_equal(count, 3) all_objects = model.Session.query(HarvestObject).filter_by( report_status='added').all() assert_equal(len(all_objects), 3) all_objects = model.Session.query(HarvestObject).filter_by( report_status='updated').all() assert_equal(len(all_objects), 2) all_objects = model.Session.query(HarvestObject).filter_by( report_status='deleted').all() assert_equal(len(all_objects), 1) # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert_equal(harvest_job['stats'], { 'added': 0, 'updated': 2, 'not modified': 0, 'errors': 0, 'deleted': 1 }) context['detailed'] = True harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert_equal(harvest_source_dict['status']['last_job']['stats'], { 'added': 0, 'updated': 2, 'not modified': 0, 'errors': 0, 'deleted': 1 }) assert_equal(harvest_source_dict['status']['total_datasets'], 2) assert_equal(harvest_source_dict['status']['job_count'], 2)
def test_resubmit_objects(self): ''' Test that only harvest objects re-submitted which were not be present in the redis fetch queue. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() # make sure that there are no old elements in the redis db redis = queue.get_connection() fetch_routing_key = queue.get_fetch_routing_key() redis.flushdb() try: # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context) assert redis.llen(fetch_routing_key) == 3 # do only one time for the first harvest object reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 1 all_objects = model.Session.query(HarvestObject).order_by(HarvestObject.state.asc()).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[0].current is True assert all_objects[1].state == 'WAITING' assert all_objects[1].current is False assert all_objects[2].state == 'WAITING' assert all_objects[2].current is False assert len(redis.keys(fetch_routing_key + ':*')) == 0 assert redis.llen(fetch_routing_key) == 2 # Remove one object from redis that should be re-sent to the fetch queue reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert len(fetch_queue_items) == 1 harvest_object_id = reply[2] assert fetch_queue_items[0] != harvest_object_id queue.resubmit_objects() assert redis.llen(fetch_routing_key) == 2 fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert harvest_object_id in fetch_queue_items assert redis.dbsize() == 1 finally: redis.flushdb()