def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) assert redis.dbsize() > num_keys queue.purge_queues() assert redis.get('ckanext-harvest:some-random-key') == 'foobar' assert redis.dbsize() == num_keys assert redis.llen(queue.get_gather_routing_key()) == 0 assert redis.llen(queue.get_fetch_routing_key()) == 0 finally: redis.delete('ckanext-harvest:some-random-key')
def purge_distributed_queues(gather_queue_name, fetch_queue_name): ''' Purges given persistent queues. @param gather_queue_name name of the gather queue @param fetch_queue_name name of the fetch queue ''' backend = config.get('ckan.harvest.mq.type', MQ_TYPE) connection = get_connection() if backend in ('amqp', 'ampq'): channel = connection.channel() channel.queue_purge(queue=gather_queue_name) channel.queue_purge(queue=fetch_queue_name) return raise Exception('not a valid queue type %s' % backend)
def get_publisher(exchange_name, routing_key): ''' Returns a publisher object. @param exchange name of the exchange to send messages to @param routing_key message routing key ''' connection = get_connection() backend = config.get('ckan.harvest.mq.type', MQ_TYPE) if backend in ('amqp', 'ampq'): channel = connection.channel() channel.exchange_declare(exchange=exchange_name, durable=True) return Publisher(connection, channel, exchange_name, routing_key=routing_key) raise Exception('not a valid queue type %s' % backend)
def get_consumer(exchange_name, queue_name, routing_key): ''' Returns a reference to a RabbitMQ server channel. @param exchange name of the exchange to send messages to @param queue_key name of the queue to receive messages from @param routing_key message routing key ''' connection = get_connection() backend = config.get('ckan.harvest.mq.type', MQ_TYPE) if backend in ('amqp', 'ampq'): channel = connection.channel() channel.exchange_declare(exchange=exchange_name, durable=True) channel.queue_declare(queue=queue_name, durable=True) channel.queue_bind(queue=queue_name, exchange=exchange_name, routing_key=routing_key) return channel raise Exception('not a valid queue type %s' % backend)
def test_redis_corrupt(self, mock_log_error): ''' Test that corrupt Redis doesn't stop harvest process and still processes other jobs. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key-2', 'foobar') # make sure queues/exchanges are created first and are empty gather_consumer = queue.get_gather_consumer() fetch_consumer = queue.get_fetch_consumer() gather_consumer.queue_purge(queue=queue.get_gather_queue_name()) fetch_consumer.queue_purge(queue=queue.get_fetch_queue_name()) # Create some fake jobs and objects with no harvest_job_id gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': None}) h_obj_id = str(uuid.uuid4()) fetch_publisher.send({'harvest_object_id': h_obj_id}) # Create some fake objects next(gather_consumer.consume(queue.get_gather_queue_name())) _, _, body = next( fetch_consumer.consume(queue.get_fetch_queue_name())) json_obj = json.loads(body) assert json_obj['harvest_object_id'] == h_obj_id assert mock_log_error.call_count == 1 args, _ = mock_log_error.call_args_list[0] if six.PY2: assert "cannot concatenate 'str' and 'NoneType' objects" in args[ 1] else: assert "must be str, not NoneType" in str(args[1]) finally: redis.delete('ckanext-harvest:some-random-key-2')
def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': raise SkipTest() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) ok_(redis.dbsize() > num_keys) queue.purge_queues() assert_equal(redis.get('ckanext-harvest:some-random-key'), 'foobar') assert_equal(redis.dbsize(), num_keys) assert_equal(redis.llen(queue.get_gather_routing_key()), 0) assert_equal(redis.llen(queue.get_fetch_routing_key()), 0) finally: redis.delete('ckanext-harvest:some-random-key')
def test_01_basic_harvester(self): if config.get('ckan.harvest.mq.type') == 'redis': # make sure that there are no old elements in the redis db redis = queue.get_connection() redis.flushdb() # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context) # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[1].state == 'COMPLETE' assert all_objects[1].report_status == 'added' assert all_objects[2].state == 'COMPLETE' assert all_objects[2].report_status == 'added' # fire run again to check if job is set to Finished toolkit.get_action('harvest_jobs_run')( context, {'source_id': harvest_source['id']} ) harvest_job = toolkit.get_action('harvest_job_show')( context, {'id': job_id} ) assert harvest_job['status'] == u'Finished' assert harvest_job['stats'] == {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0} harvest_source_dict = toolkit.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0} assert harvest_source_dict['status']['total_datasets'] == 3 assert harvest_source_dict['status']['job_count'] == 1 # Second run harvest_job = toolkit.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert toolkit.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all() assert len(all_objects) == 3 all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all() assert len(all_objects) == 2 all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all() assert len(all_objects) == 1 # run to make sure job is marked as finshed toolkit.get_action('harvest_jobs_run')( context, {'source_id': harvest_source['id']} ) harvest_job = toolkit.get_action('harvest_job_show')( context, {'id': job_id} ) assert harvest_job['stats'] == {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1} harvest_source_dict = toolkit.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1} assert harvest_source_dict['status']['total_datasets'] == 2 assert harvest_source_dict['status']['job_count'] == 2
def test_resubmit_objects(self): ''' Test that only harvest objects re-submitted which were not be present in the redis fetch queue. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() # make sure that there are no old elements in the redis db redis = queue.get_connection() fetch_routing_key = queue.get_fetch_routing_key() redis.flushdb() try: # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context) assert redis.llen(fetch_routing_key) == 3 # do only one time for the first harvest object reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 1 all_objects = model.Session.query(HarvestObject).order_by(HarvestObject.state.asc()).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[0].current is True assert all_objects[1].state == 'WAITING' assert all_objects[1].current is False assert all_objects[2].state == 'WAITING' assert all_objects[2].current is False assert len(redis.keys(fetch_routing_key + ':*')) == 0 assert redis.llen(fetch_routing_key) == 2 # Remove one object from redis that should be re-sent to the fetch queue reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert len(fetch_queue_items) == 1 harvest_object_id = reply[2] assert fetch_queue_items[0] != harvest_object_id queue.resubmit_objects() assert redis.llen(fetch_routing_key) == 2 fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert harvest_object_id in fetch_queue_items assert redis.dbsize() == 1 finally: redis.flushdb()