def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = { 'model': model, 'session': model.Session, 'ignore_auth': True } self.admin_user = get_action('get_site_user')(context, {}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'run': self.run_harvester() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import get_gather_consumer logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() logging.getLogger('ckan.cli').info( 'Now going to wait on the gather queue...') consumer.wait() elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer consumer = get_fetch_consumer() logging.getLogger('ckan.cli').info( 'Now going to wait on the fetch queue...') consumer.wait() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'job-run': self.job_run() else: print 'Command %s not recognized' % cmd
def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) assert redis.dbsize() > num_keys queue.purge_queues() assert redis.get('ckanext-harvest:some-random-key') == 'foobar' assert redis.dbsize() == num_keys assert redis.llen(queue.get_gather_routing_key()) == 0 assert redis.llen(queue.get_fetch_routing_key()) == 0 finally: redis.delete('ckanext-harvest:some-random-key')
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {'model':model,'session':model.Session,'ignore_auth':True} self.admin_user = get_action('get_site_user')(context,{}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'run': self.run_harvester() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import get_gather_consumer, gather_callback logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume(queue='ckan.harvest.gather'): gather_callback(consumer, method, header, body) elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer, fetch_callback consumer = get_fetch_consumer() for method, header, body in consumer.consume(queue='ckan.harvest.fetch'): fetch_callback(consumer, method, header, body) elif cmd == 'purge_queues': from ckanext.harvest.queue import purge_queues purge_queues() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'reindex': self.reindex() else: print 'Command %s not recognized' % cmd
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {'model':model,'session':model.Session,'ignore_auth':True} self.admin_user = get_action('get_site_user')(context,{}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'run': self.run_harvester() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import get_gather_consumer logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() logging.getLogger('ckan.cli').info('Now going to wait on the gather queue...') consumer.wait() elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer consumer = get_fetch_consumer() logging.getLogger('ckan.cli').info('Now going to wait on the fetch queue...') consumer.wait() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'job-run': self.job_run() else: print 'Command %s not recognized' % cmd
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {"model": model, "session": model.Session, "ignore_auth": True} self.admin_user = get_action("get_site_user")(context, {}) print "" if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == "source": self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == "sources": self.list_harvest_sources() elif cmd == "job": self.create_harvest_job() elif cmd == "jobs": self.list_harvest_jobs() elif cmd == "run": self.run_harvester() elif cmd == "gather_consumer": import logging from ckanext.harvest.queue import get_gather_consumer logging.getLogger("amqplib").setLevel(logging.INFO) consumer = get_gather_consumer() consumer.wait() elif cmd == "fetch_consumer": import logging logging.getLogger("amqplib").setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer consumer = get_fetch_consumer() consumer.wait() elif cmd == "initdb": self.initdb() elif cmd == "import": self.initdb() self.import_stage() elif cmd == "job-all": self.create_harvest_job_all() elif cmd == "harvesters-info": harvesters_info = get_action("harvesters_info_show")() pprint(harvesters_info) else: print "Command %s not recognized" % cmd
def fetch_consumer(): import logging logging.getLogger("amqplib").setLevel(logging.INFO) from ckanext.harvest.queue import ( get_fetch_consumer, fetch_callback, get_fetch_queue_name, ) consumer = get_fetch_consumer() for method, header, body in consumer.consume(queue=get_fetch_queue_name()): fetch_callback(consumer, method, header, body)
def test_redis_corrupt(self, mock_log_error): ''' Test that corrupt Redis doesn't stop harvest process and still processes other jobs. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key-2', 'foobar') # make sure queues/exchanges are created first and are empty gather_consumer = queue.get_gather_consumer() fetch_consumer = queue.get_fetch_consumer() gather_consumer.queue_purge(queue=queue.get_gather_queue_name()) fetch_consumer.queue_purge(queue=queue.get_fetch_queue_name()) # Create some fake jobs and objects with no harvest_job_id gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': None}) h_obj_id = str(uuid.uuid4()) fetch_publisher.send({'harvest_object_id': h_obj_id}) # Create some fake objects next(gather_consumer.consume(queue.get_gather_queue_name())) _, _, body = next( fetch_consumer.consume(queue.get_fetch_queue_name())) json_obj = json.loads(body) assert json_obj['harvest_object_id'] == h_obj_id assert mock_log_error.call_count == 1 args, _ = mock_log_error.call_args_list[0] if six.PY2: assert "cannot concatenate 'str' and 'NoneType' objects" in args[ 1] else: assert "must be str, not NoneType" in str(args[1]) finally: redis.delete('ckanext-harvest:some-random-key-2')
def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': raise SkipTest() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) ok_(redis.dbsize() > num_keys) queue.purge_queues() assert_equal(redis.get('ckanext-harvest:some-random-key'), 'foobar') assert_equal(redis.dbsize(), num_keys) assert_equal(redis.llen(queue.get_gather_routing_key()), 0) assert_equal(redis.llen(queue.get_fetch_routing_key()), 0) finally: redis.delete('ckanext-harvest:some-random-key')
def test_resubmit_objects(self): ''' Test that only harvest objects re-submitted which were not be present in the redis fetch queue. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() # make sure that there are no old elements in the redis db redis = queue.get_connection() fetch_routing_key = queue.get_fetch_routing_key() redis.flushdb() try: # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context) assert redis.llen(fetch_routing_key) == 3 # do only one time for the first harvest object reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 1 all_objects = model.Session.query(HarvestObject).order_by(HarvestObject.state.asc()).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[0].current is True assert all_objects[1].state == 'WAITING' assert all_objects[1].current is False assert all_objects[2].state == 'WAITING' assert all_objects[2].current is False assert len(redis.keys(fetch_routing_key + ':*')) == 0 assert redis.llen(fetch_routing_key) == 2 # Remove one object from redis that should be re-sent to the fetch queue reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert len(fetch_queue_items) == 1 harvest_object_id = reply[2] assert fetch_queue_items[0] != harvest_object_id queue.resubmit_objects() assert redis.llen(fetch_routing_key) == 2 fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert harvest_object_id in fetch_queue_items assert redis.dbsize() == 1 finally: redis.flushdb()
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = { 'model': model, 'session': model.Session, 'ignore_auth': True } self.admin_user = get_action('get_site_user')(context, {}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': if len(self.args) > 2: self.create_harvest_source() else: self.show_harvest_source() elif cmd == 'rmsource': self.remove_harvest_source() elif cmd == 'clearsource': self.clear_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'job_abort': self.job_abort() elif cmd == 'run': self.run_harvester() elif cmd == 'run_test': self.run_test_harvest() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import (get_gather_consumer, gather_callback, get_gather_queue_name) logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume( queue=get_gather_queue_name()): gather_callback(consumer, method, header, body) elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import (get_fetch_consumer, fetch_callback, get_fetch_queue_name) consumer = get_fetch_consumer() for method, header, body in consumer.consume( queue=get_fetch_queue_name()): fetch_callback(consumer, method, header, body) elif cmd == 'purge_queues': from ckanext.harvest.queue import purge_queues purge_queues() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'reindex': self.reindex() elif cmd == 'clean_harvest_log': self.clean_harvest_log() else: print 'Command %s not recognized' % cmd
def test_raise_child_error_and_retry(self): """ if a harvest job for a child fails because parent still not exists we need to ensure this job will be retried. This test emulate the case we harvest children first (e.g. if we have several active queues). Just for CKAN 2.8 env""" # start harvest process with gather to create harvest objects url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port self.run_gather(url=url) assert_equal(len(self.harvest_objects), 3) # create a publisher to send this objects to the fetch queue publisher = queue.get_fetch_publisher() for ho in self.harvest_objects: ho = harvest_model.HarvestObject.get(ho.id) # refresh ho_data = json.loads(ho.content) assert_equal(ho.state, 'WAITING') log.info('HO: {}\n\tCurrent: {}'.format(ho_data['identifier'], ho.current)) assert_equal(ho.retry_times, 0) publisher.send({'harvest_object_id': ho.id}) log.info('Harvest object sent to the fetch queue {} as {}'.format( ho_data['identifier'], ho.id)) publisher.close() # run fetch for elements in the wrong order (first a child, the a parent) class FakeMethod(object): ''' This is to act like the method returned by AMQP''' def __init__(self, message): self.delivery_tag = message # get the fetch consumer_fetch = queue.get_fetch_consumer() qname = queue.get_fetch_queue_name() # first a child and assert to get an error r2 = json.dumps({"harvest_object_id": self.harvest_objects[1].id}) r0 = FakeMethod(r2) with assert_raises(ParentNotHarvestedException): queue.fetch_callback(consumer_fetch, r0, None, r2) assert_equal(self.harvest_objects[1].retry_times, 1) assert_equal(self.harvest_objects[1].state, "ERROR") # run the parent later, like in a different queue r2 = json.dumps({"harvest_object_id": self.harvest_objects[0].id}) r0 = FakeMethod(r2) queue.fetch_callback(consumer_fetch, r0, None, r2) assert_equal(self.harvest_objects[0].retry_times, 1) assert_equal(self.harvest_objects[0].state, "COMPLETE") # Check status on harvest objects # We expect one child with error, parent ok and second child still waiting for ho in self.harvest_objects: ho = harvest_model.HarvestObject.get(ho.id) # refresh ho_data = json.loads(ho.content) idf = ho_data['identifier'] log.info( '\nHO2: {}\n\tState: {}\n\tCurrent: {}\n\tGathered {}'.format( idf, ho.state, ho.current, ho.gathered)) if idf == 'OPM-ERround-0001': assert_equal(ho.state, 'COMPLETE') elif idf == 'OPM-ERround-0001-AWOL': assert_equal(ho.state, 'ERROR') ho_awol_id = ho.id elif idf == 'OPM-ERround-0001-Retire': assert_equal(ho.state, 'WAITING') ho_retire_id = ho.id else: raise Exception('Unexpected identifier: "{}"'.format(idf)) # resubmit jobs and objects as harvest_jobs_run does # we expect the errored harvest object is in this queue queue.resubmit_jobs() queue.resubmit_objects() # iterate over the fetch consumer queue again and check pending harvest objects harvest_objects = [] while True: method, header, body = consumer_fetch.basic_get(queue=qname) if body is None: break body_data = json.loads(body) ho_id = body_data.get('harvest_object_id', None) log.info('Adding ho_id {}'.format(ho_id)) if ho_id is not None: ho = harvest_model.HarvestObject.get(ho_id) if ho is not None: harvest_objects.append(ho) content = json.loads(ho.content) log.info('Harvest object found {}: {} '.format( content['identifier'], ho.state)) else: log.info('Harvest object not found {}'.format(ho_id)) ho_ids = [ho.id for ho in harvest_objects] # Now, we expect the waiting child and the errored one to be in the fetch queue log.info('Searching wainting object "Retire ID"') assert_in(ho_retire_id, ho_ids) log.info('Searching errored object "Awol ID"') assert_in(ho_awol_id, ho_ids)
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {"model": model, "session": model.Session, "ignore_auth": True} self.admin_user = get_action("get_site_user")(context, {}) print "" if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == "source": if len(self.args) > 2: self.create_harvest_source() else: self.show_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == "clearsource": self.clear_harvest_source() elif cmd == "clearsource_history": self.clear_harvest_source_history() elif cmd == "sources": self.list_harvest_sources() elif cmd == "job": self.create_harvest_job() elif cmd == "jobs": self.list_harvest_jobs() elif cmd == "job_abort": self.job_abort() elif cmd == "run": self.run_harvester() elif cmd == "run_test": self.run_test_harvest() elif cmd == "gather_consumer": import logging from ckanext.harvest.queue import get_gather_consumer, gather_callback, get_gather_queue_name logging.getLogger("amqplib").setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume(queue=get_gather_queue_name()): gather_callback(consumer, method, header, body) elif cmd == "fetch_consumer": import logging logging.getLogger("amqplib").setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer, fetch_callback, get_fetch_queue_name consumer = get_fetch_consumer() for method, header, body in consumer.consume(queue=get_fetch_queue_name()): fetch_callback(consumer, method, header, body) elif cmd == "purge_queues": self.purge_queues() elif cmd == "initdb": self.initdb() elif cmd == "import": self.initdb() self.import_stage() elif cmd == "job-all": self.create_harvest_job_all() elif cmd == "harvesters-info": harvesters_info = get_action("harvesters_info_show")() pprint(harvesters_info) elif cmd == "reindex": self.reindex() elif cmd == "clean_harvest_log": self.clean_harvest_log() else: print "Command %s not recognized" % cmd
def setup_class(cls): h.reset_db() cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' # Minimal remote RDF file with pagination (1) # Use slashes for paginated URLs because HTTPretty won't distinguish # query strings cls.rdf_mock_url_pagination_1 = 'http://some.dcat.file.pagination.rdf' cls.rdf_content_pagination_1 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:nextPage>http://some.dcat.file.pagination.rdf/page/2</hydra:nextPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file with pagination (2) cls.rdf_mock_url_pagination_2 = 'http://some.dcat.file.pagination.rdf/page/2' cls.rdf_content_pagination_2 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/3"> <dct:title>Example dataset 3</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/4"> <dct:title>Example dataset 4</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:previousPage>http://some.dcat.file.pagination.rdf/page/1</hydra:previousPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' # RDF with minimal distribution cls.rdf_content_with_distribution_uri = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> <dcat:distribution> <dcat:Distribution rdf:about="https://data.some.org/catalog/datasets/1/resource/1"> <dct:title>Example resource 1</dct:title> <dcat:accessURL>http://data.some.org/download.zip</dcat:accessURL> </dcat:Distribution> </dcat:distribution> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_content_with_distribution = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> <dcat:distribution> <dcat:Distribution> <dct:title>Example resource 1</dct:title> <dcat:accessURL>http://data.some.org/download.zip</dcat:accessURL> </dcat:Distribution> </dcat:distribution> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "förskola", "Garduña" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "San Sebastián", "Ελλάδα" . ''' cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "Utbildning, kontaktuppgifter" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "Trees, forest, shrub" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def setup_class(cls): cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def setup_class(cls): h.reset_db() cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' # Minimal remote RDF file with pagination (1) # Use slashes for paginated URLs because HTTPretty won't distinguish # query strings cls.rdf_mock_url_pagination_1 = 'http://some.dcat.file.pagination.rdf' cls.rdf_content_pagination_1 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:nextPage>http://some.dcat.file.pagination.rdf/page/2</hydra:nextPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file with pagination (2) cls.rdf_mock_url_pagination_2 = 'http://some.dcat.file.pagination.rdf/page/2' cls.rdf_content_pagination_2 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/3"> <dct:title>Example dataset 3</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/4"> <dct:title>Example dataset 4</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:previousPage>http://some.dcat.file.pagination.rdf/page/1</hydra:previousPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "förskola", "Garduña" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "San Sebastián", "Ελλάδα" . ''' cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "Utbildning, kontaktuppgifter" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "Trees, forest, shrub" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def test_01_basic_harvester(self): if config.get('ckan.harvest.mq.type') == 'redis': # make sure that there are no old elements in the redis db redis = queue.get_connection() redis.flushdb() # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context) # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[1].state == 'COMPLETE' assert all_objects[1].report_status == 'added' assert all_objects[2].state == 'COMPLETE' assert all_objects[2].report_status == 'added' # fire run again to check if job is set to Finished toolkit.get_action('harvest_jobs_run')( context, {'source_id': harvest_source['id']} ) harvest_job = toolkit.get_action('harvest_job_show')( context, {'id': job_id} ) assert harvest_job['status'] == u'Finished' assert harvest_job['stats'] == {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0} harvest_source_dict = toolkit.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0} assert harvest_source_dict['status']['total_datasets'] == 3 assert harvest_source_dict['status']['job_count'] == 1 # Second run harvest_job = toolkit.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert toolkit.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all() assert len(all_objects) == 3 all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all() assert len(all_objects) == 2 all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all() assert len(all_objects) == 1 # run to make sure job is marked as finshed toolkit.get_action('harvest_jobs_run')( context, {'source_id': harvest_source['id']} ) harvest_job = toolkit.get_action('harvest_job_show')( context, {'id': job_id} ) assert harvest_job['stats'] == {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1} harvest_source_dict = toolkit.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1} assert harvest_source_dict['status']['total_datasets'] == 2 assert harvest_source_dict['status']['job_count'] == 2
def setup_class(cls): h.reset_db() cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer()
def setup_class(cls): cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "förskola", "Garduña" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "San Sebastián", "Ελλάδα" . ''' cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "Utbildning, kontaktuppgifter" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "Trees, forest, shrub" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = logic.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')( context, source_dict ) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert_equal(len(all_objects), 3) assert_equal(all_objects[0].state, 'COMPLETE') assert_equal(all_objects[0].report_status, 'added') assert_equal(all_objects[1].state, 'COMPLETE') assert_equal(all_objects[1].report_status, 'added') assert_equal(all_objects[2].state, 'COMPLETE') assert_equal(all_objects[2].report_status, 'added') ## fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['status'], u'Finished') assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) assert_equal(harvest_source_dict['status']['total_datasets'], 3) assert_equal(harvest_source_dict['status']['job_count'], 1) ########### Second run ######################## harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert_equal(count, 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all() assert_equal(len(all_objects), 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all() assert_equal(len(all_objects), 2) all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all() assert_equal(len(all_objects), 1) # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) assert_equal(harvest_source_dict['status']['total_datasets'], 2) assert_equal(harvest_source_dict['status']['job_count'], 2)
def test_01_basic_harvester(self): # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by( current=True).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[1].state == 'COMPLETE' assert all_objects[1].report_status == 'added' assert all_objects[2].state == 'COMPLETE' assert all_objects[2].report_status == 'added' # fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert harvest_job['status'] == u'Finished' assert harvest_job['stats'] == { 'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0 } harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0 } assert harvest_source_dict['status']['total_datasets'] == 3 assert harvest_source_dict['status']['job_count'] == 1 # Second run harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by( report_status='added').all() assert len(all_objects) == 3 all_objects = model.Session.query(HarvestObject).filter_by( report_status='updated').all() assert len(all_objects) == 2 all_objects = model.Session.query(HarvestObject).filter_by( report_status='deleted').all() assert len(all_objects) == 1 # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert harvest_job['stats'] == { 'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1 } harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1 } assert harvest_source_dict['status']['total_datasets'] == 2 assert harvest_source_dict['status']['job_count'] == 2
def test_fetch_doesnt_process_remaining_objects_if_job_status_finished( self): # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Job Finished', 'name': 'test-job-finished', 'url': 'basic_test_1', 'source_type': 'test-nose', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test-nose', harvest_source assert harvest_source['url'] == 'basic_test_1', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).filter( HarvestObject.harvest_job_id == harvest_job['id']).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' # artificially set the job to finished to simulate a job abort or timeout job_obj = HarvestJob.get(harvest_job['id']) job_obj.status = 'Finished' job_obj.save() original_dataset_count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) all_objects = model.Session.query(HarvestObject).filter( HarvestObject.harvest_job_id == harvest_job['id']).all() assert len(all_objects) == 3 assert all_objects[0].state == 'ERROR' assert all_objects[1].state == 'ERROR' assert all_objects[2].state == 'ERROR' count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == original_dataset_count # fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert_equal(harvest_job['status'], u'Finished') assert_equal( harvest_job['stats'], { 'added': 0, 'updated': 0, 'not modified': 0, 'errored': 3, 'deleted': 0 }) harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert_equal( harvest_source_dict['status']['last_job']['stats'], { 'added': 0, 'updated': 0, 'not modified': 0, 'errored': 3, 'deleted': 0 }) assert_equal(harvest_source_dict['status']['total_datasets'], 0) assert_equal(harvest_source_dict['status']['job_count'], 1)