def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) assert redis.dbsize() > num_keys queue.purge_queues() assert redis.get('ckanext-harvest:some-random-key') == 'foobar' assert redis.dbsize() == num_keys assert redis.llen(queue.get_gather_routing_key()) == 0 assert redis.llen(queue.get_fetch_routing_key()) == 0 finally: redis.delete('ckanext-harvest:some-random-key')
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = { 'model': model, 'session': model.Session, 'ignore_auth': True } self.admin_user = get_action('get_site_user')(context, {}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'run': self.run_harvester() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import get_gather_consumer logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() logging.getLogger('ckan.cli').info( 'Now going to wait on the gather queue...') consumer.wait() elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer consumer = get_fetch_consumer() logging.getLogger('ckan.cli').info( 'Now going to wait on the fetch queue...') consumer.wait() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'job-run': self.job_run() else: print 'Command %s not recognized' % cmd
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {'model':model,'session':model.Session,'ignore_auth':True} self.admin_user = get_action('get_site_user')(context,{}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'run': self.run_harvester() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import get_gather_consumer, gather_callback logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume(queue='ckan.harvest.gather'): gather_callback(consumer, method, header, body) elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer, fetch_callback consumer = get_fetch_consumer() for method, header, body in consumer.consume(queue='ckan.harvest.fetch'): fetch_callback(consumer, method, header, body) elif cmd == 'purge_queues': from ckanext.harvest.queue import purge_queues purge_queues() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'reindex': self.reindex() else: print 'Command %s not recognized' % cmd
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {'model':model,'session':model.Session,'ignore_auth':True} self.admin_user = get_action('get_site_user')(context,{}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'run': self.run_harvester() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import get_gather_consumer logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() logging.getLogger('ckan.cli').info('Now going to wait on the gather queue...') consumer.wait() elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer consumer = get_fetch_consumer() logging.getLogger('ckan.cli').info('Now going to wait on the fetch queue...') consumer.wait() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'job-run': self.job_run() else: print 'Command %s not recognized' % cmd
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {"model": model, "session": model.Session, "ignore_auth": True} self.admin_user = get_action("get_site_user")(context, {}) print "" if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == "source": self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == "sources": self.list_harvest_sources() elif cmd == "job": self.create_harvest_job() elif cmd == "jobs": self.list_harvest_jobs() elif cmd == "run": self.run_harvester() elif cmd == "gather_consumer": import logging from ckanext.harvest.queue import get_gather_consumer logging.getLogger("amqplib").setLevel(logging.INFO) consumer = get_gather_consumer() consumer.wait() elif cmd == "fetch_consumer": import logging logging.getLogger("amqplib").setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer consumer = get_fetch_consumer() consumer.wait() elif cmd == "initdb": self.initdb() elif cmd == "import": self.initdb() self.import_stage() elif cmd == "job-all": self.create_harvest_job_all() elif cmd == "harvesters-info": harvesters_info = get_action("harvesters_info_show")() pprint(harvesters_info) else: print "Command %s not recognized" % cmd
def gather_consumer(): import logging from ckanext.harvest.queue import ( get_gather_consumer, gather_callback, get_gather_queue_name, ) logging.getLogger("amqplib").setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume( queue=get_gather_queue_name()): gather_callback(consumer, method, header, body)
def test_redis_corrupt(self, mock_log_error): ''' Test that corrupt Redis doesn't stop harvest process and still processes other jobs. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key-2', 'foobar') # make sure queues/exchanges are created first and are empty gather_consumer = queue.get_gather_consumer() fetch_consumer = queue.get_fetch_consumer() gather_consumer.queue_purge(queue=queue.get_gather_queue_name()) fetch_consumer.queue_purge(queue=queue.get_fetch_queue_name()) # Create some fake jobs and objects with no harvest_job_id gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': None}) h_obj_id = str(uuid.uuid4()) fetch_publisher.send({'harvest_object_id': h_obj_id}) # Create some fake objects next(gather_consumer.consume(queue.get_gather_queue_name())) _, _, body = next( fetch_consumer.consume(queue.get_fetch_queue_name())) json_obj = json.loads(body) assert json_obj['harvest_object_id'] == h_obj_id assert mock_log_error.call_count == 1 args, _ = mock_log_error.call_args_list[0] if six.PY2: assert "cannot concatenate 'str' and 'NoneType' objects" in args[ 1] else: assert "must be str, not NoneType" in str(args[1]) finally: redis.delete('ckanext-harvest:some-random-key-2')
def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': raise SkipTest() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) ok_(redis.dbsize() > num_keys) queue.purge_queues() assert_equal(redis.get('ckanext-harvest:some-random-key'), 'foobar') assert_equal(redis.dbsize(), num_keys) assert_equal(redis.llen(queue.get_gather_routing_key()), 0) assert_equal(redis.llen(queue.get_fetch_routing_key()), 0) finally: redis.delete('ckanext-harvest:some-random-key')
def setup_class(cls): h.reset_db() cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' # Minimal remote RDF file with pagination (1) # Use slashes for paginated URLs because HTTPretty won't distinguish # query strings cls.rdf_mock_url_pagination_1 = 'http://some.dcat.file.pagination.rdf' cls.rdf_content_pagination_1 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:nextPage>http://some.dcat.file.pagination.rdf/page/2</hydra:nextPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file with pagination (2) cls.rdf_mock_url_pagination_2 = 'http://some.dcat.file.pagination.rdf/page/2' cls.rdf_content_pagination_2 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/3"> <dct:title>Example dataset 3</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/4"> <dct:title>Example dataset 4</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:previousPage>http://some.dcat.file.pagination.rdf/page/1</hydra:previousPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' # RDF with minimal distribution cls.rdf_content_with_distribution_uri = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> <dcat:distribution> <dcat:Distribution rdf:about="https://data.some.org/catalog/datasets/1/resource/1"> <dct:title>Example resource 1</dct:title> <dcat:accessURL>http://data.some.org/download.zip</dcat:accessURL> </dcat:Distribution> </dcat:distribution> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_content_with_distribution = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> <dcat:distribution> <dcat:Distribution> <dct:title>Example resource 1</dct:title> <dcat:accessURL>http://data.some.org/download.zip</dcat:accessURL> </dcat:Distribution> </dcat:distribution> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "förskola", "Garduña" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "San Sebastián", "Ελλάδα" . ''' cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "Utbildning, kontaktuppgifter" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "Trees, forest, shrub" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def setup_class(cls): cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def setup_class(cls): h.reset_db() cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' # Minimal remote RDF file with pagination (1) # Use slashes for paginated URLs because HTTPretty won't distinguish # query strings cls.rdf_mock_url_pagination_1 = 'http://some.dcat.file.pagination.rdf' cls.rdf_content_pagination_1 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:nextPage>http://some.dcat.file.pagination.rdf/page/2</hydra:nextPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file with pagination (2) cls.rdf_mock_url_pagination_2 = 'http://some.dcat.file.pagination.rdf/page/2' cls.rdf_content_pagination_2 = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/3"> <dct:title>Example dataset 3</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/4"> <dct:title>Example dataset 4</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems> <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage> <hydra:previousPage>http://some.dcat.file.pagination.rdf/page/1</hydra:previousPage> <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "förskola", "Garduña" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "San Sebastián", "Ελλάδα" . ''' cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "Utbildning, kontaktuppgifter" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "Trees, forest, shrub" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = logic.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')( context, source_dict ) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert_equal(len(all_objects), 3) assert_equal(all_objects[0].state, 'COMPLETE') assert_equal(all_objects[0].report_status, 'added') assert_equal(all_objects[1].state, 'COMPLETE') assert_equal(all_objects[1].report_status, 'added') assert_equal(all_objects[2].state, 'COMPLETE') assert_equal(all_objects[2].report_status, 'added') ## fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['status'], u'Finished') assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) assert_equal(harvest_source_dict['status']['total_datasets'], 3) assert_equal(harvest_source_dict['status']['job_count'], 1) ########### Second run ######################## harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert_equal(count, 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all() assert_equal(len(all_objects), 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all() assert_equal(len(all_objects), 2) all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all() assert_equal(len(all_objects), 1) # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) assert_equal(harvest_source_dict['status']['total_datasets'], 2) assert_equal(harvest_source_dict['status']['job_count'], 2)
def setup_class(cls): cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer() # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "förskola", "Garduña" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "San Sebastián", "Ελλάδα" . ''' cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" ; dcat:keyword "Utbildning, kontaktuppgifter" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" ; dcat:keyword "Trees, forest, shrub" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def setup_class(cls): h.reset_db() cls.gather_consumer = queue.get_gather_consumer() cls.fetch_consumer = queue.get_fetch_consumer()
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = {"model": model, "session": model.Session, "ignore_auth": True} self.admin_user = get_action("get_site_user")(context, {}) print "" if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == "source": if len(self.args) > 2: self.create_harvest_source() else: self.show_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == "clearsource": self.clear_harvest_source() elif cmd == "clearsource_history": self.clear_harvest_source_history() elif cmd == "sources": self.list_harvest_sources() elif cmd == "job": self.create_harvest_job() elif cmd == "jobs": self.list_harvest_jobs() elif cmd == "job_abort": self.job_abort() elif cmd == "run": self.run_harvester() elif cmd == "run_test": self.run_test_harvest() elif cmd == "gather_consumer": import logging from ckanext.harvest.queue import get_gather_consumer, gather_callback, get_gather_queue_name logging.getLogger("amqplib").setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume(queue=get_gather_queue_name()): gather_callback(consumer, method, header, body) elif cmd == "fetch_consumer": import logging logging.getLogger("amqplib").setLevel(logging.INFO) from ckanext.harvest.queue import get_fetch_consumer, fetch_callback, get_fetch_queue_name consumer = get_fetch_consumer() for method, header, body in consumer.consume(queue=get_fetch_queue_name()): fetch_callback(consumer, method, header, body) elif cmd == "purge_queues": self.purge_queues() elif cmd == "initdb": self.initdb() elif cmd == "import": self.initdb() self.import_stage() elif cmd == "job-all": self.create_harvest_job_all() elif cmd == "harvesters-info": harvesters_info = get_action("harvesters_info_show")() pprint(harvesters_info) elif cmd == "reindex": self.reindex() elif cmd == "clean_harvest_log": self.clean_harvest_log() else: print "Command %s not recognized" % cmd
def command(self): self._load_config() # We'll need a sysadmin user to perform most of the actions # We will use the sysadmin site user (named as the site_id) context = { 'model': model, 'session': model.Session, 'ignore_auth': True } self.admin_user = get_action('get_site_user')(context, {}) print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': if len(self.args) > 2: self.create_harvest_source() else: self.show_harvest_source() elif cmd == 'rmsource': self.remove_harvest_source() elif cmd == 'clearsource': self.clear_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'job_abort': self.job_abort() elif cmd == 'run': self.run_harvester() elif cmd == 'run_test': self.run_test_harvest() elif cmd == 'gather_consumer': import logging from ckanext.harvest.queue import (get_gather_consumer, gather_callback, get_gather_queue_name) logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume( queue=get_gather_queue_name()): gather_callback(consumer, method, header, body) elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) from ckanext.harvest.queue import (get_fetch_consumer, fetch_callback, get_fetch_queue_name) consumer = get_fetch_consumer() for method, header, body in consumer.consume( queue=get_fetch_queue_name()): fetch_callback(consumer, method, header, body) elif cmd == 'purge_queues': from ckanext.harvest.queue import purge_queues purge_queues() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.initdb() self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() elif cmd == 'harvesters-info': harvesters_info = get_action('harvesters_info_show')() pprint(harvesters_info) elif cmd == 'reindex': self.reindex() elif cmd == 'clean_harvest_log': self.clean_harvest_log() else: print 'Command %s not recognized' % cmd
def test_resubmit_objects(self): ''' Test that only harvest objects re-submitted which were not be present in the redis fetch queue. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() # make sure that there are no old elements in the redis db redis = queue.get_connection() fetch_routing_key = queue.get_fetch_routing_key() redis.flushdb() try: # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context) assert redis.llen(fetch_routing_key) == 3 # do only one time for the first harvest object reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 1 all_objects = model.Session.query(HarvestObject).order_by(HarvestObject.state.asc()).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[0].current is True assert all_objects[1].state == 'WAITING' assert all_objects[1].current is False assert all_objects[2].state == 'WAITING' assert all_objects[2].current is False assert len(redis.keys(fetch_routing_key + ':*')) == 0 assert redis.llen(fetch_routing_key) == 2 # Remove one object from redis that should be re-sent to the fetch queue reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert len(fetch_queue_items) == 1 harvest_object_id = reply[2] assert fetch_queue_items[0] != harvest_object_id queue.resubmit_objects() assert redis.llen(fetch_routing_key) == 2 fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10) assert harvest_object_id in fetch_queue_items assert redis.dbsize() == 1 finally: redis.flushdb()
def test_01_basic_harvester(self): if config.get('ckan.harvest.mq.type') == 'redis': # make sure that there are no old elements in the redis db redis = queue.get_connection() redis.flushdb() # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context) # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[1].state == 'COMPLETE' assert all_objects[1].report_status == 'added' assert all_objects[2].state == 'COMPLETE' assert all_objects[2].report_status == 'added' # fire run again to check if job is set to Finished toolkit.get_action('harvest_jobs_run')( context, {'source_id': harvest_source['id']} ) harvest_job = toolkit.get_action('harvest_job_show')( context, {'id': job_id} ) assert harvest_job['status'] == u'Finished' assert harvest_job['stats'] == {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0} harvest_source_dict = toolkit.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0} assert harvest_source_dict['status']['total_datasets'] == 3 assert harvest_source_dict['status']['job_count'] == 1 # Second run harvest_job = toolkit.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert toolkit.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all() assert len(all_objects) == 3 all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all() assert len(all_objects) == 2 all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all() assert len(all_objects) == 1 # run to make sure job is marked as finshed toolkit.get_action('harvest_jobs_run')( context, {'source_id': harvest_source['id']} ) harvest_job = toolkit.get_action('harvest_job_show')( context, {'id': job_id} ) assert harvest_job['stats'] == {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1} harvest_source_dict = toolkit.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1} assert harvest_source_dict['status']['total_datasets'] == 2 assert harvest_source_dict['status']['job_count'] == 2
def test_01_basic_harvester(self): # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by( current=True).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[1].state == 'COMPLETE' assert all_objects[1].report_status == 'added' assert all_objects[2].state == 'COMPLETE' assert all_objects[2].report_status == 'added' # fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert harvest_job['status'] == u'Finished' assert harvest_job['stats'] == { 'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0 } harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0 } assert harvest_source_dict['status']['total_datasets'] == 3 assert harvest_source_dict['status']['job_count'] == 1 # Second run harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by( report_status='added').all() assert len(all_objects) == 3 all_objects = model.Session.query(HarvestObject).filter_by( report_status='updated').all() assert len(all_objects) == 2 all_objects = model.Session.query(HarvestObject).filter_by( report_status='deleted').all() assert len(all_objects) == 1 # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert harvest_job['stats'] == { 'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1 } harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert harvest_source_dict['status']['last_job']['stats'] == { 'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1 } assert harvest_source_dict['status']['total_datasets'] == 2 assert harvest_source_dict['status']['job_count'] == 2
def test_fetch_doesnt_process_remaining_objects_if_job_status_finished( self): # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Job Finished', 'name': 'test-job-finished', 'url': 'basic_test_1', 'source_type': 'test-nose', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test-nose', harvest_source assert harvest_source['url'] == 'basic_test_1', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).filter( HarvestObject.harvest_job_id == harvest_job['id']).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' # artificially set the job to finished to simulate a job abort or timeout job_obj = HarvestJob.get(harvest_job['id']) job_obj.status = 'Finished' job_obj.save() original_dataset_count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) all_objects = model.Session.query(HarvestObject).filter( HarvestObject.harvest_job_id == harvest_job['id']).all() assert len(all_objects) == 3 assert all_objects[0].state == 'ERROR' assert all_objects[1].state == 'ERROR' assert all_objects[2].state == 'ERROR' count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == original_dataset_count # fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert_equal(harvest_job['status'], u'Finished') assert_equal( harvest_job['stats'], { 'added': 0, 'updated': 0, 'not modified': 0, 'errored': 3, 'deleted': 0 }) harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert_equal( harvest_source_dict['status']['last_job']['stats'], { 'added': 0, 'updated': 0, 'not modified': 0, 'errored': 3, 'deleted': 0 }) assert_equal(harvest_source_dict['status']['total_datasets'], 0) assert_equal(harvest_source_dict['status']['job_count'], 1)