def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.gather', 'harvest_job_id') consumer_fetch = queue.get_consumer('ckan.harvest.fetch', 'harvest_object_id') consumer.queue_purge(queue='ckan.harvest.gather') consumer_fetch.queue_purge(queue='ckan.harvest.fetch') user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'http://www.lidata.eu/oaiprovider/', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'] }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'New' logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) # reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') # queue.fetch_callback(consumer_fetch, *reply) assert 1 == 2
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.gather','harvest_job_id') consumer_fetch = queue.get_consumer('ckan.harvest.fetch','harvest_object_id') consumer.queue_purge(queue='ckan.harvest.gather') consumer_fetch.queue_purge(queue='ckan.harvest.fetch') user = logic.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'http://www.lidata.eu/oaiprovider/', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')( context, source_dict ) harvest_job = logic.get_action('harvest_job_create')( context, {'source_id':harvest_source['id']} ) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'New' logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) # reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') # queue.fetch_callback(consumer_fetch, *reply) assert 1 == 2
def setup_class(cls): cls.gather_consumer = queue.get_consumer('ckan.harvest.gather.test', 'harvest_job_id') cls.fetch_consumer = queue.get_consumer('ckan.harvest.fetch.test', 'harvest_object_id') cls.mock_url = 'http://some.dcat.file.rdf' # Minimal remote RDF file cls.remote_file = '''<?xml version="1.0" encoding="utf-8" ?>
def setup_class(cls): cls.gather_consumer = queue.get_consumer('ckan.harvest.gather.test', 'harvest_job_id') cls.fetch_consumer = queue.get_consumer('ckan.harvest.fetch.test', 'harvest_object_id') # Minimal remote RDF file cls.rdf_mock_url = 'http://some.dcat.file.rdf' cls.rdf_content_type = 'application/rdf+xml' cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2"> <dct:title>Example dataset 2</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog rdf:about="https://data.some.org/catalog"> <dcat:dataset> <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1"> <dct:title>Example dataset 1</dct:title> </dcat:Dataset> </dcat:dataset> </dcat:Catalog> </rdf:RDF> ''' cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <dcat:Catalog </rdf:RDF> ''' #Minimal remote turtle file cls.ttl_mock_url = 'http://some.dcat.file.ttl' cls.ttl_content_type = 'text/turtle' cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . <https://data.some.org/catalog/datasets/2> a dcat:Dataset ; dc:title "Example dataset 2" . ''' cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> . @prefix dc: <http://purl.org/dc/terms/> . <https://data.some.org/catalog> a dcat:Catalog ; dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> . <https://data.some.org/catalog/datasets/1> a dcat:Dataset ; dc:title "Example dataset 1" . ''' cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.test.gather', queue.get_gather_routing_key()) consumer_fetch = queue.get_consumer('ckan.harvest.test.fetch', queue.get_fetch_routing_key()) consumer.queue_purge(queue='ckan.harvest.test.gather') consumer_fetch.queue_purge(queue='ckan.harvest.test.fetch') user = logic.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True} source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')( context, source_dict ) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert_equal(len(all_objects), 3) assert_equal(all_objects[0].state, 'COMPLETE') assert_equal(all_objects[0].report_status, 'added') assert_equal(all_objects[1].state, 'COMPLETE') assert_equal(all_objects[1].report_status, 'added') assert_equal(all_objects[2].state, 'COMPLETE') assert_equal(all_objects[2].report_status, 'added') ## fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['status'], u'Finished') assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}) assert_equal(harvest_source_dict['status']['total_datasets'], 3) assert_equal(harvest_source_dict['status']['job_count'], 1) ########### Second run ######################## harvest_job = logic.get_action('harvest_job_create')( context, {'source_id': harvest_source['id'], 'run': True} ) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert_equal(count, 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all() assert_equal(len(all_objects), 3) all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all() assert_equal(len(all_objects), 2) all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all() assert_equal(len(all_objects), 1) # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) harvest_job = logic.get_action('harvest_job_show')( context, {'id': job_id} ) assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) context['detailed'] = True harvest_source_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source['id']} ) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}) assert_equal(harvest_source_dict['status']['total_datasets'], 2) assert_equal(harvest_source_dict['status']['job_count'], 2)
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.test.gather', queue.get_gather_routing_key()) consumer_fetch = queue.get_consumer('ckan.harvest.test.fetch', queue.get_fetch_routing_key()) consumer.queue_purge(queue='ckan.harvest.test.gather') consumer_fetch.queue_purge(queue='ckan.harvest.test.fetch') user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by( current=True).all() assert_equal(len(all_objects), 3) assert_equal(all_objects[0].state, 'COMPLETE') assert_equal(all_objects[0].report_status, 'added') assert_equal(all_objects[1].state, 'COMPLETE') assert_equal(all_objects[1].report_status, 'added') assert_equal(all_objects[2].state, 'COMPLETE') assert_equal(all_objects[2].report_status, 'added') ## fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert_equal(harvest_job['status'], u'Finished') assert_equal(harvest_job['stats'], { 'added': 3, 'updated': 0, 'not modified': 0, 'errors': 0, 'deleted': 0 }) harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert_equal(harvest_source_dict['status']['last_job']['stats'], { 'added': 3, 'updated': 0, 'not modified': 0, 'errors': 0, 'deleted': 0 }) assert_equal(harvest_source_dict['status']['total_datasets'], 3) assert_equal(harvest_source_dict['status']['job_count'], 1) ########### Second run ######################## harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 6 reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert_equal(count, 3) all_objects = model.Session.query(HarvestObject).filter_by( report_status='added').all() assert_equal(len(all_objects), 3) all_objects = model.Session.query(HarvestObject).filter_by( report_status='updated').all() assert_equal(len(all_objects), 2) all_objects = model.Session.query(HarvestObject).filter_by( report_status='deleted').all() assert_equal(len(all_objects), 1) # run to make sure job is marked as finshed logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert_equal(harvest_job['stats'], { 'added': 0, 'updated': 2, 'not modified': 0, 'errors': 0, 'deleted': 1 }) context['detailed'] = True harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert_equal(harvest_source_dict['status']['last_job']['stats'], { 'added': 0, 'updated': 2, 'not modified': 0, 'errors': 0, 'deleted': 1 }) assert_equal(harvest_source_dict['status']['total_datasets'], 2) assert_equal(harvest_source_dict['status']['job_count'], 2)
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.gather','harvest_job_id') consumer_fetch = queue.get_consumer('ckan.harvest.fetch','harvest_object_id') consumer.queue_purge(queue='ckan.harvest.gather') consumer_fetch.queue_purge(queue='ckan.harvest.fetch') user = logic.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {} )['name'] context = {'model': model, 'session': model.Session, 'user': user, 'api_version': 3} source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')( context, source_dict ) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, {'source_id':harvest_source['id']} ) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'New' logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) assert logic.get_action('harvest_job_show')( context, {'id': job_id} )['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by(current=True).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[1].state == 'COMPLETE' assert all_objects[1].report_status == 'added' assert all_objects[2].state == 'COMPLETE' assert all_objects[2].report_status == 'added' ## fire run again to check if job is set to Finished try: logic.get_action('harvest_jobs_run')( context, {'source_id':harvest_source['id']} ) except Exception, e: assert 'There are no new harvesting jobs' in str(e)
def test_01_basic_harvester(self): ### make sure queues/exchanges are created first and are empty consumer = queue.get_consumer('ckan.harvest.gather', 'harvest_job_id') consumer_fetch = queue.get_consumer('ckan.harvest.fetch', 'harvest_object_id') consumer.queue_purge(queue='ckan.harvest.gather') consumer_fetch.queue_purge(queue='ckan.harvest.fetch') user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3 } source_dict = { 'title': 'Test Source', 'name': 'test-source', 'url': 'basic_test', 'source_type': 'test', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test', harvest_source assert harvest_source['url'] == 'basic_test', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'] }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'New' logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' ## pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' assert len(model.Session.query(HarvestObject).all()) == 3 assert len(model.Session.query(HarvestObjectExtra).all()) == 1 ## do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) count = model.Session.query(model.Package) \ .filter(model.Package.type=='dataset') \ .count() assert count == 3 all_objects = model.Session.query(HarvestObject).filter_by( current=True).all() assert len(all_objects) == 3 assert all_objects[0].state == 'COMPLETE' assert all_objects[0].report_status == 'added' assert all_objects[1].state == 'COMPLETE' assert all_objects[1].report_status == 'added' assert all_objects[2].state == 'COMPLETE' assert all_objects[2].report_status == 'added' ## fire run again to check if job is set to Finished try: logic.get_action('harvest_jobs_run')( context, { 'source_id': harvest_source['id'] }) except Exception, e: assert 'There are no new harvesting jobs' in str(e)