def setup_class(cls): try: from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra except ImportError: raise SkipTest('The harvester extension is needed for these tests') cls.content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content1) cls.content2 = '<xml>Content 2</xml>' cls.original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content2) hoe = HarvestObjectExtra(key='original_document', value=cls.original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() cls.object_id_1 = ho1.id cls.object_id_2 = ho2.id
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def test_harvester(self): job = HarvestJob(source = self.source) harvester = InventoryHarvester() # Gather all of the datasets from the XML content and make sure # we have created some harvest objects result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml')) self.assertEqual(len(result), 79) # We only want one for testing harvest_object_id = result[0] harvest_obj = HarvestObject.get(harvest_object_id) # Run the fetch stage fetch_result = harvester.fetch_stage(harvest_obj) self.assertTrue(fetch_result) # Make sure we can create a dataset by running the import stage harvester.import_stage(harvest_obj) self.assertIsNotNone(harvest_obj.package_id) # Get the newly created package and make sure it is in the correct # organisation pkg = toolkit.get_action('package_show')( { 'ignore_auth': True, 'user': self.sysadmin['name'] }, { 'id': harvest_obj.package_id }, ) self.assertEqual(pkg['organization']['id'], self.publisher['id'])
def test_gather(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() self.harvester.client = _FakeClient() self.harvester.gather_stage(job)
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict = {'source_id': source_id, 'status': u'New'} exists = harvest_job_list(context, data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package( get_action('package_show')({ 'validate': False, 'ignore_auth': True }, { 'id': source.id })) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError( 'Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')(context, { 'id': job.id }) return harvest_job_dictize(job, context)
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() job = HarvestJob() job.save() model.repo.commit_and_remove() job = model.Session.query(HarvestJob).first() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), harvest_job=job, guid='test-guid', content='<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org', type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package.get('name', None), utils.pid_to_name(package.get('id', None))) self.assertEquals(utils.get_primary_pid(package), u'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = { u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'relation', u'relation': u'generalRelation' } self.assertTrue(expected_pid not in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def test_api(self): try: from ckanext.harvest.model import (HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra) except ImportError: raise SkipTest('The harvester extension is needed for these tests') content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=content1) content2 = '<xml>Content 2</xml>' original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=content2) hoe = HarvestObjectExtra( key='original_document', value=original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() object_id_1 = ho1.id object_id_2 = ho2.id app = self._get_test_app() # Test redirects for old URLs url = '/api/2/rest/harvestobject/{0}/xml'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 301) assert ('/harvest/object/{0}'.format(object_id_1) in r.headers['Location']) url = '/api/2/rest/harvestobject/{0}/html'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 301) assert ('/harvest/object/{0}/html'.format(object_id_1) in r.headers['Location']) # Access object content url = '/harvest/object/{0}'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'application/xml; charset=utf-8') assert_equals( r.body, '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>') # Access original content in object extra (if present) url = '/harvest/object/{0}/original'.format(object_id_1) r = app.get(url, status=404) assert_equals(r.status_int, 404) url = '/harvest/object/{0}/original'.format(object_id_2) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'application/xml; charset=utf-8') assert_equals( r.body, '<?xml version="1.0" encoding="UTF-8"?>\n' + '<xml>Original Content 2</xml>') # Access HTML transformation url = '/harvest/object/{0}/html'.format(object_id_1) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'text/html; charset=utf-8') assert 'GEMINI record about' in r.body url = '/harvest/object/{0}/html/original'.format(object_id_1) r = app.get(url, status=404) assert_equals(r.status_int, 404) url = '/harvest/object/{0}/html'.format(object_id_2) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'text/html; charset=utf-8') assert 'GEMINI record about' in r.body url = '/harvest/object/{0}/html/original'.format(object_id_2) r = app.get(url) assert_equals(r.status_int, 200) assert_equals(r.headers['Content-Type'], 'text/html; charset=utf-8') assert 'GEMINI record about' in r.body
def test_provenance(self): id="http://data.norge.no/node/512" source_json = { "title": "NOBIL - ladestasjoner for elbiler", "description": [ { "language": "nb", "value": "<p>NOBIL er etablert for formidling av informasjon om ladestasjoner for elbiler. Den har et detaljert innhold om ladestasjonene og formidler sanntidsdata. Alt er fritt tilgjengelig via et API.</p><p>Et ladepunkt er en reservert parkeringsplass med lademulighet for ladbare kjoeretoey. Paa et ladepunkt kan det vaere mer enn en kontakt, men bare plass til et kjoeretoey av gangen. En ladestasjon er et sted for det er ett eller flere ladepunkt.</p>" } ], "landingPage": "http://nobil.no", "issued": "2013-10-08", "modified": "2016-10-07T11:34", "language": [ "NOR" ], "publisher": { "name": "Enova SF", "mbox": "*****@*****.**" }, "keyword": [ "Energi", "Miljoe", "Transport" ], "distribution": [ { "title": "JSON", "description": [ { "language": "nb", "value": "Nettside som med informasjon om API-et" } ], "format": "JSON", "downloadURL": "null", "accessURL": "http://nobil.no", "webserviceURL": "null", "license": "http://creativecommons.org/licenses/by/3.0/no/" } ] } job= HarvestJob(url="http://data.norge.no/api/dcat/data.json", title="TestHarvest") obj = HarvestObject(guid="http://data.norge.no/node/512",content=source_json, source=job) initial={"activity_ocurred": "2016-10-21T12:25:09.091774", "harvest_source_title": "TestHarvest", "activity": "initial_harvest", "harvest_sorce_url": "http://data.norge.no/api/dcat/data.json", "excluded_resources": [], "harvested_guid": "http://data.norge.no/node/512"} #Testing get_metadata:provenance_for_just_this_harvest dcatharvest=DCATHarvester() provenance=dcatharvest.get_metadata_provenance_for_just_this_harvest(obj,"initial_harvest",[]) self.assertEqual(provenance["harvest_source_title"], initial["harvest_source_title"]) self.assertEqual(provenance["activity"], initial["activity"]) self.assertEqual(provenance["harvest_sorce_url"], initial["harvest_sorce_url"]) self.assertEqual(provenance["harvested_guid"], initial["harvested_guid"]) self.assertEqual(provenance["excluded_resources"],initial["excluded_resources"]) #Testing append_provenance_data val=[initial] val_string=json.dumps(val) package_dict={ "extras":[ {"key":"metadata_provenance","value":val_string} ] } update={"activity_ocurred": "2016-10-21T12:25:09.091774", "harvest_source_title": "TestHarvest", "activity": "update", "harvest_sorce_url": "http://data.norge.no/api/dcat/data.json", "excluded_resources": [], "harvested_guid": "http://data.norge.no/node/512"} appended=[initial,update] package_correct ={ "extras": [ {"key": "metadata_provenance", "value": json.dumps(appended)} ] } test_package_result=dcatharvest.append_provenance_data(package_dict,obj,"update",[]) test_provenance=json.loads(test_package_result["extras"][0]["value"]) correct_provenance=json.loads(package_correct["extras"][0]["value"]) self.assertEqual(len(correct_provenance),len(test_provenance))
def test_api(self, app): try: from ckanext.harvest.model import ( HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra, ) except ImportError: raise pytest.skip( "The harvester extension is needed for these tests") content1 = "<xml>Content 1</xml>" ho1 = HarvestObject( guid="test-ho-1", job=HarvestJob(source=HarvestSource(url="http://", type="xx")), content=content1, ) content2 = "<xml>Content 2</xml>" original_content2 = "<xml>Original Content 2</xml>" ho2 = HarvestObject( guid="test-ho-2", job=HarvestJob(source=HarvestSource(url="http://", type="xx")), content=content2, ) hoe = HarvestObjectExtra( key="original_document", value=original_content2, object=ho2 ) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() object_id_1 = ho1.id object_id_2 = ho2.id # Access object content url = "/harvest/object/{0}".format(object_id_1) r = app.get(url, status=200) assert( r.headers["Content-Type"] == "application/xml; charset=utf-8" ) assert( r.body == '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>' ) # Access original content in object extra (if present) url = "/harvest/object/{0}/original".format(object_id_1) r = app.get(url, status=404) url = "/harvest/object/{0}/original".format(object_id_2) r = app.get(url, status=200) assert( r.headers["Content-Type"] == "application/xml; charset=utf-8" ) assert( r.body == '<?xml version="1.0" encoding="UTF-8"?>\n' + "<xml>Original Content 2</xml>" )
if not source_pkg: log.error('Harvest source %s does not exist', source_name) return source_id = source_pkg.id source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) return # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') job = HarvestJob() job.source = source job.save() context['harvest_job'] = job print str(datetime.datetime.now()) + ' Start to import doi datasets.' print 'Datasets found on remote doi server: ' + str( len(collected_ids)) + ', on local: ' + str(len(existing_ids)) + '.' ids_to_add = collected_ids - existing_ids print 'Datasets to be added as new: ' + str(len(ids_to_add)) + '.' for num, doi_id in enumerate(ids_to_add): context.pop('package', None) context.pop('group', None) try: new_package = self.get_doi_package(url_dataset + doi_id)
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # recreate job for datajson collection or the like. source = job_obj.source source_config = json.loads(source.config or '{}') datajson_collection = source_config.get( 'datajson_collection') if datajson_collection == 'parents_run': new_job = HarvestJob() new_job.source = source new_job.save() source_config['datajson_collection'] = 'children_run' source.config = json.dumps(source_config) source.save() elif datajson_collection: # reset the key if 'children_run', or anything. source_config.pop("datajson_collection", None) source.config = json.dumps(source_config) source.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs