def test_last_error_free_returns_correct_job(self): '''Test that, after a successful job A, last_error_free() returns A.''' source, job = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job.status = u'Finished' job.save() new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) _assert_equal(last_error_free_job, job) # the import_since date should be the time job_a finished: FisbrokerPlugin().source_config['import_since'] = "last_error_free" import_since = FisbrokerPlugin().get_import_since_date(new_job) import_since_expected = (job.gather_started + timedelta(hours=FisbrokerPlugin().get_timedelta())) _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z")) # the query constraints should reflect the import_since date: constraint = FisbrokerPlugin().get_constraints(new_job)[0] _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).literal) _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).propertyname)
def run_harvest_job(job, harvester): # In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue' # which would do 2 things to 'run' the job: # 1. change the job status to Running job.status = 'Running' job.save() # 2. put the job on the gather queue which is consumed by # queue.gather_callback, which determines the harvester and then calls # gather_stage. We simply call the gather_stage. obj_ids = queue.gather_stage(harvester, job) # The object ids are put onto the fetch queue, consumed by # queue.fetch_callback which calls queue.fetch_and_import_stages results_by_guid = {} for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) guid = harvest_object.guid results_by_guid[guid] = {'obj_id': obj_id} queue.fetch_and_import_stages(harvester, harvest_object) results_by_guid[guid]['state'] = harvest_object.state results_by_guid[guid]['report_status'] = harvest_object.report_status if harvest_object.state == 'COMPLETE' and harvest_object.package_id: results_by_guid[guid]['dataset'] = \ toolkit.get_action('package_show')( {'ignore_auth': True}, dict(id=harvest_object.package_id)) results_by_guid[guid]['errors'] = harvest_object.errors # Do 'harvest_jobs_run' to change the job status to 'finished' toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {}) return results_by_guid
def run_harvest_job(job, harvester): # When 'paster harvest run' is called by the regular cron it does 2 things: # 1. change the job status to Running job.status = "Running" job.save() # 2. put the job on the gather queue which is consumed by # queue.gather_callback, which determines the harvester and then calls # gather_stage. We simply call the gather_stage. obj_ids = queue.gather_stage(harvester, job) # The object ids are put onto the fetch queue, consumed by # queue.fetch_callback which calls queue.fetch_and_import_stages results_by_guid = {} for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) guid = harvest_object.guid results_by_guid[guid] = {"obj_id": obj_id} queue.fetch_and_import_stages(harvester, harvest_object) results_by_guid[guid]["state"] = harvest_object.state results_by_guid[guid]["report_status"] = harvest_object.report_status if harvest_object.state == "COMPLETE" and harvest_object.package_id: results_by_guid[guid]["dataset"] = toolkit.get_action("package_show")( {}, dict(id=harvest_object.package_id) ) results_by_guid[guid]["errors"] = harvest_object.errors # Do 'harvest_jobs_run' to change the job status to 'finished' try: toolkit.get_action("harvest_jobs_run")({"ignore_auth": True}, {}) except NoNewHarvestJobError: # This is expected pass return results_by_guid
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package( get_action('package_show')({ 'validate': False, 'ignore_auth': True }, { 'id': source.id })) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id})) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def run_harvest_job(job, harvester): # In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue' # which would do 2 things to 'run' the job: # 1. change the job status to Running job.status = 'Running' job.save() # 2. put the job on the gather queue which is consumed by # queue.gather_callback, which determines the harvester and then calls # gather_stage. We simply call the gather_stage. obj_ids = queue.gather_stage(harvester, job) if not isinstance(obj_ids, list): # gather had nothing to do or errored. Carry on to ensure the job is # closed properly obj_ids = [] # The object ids are put onto the fetch queue, consumed by # queue.fetch_callback which calls queue.fetch_and_import_stages results_by_guid = {} for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) guid = harvest_object.guid # force reimport of datasets if hasattr(job, 'force_import'): if guid in job.force_import: harvest_object.force_import = True else: log.info('Skipping: %s', guid) continue results_by_guid[guid] = {'obj_id': obj_id} queue.fetch_and_import_stages(harvester, harvest_object) results_by_guid[guid]['state'] = harvest_object.state results_by_guid[guid]['report_status'] = harvest_object.report_status if harvest_object.state == 'COMPLETE' and harvest_object.package_id: results_by_guid[guid]['dataset'] = \ toolkit.get_action('package_show')( {'ignore_auth': True}, dict(id=harvest_object.package_id)) results_by_guid[guid]['errors'] = harvest_object.errors # Do 'harvest_jobs_run' to change the job status to 'finished' toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {}) return results_by_guid
def test_last_error_free_does_not_return_reimport_job(self): '''Test that reimport jobs are ignored for determining the last error-free job.''' # do a successful job source, job_a = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job_a) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_a.status = u'Finished' job_a.save() LOG.debug("successful job done ...") # do an unsuccessful job # This harvest job should fail, because the mock FIS-broker will look for a different # file on the second harvest run, will not find it and return a "no_record_found" # error. job_b = self._create_job(source.id) object_ids = gather_stage(FisbrokerPlugin(), job_b) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_b.status = u'Finished' job_b.save() LOG.debug("unsuccessful job done ...") # reset the mock server's counter reset_mock_server(1) # do a reimport job package_id = "3d-gebaudemodelle-im-level-of-detail-2-lod-2-wms-f2a8a483" self._get_test_app().get( url="/api/harvest/reimport?id={}".format(package_id), headers={'Accept': 'application/json'}, extra_environ={'REMOTE_USER': self.context['user'].encode('ascii')} ) LOG.debug("reimport job done ...") new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) # job_a should be the last error free job: _assert_equal(last_error_free_job.id, job_a.id)
def test_last_error_free_does_not_return_unsuccessful_job(self): '''Test that, after a successful job A, followed by an unsuccessful job B, last_error_free() returns A.''' source, job_a = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job_a) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_a.status = u'Finished' job_a.save() # This harvest job should fail, because the mock FIS-broker will look for a different # file on the second harvest run, will not find it and return a "no_record_found" # error. job_b = self._create_job(source.id) object_ids = gather_stage(FisbrokerPlugin(), job_b) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_b.status = u'Finished' job_b.save() new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) # job_a should be the last error free job: _assert_equal(last_error_free_job, job_a) # the import_since date should be the time job_a finished: FisbrokerPlugin().source_config['import_since'] = "last_error_free" import_since = FisbrokerPlugin().get_import_since_date(new_job) import_since_expected = (job_a.gather_started + timedelta(hours=FisbrokerPlugin().get_timedelta())) _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z")) # the query constraints should reflect the import_since date: constraint = FisbrokerPlugin().get_constraints(new_job)[0] _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo('modified', import_since).literal) _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).propertyname)
class WAFCollectionHarvester(GeoDataGovWAFHarvester): def info(self): return { 'name': 'waf-collection', 'title': 'Web Accessible Folder (WAF) Homogeneous Collection', 'description': 'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents with a collection record' } def extra_schema(self): extra_schema = super(WAFCollectionHarvester, self).extra_schema() extra_schema['collection_metadata_url'] = [not_empty, unicode] return extra_schema def get_package_dict(self, iso_values, harvest_object): package_dict = super(WAFCollectionHarvester, self).get_package_dict(iso_values, harvest_object) if not package_dict: return None collection_package_id = self._get_object_extra( harvest_object, 'collection_package_id') if collection_package_id: package_dict['extras'].append( dict(key='collection_package_id', value=collection_package_id)) collection_metadata = self._get_object_extra(harvest_object, 'collection_metadata') if collection_metadata: package_dict['extras'].append( dict(key='collection_metadata', value=collection_metadata)) status = self._get_object_extra(harvest_object, 'status') if status == 'change': self.force_import = True else: self.force_import = False return package_dict def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('WafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) collection_metadata_url = self.source_config.get( 'collection_metadata_url') if not collection_metadata_url: self._save_gather_error('collection url does not exist', harvest_job) return None try: response = requests.get(source_url, timeout=60) content = response.content except Exception, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None guid = hashlib.md5(collection_metadata_url.encode( 'utf8', 'ignore')).hexdigest() existing_harvest_object = model.Session.\ query(HarvestObject.guid, HarvestObject.package_id, HOExtra.value).\ join(HOExtra, HarvestObject.extras).\ filter(HOExtra.key=='collection_metadata').\ filter(HOExtra.value=='true').\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).first() if existing_harvest_object: status = 'change' guid = existing_harvest_object.guid package_id = existing_harvest_object.package_id else: status, package_id = 'new', None obj = HarvestObject(job=harvest_job, extras=[ HOExtra(key='collection_metadata', value='true'), HOExtra(key='waf_location', value=collection_metadata_url), HOExtra(key='status', value=status) ], guid=guid, status=status, package_id=package_id) queue.fetch_and_import_stages(self, obj) if obj.state == 'ERROR': self._save_gather_error( 'Collection object failed to harvest, not harvesting', harvest_job) return None return GeoDataGovWAFHarvester.gather_stage( self, harvest_job, collection_package_id=obj.package_id)