def test_import_since_regular_value_returned_unchanged(self): '''Test that any value other than 'big_bang' or 'last_changed' for `import_since` is returned unchanged.''' FisbrokerPlugin().source_config = {'import_since': "2020-03-01"} import_since = FisbrokerPlugin().get_import_since_date(None) _assert_equal(import_since, "2020-03-01")
def test_timeout_config_returned_as_int(self): '''Test that get_timeout() always returns an int, if the `timeout`` config is set.''' FisbrokerPlugin().source_config = { 'timeout': '100' } timeout = FisbrokerPlugin().get_timeout() _assert_equal(timeout, 100)
def test_timedelta_config_returned_as_int(self): '''Test that get_timedelta() always returns an int, if the `timedelta`` config is set.''' FisbrokerPlugin().source_config = { 'timedelta': '1' } timedelta = FisbrokerPlugin().get_timedelta() _assert_equal(timedelta, 1)
def test_import_since_big_bang_means_none(self): '''Test that 'big_bang' for the `import_since` config means returns None.''' FisbrokerPlugin().source_config = { 'import_since': "big_bang" } import_since = FisbrokerPlugin().get_import_since_date(None) _assert_equal(import_since, None)
def test_timedelta_must_be_int(self): '''Test that the `timedelta` config must be an int.''' config = '{ "timedelta": 2 }' assert FisbrokerPlugin().validate_config(config) # invalid timedelta: config = '{ "timedelta": "two" }' with assert_raises(ValueError): assert FisbrokerPlugin().validate_config(config)
def test_timeout_must_be_int(self): '''Test that the `timeout` config must be an int.''' config = '{ "timeout": 30 }' assert FisbrokerPlugin().validate_config(config) # invalid timout: config = '{ "timeout": "hurtz" }' with assert_raises(ValueError): assert FisbrokerPlugin().validate_config(config)
def test_import_since_must_be_valid_iso(self): '''Test that the `import_since` config must be a valid ISO8601 date.''' config = '{ "import_since": "2019-01-01" }' assert FisbrokerPlugin().validate_config(config) # invalid date: config = '{ "import_since": "2019.01.01" }' with assert_raises(ValueError): assert FisbrokerPlugin().validate_config(config)
def test_import_since_date_is_none_if_no_jobs(self): '''Test that, if the `import_since` setting is `last_error_free`, but no jobs have run successfully (or at all), get_import_since_date() returns None.''' source, job = self._create_source_and_job() FisbrokerPlugin().source_config['import_since'] = "last_error_free" import_since = FisbrokerPlugin().get_import_since_date(job) _assert_equal(import_since, None)
def test_skip_on_missing_release_date(self): '''Test if get_package_dict() returns 'skip' for a service resource without a release date.''' data_dict = self._csw_resource_data_dict('wfs-no-release-date.xml') # LOG.info("iso_valalala: %s", data_dict['iso_values']) # assert False _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
def test_last_error_free_does_not_return_reimport_job(self): '''Test that reimport jobs are ignored for determining the last error-free job.''' # do a successful job source, job_a = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job_a) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_a.status = u'Finished' job_a.save() LOG.debug("successful job done ...") # do an unsuccessful job # This harvest job should fail, because the mock FIS-broker will look for a different # file on the second harvest run, will not find it and return a "no_record_found" # error. job_b = self._create_job(source.id) object_ids = gather_stage(FisbrokerPlugin(), job_b) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_b.status = u'Finished' job_b.save() LOG.debug("unsuccessful job done ...") # reset the mock server's counter reset_mock_server(1) # do a reimport job package_id = "3d-gebaudemodelle-im-level-of-detail-2-lod-2-wms-f2a8a483" self._get_test_app().get( url="/api/harvest/reimport?id={}".format(package_id), headers={'Accept': 'application/json'}, extra_environ={'REMOTE_USER': self.context['user'].encode('ascii')} ) LOG.debug("reimport job done ...") new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) # job_a should be the last error free job: _assert_equal(last_error_free_job.id, job_a.id)
def test_last_error_free_returns_correct_job(self): '''Test that, after a successful job A, last_error_free() returns A.''' source, job = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job.status = u'Finished' job.save() new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) _assert_equal(last_error_free_job, job) # the import_since date should be the time job_a finished: FisbrokerPlugin().source_config['import_since'] = "last_error_free" import_since = FisbrokerPlugin().get_import_since_date(new_job) import_since_expected = (job.gather_started + timedelta(hours=FisbrokerPlugin().get_timedelta())) _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z")) # the query constraints should reflect the import_since date: constraint = FisbrokerPlugin().get_constraints(new_job)[0] _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).literal) _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).propertyname)
def _run_job_for_single_document(self, harvest_job, object_id): harvester = FisbrokerPlugin() # we circumvent gather_stage() and fetch_stage() and just load the # content with a known object_id and create the harvest object: url = harvest_job.source.url # _get_content() returns XML content = harvester._get_content(url) obj = HarvestObject(guid=object_id, job=harvest_job, content=content, extras=[HarvestObjectExtra(key='status',value='new')]) obj.save() assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) harvest_job.status = u'Finished' harvest_job.save() return obj
def test_last_error_free_does_not_return_unsuccessful_job(self): '''Test that, after a successful job A, followed by an unsuccessful job B, last_error_free() returns A.''' source, job_a = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job_a) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_a.status = u'Finished' job_a.save() # This harvest job should fail, because the mock FIS-broker will look for a different # file on the second harvest run, will not find it and return a "no_record_found" # error. job_b = self._create_job(source.id) object_ids = gather_stage(FisbrokerPlugin(), job_b) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_b.status = u'Finished' job_b.save() new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) # job_a should be the last error free job: _assert_equal(last_error_free_job, job_a) # the import_since date should be the time job_a finished: FisbrokerPlugin().source_config['import_since'] = "last_error_free" import_since = FisbrokerPlugin().get_import_since_date(new_job) import_since_expected = (job_a.gather_started + timedelta(hours=FisbrokerPlugin().get_timedelta())) _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z")) # the query constraints should reflect the import_since date: constraint = FisbrokerPlugin().get_constraints(new_job)[0] _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo('modified', import_since).literal) _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).propertyname)
def command(self): '''Implementation of the paster command ''' self._load_config() if not self.args: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'list_sources': LOG.debug("listing all instances of FisbrokerPlugin ...") sources = self.list_sources() self.print_harvest_sources(sources) elif cmd == 'list_datasets': LOG.debug("listing datasets harvested by FisbrokerPlugin ...") sources = [source.get('id') for source in self.list_sources()] if len(self.args) >= 2: sources = [unicode(self.args[1])] for source in sources: start = time.time() packages = self.list_packages(source) self.print_datasets(packages) LOG.debug("there were %i results ...", len(packages)) end = time.time() LOG.debug("This took %f seconds", end - start) elif cmd == 'reimport_dataset': LOG.debug("reimporting datasets ...") package_ids = [] if self.options.dataset_id: LOG.debug("reimporting a single dataset ...") package_ids = [unicode(self.options.dataset_id)] else: sources = [] if self.options.source_id: LOG.debug( "reimporting all dataset from a single source: %s ...", self.options.source_id) sources = [unicode(self.options.source_id)] else: LOG.debug("reimporting all dataset from all sources ...") sources = [ source.get('id') for source in self.list_sources() ] for source in sources: package_ids += [ package['name'] for package in self.list_packages(source) ] start = time.time() self.reimport_dataset(package_ids) end = time.time() LOG.debug("This took %f seconds", end - start) elif cmd == 'last_successful_job': class MockHarvestJob: pass sources = [] if self.options.source_id: LOG.debug( "finding last successful job from a single source: %s ...", self.options.source_id) sources = [unicode(self.options.source_id)] else: LOG.debug("finding last successful job from all sources ...") sources = [source.get('id') for source in self.list_sources()] for source in sources: harvest_job = MockHarvestJob() harvest_job.source = HarvestSource.get(source) harvest_job.id = 'fakeid' last_successful_job = FisbrokerPlugin.last_error_free_job( harvest_job) LOG.debug(last_successful_job) else: print 'Command %s not recognized' % cmd
def reimport_batch(self, package_ids, context): '''Batch-reimport all packages in `package_ids` from their original harvest source.''' ckan_fb_mapping = {} # first, do checks that can be done without connection to FIS-Broker for package_id in package_ids: package = Package.get(package_id) if not package: raise PackageIdDoesNotExistError(package_id) if not dataset_was_harvested(package): raise PackageNotHarvestedError(package_id) harvester = harvester_for_package(package) harvester_url = harvester.url harvester_type = harvester.type if not harvester_type == HARVESTER_ID: raise PackageNotHarvestedInFisbrokerError(package_id) fb_guid = fisbroker_guid(package) if not fb_guid: raise NoFisbrokerIdError(package_id) ckan_fb_mapping[package.id] = fb_guid # get the harvest source for FIS-Broker datasets fb_source = get_fisbroker_source() if not fb_source: raise NoFBHarvesterDefined() source_id = fb_source.get('id', None) # Create and start a new harvest job job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id}) harvest_job = HarvestJob.get(job_dict['id']) harvest_job.gather_started = datetime.datetime.utcnow() assert harvest_job # instatiate the CSW connector (on the reasonable assumption that harvester_url is # the same for all package_ids) package_id = None reimported_packages = [] try: csw = CatalogueServiceWeb(harvester_url) for package_id, fb_guid in ckan_fb_mapping.items(): # query connector to get resource document csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd']) # show resource document record = csw.records.get(fb_guid, None) if record: obj = HarvestObject(guid=fb_guid, job=harvest_job, content=record.xml, package_id=package_id, extras=[ HarvestObjectExtra(key='status',value='change'), HarvestObjectExtra(key='type',value='reimport'), ]) obj.save() assert obj, obj.content harvester = FisbrokerPlugin() harvester.force_import = True harvester.import_stage(obj) rejection_reason = self._dataset_rejected(obj) if rejection_reason: raise FBImportError(package_id, rejection_reason) harvester.force_import = False Session.refresh(obj) reimported_packages.append(record) else: raise NotFoundInFisbrokerError(package_id, fb_guid) except RequestException as error: raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__)) # successfully finish harvest job harvest_job.status = u'Finished' harvest_job.finished = datetime.datetime.utcnow() harvest_job.save() return reimported_packages
def test_skip_on_dataset_resource(self): '''Test if get_package_dict() returns 'skip' for a dataset CSW resource (as opposed to a service resource).''' data_dict = self._csw_resource_data_dict('dataset-open-data.xml') _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
def test_skip_on_missing_email(self): '''Test if get_package_dict() returns 'skip' for a service resource without an email in the responsible party information.''' data_dict = self._csw_resource_data_dict('wfs-no-email.xml') _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
def test_skip_on_missing_license_info(self): '''Test if get_package_dict() returns 'skip' for a service resource without parseable license information.''' data_dict = self._csw_resource_data_dict('wfs-no-license.xml') _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
def test_empty_config(self): '''Test that an empty config just returns unchanged.''' _assert_equal(FisbrokerPlugin().validate_config(None), None) _assert_equal(FisbrokerPlugin().validate_config({}), {})
def test_undefined_import_since_is_none(self): '''Test that an undefined `import_since` config returns None.''' FisbrokerPlugin().source_config = {} import_since = FisbrokerPlugin().get_import_since_date(None) _assert_equal(import_since, None)
def test_undefined_time_delta_gives_default(self): '''Test that an undefined `timedelta` config returns the default.''' FisbrokerPlugin().source_config = {} timedelta = FisbrokerPlugin().get_timedelta() _assert_equal(timedelta, TIMEDELTA_DEFAULT)
def test_undefined_timeout_gives_default(self): '''Test that an undefined `timeout` config returns the default.''' FisbrokerPlugin().source_config = {} timeout = FisbrokerPlugin().get_timeout() _assert_equal(timeout, TIMEOUT_DEFAULT)
def test_skip_on_closed_data_resource(self): '''Test if get_package_dict() returns 'skip' for a closed data CSW resource.''' data_dict = self._csw_resource_data_dict('wfs-closed-data.xml') _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')