def test_import_since_regular_value_returned_unchanged(self):
        '''Test that any value other than 'big_bang' or 'last_changed' for
           `import_since` is returned unchanged.'''

        FisbrokerPlugin().source_config = {'import_since': "2020-03-01"}
        import_since = FisbrokerPlugin().get_import_since_date(None)
        _assert_equal(import_since, "2020-03-01")
    def test_timeout_config_returned_as_int(self):
        '''Test that get_timeout() always returns an int, if the `timeout``
           config is set.'''

        FisbrokerPlugin().source_config = { 'timeout': '100' }
        timeout = FisbrokerPlugin().get_timeout()
        _assert_equal(timeout, 100)
    def test_timedelta_config_returned_as_int(self):
        '''Test that get_timedelta() always returns an int, if the `timedelta``
           config is set.'''

        FisbrokerPlugin().source_config = { 'timedelta': '1' }
        timedelta = FisbrokerPlugin().get_timedelta()
        _assert_equal(timedelta, 1)
    def test_import_since_big_bang_means_none(self):
        '''Test that 'big_bang' for the `import_since` config means
           returns None.'''

        FisbrokerPlugin().source_config = { 'import_since': "big_bang" }
        import_since = FisbrokerPlugin().get_import_since_date(None)
        _assert_equal(import_since, None)
 def test_timedelta_must_be_int(self):
     '''Test that the `timedelta` config must be an int.'''
     config = '{ "timedelta": 2 }'
     assert FisbrokerPlugin().validate_config(config)
     # invalid timedelta:
     config = '{ "timedelta": "two" }'
     with assert_raises(ValueError):
         assert FisbrokerPlugin().validate_config(config)
 def test_timeout_must_be_int(self):
     '''Test that the `timeout` config must be an int.'''
     config = '{ "timeout": 30 }'
     assert FisbrokerPlugin().validate_config(config)
     # invalid timout:
     config = '{ "timeout": "hurtz" }'
     with assert_raises(ValueError):
         assert FisbrokerPlugin().validate_config(config)
 def test_import_since_must_be_valid_iso(self):
     '''Test that the `import_since` config must be a valid ISO8601 date.'''
     config = '{ "import_since": "2019-01-01" }'
     assert FisbrokerPlugin().validate_config(config)
     # invalid date:
     config = '{ "import_since": "2019.01.01" }'
     with assert_raises(ValueError):
         assert FisbrokerPlugin().validate_config(config)
    def test_import_since_date_is_none_if_no_jobs(self):
        '''Test that, if the `import_since` setting is `last_error_free`, but
        no jobs have run successfully (or at all), get_import_since_date()
        returns None.'''

        source, job = self._create_source_and_job()
        FisbrokerPlugin().source_config['import_since'] = "last_error_free"
        import_since = FisbrokerPlugin().get_import_since_date(job)
        _assert_equal(import_since, None)
    def test_skip_on_missing_release_date(self):
        '''Test if get_package_dict() returns 'skip' for a service resource
           without a release date.'''

        data_dict = self._csw_resource_data_dict('wfs-no-release-date.xml')
        # LOG.info("iso_valalala: %s", data_dict['iso_values'])
        # assert False
        _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
    def test_last_error_free_does_not_return_reimport_job(self):
        '''Test that reimport jobs are ignored for determining
           the last error-free job.'''

        # do a successful job
        source, job_a = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job_a)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_a.status = u'Finished'
        job_a.save()

        LOG.debug("successful job done ...")

        # do an unsuccessful job
        # This harvest job should fail, because the mock FIS-broker will look for a different
        # file on the second harvest run, will not find it and return a "no_record_found"
        # error.
        job_b = self._create_job(source.id)
        object_ids = gather_stage(FisbrokerPlugin(), job_b)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_b.status = u'Finished'
        job_b.save()

        LOG.debug("unsuccessful job done ...")

        # reset the mock server's counter
        reset_mock_server(1)

        # do a reimport job
        package_id = "3d-gebaudemodelle-im-level-of-detail-2-lod-2-wms-f2a8a483"
        self._get_test_app().get(
            url="/api/harvest/reimport?id={}".format(package_id),
            headers={'Accept': 'application/json'},
            extra_environ={'REMOTE_USER': self.context['user'].encode('ascii')}
        )

        LOG.debug("reimport job done ...")

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        # job_a should be the last error free job:
        _assert_equal(last_error_free_job.id, job_a.id)
    def test_last_error_free_returns_correct_job(self):
        '''Test that, after a successful job A, last_error_free() returns A.'''

        source, job = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job.status = u'Finished'
        job.save()

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        _assert_equal(last_error_free_job, job)

        # the import_since date should be the time job_a finished:
        FisbrokerPlugin().source_config['import_since'] = "last_error_free"
        import_since = FisbrokerPlugin().get_import_since_date(new_job)
        import_since_expected = (job.gather_started +
                                 timedelta(hours=FisbrokerPlugin().get_timedelta()))
        _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z"))

        # the query constraints should reflect the import_since date:
        constraint = FisbrokerPlugin().get_constraints(new_job)[0]
        _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).literal)
        _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).propertyname)
Example #12
0
    def _run_job_for_single_document(self, harvest_job, object_id):

        harvester = FisbrokerPlugin()

        # we circumvent gather_stage() and fetch_stage() and just load the
        # content with a known object_id and create the harvest object:
        url = harvest_job.source.url
        # _get_content() returns XML
        content = harvester._get_content(url)
        obj = HarvestObject(guid=object_id,
                            job=harvest_job,
                            content=content,
                            extras=[HarvestObjectExtra(key='status',value='new')])
        obj.save()

        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)

        harvest_job.status = u'Finished'
        harvest_job.save()

        return obj
    def test_last_error_free_does_not_return_unsuccessful_job(self):
        '''Test that, after a successful job A, followed by an unsuccessful
           job B, last_error_free() returns A.'''

        source, job_a = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job_a)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_a.status = u'Finished'
        job_a.save()

        # This harvest job should fail, because the mock FIS-broker will look for a different
        # file on the second harvest run, will not find it and return a "no_record_found"
        # error.
        job_b = self._create_job(source.id)
        object_ids = gather_stage(FisbrokerPlugin(), job_b)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_b.status = u'Finished'
        job_b.save()

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        # job_a should be the last error free job:
        _assert_equal(last_error_free_job, job_a)

        # the import_since date should be the time job_a finished:
        FisbrokerPlugin().source_config['import_since'] = "last_error_free"
        import_since = FisbrokerPlugin().get_import_since_date(new_job)
        import_since_expected = (job_a.gather_started +
                                 timedelta(hours=FisbrokerPlugin().get_timedelta()))
        _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z"))

        # the query constraints should reflect the import_since date:
        constraint = FisbrokerPlugin().get_constraints(new_job)[0]
        _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo('modified', import_since).literal)
        _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).propertyname)
Example #14
0
    def command(self):
        '''Implementation of the paster command
        '''

        self._load_config()

        if not self.args:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]

        if cmd == 'list_sources':
            LOG.debug("listing all instances of FisbrokerPlugin ...")
            sources = self.list_sources()
            self.print_harvest_sources(sources)
        elif cmd == 'list_datasets':
            LOG.debug("listing datasets harvested by FisbrokerPlugin ...")
            sources = [source.get('id') for source in self.list_sources()]
            if len(self.args) >= 2:
                sources = [unicode(self.args[1])]
            for source in sources:
                start = time.time()
                packages = self.list_packages(source)
                self.print_datasets(packages)
                LOG.debug("there were %i results ...", len(packages))
                end = time.time()
                LOG.debug("This took %f seconds", end - start)
        elif cmd == 'reimport_dataset':
            LOG.debug("reimporting datasets ...")
            package_ids = []
            if self.options.dataset_id:
                LOG.debug("reimporting a single dataset ...")
                package_ids = [unicode(self.options.dataset_id)]
            else:
                sources = []
                if self.options.source_id:
                    LOG.debug(
                        "reimporting all dataset from a single source: %s ...",
                        self.options.source_id)
                    sources = [unicode(self.options.source_id)]
                else:
                    LOG.debug("reimporting all dataset from all sources ...")
                    sources = [
                        source.get('id') for source in self.list_sources()
                    ]
                for source in sources:
                    package_ids += [
                        package['name']
                        for package in self.list_packages(source)
                    ]
            start = time.time()
            self.reimport_dataset(package_ids)
            end = time.time()
            LOG.debug("This took %f seconds", end - start)
        elif cmd == 'last_successful_job':

            class MockHarvestJob:
                pass

            sources = []
            if self.options.source_id:
                LOG.debug(
                    "finding last successful job from a single source: %s ...",
                    self.options.source_id)
                sources = [unicode(self.options.source_id)]
            else:
                LOG.debug("finding last successful job from all sources ...")
                sources = [source.get('id') for source in self.list_sources()]
            for source in sources:
                harvest_job = MockHarvestJob()
                harvest_job.source = HarvestSource.get(source)
                harvest_job.id = 'fakeid'
                last_successful_job = FisbrokerPlugin.last_error_free_job(
                    harvest_job)
                LOG.debug(last_successful_job)
        else:
            print 'Command %s not recognized' % cmd
    def reimport_batch(self, package_ids, context):
        '''Batch-reimport all packages in `package_ids` from their original
           harvest source.'''

        ckan_fb_mapping = {}

        # first, do checks that can be done without connection to FIS-Broker
        for package_id in package_ids:
            package = Package.get(package_id)

            if not package:
                raise PackageIdDoesNotExistError(package_id)

            if not dataset_was_harvested(package):
                raise PackageNotHarvestedError(package_id)

            harvester = harvester_for_package(package)
            harvester_url = harvester.url
            harvester_type = harvester.type
            if not harvester_type == HARVESTER_ID:
                raise PackageNotHarvestedInFisbrokerError(package_id)

            fb_guid = fisbroker_guid(package)
            if not fb_guid:
                raise NoFisbrokerIdError(package_id)

            ckan_fb_mapping[package.id] = fb_guid

        # get the harvest source for FIS-Broker datasets
        fb_source = get_fisbroker_source()
        if not fb_source:
            raise NoFBHarvesterDefined()
        source_id = fb_source.get('id', None)

        # Create and start a new harvest job
        job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id})
        harvest_job = HarvestJob.get(job_dict['id'])
        harvest_job.gather_started = datetime.datetime.utcnow()
        assert harvest_job

        # instatiate the CSW connector (on the reasonable assumption that harvester_url is
        # the same for all package_ids)
        package_id = None
        reimported_packages = []
        try:
            csw = CatalogueServiceWeb(harvester_url)
            for package_id, fb_guid in ckan_fb_mapping.items():
                # query connector to get resource document
                csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd'])

                # show resource document
                record = csw.records.get(fb_guid, None)
                if record:
                    obj = HarvestObject(guid=fb_guid,
                                        job=harvest_job,
                                        content=record.xml,
                                        package_id=package_id,
                                        extras=[
                                            HarvestObjectExtra(key='status',value='change'),
                                            HarvestObjectExtra(key='type',value='reimport'),
                                        ])
                    obj.save()

                    assert obj, obj.content

                    harvester = FisbrokerPlugin()
                    harvester.force_import = True
                    harvester.import_stage(obj)
                    rejection_reason = self._dataset_rejected(obj)
                    if rejection_reason:
                        raise FBImportError(package_id, rejection_reason)

                    harvester.force_import = False
                    Session.refresh(obj)

                    reimported_packages.append(record)

                else:
                    raise NotFoundInFisbrokerError(package_id, fb_guid)

        except RequestException as error:
            raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__))


        # successfully finish harvest job
        harvest_job.status = u'Finished'
        harvest_job.finished = datetime.datetime.utcnow()
        harvest_job.save()

        return reimported_packages
    def test_skip_on_dataset_resource(self):
        '''Test if get_package_dict() returns 'skip' for a dataset
           CSW resource (as opposed to a service resource).'''

        data_dict = self._csw_resource_data_dict('dataset-open-data.xml')
        _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
    def test_skip_on_missing_email(self):
        '''Test if get_package_dict() returns 'skip' for a service resource
           without an email in the responsible party information.'''

        data_dict = self._csw_resource_data_dict('wfs-no-email.xml')
        _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
    def test_skip_on_missing_license_info(self):
        '''Test if get_package_dict() returns 'skip' for a service resource
           without parseable license information.'''

        data_dict = self._csw_resource_data_dict('wfs-no-license.xml')
        _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')
 def test_empty_config(self):
     '''Test that an empty config just returns unchanged.'''
     _assert_equal(FisbrokerPlugin().validate_config(None), None)
     _assert_equal(FisbrokerPlugin().validate_config({}), {})
    def test_undefined_import_since_is_none(self):
        '''Test that an undefined `import_since` config returns None.'''

        FisbrokerPlugin().source_config = {}
        import_since = FisbrokerPlugin().get_import_since_date(None)
        _assert_equal(import_since, None)
    def test_undefined_time_delta_gives_default(self):
        '''Test that an undefined `timedelta` config returns the default.'''

        FisbrokerPlugin().source_config = {}
        timedelta = FisbrokerPlugin().get_timedelta()
        _assert_equal(timedelta, TIMEDELTA_DEFAULT)
    def test_undefined_timeout_gives_default(self):
        '''Test that an undefined `timeout` config returns the default.'''

        FisbrokerPlugin().source_config = {}
        timeout = FisbrokerPlugin().get_timeout()
        _assert_equal(timeout, TIMEOUT_DEFAULT)
    def test_skip_on_closed_data_resource(self):
        '''Test if get_package_dict() returns 'skip' for a closed data
           CSW resource.'''

        data_dict = self._csw_resource_data_dict('wfs-closed-data.xml')
        _assert_equal(FisbrokerPlugin().get_package_dict(self.context, data_dict), 'skip')