def test_last_error_free_returns_correct_job(self):
        '''Test that, after a successful job A, last_error_free() returns A.'''

        source, job = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job.status = u'Finished'
        job.save()

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        _assert_equal(last_error_free_job, job)

        # the import_since date should be the time job_a finished:
        FisbrokerPlugin().source_config['import_since'] = "last_error_free"
        import_since = FisbrokerPlugin().get_import_since_date(new_job)
        import_since_expected = (job.gather_started +
                                 timedelta(hours=FisbrokerPlugin().get_timedelta()))
        _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z"))

        # the query constraints should reflect the import_since date:
        constraint = FisbrokerPlugin().get_constraints(new_job)[0]
        _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).literal)
        _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).propertyname)
Esempio n. 2
0
def run_harvest_job(job, harvester):
    # In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue'
    # which would do 2 things to 'run' the job:
    # 1. change the job status to Running
    job.status = 'Running'
    job.save()
    # 2. put the job on the gather queue which is consumed by
    # queue.gather_callback, which determines the harvester and then calls
    # gather_stage. We simply call the gather_stage.
    obj_ids = queue.gather_stage(harvester, job)

    # The object ids are put onto the fetch queue, consumed by
    # queue.fetch_callback which calls queue.fetch_and_import_stages
    results_by_guid = {}
    for obj_id in obj_ids:
        harvest_object = harvest_model.HarvestObject.get(obj_id)
        guid = harvest_object.guid
        results_by_guid[guid] = {'obj_id': obj_id}

        queue.fetch_and_import_stages(harvester, harvest_object)
        results_by_guid[guid]['state'] = harvest_object.state
        results_by_guid[guid]['report_status'] = harvest_object.report_status
        if harvest_object.state == 'COMPLETE' and harvest_object.package_id:
            results_by_guid[guid]['dataset'] = \
                toolkit.get_action('package_show')(
                    {'ignore_auth': True},
                    dict(id=harvest_object.package_id))
        results_by_guid[guid]['errors'] = harvest_object.errors

    # Do 'harvest_jobs_run' to change the job status to 'finished'
    toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {})

    return results_by_guid
Esempio n. 3
0
def run_harvest_job(job, harvester):
    # When 'paster harvest run' is called by the regular cron it does 2 things:
    # 1. change the job status to Running
    job.status = "Running"
    job.save()
    # 2. put the job on the gather queue which is consumed by
    # queue.gather_callback, which determines the harvester and then calls
    # gather_stage. We simply call the gather_stage.
    obj_ids = queue.gather_stage(harvester, job)

    # The object ids are put onto the fetch queue, consumed by
    # queue.fetch_callback which calls queue.fetch_and_import_stages
    results_by_guid = {}
    for obj_id in obj_ids:
        harvest_object = harvest_model.HarvestObject.get(obj_id)
        guid = harvest_object.guid
        results_by_guid[guid] = {"obj_id": obj_id}

        queue.fetch_and_import_stages(harvester, harvest_object)
        results_by_guid[guid]["state"] = harvest_object.state
        results_by_guid[guid]["report_status"] = harvest_object.report_status
        if harvest_object.state == "COMPLETE" and harvest_object.package_id:
            results_by_guid[guid]["dataset"] = toolkit.get_action("package_show")(
                {}, dict(id=harvest_object.package_id)
            )
        results_by_guid[guid]["errors"] = harvest_object.errors

    # Do 'harvest_jobs_run' to change the job status to 'finished'
    try:
        toolkit.get_action("harvest_jobs_run")({"ignore_auth": True}, {})
    except NoNewHarvestJobError:
        # This is expected
        pass

    return results_by_guid
Esempio n. 4
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
Esempio n. 5
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()
        
        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)
        
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()
        
        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()
            
            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)
                
            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id}))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
Esempio n. 6
0
def run_harvest_job(job, harvester):
    # In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue'
    # which would do 2 things to 'run' the job:
    # 1. change the job status to Running
    job.status = 'Running'
    job.save()
    # 2. put the job on the gather queue which is consumed by
    # queue.gather_callback, which determines the harvester and then calls
    # gather_stage. We simply call the gather_stage.
    obj_ids = queue.gather_stage(harvester, job)
    if not isinstance(obj_ids, list):
        # gather had nothing to do or errored. Carry on to ensure the job is
        # closed properly
        obj_ids = []

    # The object ids are put onto the fetch queue, consumed by
    # queue.fetch_callback which calls queue.fetch_and_import_stages
    results_by_guid = {}
    for obj_id in obj_ids:
        harvest_object = harvest_model.HarvestObject.get(obj_id)
        guid = harvest_object.guid

        # force reimport of datasets
        if hasattr(job, 'force_import'):
            if guid in job.force_import:
                harvest_object.force_import = True
            else:
                log.info('Skipping: %s', guid)
                continue

        results_by_guid[guid] = {'obj_id': obj_id}

        queue.fetch_and_import_stages(harvester, harvest_object)
        results_by_guid[guid]['state'] = harvest_object.state
        results_by_guid[guid]['report_status'] = harvest_object.report_status
        if harvest_object.state == 'COMPLETE' and harvest_object.package_id:
            results_by_guid[guid]['dataset'] = \
                toolkit.get_action('package_show')(
                    {'ignore_auth': True},
                    dict(id=harvest_object.package_id))
        results_by_guid[guid]['errors'] = harvest_object.errors

    # Do 'harvest_jobs_run' to change the job status to 'finished'
    toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {})

    return results_by_guid
    def test_last_error_free_does_not_return_reimport_job(self):
        '''Test that reimport jobs are ignored for determining
           the last error-free job.'''

        # do a successful job
        source, job_a = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job_a)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_a.status = u'Finished'
        job_a.save()

        LOG.debug("successful job done ...")

        # do an unsuccessful job
        # This harvest job should fail, because the mock FIS-broker will look for a different
        # file on the second harvest run, will not find it and return a "no_record_found"
        # error.
        job_b = self._create_job(source.id)
        object_ids = gather_stage(FisbrokerPlugin(), job_b)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_b.status = u'Finished'
        job_b.save()

        LOG.debug("unsuccessful job done ...")

        # reset the mock server's counter
        reset_mock_server(1)

        # do a reimport job
        package_id = "3d-gebaudemodelle-im-level-of-detail-2-lod-2-wms-f2a8a483"
        self._get_test_app().get(
            url="/api/harvest/reimport?id={}".format(package_id),
            headers={'Accept': 'application/json'},
            extra_environ={'REMOTE_USER': self.context['user'].encode('ascii')}
        )

        LOG.debug("reimport job done ...")

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        # job_a should be the last error free job:
        _assert_equal(last_error_free_job.id, job_a.id)
    def test_last_error_free_does_not_return_unsuccessful_job(self):
        '''Test that, after a successful job A, followed by an unsuccessful
           job B, last_error_free() returns A.'''

        source, job_a = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job_a)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_a.status = u'Finished'
        job_a.save()

        # This harvest job should fail, because the mock FIS-broker will look for a different
        # file on the second harvest run, will not find it and return a "no_record_found"
        # error.
        job_b = self._create_job(source.id)
        object_ids = gather_stage(FisbrokerPlugin(), job_b)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_b.status = u'Finished'
        job_b.save()

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        # job_a should be the last error free job:
        _assert_equal(last_error_free_job, job_a)

        # the import_since date should be the time job_a finished:
        FisbrokerPlugin().source_config['import_since'] = "last_error_free"
        import_since = FisbrokerPlugin().get_import_since_date(new_job)
        import_since_expected = (job_a.gather_started +
                                 timedelta(hours=FisbrokerPlugin().get_timedelta()))
        _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z"))

        # the query constraints should reflect the import_since date:
        constraint = FisbrokerPlugin().get_constraints(new_job)[0]
        _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo('modified', import_since).literal)
        _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).propertyname)
Esempio n. 9
0
class WAFCollectionHarvester(GeoDataGovWAFHarvester):
    def info(self):
        return {
            'name':
            'waf-collection',
            'title':
            'Web Accessible Folder (WAF) Homogeneous Collection',
            'description':
            'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents with a collection record'
        }

    def extra_schema(self):
        extra_schema = super(WAFCollectionHarvester, self).extra_schema()
        extra_schema['collection_metadata_url'] = [not_empty, unicode]
        return extra_schema

    def get_package_dict(self, iso_values, harvest_object):

        package_dict = super(WAFCollectionHarvester,
                             self).get_package_dict(iso_values, harvest_object)
        if not package_dict:
            return None

        collection_package_id = self._get_object_extra(
            harvest_object, 'collection_package_id')
        if collection_package_id:
            package_dict['extras'].append(
                dict(key='collection_package_id', value=collection_package_id))

        collection_metadata = self._get_object_extra(harvest_object,
                                                     'collection_metadata')
        if collection_metadata:
            package_dict['extras'].append(
                dict(key='collection_metadata', value=collection_metadata))
            status = self._get_object_extra(harvest_object, 'status')
            if status == 'change':
                self.force_import = True
            else:
                self.force_import = False

        return package_dict

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('WafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        collection_metadata_url = self.source_config.get(
            'collection_metadata_url')

        if not collection_metadata_url:
            self._save_gather_error('collection url does not exist',
                                    harvest_job)
            return None

        try:
            response = requests.get(source_url, timeout=60)
            content = response.content
        except Exception, e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None

        guid = hashlib.md5(collection_metadata_url.encode(
            'utf8', 'ignore')).hexdigest()

        existing_harvest_object = model.Session.\
            query(HarvestObject.guid, HarvestObject.package_id, HOExtra.value).\
            join(HOExtra, HarvestObject.extras).\
            filter(HOExtra.key=='collection_metadata').\
            filter(HOExtra.value=='true').\
            filter(HarvestObject.current==True).\
            filter(HarvestObject.harvest_source_id==harvest_job.source.id).first()

        if existing_harvest_object:
            status = 'change'
            guid = existing_harvest_object.guid
            package_id = existing_harvest_object.package_id
        else:
            status, package_id = 'new', None

        obj = HarvestObject(job=harvest_job,
                            extras=[
                                HOExtra(key='collection_metadata',
                                        value='true'),
                                HOExtra(key='waf_location',
                                        value=collection_metadata_url),
                                HOExtra(key='status', value=status)
                            ],
                            guid=guid,
                            status=status,
                            package_id=package_id)
        queue.fetch_and_import_stages(self, obj)
        if obj.state == 'ERROR':
            self._save_gather_error(
                'Collection object failed to harvest, not harvesting',
                harvest_job)
            return None

        return GeoDataGovWAFHarvester.gather_stage(
            self, harvest_job, collection_package_id=obj.package_id)