Esempio n. 1
0
    def setup_class(cls):
        try:
            from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        cls.content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content1)

        cls.content2 = '<xml>Content 2</xml>'
        cls.original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content2)

        hoe = HarvestObjectExtra(key='original_document',
                                 value=cls.original_content2,
                                 object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        cls.object_id_1 = ho1.id
        cls.object_id_2 = ho2.id
Esempio n. 2
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
Esempio n. 3
0
    def test_harvester(self):
        job = HarvestJob(source = self.source)

        harvester = InventoryHarvester()

        # Gather all of the datasets from the XML content and make sure
        # we have created some harvest objects
        result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml'))
        self.assertEqual(len(result), 79)

        # We only want one for testing
        harvest_object_id = result[0]
        harvest_obj = HarvestObject.get(harvest_object_id)

        # Run the fetch stage
        fetch_result = harvester.fetch_stage(harvest_obj)
        self.assertTrue(fetch_result)

        # Make sure we can create a dataset by running the import stage
        harvester.import_stage(harvest_obj)
        self.assertIsNotNone(harvest_obj.package_id)

        # Get the newly created package and make sure it is in the correct
        # organisation
        pkg = toolkit.get_action('package_show')(
            { 'ignore_auth': True, 'user': self.sysadmin['name'] },
            { 'id': harvest_obj.package_id },
        )
        self.assertEqual(pkg['organization']['id'], self.publisher['id'])
Esempio n. 4
0
 def test_gather(self):
     source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
     source.save()
     job = HarvestJob(source=source)
     job.save()
     self.harvester.client = _FakeClient()
     self.harvester.gather_stage(job)
Esempio n. 5
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict = {'source_id': source_id, 'status': u'New'}
    exists = harvest_job_list(context, data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
Esempio n. 6
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
Esempio n. 7
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.ObjectNotFound('Harvest source %s does not exist' %
                                     source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError(
            'Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(context, {
            'id': job.id
        })

    return harvest_job_dictize(job, context)
Esempio n. 8
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        job = HarvestJob()
        job.save()
        model.repo.commit_and_remove()
        job = model.Session.query(HarvestJob).first()
        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           harvest_job=job,
                           guid='test-guid',
                           content='<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
Esempio n. 9
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        source = HarvestSource(url=u'http://test-source.org', type='test')
        source.save()

        job = HarvestJob(source=source)
        job.save()

        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           job=job,
                           guid=u'test-guid',
                           content=u'<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
Esempio n. 10
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package.get('name', None),
                          utils.pid_to_name(package.get('id', None)))
        self.assertEquals(utils.get_primary_pid(package),
                          u'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('notes', None),
                          u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {
            u'id': u'http://islrn.org/resources/248-895-085-557-0',
            u'provider': provider,
            u'type': u'relation',
            u'relation': u'generalRelation'
        }

        self.assertTrue(expected_pid not in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
Esempio n. 11
0
    def test_api(self):
        try:
            from ckanext.harvest.model import (HarvestObject, HarvestJob,
                                               HarvestSource,
                                               HarvestObjectExtra)
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=content1)

        content2 = '<xml>Content 2</xml>'
        original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=content2)

        hoe = HarvestObjectExtra(
            key='original_document',
            value=original_content2,
            object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        object_id_1 = ho1.id
        object_id_2 = ho2.id

        app = self._get_test_app()

        # Test redirects for old URLs
        url = '/api/2/rest/harvestobject/{0}/xml'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 301)
        assert ('/harvest/object/{0}'.format(object_id_1)
                in r.headers['Location'])

        url = '/api/2/rest/harvestobject/{0}/html'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 301)
        assert ('/harvest/object/{0}/html'.format(object_id_1)
                in r.headers['Location'])

        # Access object content
        url = '/harvest/object/{0}'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'application/xml; charset=utf-8')
        assert_equals(
            r.body,
            '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>')

        # Access original content in object extra (if present)
        url = '/harvest/object/{0}/original'.format(object_id_1)
        r = app.get(url, status=404)
        assert_equals(r.status_int, 404)

        url = '/harvest/object/{0}/original'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'application/xml; charset=utf-8')
        assert_equals(
            r.body,
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            + '<xml>Original Content 2</xml>')

        # Access HTML transformation
        url = '/harvest/object/{0}/html'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body

        url = '/harvest/object/{0}/html/original'.format(object_id_1)
        r = app.get(url, status=404)
        assert_equals(r.status_int, 404)

        url = '/harvest/object/{0}/html'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body

        url = '/harvest/object/{0}/html/original'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body
Esempio n. 12
0
    def test_provenance(self):
		id="http://data.norge.no/node/512"
		source_json = {
			"title": "NOBIL - ladestasjoner for elbiler",
			"description": [
				{
					"language": "nb",
					"value": "<p>NOBIL er etablert for formidling av informasjon om ladestasjoner for elbiler. Den har et detaljert innhold om ladestasjonene og formidler sanntidsdata. Alt er fritt tilgjengelig via et API.</p><p>Et ladepunkt er en reservert parkeringsplass med lademulighet for ladbare kjoeretoey. Paa et ladepunkt kan det vaere mer enn en kontakt, men bare plass til et kjoeretoey av gangen. En ladestasjon er et sted for det er ett eller flere ladepunkt.</p>"
				}
			],
			"landingPage": "http://nobil.no",
			"issued": "2013-10-08",
			"modified": "2016-10-07T11:34",
			"language": [
				"NOR"
			],
			"publisher": {
				"name": "Enova SF",
				"mbox": "*****@*****.**"
			},
			"keyword": [
				"Energi",
				"Miljoe",
				"Transport"
			],
			"distribution": [
				{
					"title": "JSON",
					"description": [
						{
							"language": "nb",
							"value": "Nettside som med informasjon om API-et"
						}
					],
					"format": "JSON",
					"downloadURL": "null",
					"accessURL": "http://nobil.no",
					"webserviceURL": "null",
					"license": "http://creativecommons.org/licenses/by/3.0/no/"
				}
			]
		}
		job= HarvestJob(url="http://data.norge.no/api/dcat/data.json", title="TestHarvest")
		obj = HarvestObject(guid="http://data.norge.no/node/512",content=source_json, source=job)
		initial={"activity_ocurred": "2016-10-21T12:25:09.091774", "harvest_source_title": "TestHarvest", "activity": "initial_harvest", "harvest_sorce_url": "http://data.norge.no/api/dcat/data.json", "excluded_resources": [], "harvested_guid": "http://data.norge.no/node/512"}

		#Testing get_metadata:provenance_for_just_this_harvest
		dcatharvest=DCATHarvester()
		provenance=dcatharvest.get_metadata_provenance_for_just_this_harvest(obj,"initial_harvest",[])

		self.assertEqual(provenance["harvest_source_title"], initial["harvest_source_title"])
		self.assertEqual(provenance["activity"], initial["activity"])
		self.assertEqual(provenance["harvest_sorce_url"], initial["harvest_sorce_url"])
		self.assertEqual(provenance["harvested_guid"], initial["harvested_guid"])
		self.assertEqual(provenance["excluded_resources"],initial["excluded_resources"])

		#Testing append_provenance_data
		val=[initial]
		val_string=json.dumps(val)
		package_dict={
			"extras":[
				{"key":"metadata_provenance","value":val_string}

			]
		}


		update={"activity_ocurred": "2016-10-21T12:25:09.091774", "harvest_source_title": "TestHarvest", "activity": "update", "harvest_sorce_url": "http://data.norge.no/api/dcat/data.json", "excluded_resources": [], "harvested_guid": "http://data.norge.no/node/512"}
		appended=[initial,update]
		package_correct ={
			"extras": [
				{"key": "metadata_provenance", "value": json.dumps(appended)}

			]
		}

		test_package_result=dcatharvest.append_provenance_data(package_dict,obj,"update",[])
		test_provenance=json.loads(test_package_result["extras"][0]["value"])

		correct_provenance=json.loads(package_correct["extras"][0]["value"])

		self.assertEqual(len(correct_provenance),len(test_provenance))
Esempio n. 13
0
    def test_api(self, app):
        try:
            from ckanext.harvest.model import (
                HarvestObject,
                HarvestJob,
                HarvestSource,
                HarvestObjectExtra,
            )
        except ImportError:
            raise pytest.skip(
                "The harvester extension is needed for these tests")

        content1 = "<xml>Content 1</xml>"
        ho1 = HarvestObject(
            guid="test-ho-1",
            job=HarvestJob(source=HarvestSource(url="http://", type="xx")),
            content=content1,
        )

        content2 = "<xml>Content 2</xml>"
        original_content2 = "<xml>Original Content 2</xml>"
        ho2 = HarvestObject(
            guid="test-ho-2",
            job=HarvestJob(source=HarvestSource(url="http://", type="xx")),
            content=content2,
        )

        hoe = HarvestObjectExtra(
            key="original_document", value=original_content2, object=ho2
        )

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        object_id_1 = ho1.id
        object_id_2 = ho2.id

        # Access object content
        url = "/harvest/object/{0}".format(object_id_1)
        r = app.get(url, status=200)
        assert(
            r.headers["Content-Type"] == "application/xml; charset=utf-8"
        )
        assert(
            r.body ==
            '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>'
        )

        # Access original content in object extra (if present)
        url = "/harvest/object/{0}/original".format(object_id_1)
        r = app.get(url, status=404)

        url = "/harvest/object/{0}/original".format(object_id_2)
        r = app.get(url, status=200)
        assert(
            r.headers["Content-Type"] == "application/xml; charset=utf-8"
        )
        assert(
            r.body ==
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            + "<xml>Original Content 2</xml>"
        )
Esempio n. 14
0
        if not source_pkg:
            log.error('Harvest source %s does not exist', source_name)
            return
        source_id = source_pkg.id
        source = HarvestSource.get(source_id)
        if not source:
            log.error('Harvest source %s does not exist', source_id)
            return

        # Check if the source is active
        if not source.active:
            log.warn('Harvest job cannot be created for inactive source %s',
                     source_id)
            raise Exception('Can not create jobs on inactive sources')

        job = HarvestJob()
        job.source = source
        job.save()
        context['harvest_job'] = job

        print str(datetime.datetime.now()) + ' Start to import doi datasets.'
        print 'Datasets found on remote doi server: ' + str(
            len(collected_ids)) + ', on local: ' + str(len(existing_ids)) + '.'

        ids_to_add = collected_ids - existing_ids
        print 'Datasets to be added as new: ' + str(len(ids_to_add)) + '.'
        for num, doi_id in enumerate(ids_to_add):
            context.pop('package', None)
            context.pop('group', None)
            try:
                new_package = self.get_doi_package(url_dataset + doi_id)
Esempio n. 15
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()

                    # recreate job for datajson collection or the like.
                    source = job_obj.source
                    source_config = json.loads(source.config or '{}')
                    datajson_collection = source_config.get(
                        'datajson_collection')
                    if datajson_collection == 'parents_run':
                        new_job = HarvestJob()
                        new_job.source = source
                        new_job.save()
                        source_config['datajson_collection'] = 'children_run'
                        source.config = json.dumps(source_config)
                        source.save()
                    elif datajson_collection:
                        # reset the key if 'children_run', or anything.
                        source_config.pop("datajson_collection", None)
                        source.config = json.dumps(source_config)
                        source.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs