def _refresh_harvest_objects(self, harvest_object, package_id):
     """
     Perform harvester housekeeping:
         - Flag the other objects of the source as not current
         - Set a refernce to the package in the harvest object
         - Flag it as current
         - And save the changes
     """
     # Flag the other objects of this source as not current
     from ckanext.harvest.model import harvest_object_table
     u = update(harvest_object_table) \
         .where(harvest_object_table.c.package_id == bindparam('pkg_id')) \
         .values(current=False)
     Session.execute(u, params={'pkg_id': package_id})
     Session.commit()
     # Refresh current object from session, otherwise the
     # import paster command fails
     # (Copied from the Gemini harvester--not sure if necessary)
     Session.remove()
     Session.add(harvest_object)
     Session.refresh(harvest_object)
     # Set reference to package in the HarvestObject and flag it as
     # the current one
     if not harvest_object.package_id:
         harvest_object.package_id = package_id
     harvest_object.current = True
     harvest_object.save()
Esempio n. 2
0
    def _run_job_for_single_document(
        self, job, force_import=False, expect_gather_errors=False, expect_obj_errors=False
    ):

        harvester = GeminiDocHarvester()

        harvester.force_import = force_import

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1
        if expect_gather_errors:
            assert len(job.gather_errors) > 0
        else:
            assert len(job.gather_errors) == 0

        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)
        if expect_obj_errors:
            assert len(obj.errors) > 0
        else:
            assert len(obj.errors) == 0

        job.status = u"Finished"
        job.save()

        return obj
Esempio n. 3
0
    def test_harvest_update_records(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"}

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run a second job, the package should not be updated
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)

        second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict["id"] == second_package_dict["id"]
        assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"]
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Create and run a third job, forcing the importing to simulate an update in the package
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job, force_import=True)

        # For some reason first_obj does not get updated after the import_stage,
        # and we have to force a refresh to get the actual DB values.
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action("package_show_rest")(self.context, {"id": third_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict["id"] == third_package_dict["id"]
        assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"]
        assert third_obj.package, third_obj.package_id == first_package_dict["id"]
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Esempio n. 4
0
    def _run_job_for_single_document(self,job,force_import=False,expect_gather_errors=False,expect_obj_errors=False):

        harvester = GeminiDocHarvester()

        harvester.force_import = force_import


        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1
        if expect_gather_errors:
            assert len(job.gather_errors) > 0
        else:
            assert len(job.gather_errors) == 0

        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)
        if expect_obj_errors:
            assert len(obj.errors) > 0
        else:
            assert len(obj.errors) == 0

        job.status = u'Finished'
        job.save()

        return obj
Esempio n. 5
0
    def test_harvest_import_command(self):

        # Create source
        source_fixture = {
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        before_package_dict = get_action('package_show_rest')(
            self.context, {
                'id': first_obj.package_id
            })

        # Package was created
        assert before_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run two more jobs, the package should not be updated
        second_job = self._create_job(source.id)
        second_obj = self._run_job_for_single_document(second_job)
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job)

        # Run the import command manually
        imported_objects = get_action('harvest_objects_import')(
            self.context, {
                'source_id': source.id
            })
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        after_package_dict = get_action('package_show_rest')(
            self.context, {
                'id': first_obj.package_id
            })

        # Package was updated, and the current object remains the same
        assert after_package_dict, before_package_dict[
            'id'] == after_package_dict['id']
        assert after_package_dict['metadata_modified'] > before_package_dict[
            'metadata_modified']
        assert third_obj.current == False
        assert second_obj.current == False
        assert first_obj.current == True

        source_dict = get_action('harvest_source_show')(self.context, {
            'id': source.id
        })
        assert len(source_dict['status']['packages']) == 1
Esempio n. 6
0
    def _run_job_for_single_document(self, harvest_job, object_id):

        harvester = FisbrokerPlugin()

        # we circumvent gather_stage() and fetch_stage() and just load the
        # content with a known object_id and create the harvest object:
        url = harvest_job.source.url
        # _get_content() returns XML
        content = harvester._get_content(url)
        obj = HarvestObject(guid=object_id,
                            job=harvest_job,
                            content=content,
                            extras=[HarvestObjectExtra(key='status',value='new')])
        obj.save()

        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)

        harvest_job.status = u'Finished'
        harvest_job.save()

        return obj
Esempio n. 7
0
    def test_harvest_different_sources_same_document(self):

        # Create source1
        source1_fixture = {"url": u"http://127.0.0.1:8999/single/source1/same_dataset.xml", "type": u"gemini-single"}

        source1, first_job = self._create_source_and_job(source1_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict["state"] == u"active"
        assert first_obj.current == True

        # Harvest the same document, unchanged, from another source, the package
        # is not updated.
        # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67
        # we are no longer throwing an exception when this happens)
        source2_fixture = {"url": u"http://127.0.0.1:8999/single/source2/same_dataset.xml", "type": u"gemini-single"}

        source2, second_job = self._create_source_and_job(source2_fixture)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict["id"] == second_package_dict["id"]
        assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"]
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Inactivate source1 and reharvest from source2, package should be updated
        third_job = self._create_job(source2.id)
        third_obj = self._run_job_for_single_document(third_job, force_import=True)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict["id"] == third_package_dict["id"]
        assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"]
        assert third_obj.package, third_obj.package_id == first_package_dict["id"]
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Esempio n. 8
0
    def test_harvest_import_command(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        before_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert before_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run two more jobs, the package should not be updated
        second_job = self._create_job(source.id)
        second_obj = self._run_job_for_single_document(second_job)
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job)

        # Run the import command manually
        imported_objects = get_action('harvest_objects_import')(self.context,{'source_id':source.id})
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        after_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was updated, and the current object remains the same
        assert after_package_dict, before_package_dict['id'] == after_package_dict['id']
        assert after_package_dict['metadata_modified'] > before_package_dict['metadata_modified']
        assert third_obj.current == False
        assert second_obj.current == False
        assert first_obj.current == True


        source_dict = get_action('harvest_source_show')(self.context,{'id':source.id})
        assert source_dict['status']['total_datasets'] == 1
Esempio n. 9
0
    def test_harvest_import_command(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"}

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        before_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert before_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run two more jobs, the package should not be updated
        second_job = self._create_job(source.id)
        second_obj = self._run_job_for_single_document(second_job)
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job)

        # Run the import command manually
        imported_objects = get_action("harvest_objects_import")(self.context, {"source_id": source.id})
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        after_package_dict = get_action("package_show_rest")(self.context, {"id": imported_objects[0]["package_id"]})

        # Package was updated, and the current object remains the same
        assert after_package_dict, before_package_dict["id"] == after_package_dict["id"]
        assert after_package_dict["metadata_modified"] > before_package_dict["metadata_modified"]
        assert third_obj.current == False
        assert second_obj.current == False
        assert first_obj.current == True

        source_dict = get_action("harvest_source_show")(self.context, {"id": source.id})
        assert len(source_dict["status"]["packages"]) == 1
Esempio n. 10
0
    def test_harvest_deleted_record(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/service1.xml", "type": u"gemini-single"}

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict["state"] == u"active"
        assert first_obj.current == True

        # Delete package
        first_package_dict["state"] = u"deleted"
        self.context.update({"id": first_package_dict["id"]})
        updated_package_dict = get_action("package_update_rest")(self.context, first_package_dict)

        # Create and run a second job, the date has not changed, so the package should not be updated
        # and remain deleted
        first_job.status = u"Finished"
        first_job.save()
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was not updated
        assert second_package_dict, updated_package_dict["id"] == second_package_dict["id"]
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Harvest an updated document, with a more recent modified date, package should be
        # updated and reactivated
        source.url = u"http://127.0.0.1:8999/single/service1_newer.xml"
        source.save()

        third_job = self._create_job(source.id)

        third_obj = self._run_job_for_single_document(third_job)

        third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        # Package was updated
        assert third_package_dict, third_package_dict["id"] == second_package_dict["id"]
        assert third_obj.package, third_obj.package
        assert third_obj.current == True, second_obj.current == False
        assert first_obj.current == False

        assert "NEWER" in third_package_dict["title"]
        assert third_package_dict["state"] == u"active"
Esempio n. 11
0
    def test_harvest_different_sources_same_document(self):

        # Create source1
        source1_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source1, first_job = self._create_source_and_job(source1_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Harvest the same document, unchanged, from another source, the package
        # is not updated.
        # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67
        # we are no longer throwing an exception when this happens)
        source2_fixture = {
            'title': 'Test Source 2',
            'name': 'test-source-2',
            'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source2, second_job = self._create_source_and_job(source2_fixture)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Inactivate source1 and reharvest from source2, package should be updated
        third_job = self._create_job(source2.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Esempio n. 12
0
    def test_harvest_deleted_record(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Delete package
        first_package_dict['state'] = u'deleted'
        self.context.update({'id':first_package_dict['id']})
        updated_package_dict = get_action('package_update')(self.context,first_package_dict)

        # Create and run a second job, the date has not changed, so the package should not be updated
        # and remain deleted
        first_job.status = u'Finished'
        first_job.save()
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, updated_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True


        # Harvest an updated document, with a more recent modified date, package should be
        # updated and reactivated
        source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml'
        source.save()

        third_job = self._create_job(source.id)

        third_obj = self._run_job_for_single_document(third_job)

        third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        # Package was updated
        assert third_package_dict, third_package_dict['id'] == second_package_dict['id']
        assert third_obj.package, third_obj.package
        assert third_obj.current == True, second_obj.current == False
        assert first_obj.current == False

        assert 'NEWER' in third_package_dict['title']
        assert third_package_dict['state'] == u'active'
Esempio n. 13
0
    def test_harvest_update_records(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run a second job, the package should not be updated
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)

        second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Create and run a third job, forcing the importing to simulate an update in the package
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        # For some reason first_obj does not get updated after the import_stage,
        # and we have to force a refresh to get the actual DB values.
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show')(self.context,{'id':third_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Esempio n. 14
0
    def test_harvest_different_sources_same_document(self):

        # Create source1
        source1_fixture = {
		    'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source1, first_job = self._create_source_and_job(source1_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Harvest the same document, unchanged, from another source, the package
        # is not updated.
        # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67
        # we are no longer throwing an exception when this happens)
        source2_fixture = {
			'title': 'Test Source 2',
			'name': 'test-source-2',
            'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source2, second_job = self._create_source_and_job(source2_fixture)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True


        # Inactivate source1 and reharvest from source2, package should be updated
        third_job = self._create_job(source2.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Esempio n. 15
0
    def test_harvest_deleted_record(self):

        # Create source
        source_fixture = {
			'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Delete package
        first_package_dict['state'] = u'deleted'
        self.context.update({'id':first_package_dict['id']})
        updated_package_dict = get_action('package_update_rest')(self.context,first_package_dict)

        # Create and run a second job, the date has not changed, so the package should not be updated
        # and remain deleted
        first_job.status = u'Finished'
        first_job.save()
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, updated_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True


        # Harvest an updated document, with a more recent modified date, package should be
        # updated and reactivated
        source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml'
        source.save()

        third_job = self._create_job(source.id)

        third_obj = self._run_job_for_single_document(third_job)

        third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        # Package was updated
        assert third_package_dict, third_package_dict['id'] == second_package_dict['id']
        assert third_obj.package, third_obj.package
        assert third_obj.current == True, second_obj.current == False
        assert first_obj.current == False

        assert 'NEWER' in third_package_dict['title']
        assert third_package_dict['state'] == u'active'
Esempio n. 16
0
    def test_harvest_update_records(self):

        # Create source
        source_fixture = {
			'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run a second job, the package should not be updated
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)

        second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Create and run a third job, forcing the importing to simulate an update in the package
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        # For some reason first_obj does not get updated after the import_stage,
        # and we have to force a refresh to get the actual DB values.
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show_rest')(self.context,{'id':third_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Esempio n. 17
0
    def write_package_from_gemini_string(self, content):
        '''Create or update a Package based on some content that has
        come from a URL.

        Returns the package_dict of the result.
        If there is an error, it returns None or raises Exception.
        '''
        log = logging.getLogger(__name__ + '.import')
        package = None
        gemini_document = GeminiDocument(content)
        gemini_values = gemini_document.read_values()
        gemini_guid = gemini_values['guid']

        # Save the metadata reference date in the Harvest Object
        try:
            metadata_modified_date = datetime.strptime(
                gemini_values['metadata-date'], '%Y-%m-%d')
        except ValueError:
            try:
                metadata_modified_date = datetime.strptime(
                    gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S')
            except:
                raise Exception('Could not extract reference date for GUID %s (%s)' \
                        % (gemini_guid,gemini_values['metadata-date']))

        self.obj.metadata_modified_date = metadata_modified_date
        self.obj.save()

        last_harvested_object = Session.query(HarvestObject) \
                            .filter(HarvestObject.guid==gemini_guid) \
                            .filter(HarvestObject.current==True) \
                            .all()

        if len(last_harvested_object) == 1:
            last_harvested_object = last_harvested_object[0]
        elif len(last_harvested_object) > 1:
            raise Exception(
                'Application Error: more than one current record for GUID %s' %
                gemini_guid)

        reactivate_package = False
        if last_harvested_object:
            # We've previously harvested this (i.e. it's an update)

            # Use metadata modified date instead of content to determine if the package
            # needs to be updated
            if last_harvested_object.metadata_modified_date is None \
                or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \
                or self.force_import \
                or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and
                    last_harvested_object.source.active is False):

                if self.force_import:
                    log.info('Import forced for object %s with GUID %s' %
                             (self.obj.id, gemini_guid))
                else:
                    log.info(
                        'Package for object with GUID %s needs to be created or updated'
                        % gemini_guid)

                package = last_harvested_object.package

                # If the package has a deleted state, we will only update it and reactivate it if the
                # new document has a more recent modified date
                if package.state == u'deleted':
                    if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date:
                        log.info(
                            'Package for object with GUID %s will be re-activated'
                            % gemini_guid)
                        reactivate_package = True
                    else:
                        log.info(
                            'Remote record with GUID %s is not more recent than a deleted package, skipping... '
                            % gemini_guid)
                        return None

            else:
                if last_harvested_object.content != self.obj.content and \
                 last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
                    diff_generator = difflib.unified_diff(
                        last_harvested_object.content.split('\n'),
                        self.obj.content.split('\n'))
                    diff = '\n'.join([line for line in diff_generator])
                    raise Exception(
                        'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s'
                        % (gemini_guid, diff))
                else:
                    # The content hasn't changed, no need to update the package
                    log.info('Document with GUID %s unchanged, skipping...' %
                             (gemini_guid))
                return None
        else:
            log.info(
                'No package with GEMINI guid %s found, let\'s create one' %
                gemini_guid)

        extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id}

        # Just add some of the metadata as extras, not the whole lot
        for name in [
                # Essentials
                'spatial-reference-system',
                'guid',
                # Usefuls
                'dataset-reference-date',
                'metadata-language',  # Language
                'metadata-date',  # Released
                'coupled-resource',
                'contact-email',
                'frequency-of-update',
                'spatial-data-service-type',
        ]:
            extras[name] = gemini_values[name]

        if len(gemini_values.get('progress', [])):
            extras['progress'] = gemini_values['progress'][0]
        else:
            extras['progress'] = ''

        extras['resource-type'] = gemini_values['resource-type'][0]

        # Use-constraints can contain values which are:
        #  * free text
        #  * licence URL
        # Store all values in extra['licence'] and if there is a
        # URL in there, store that in extra['licence-url']
        extras['licence'] = gemini_values.get('use-constraints', '')
        if len(extras['licence']):
            licence_url_extracted = self._extract_first_licence_url(
                extras['licence'])
            if licence_url_extracted:
                extras['licence_url'] = licence_url_extracted

        extras['access_constraints'] = gemini_values.get(
            'limitations-on-public-access', '')
        if 'temporal-extent-begin' in gemini_values:
            #gemini_values['temporal-extent-begin'].sort()
            extras['temporal_coverage-from'] = gemini_values[
                'temporal-extent-begin']
        if 'temporal-extent-end' in gemini_values:
            #gemini_values['temporal-extent-end'].sort()
            extras['temporal_coverage-to'] = gemini_values[
                'temporal-extent-end']

        # Save responsible organization roles
        provider, responsible_parties = self._process_responsible_organisation(
            gemini_values['responsible-organisation'])
        extras['provider'] = provider
        extras['responsible-party'] = '; '.join(responsible_parties)

        if len(gemini_values['bbox']) > 0:
            extras['bbox-east-long'] = gemini_values['bbox'][0]['east']
            extras['bbox-north-lat'] = gemini_values['bbox'][0]['north']
            extras['bbox-south-lat'] = gemini_values['bbox'][0]['south']
            extras['bbox-west-long'] = gemini_values['bbox'][0]['west']

            # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry
            extent_string = self.extent_template.substitute(
                xmin=extras['bbox-east-long'],
                ymin=extras['bbox-south-lat'],
                xmax=extras['bbox-west-long'],
                ymax=extras['bbox-north-lat'])

            extras['spatial'] = extent_string.strip()

        tags = []
        for tag in gemini_values['tags']:
            tag = tag[:50] if len(tag) > 50 else tag
            tags.append({'name': tag})

        package_dict = {
            'title': gemini_values['title'],
            'notes': gemini_values['abstract'],
            'tags': tags,
            'resources': []
        }

        if self.obj.source.publisher_id:
            package_dict['groups'] = [{'id': self.obj.source.publisher_id}]

        if reactivate_package:
            package_dict['state'] = u'active'

        if package is None or package.title != gemini_values['title']:
            name = self.gen_new_name(gemini_values['title'])
            if not name:
                name = self.gen_new_name(six.text_type(gemini_guid))
            if not name:
                raise Exception(
                    'Could not generate a unique name from the title or the GUID. Please choose a more unique title.'
                )
            package_dict['name'] = name
        else:
            package_dict['name'] = package.name

        resource_locators = gemini_values.get('resource-locator', [])

        if len(resource_locators):
            for resource_locator in resource_locators:
                url = resource_locator.get('url', '')
                if url:
                    resource_format = ''
                    resource = {}
                    if extras['resource-type'] == 'service':
                        # Check if the service is a view service
                        test_url = url.split('?')[0] if '?' in url else url
                        if self._is_wms(test_url):
                            resource['verified'] = True
                            resource['verified_date'] = datetime.now(
                            ).isoformat()
                            resource_format = 'WMS'
                    resource.update({
                        'url':
                        url,
                        'name':
                        resource_locator.get('name', ''),
                        'description':
                        resource_locator.get('description')
                        if resource_locator.get('description') else
                        'Resource locator',
                        'format':
                        resource_format or None,
                        'resource_locator_protocol':
                        resource_locator.get('protocol', ''),
                        'resource_locator_function':
                        resource_locator.get('function', '')
                    })
                    package_dict['resources'].append(resource)

            # Guess the best view service to use in WMS preview
            verified_view_resources = [
                r for r in package_dict['resources']
                if 'verified' in r and r['format'] == 'WMS'
            ]
            if len(verified_view_resources):
                verified_view_resources[0][
                    'ckan_recommended_wms_preview'] = True
            else:
                view_resources = [
                    r for r in package_dict['resources']
                    if r['format'] == 'WMS'
                ]
                if len(view_resources):
                    view_resources[0]['ckan_recommended_wms_preview'] = True

        extras_as_dict = []
        for key, value in extras.items():
            if isinstance(value, six.string_types + (Number, )):
                extras_as_dict.append({'key': key, 'value': value})
            else:
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})

        package_dict['extras'] = extras_as_dict

        if package == None:
            # Create new package from data.
            package = self._create_package_from_data(package_dict)
            log.info('Created new package ID %s with GEMINI guid %s',
                     package['id'], gemini_guid)
        else:
            package = self._create_package_from_data(package_dict,
                                                     package=package)
            log.info(
                'Updated existing package ID %s with existing GEMINI guid %s',
                package['id'], gemini_guid)

        # Flag the other objects of this source as not current anymore
        from ckanext.harvest.model import harvest_object_table
        u = update(harvest_object_table) \
                .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                .values(current=False)
        Session.execute(u, params={'b_package_id': package['id']})
        Session.commit()

        # Refresh current object from session, otherwise the
        # import paster command fails
        Session.remove()
        Session.add(self.obj)
        Session.refresh(self.obj)

        # Set reference to package in the HarvestObject and flag it as
        # the current one
        if not self.obj.package_id:
            self.obj.package_id = package['id']

        self.obj.current = True
        self.obj.save()

        return package
Esempio n. 18
0
    def reimport_batch(self, package_ids, context):
        '''Batch-reimport all packages in `package_ids` from their original
           harvest source.'''

        ckan_fb_mapping = {}

        # first, do checks that can be done without connection to FIS-Broker
        for package_id in package_ids:
            package = Package.get(package_id)

            if not package:
                raise PackageIdDoesNotExistError(package_id)

            if not dataset_was_harvested(package):
                raise PackageNotHarvestedError(package_id)

            harvester = harvester_for_package(package)
            harvester_url = harvester.url
            harvester_type = harvester.type
            if not harvester_type == HARVESTER_ID:
                raise PackageNotHarvestedInFisbrokerError(package_id)

            fb_guid = fisbroker_guid(package)
            if not fb_guid:
                raise NoFisbrokerIdError(package_id)

            ckan_fb_mapping[package.id] = fb_guid

        # get the harvest source for FIS-Broker datasets
        fb_source = get_fisbroker_source()
        if not fb_source:
            raise NoFBHarvesterDefined()
        source_id = fb_source.get('id', None)

        # Create and start a new harvest job
        job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id})
        harvest_job = HarvestJob.get(job_dict['id'])
        harvest_job.gather_started = datetime.datetime.utcnow()
        assert harvest_job

        # instatiate the CSW connector (on the reasonable assumption that harvester_url is
        # the same for all package_ids)
        package_id = None
        reimported_packages = []
        try:
            csw = CatalogueServiceWeb(harvester_url)
            for package_id, fb_guid in ckan_fb_mapping.items():
                # query connector to get resource document
                csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd'])

                # show resource document
                record = csw.records.get(fb_guid, None)
                if record:
                    obj = HarvestObject(guid=fb_guid,
                                        job=harvest_job,
                                        content=record.xml,
                                        package_id=package_id,
                                        extras=[
                                            HarvestObjectExtra(key='status',value='change'),
                                            HarvestObjectExtra(key='type',value='reimport'),
                                        ])
                    obj.save()

                    assert obj, obj.content

                    harvester = FisbrokerPlugin()
                    harvester.force_import = True
                    harvester.import_stage(obj)
                    rejection_reason = self._dataset_rejected(obj)
                    if rejection_reason:
                        raise FBImportError(package_id, rejection_reason)

                    harvester.force_import = False
                    Session.refresh(obj)

                    reimported_packages.append(record)

                else:
                    raise NotFoundInFisbrokerError(package_id, fb_guid)

        except RequestException as error:
            raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__))


        # successfully finish harvest job
        harvest_job.status = u'Finished'
        harvest_job.finished = datetime.datetime.utcnow()
        harvest_job.save()

        return reimported_packages