def _refresh_harvest_objects(self, harvest_object, package_id): """ Perform harvester housekeeping: - Flag the other objects of the source as not current - Set a refernce to the package in the harvest object - Flag it as current - And save the changes """ # Flag the other objects of this source as not current from ckanext.harvest.model import harvest_object_table u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id == bindparam('pkg_id')) \ .values(current=False) Session.execute(u, params={'pkg_id': package_id}) Session.commit() # Refresh current object from session, otherwise the # import paster command fails # (Copied from the Gemini harvester--not sure if necessary) Session.remove() Session.add(harvest_object) Session.refresh(harvest_object) # Set reference to package in the HarvestObject and flag it as # the current one if not harvest_object.package_id: harvest_object.package_id = package_id harvest_object.current = True harvest_object.save()
def _run_job_for_single_document( self, job, force_import=False, expect_gather_errors=False, expect_obj_errors=False ): harvester = GeminiDocHarvester() harvester.force_import = force_import object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 if expect_gather_errors: assert len(job.gather_errors) > 0 else: assert len(job.gather_errors) == 0 assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) if expect_obj_errors: assert len(obj.errors) > 0 else: assert len(obj.errors) == 0 job.status = u"Finished" job.save() return obj
def test_harvest_update_records(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"} source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert first_package_dict assert first_obj.current == True assert first_obj.package # Create and run a second job, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.refresh(first_obj) Session.refresh(second_obj) second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict["id"] == second_package_dict["id"] assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Create and run a third job, forcing the importing to simulate an update in the package third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job, force_import=True) # For some reason first_obj does not get updated after the import_stage, # and we have to force a refresh to get the actual DB values. Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action("package_show_rest")(self.context, {"id": third_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict["id"] == third_package_dict["id"] assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"] assert third_obj.package, third_obj.package_id == first_package_dict["id"] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def _run_job_for_single_document(self,job,force_import=False,expect_gather_errors=False,expect_obj_errors=False): harvester = GeminiDocHarvester() harvester.force_import = force_import object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 if expect_gather_errors: assert len(job.gather_errors) > 0 else: assert len(job.gather_errors) == 0 assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) if expect_obj_errors: assert len(obj.errors) > 0 else: assert len(obj.errors) == 0 job.status = u'Finished' job.save() return obj
def test_harvest_import_command(self): # Create source source_fixture = { 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) before_package_dict = get_action('package_show_rest')( self.context, { 'id': first_obj.package_id }) # Package was created assert before_package_dict assert first_obj.current == True assert first_obj.package # Create and run two more jobs, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) # Run the import command manually imported_objects = get_action('harvest_objects_import')( self.context, { 'source_id': source.id }) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) after_package_dict = get_action('package_show_rest')( self.context, { 'id': first_obj.package_id }) # Package was updated, and the current object remains the same assert after_package_dict, before_package_dict[ 'id'] == after_package_dict['id'] assert after_package_dict['metadata_modified'] > before_package_dict[ 'metadata_modified'] assert third_obj.current == False assert second_obj.current == False assert first_obj.current == True source_dict = get_action('harvest_source_show')(self.context, { 'id': source.id }) assert len(source_dict['status']['packages']) == 1
def _run_job_for_single_document(self, harvest_job, object_id): harvester = FisbrokerPlugin() # we circumvent gather_stage() and fetch_stage() and just load the # content with a known object_id and create the harvest object: url = harvest_job.source.url # _get_content() returns XML content = harvester._get_content(url) obj = HarvestObject(guid=object_id, job=harvest_job, content=content, extras=[HarvestObjectExtra(key='status',value='new')]) obj.save() assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) harvest_job.status = u'Finished' harvest_job.save() return obj
def test_harvest_different_sources_same_document(self): # Create source1 source1_fixture = {"url": u"http://127.0.0.1:8999/single/source1/same_dataset.xml", "type": u"gemini-single"} source1, first_job = self._create_source_and_job(source1_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict["state"] == u"active" assert first_obj.current == True # Harvest the same document, unchanged, from another source, the package # is not updated. # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67 # we are no longer throwing an exception when this happens) source2_fixture = {"url": u"http://127.0.0.1:8999/single/source2/same_dataset.xml", "type": u"gemini-single"} source2, second_job = self._create_source_and_job(source2_fixture) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict["id"] == second_package_dict["id"] assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Inactivate source1 and reharvest from source2, package should be updated third_job = self._create_job(source2.id) third_obj = self._run_job_for_single_document(third_job, force_import=True) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict["id"] == third_package_dict["id"] assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"] assert third_obj.package, third_obj.package_id == first_package_dict["id"] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def test_harvest_import_command(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) before_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert before_package_dict assert first_obj.current == True assert first_obj.package # Create and run two more jobs, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) # Run the import command manually imported_objects = get_action('harvest_objects_import')(self.context,{'source_id':source.id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) after_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was updated, and the current object remains the same assert after_package_dict, before_package_dict['id'] == after_package_dict['id'] assert after_package_dict['metadata_modified'] > before_package_dict['metadata_modified'] assert third_obj.current == False assert second_obj.current == False assert first_obj.current == True source_dict = get_action('harvest_source_show')(self.context,{'id':source.id}) assert source_dict['status']['total_datasets'] == 1
def test_harvest_import_command(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"} source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) before_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert before_package_dict assert first_obj.current == True assert first_obj.package # Create and run two more jobs, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) # Run the import command manually imported_objects = get_action("harvest_objects_import")(self.context, {"source_id": source.id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) after_package_dict = get_action("package_show_rest")(self.context, {"id": imported_objects[0]["package_id"]}) # Package was updated, and the current object remains the same assert after_package_dict, before_package_dict["id"] == after_package_dict["id"] assert after_package_dict["metadata_modified"] > before_package_dict["metadata_modified"] assert third_obj.current == False assert second_obj.current == False assert first_obj.current == True source_dict = get_action("harvest_source_show")(self.context, {"id": source.id}) assert len(source_dict["status"]["packages"]) == 1
def test_harvest_deleted_record(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/service1.xml", "type": u"gemini-single"} source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict["state"] == u"active" assert first_obj.current == True # Delete package first_package_dict["state"] = u"deleted" self.context.update({"id": first_package_dict["id"]}) updated_package_dict = get_action("package_update_rest")(self.context, first_package_dict) # Create and run a second job, the date has not changed, so the package should not be updated # and remain deleted first_job.status = u"Finished" first_job.save() second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was not updated assert second_package_dict, updated_package_dict["id"] == second_package_dict["id"] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Harvest an updated document, with a more recent modified date, package should be # updated and reactivated source.url = u"http://127.0.0.1:8999/single/service1_newer.xml" source.save() third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) # Package was updated assert third_package_dict, third_package_dict["id"] == second_package_dict["id"] assert third_obj.package, third_obj.package assert third_obj.current == True, second_obj.current == False assert first_obj.current == False assert "NEWER" in third_package_dict["title"] assert third_package_dict["state"] == u"active"
def test_harvest_different_sources_same_document(self): # Create source1 source1_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml', 'source_type': u'gemini-single' } source1, first_job = self._create_source_and_job(source1_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Harvest the same document, unchanged, from another source, the package # is not updated. # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67 # we are no longer throwing an exception when this happens) source2_fixture = { 'title': 'Test Source 2', 'name': 'test-source-2', 'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml', 'source_type': u'gemini-single' } source2, second_job = self._create_source_and_job(source2_fixture) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Inactivate source1 and reharvest from source2, package should be updated third_job = self._create_job(source2.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def test_harvest_deleted_record(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Delete package first_package_dict['state'] = u'deleted' self.context.update({'id':first_package_dict['id']}) updated_package_dict = get_action('package_update')(self.context,first_package_dict) # Create and run a second job, the date has not changed, so the package should not be updated # and remain deleted first_job.status = u'Finished' first_job.save() second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, updated_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Harvest an updated document, with a more recent modified date, package should be # updated and reactivated source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml' source.save() third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) # Package was updated assert third_package_dict, third_package_dict['id'] == second_package_dict['id'] assert third_obj.package, third_obj.package assert third_obj.current == True, second_obj.current == False assert first_obj.current == False assert 'NEWER' in third_package_dict['title'] assert third_package_dict['state'] == u'active'
def test_harvest_update_records(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_obj.current == True assert first_obj.package # Create and run a second job, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.refresh(first_obj) Session.refresh(second_obj) second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Create and run a third job, forcing the importing to simulate an update in the package third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) # For some reason first_obj does not get updated after the import_stage, # and we have to force a refresh to get the actual DB values. Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show')(self.context,{'id':third_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def test_harvest_different_sources_same_document(self): # Create source1 source1_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml', 'source_type': u'gemini-single' } source1, first_job = self._create_source_and_job(source1_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Harvest the same document, unchanged, from another source, the package # is not updated. # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67 # we are no longer throwing an exception when this happens) source2_fixture = { 'title': 'Test Source 2', 'name': 'test-source-2', 'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml', 'source_type': u'gemini-single' } source2, second_job = self._create_source_and_job(source2_fixture) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Inactivate source1 and reharvest from source2, package should be updated third_job = self._create_job(source2.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def test_harvest_deleted_record(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Delete package first_package_dict['state'] = u'deleted' self.context.update({'id':first_package_dict['id']}) updated_package_dict = get_action('package_update_rest')(self.context,first_package_dict) # Create and run a second job, the date has not changed, so the package should not be updated # and remain deleted first_job.status = u'Finished' first_job.save() second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, updated_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Harvest an updated document, with a more recent modified date, package should be # updated and reactivated source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml' source.save() third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) # Package was updated assert third_package_dict, third_package_dict['id'] == second_package_dict['id'] assert third_obj.package, third_obj.package assert third_obj.current == True, second_obj.current == False assert first_obj.current == False assert 'NEWER' in third_package_dict['title'] assert third_package_dict['state'] == u'active'
def test_harvest_update_records(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_obj.current == True assert first_obj.package # Create and run a second job, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.refresh(first_obj) Session.refresh(second_obj) second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Create and run a third job, forcing the importing to simulate an update in the package third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) # For some reason first_obj does not get updated after the import_stage, # and we have to force a refresh to get the actual DB values. Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show_rest')(self.context,{'id':third_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def write_package_from_gemini_string(self, content): '''Create or update a Package based on some content that has come from a URL. Returns the package_dict of the result. If there is an error, it returns None or raises Exception. ''' log = logging.getLogger(__name__ + '.import') package = None gemini_document = GeminiDocument(content) gemini_values = gemini_document.read_values() gemini_guid = gemini_values['guid'] # Save the metadata reference date in the Harvest Object try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%d') except ValueError: try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S') except: raise Exception('Could not extract reference date for GUID %s (%s)' \ % (gemini_guid,gemini_values['metadata-date'])) self.obj.metadata_modified_date = metadata_modified_date self.obj.save() last_harvested_object = Session.query(HarvestObject) \ .filter(HarvestObject.guid==gemini_guid) \ .filter(HarvestObject.current==True) \ .all() if len(last_harvested_object) == 1: last_harvested_object = last_harvested_object[0] elif len(last_harvested_object) > 1: raise Exception( 'Application Error: more than one current record for GUID %s' % gemini_guid) reactivate_package = False if last_harvested_object: # We've previously harvested this (i.e. it's an update) # Use metadata modified date instead of content to determine if the package # needs to be updated if last_harvested_object.metadata_modified_date is None \ or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \ or self.force_import \ or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and last_harvested_object.source.active is False): if self.force_import: log.info('Import forced for object %s with GUID %s' % (self.obj.id, gemini_guid)) else: log.info( 'Package for object with GUID %s needs to be created or updated' % gemini_guid) package = last_harvested_object.package # If the package has a deleted state, we will only update it and reactivate it if the # new document has a more recent modified date if package.state == u'deleted': if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date: log.info( 'Package for object with GUID %s will be re-activated' % gemini_guid) reactivate_package = True else: log.info( 'Remote record with GUID %s is not more recent than a deleted package, skipping... ' % gemini_guid) return None else: if last_harvested_object.content != self.obj.content and \ last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date: diff_generator = difflib.unified_diff( last_harvested_object.content.split('\n'), self.obj.content.split('\n')) diff = '\n'.join([line for line in diff_generator]) raise Exception( 'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff)) else: # The content hasn't changed, no need to update the package log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid)) return None else: log.info( 'No package with GEMINI guid %s found, let\'s create one' % gemini_guid) extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id} # Just add some of the metadata as extras, not the whole lot for name in [ # Essentials 'spatial-reference-system', 'guid', # Usefuls 'dataset-reference-date', 'metadata-language', # Language 'metadata-date', # Released 'coupled-resource', 'contact-email', 'frequency-of-update', 'spatial-data-service-type', ]: extras[name] = gemini_values[name] if len(gemini_values.get('progress', [])): extras['progress'] = gemini_values['progress'][0] else: extras['progress'] = '' extras['resource-type'] = gemini_values['resource-type'][0] # Use-constraints can contain values which are: # * free text # * licence URL # Store all values in extra['licence'] and if there is a # URL in there, store that in extra['licence-url'] extras['licence'] = gemini_values.get('use-constraints', '') if len(extras['licence']): licence_url_extracted = self._extract_first_licence_url( extras['licence']) if licence_url_extracted: extras['licence_url'] = licence_url_extracted extras['access_constraints'] = gemini_values.get( 'limitations-on-public-access', '') if 'temporal-extent-begin' in gemini_values: #gemini_values['temporal-extent-begin'].sort() extras['temporal_coverage-from'] = gemini_values[ 'temporal-extent-begin'] if 'temporal-extent-end' in gemini_values: #gemini_values['temporal-extent-end'].sort() extras['temporal_coverage-to'] = gemini_values[ 'temporal-extent-end'] # Save responsible organization roles provider, responsible_parties = self._process_responsible_organisation( gemini_values['responsible-organisation']) extras['provider'] = provider extras['responsible-party'] = '; '.join(responsible_parties) if len(gemini_values['bbox']) > 0: extras['bbox-east-long'] = gemini_values['bbox'][0]['east'] extras['bbox-north-lat'] = gemini_values['bbox'][0]['north'] extras['bbox-south-lat'] = gemini_values['bbox'][0]['south'] extras['bbox-west-long'] = gemini_values['bbox'][0]['west'] # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry extent_string = self.extent_template.substitute( xmin=extras['bbox-east-long'], ymin=extras['bbox-south-lat'], xmax=extras['bbox-west-long'], ymax=extras['bbox-north-lat']) extras['spatial'] = extent_string.strip() tags = [] for tag in gemini_values['tags']: tag = tag[:50] if len(tag) > 50 else tag tags.append({'name': tag}) package_dict = { 'title': gemini_values['title'], 'notes': gemini_values['abstract'], 'tags': tags, 'resources': [] } if self.obj.source.publisher_id: package_dict['groups'] = [{'id': self.obj.source.publisher_id}] if reactivate_package: package_dict['state'] = u'active' if package is None or package.title != gemini_values['title']: name = self.gen_new_name(gemini_values['title']) if not name: name = self.gen_new_name(six.text_type(gemini_guid)) if not name: raise Exception( 'Could not generate a unique name from the title or the GUID. Please choose a more unique title.' ) package_dict['name'] = name else: package_dict['name'] = package.name resource_locators = gemini_values.get('resource-locator', []) if len(resource_locators): for resource_locator in resource_locators: url = resource_locator.get('url', '') if url: resource_format = '' resource = {} if extras['resource-type'] == 'service': # Check if the service is a view service test_url = url.split('?')[0] if '?' in url else url if self._is_wms(test_url): resource['verified'] = True resource['verified_date'] = datetime.now( ).isoformat() resource_format = 'WMS' resource.update({ 'url': url, 'name': resource_locator.get('name', ''), 'description': resource_locator.get('description') if resource_locator.get('description') else 'Resource locator', 'format': resource_format or None, 'resource_locator_protocol': resource_locator.get('protocol', ''), 'resource_locator_function': resource_locator.get('function', '') }) package_dict['resources'].append(resource) # Guess the best view service to use in WMS preview verified_view_resources = [ r for r in package_dict['resources'] if 'verified' in r and r['format'] == 'WMS' ] if len(verified_view_resources): verified_view_resources[0][ 'ckan_recommended_wms_preview'] = True else: view_resources = [ r for r in package_dict['resources'] if r['format'] == 'WMS' ] if len(view_resources): view_resources[0]['ckan_recommended_wms_preview'] = True extras_as_dict = [] for key, value in extras.items(): if isinstance(value, six.string_types + (Number, )): extras_as_dict.append({'key': key, 'value': value}) else: extras_as_dict.append({'key': key, 'value': json.dumps(value)}) package_dict['extras'] = extras_as_dict if package == None: # Create new package from data. package = self._create_package_from_data(package_dict) log.info('Created new package ID %s with GEMINI guid %s', package['id'], gemini_guid) else: package = self._create_package_from_data(package_dict, package=package) log.info( 'Updated existing package ID %s with existing GEMINI guid %s', package['id'], gemini_guid) # Flag the other objects of this source as not current anymore from ckanext.harvest.model import harvest_object_table u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) Session.execute(u, params={'b_package_id': package['id']}) Session.commit() # Refresh current object from session, otherwise the # import paster command fails Session.remove() Session.add(self.obj) Session.refresh(self.obj) # Set reference to package in the HarvestObject and flag it as # the current one if not self.obj.package_id: self.obj.package_id = package['id'] self.obj.current = True self.obj.save() return package
def reimport_batch(self, package_ids, context): '''Batch-reimport all packages in `package_ids` from their original harvest source.''' ckan_fb_mapping = {} # first, do checks that can be done without connection to FIS-Broker for package_id in package_ids: package = Package.get(package_id) if not package: raise PackageIdDoesNotExistError(package_id) if not dataset_was_harvested(package): raise PackageNotHarvestedError(package_id) harvester = harvester_for_package(package) harvester_url = harvester.url harvester_type = harvester.type if not harvester_type == HARVESTER_ID: raise PackageNotHarvestedInFisbrokerError(package_id) fb_guid = fisbroker_guid(package) if not fb_guid: raise NoFisbrokerIdError(package_id) ckan_fb_mapping[package.id] = fb_guid # get the harvest source for FIS-Broker datasets fb_source = get_fisbroker_source() if not fb_source: raise NoFBHarvesterDefined() source_id = fb_source.get('id', None) # Create and start a new harvest job job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id}) harvest_job = HarvestJob.get(job_dict['id']) harvest_job.gather_started = datetime.datetime.utcnow() assert harvest_job # instatiate the CSW connector (on the reasonable assumption that harvester_url is # the same for all package_ids) package_id = None reimported_packages = [] try: csw = CatalogueServiceWeb(harvester_url) for package_id, fb_guid in ckan_fb_mapping.items(): # query connector to get resource document csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd']) # show resource document record = csw.records.get(fb_guid, None) if record: obj = HarvestObject(guid=fb_guid, job=harvest_job, content=record.xml, package_id=package_id, extras=[ HarvestObjectExtra(key='status',value='change'), HarvestObjectExtra(key='type',value='reimport'), ]) obj.save() assert obj, obj.content harvester = FisbrokerPlugin() harvester.force_import = True harvester.import_stage(obj) rejection_reason = self._dataset_rejected(obj) if rejection_reason: raise FBImportError(package_id, rejection_reason) harvester.force_import = False Session.refresh(obj) reimported_packages.append(record) else: raise NotFoundInFisbrokerError(package_id, fb_guid) except RequestException as error: raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__)) # successfully finish harvest job harvest_job.status = u'Finished' harvest_job.finished = datetime.datetime.utcnow() harvest_job.save() return reimported_packages