def test_zharvester_import(self, mocked=True): harvest_object, harv = self._create_harvester() self.assert_(harv.info()['name'] == 'OAI-PMH') real_content = json.loads(harvest_object.content) self.assert_(real_content) self.assert_(harv.import_stage(harvest_object)) the_package = Session.query(Package).filter(Package.title == u"homer") print the_package the_package = the_package[0] self.assert_(the_package) self.assert_(len(the_package.get_tags()) == 4) self.assert_(len(the_package.get_groups()) == 3) self.assert_(the_package.url == "http://helda.helsinki.fi/oai/request?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc" % the_package.id) # Test with empty request Session.remove() CreateTestData.delete() Session.query(Package).delete() harvest_object, harv = self._create_harvester(config=False) real_content = json.loads(harvest_object.content) self.assert_(harv.import_stage(harvest_object) == False) errs = Session.query(HarvestGatherError).all() self.assert_(len(errs) == 2) errs = Session.query(HarvestObjectError).all() self.assert_(len(errs) == 3)
def _refresh_harvest_objects(self, harvest_object, package_id): """ Perform harvester housekeeping: - Flag the other objects of the source as not current - Set a refernce to the package in the harvest object - Flag it as current - And save the changes """ # Flag the other objects of this source as not current from ckanext.harvest.model import harvest_object_table u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id == bindparam('pkg_id')) \ .values(current=False) Session.execute(u, params={'pkg_id': package_id}) Session.commit() # Refresh current object from session, otherwise the # import paster command fails # (Copied from the Gemini harvester--not sure if necessary) Session.remove() Session.add(harvest_object) Session.refresh(harvest_object) # Set reference to package in the HarvestObject and flag it as # the current one if not harvest_object.package_id: harvest_object.package_id = package_id harvest_object.current = True harvest_object.save()
def test_harvest_import_command(self): # Create source source_fixture = { 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) before_package_dict = get_action('package_show_rest')( self.context, { 'id': first_obj.package_id }) # Package was created assert before_package_dict assert first_obj.current == True assert first_obj.package # Create and run two more jobs, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) # Run the import command manually imported_objects = get_action('harvest_objects_import')( self.context, { 'source_id': source.id }) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) after_package_dict = get_action('package_show_rest')( self.context, { 'id': first_obj.package_id }) # Package was updated, and the current object remains the same assert after_package_dict, before_package_dict[ 'id'] == after_package_dict['id'] assert after_package_dict['metadata_modified'] > before_package_dict[ 'metadata_modified'] assert third_obj.current == False assert second_obj.current == False assert first_obj.current == True source_dict = get_action('harvest_source_show')(self.context, { 'id': source.id }) assert len(source_dict['status']['packages']) == 1
def test_harvest_update_records(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"} source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert first_package_dict assert first_obj.current == True assert first_obj.package # Create and run a second job, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.refresh(first_obj) Session.refresh(second_obj) second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict["id"] == second_package_dict["id"] assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Create and run a third job, forcing the importing to simulate an update in the package third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job, force_import=True) # For some reason first_obj does not get updated after the import_stage, # and we have to force a refresh to get the actual DB values. Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action("package_show_rest")(self.context, {"id": third_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict["id"] == third_package_dict["id"] assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"] assert third_obj.package, third_obj.package_id == first_package_dict["id"] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def setup_class(cls): """ Remove any initial sessions. """ Session.remove() # TODO: Should also remove test data CreateTestData.create()
def setup_class(self): """ Remove any initial sessions. """ Session.remove() # TODO: Should also remove test data CreateTestData.create() self.sysadmin = model.User.get('testsysadmin')
def setup_class(cls): """ Remove any initial sessions. """ Session.remove() CreateTestData.create() url = url_for(controller="ckanext.sitemap.controller:SitemapController", action="view") cls.cont = cls.app.get(url) cls.content_file = StringIO(cls.cont.body)
def test_harvest_different_sources_same_document(self): # Create source1 source1_fixture = {"url": u"http://127.0.0.1:8999/single/source1/same_dataset.xml", "type": u"gemini-single"} source1, first_job = self._create_source_and_job(source1_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict["state"] == u"active" assert first_obj.current == True # Harvest the same document, unchanged, from another source, the package # is not updated. # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67 # we are no longer throwing an exception when this happens) source2_fixture = {"url": u"http://127.0.0.1:8999/single/source2/same_dataset.xml", "type": u"gemini-single"} source2, second_job = self._create_source_and_job(source2_fixture) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict["id"] == second_package_dict["id"] assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Inactivate source1 and reharvest from source2, package should be updated third_job = self._create_job(source2.id) third_obj = self._run_job_for_single_document(third_job, force_import=True) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict["id"] == third_package_dict["id"] assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"] assert third_obj.package, third_obj.package_id == first_package_dict["id"] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def setup_class(cls): """ Remove any initial sessions. """ Session.remove() CreateTestData.create() url = url_for( controller="ckanext.sitemap.controller:SitemapController", action='view') cls.cont = cls.app.get(url) cls.content_file = StringIO(cls.cont.body)
def test_harvest_import_command(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) before_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert before_package_dict assert first_obj.current == True assert first_obj.package # Create and run two more jobs, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) # Run the import command manually imported_objects = get_action('harvest_objects_import')(self.context,{'source_id':source.id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) after_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was updated, and the current object remains the same assert after_package_dict, before_package_dict['id'] == after_package_dict['id'] assert after_package_dict['metadata_modified'] > before_package_dict['metadata_modified'] assert third_obj.current == False assert second_obj.current == False assert first_obj.current == True source_dict = get_action('harvest_source_show')(self.context,{'id':source.id}) assert source_dict['status']['total_datasets'] == 1
def setup_class(cls): """ Remove any initial sessions. """ Session.remove() cls.original_config = config.copy() plugins = set(config['ckan.plugins'].strip().split()) plugins.add('sitemap') config['ckan.plugins'] = ' '.join(plugins) cls.app = _get_test_app() CreateTestData.create() url = url_for( controller="ckanext.sitemap.controller:SitemapController", action='view') cls.cont = cls.app.get(url) cls.content_file = StringIO(cls.cont.body)
def test_harvest_import_command(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"} source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) before_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert before_package_dict assert first_obj.current == True assert first_obj.package # Create and run two more jobs, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) # Run the import command manually imported_objects = get_action("harvest_objects_import")(self.context, {"source_id": source.id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) after_package_dict = get_action("package_show_rest")(self.context, {"id": imported_objects[0]["package_id"]}) # Package was updated, and the current object remains the same assert after_package_dict, before_package_dict["id"] == after_package_dict["id"] assert after_package_dict["metadata_modified"] > before_package_dict["metadata_modified"] assert third_obj.current == False assert second_obj.current == False assert first_obj.current == True source_dict = get_action("harvest_source_show")(self.context, {"id": source.id}) assert len(source_dict["status"]["packages"]) == 1
def authenticate(self, environ, identity): if 'repoze.who.plugins.openid.userid' in identity: openid = identity.get('repoze.who.plugins.openid.userid') user = User.by_openid(openid) if user is None: # TODO: Implement a mask to ask for an alternative user # name instead of just using the OpenID identifier. name = identity.get('repoze.who.plugins.openid.nickname') if not User.check_name_valid(name): name = openid if not User.check_name_available(name): name = openid user = User(openid=openid, name=name, fullname=identity.get('repoze.who.plugins.openid.fullname'), email=identity.get('repoze.who.plugins.openid.email')) Session.add(user) Session.commit() Session.remove() return user.name return None
def _get_or_create_user(self, env): # WSGI Variables # Shib-Application-ID 'default' # Shib-Authentication-Instant '2012-08-13T12:04:22.492Z' # Shib-Authentication-Method 'urn:oasis:names:tc:SAML:2.0:ac:classes:PasswordProtectedTransport' # Shib-AuthnContext-Class 'urn:oasis:names:tc:SAML:2.0:ac:classes:PasswordProtectedTransport' # Shib-Identity-Provider 'https://idp.example.com/idp/shibboleth' # Shib-Session-ID '_7ec5a681e6dbae627c1cefcc7cb4d56a' # Shib-Session-Index '39dafd8477850f5e0b968e3561570197f2109948c1d374a7a2b4c9a7adbf8628' # cn 'My Other Self' # givenName 'My Other Self' # mail '*****@*****.**' email = env.get(self.mail, None) fullname = env.get(self.name, None) if not email or not fullname: log.debug("Environ does not contain mail or cn attributes, user not loaded.") return None user = meta.Session.query(User).autoflush(False).filter_by(openid=email).first() if user is None: log.debug("User does not exists, creating new one.") import re username = re.sub("[.@]", "_", email) user = User(name=username, fullname=fullname, email=email, openid=email) Session.add(user) Session.commit() Session.remove() log.debug("Created new user %s" % fullname) return user
def preauthenticate(self, environ, identity): # turn the oauth identity into a CKAN one; set it in our identity import oauth2 as oauth try: access_token = dict(urlparse.parse_qsl(identity['userdata'])) oauth_token = access_token['oauth_token'] oauth_token_secret = access_token['oauth_token_secret'] except KeyError: return None access_token = oauth.Token(oauth_token, oauth_token_secret) client = oauth.Client(self.consumer, access_token) resp, content = client.request(self.user_url, "GET") data = json.loads(content) user_id = data['id'] logging.info("Preauth: Got oauth user data for user %s" % user_id) user = User.by_openid(user_id) if user is None: user = User(openid=user_id, name=data['id'], fullname=data['name'], email=data['mail']) Session.add(user) else: user.fullname = data['name'] # if the name is updated Session.commit() Session.remove() logging.info("Preauth: Created new/updated user %s" % user_id) # deal with groups user_groups = data['groups'] _sync_auth_groups(user, user_groups) name = user.name.encode("utf8") logging.info("Preauth: Returning user identifier %s" % name) identity['repoze.who.userid'] = name return identity
def test_harvest_update_records(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_obj.current == True assert first_obj.package # Create and run a second job, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.refresh(first_obj) Session.refresh(second_obj) second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Create and run a third job, forcing the importing to simulate an update in the package third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) # For some reason first_obj does not get updated after the import_stage, # and we have to force a refresh to get the actual DB values. Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show_rest')(self.context,{'id':third_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def teardown_class(cls): """ Tear down, remove the session. """ Session.remove()
def test_harvest_deleted_record(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/service1.xml", "type": u"gemini-single"} source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict["state"] == u"active" assert first_obj.current == True # Delete package first_package_dict["state"] = u"deleted" self.context.update({"id": first_package_dict["id"]}) updated_package_dict = get_action("package_update_rest")(self.context, first_package_dict) # Create and run a second job, the date has not changed, so the package should not be updated # and remain deleted first_job.status = u"Finished" first_job.save() second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) # Package was not updated assert second_package_dict, updated_package_dict["id"] == second_package_dict["id"] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Harvest an updated document, with a more recent modified date, package should be # updated and reactivated source.url = u"http://127.0.0.1:8999/single/service1_newer.xml" source.save() third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) # Package was updated assert third_package_dict, third_package_dict["id"] == second_package_dict["id"] assert third_obj.package, third_obj.package assert third_obj.current == True, second_obj.current == False assert first_obj.current == False assert "NEWER" in third_package_dict["title"] assert third_package_dict["state"] == u"active"
def test_harvest_deleted_record(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Delete package first_package_dict['state'] = u'deleted' self.context.update({'id':first_package_dict['id']}) updated_package_dict = get_action('package_update')(self.context,first_package_dict) # Create and run a second job, the date has not changed, so the package should not be updated # and remain deleted first_job.status = u'Finished' first_job.save() second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, updated_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Harvest an updated document, with a more recent modified date, package should be # updated and reactivated source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml' source.save() third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) # Package was updated assert third_package_dict, third_package_dict['id'] == second_package_dict['id'] assert third_obj.package, third_obj.package assert third_obj.current == True, second_obj.current == False assert first_obj.current == False assert 'NEWER' in third_package_dict['title'] assert third_package_dict['state'] == u'active'
def test_harvest_different_sources_same_document(self): # Create source1 source1_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml', 'source_type': u'gemini-single' } source1, first_job = self._create_source_and_job(source1_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Harvest the same document, unchanged, from another source, the package # is not updated. # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67 # we are no longer throwing an exception when this happens) source2_fixture = { 'title': 'Test Source 2', 'name': 'test-source-2', 'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml', 'source_type': u'gemini-single' } source2, second_job = self._create_source_and_job(source2_fixture) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Inactivate source1 and reharvest from source2, package should be updated third_job = self._create_job(source2.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def test_harvest_update_records(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_obj.current == True assert first_obj.package # Create and run a second job, the package should not be updated second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.refresh(first_obj) Session.refresh(second_obj) second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Create and run a third job, forcing the importing to simulate an update in the package third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) # For some reason first_obj does not get updated after the import_stage, # and we have to force a refresh to get the actual DB values. Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show')(self.context,{'id':third_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def setup_class(cls): """ Remove any initial sessions. """ Session.remove() package_dicts = [{'name':u'abraham', 'title':u'Abraham'}, {'name':u'homer', 'title':u'Homer', 'tags':['foo', 'bar', 'baz']}, {'name':u'homer_derived', 'title':u'Homer Derived'}, {'name':u'beer', 'title':u'Beer'}, {'name':u'bart', 'title':u'Bart'}, {'name':u'lisa', 'title':u'Lisa', 'extras': {'fezina':'foo'}}, {'name':u'marge', 'title':u'Marge'}, {'name':u'marge1', 'title':u'Marge'}, {'name':u'marge11', 'title':u'Marge'}, {'name':u'marge121', 'title':u'Marge'}, {'name':u'marge311', 'title':u'Marge'}, {'name':u'marge24', 'title':u'Marge'}, {'name':u'marget1', 'title':u'Marge'}, {'name':u'marge31', 'title':u'Marge'}, {'name':u'marge1121', 'title':u'Marge'}, {'name':u'marge1t', 'title':u'Marge'}, {'name':u'marge1b', 'title':u'Marge'}, {'name':u'marge1a', 'title':u'Marge'}, ] CreateTestData.create_arbitrary(package_dicts) package_dicts = [u'abraham', u'homer', u'homer_derived', u'beer', u'bart', u'lisa', u'marge', u'marge1', u'marge11', u'marge121', u'marge311', u'marge24', u'marget1', u'marge31', u'marge1121', u'marge1t', u'marge1b', u'marge1a', ] group_dicts = [{'name':'roger', 'title':'roger', 'description':'', 'packages': package_dicts}, {'name':'roger1', 'title':'roger', 'description':''}, {'name':'roger2', 'title':'roger', 'description':''}, {'name':'roger3', 'title':'roger', 'description':''}, {'name':'roger4', 'title':'roger', 'description':''}, {'name':'roger5', 'title':'roger', 'description':''}, {'name':'roger6', 'title':'roger', 'description':''}, {'name':'roger7', 'title':'roger', 'description':''}, {'name':'roger8', 'title':'roger', 'description':''}, {'name':'roger9', 'title':'roger', 'description':''}, {'name':'roger0', 'title':'roger', 'description':''}, {'name':'roger11', 'title':'roger', 'description':''}, {'name':'roger12', 'title':'roger', 'description':''}, {'name':'roger13', 'title':'roger', 'description':''}, {'name':'roger14', 'title':'roger', 'description':''}] CreateTestData.create_groups(group_dicts) setup() cls._first = True cls._second = False
def write_package_from_gemini_string(self, content): '''Create or update a Package based on some content that has come from a URL. Returns the package_dict of the result. If there is an error, it returns None or raises Exception. ''' log = logging.getLogger(__name__ + '.import') package = None gemini_document = GeminiDocument(content) gemini_values = gemini_document.read_values() gemini_guid = gemini_values['guid'] # Save the metadata reference date in the Harvest Object try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%d') except ValueError: try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S') except: raise Exception('Could not extract reference date for GUID %s (%s)' \ % (gemini_guid,gemini_values['metadata-date'])) self.obj.metadata_modified_date = metadata_modified_date self.obj.save() last_harvested_object = Session.query(HarvestObject) \ .filter(HarvestObject.guid==gemini_guid) \ .filter(HarvestObject.current==True) \ .all() if len(last_harvested_object) == 1: last_harvested_object = last_harvested_object[0] elif len(last_harvested_object) > 1: raise Exception( 'Application Error: more than one current record for GUID %s' % gemini_guid) reactivate_package = False if last_harvested_object: # We've previously harvested this (i.e. it's an update) # Use metadata modified date instead of content to determine if the package # needs to be updated if last_harvested_object.metadata_modified_date is None \ or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \ or self.force_import \ or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and last_harvested_object.source.active is False): if self.force_import: log.info('Import forced for object %s with GUID %s' % (self.obj.id, gemini_guid)) else: log.info( 'Package for object with GUID %s needs to be created or updated' % gemini_guid) package = last_harvested_object.package # If the package has a deleted state, we will only update it and reactivate it if the # new document has a more recent modified date if package.state == u'deleted': if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date: log.info( 'Package for object with GUID %s will be re-activated' % gemini_guid) reactivate_package = True else: log.info( 'Remote record with GUID %s is not more recent than a deleted package, skipping... ' % gemini_guid) return None else: if last_harvested_object.content != self.obj.content and \ last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date: diff_generator = difflib.unified_diff( last_harvested_object.content.split('\n'), self.obj.content.split('\n')) diff = '\n'.join([line for line in diff_generator]) raise Exception( 'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff)) else: # The content hasn't changed, no need to update the package log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid)) return None else: log.info( 'No package with GEMINI guid %s found, let\'s create one' % gemini_guid) extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id} # Just add some of the metadata as extras, not the whole lot for name in [ # Essentials 'spatial-reference-system', 'guid', # Usefuls 'dataset-reference-date', 'metadata-language', # Language 'metadata-date', # Released 'coupled-resource', 'contact-email', 'frequency-of-update', 'spatial-data-service-type', ]: extras[name] = gemini_values[name] if len(gemini_values.get('progress', [])): extras['progress'] = gemini_values['progress'][0] else: extras['progress'] = '' extras['resource-type'] = gemini_values['resource-type'][0] # Use-constraints can contain values which are: # * free text # * licence URL # Store all values in extra['licence'] and if there is a # URL in there, store that in extra['licence-url'] extras['licence'] = gemini_values.get('use-constraints', '') if len(extras['licence']): licence_url_extracted = self._extract_first_licence_url( extras['licence']) if licence_url_extracted: extras['licence_url'] = licence_url_extracted extras['access_constraints'] = gemini_values.get( 'limitations-on-public-access', '') if 'temporal-extent-begin' in gemini_values: #gemini_values['temporal-extent-begin'].sort() extras['temporal_coverage-from'] = gemini_values[ 'temporal-extent-begin'] if 'temporal-extent-end' in gemini_values: #gemini_values['temporal-extent-end'].sort() extras['temporal_coverage-to'] = gemini_values[ 'temporal-extent-end'] # Save responsible organization roles provider, responsible_parties = self._process_responsible_organisation( gemini_values['responsible-organisation']) extras['provider'] = provider extras['responsible-party'] = '; '.join(responsible_parties) if len(gemini_values['bbox']) > 0: extras['bbox-east-long'] = gemini_values['bbox'][0]['east'] extras['bbox-north-lat'] = gemini_values['bbox'][0]['north'] extras['bbox-south-lat'] = gemini_values['bbox'][0]['south'] extras['bbox-west-long'] = gemini_values['bbox'][0]['west'] # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry extent_string = self.extent_template.substitute( xmin=extras['bbox-east-long'], ymin=extras['bbox-south-lat'], xmax=extras['bbox-west-long'], ymax=extras['bbox-north-lat']) extras['spatial'] = extent_string.strip() tags = [] for tag in gemini_values['tags']: tag = tag[:50] if len(tag) > 50 else tag tags.append({'name': tag}) package_dict = { 'title': gemini_values['title'], 'notes': gemini_values['abstract'], 'tags': tags, 'resources': [] } if self.obj.source.publisher_id: package_dict['groups'] = [{'id': self.obj.source.publisher_id}] if reactivate_package: package_dict['state'] = u'active' if package is None or package.title != gemini_values['title']: name = self.gen_new_name(gemini_values['title']) if not name: name = self.gen_new_name(six.text_type(gemini_guid)) if not name: raise Exception( 'Could not generate a unique name from the title or the GUID. Please choose a more unique title.' ) package_dict['name'] = name else: package_dict['name'] = package.name resource_locators = gemini_values.get('resource-locator', []) if len(resource_locators): for resource_locator in resource_locators: url = resource_locator.get('url', '') if url: resource_format = '' resource = {} if extras['resource-type'] == 'service': # Check if the service is a view service test_url = url.split('?')[0] if '?' in url else url if self._is_wms(test_url): resource['verified'] = True resource['verified_date'] = datetime.now( ).isoformat() resource_format = 'WMS' resource.update({ 'url': url, 'name': resource_locator.get('name', ''), 'description': resource_locator.get('description') if resource_locator.get('description') else 'Resource locator', 'format': resource_format or None, 'resource_locator_protocol': resource_locator.get('protocol', ''), 'resource_locator_function': resource_locator.get('function', '') }) package_dict['resources'].append(resource) # Guess the best view service to use in WMS preview verified_view_resources = [ r for r in package_dict['resources'] if 'verified' in r and r['format'] == 'WMS' ] if len(verified_view_resources): verified_view_resources[0][ 'ckan_recommended_wms_preview'] = True else: view_resources = [ r for r in package_dict['resources'] if r['format'] == 'WMS' ] if len(view_resources): view_resources[0]['ckan_recommended_wms_preview'] = True extras_as_dict = [] for key, value in extras.items(): if isinstance(value, six.string_types + (Number, )): extras_as_dict.append({'key': key, 'value': value}) else: extras_as_dict.append({'key': key, 'value': json.dumps(value)}) package_dict['extras'] = extras_as_dict if package == None: # Create new package from data. package = self._create_package_from_data(package_dict) log.info('Created new package ID %s with GEMINI guid %s', package['id'], gemini_guid) else: package = self._create_package_from_data(package_dict, package=package) log.info( 'Updated existing package ID %s with existing GEMINI guid %s', package['id'], gemini_guid) # Flag the other objects of this source as not current anymore from ckanext.harvest.model import harvest_object_table u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) Session.execute(u, params={'b_package_id': package['id']}) Session.commit() # Refresh current object from session, otherwise the # import paster command fails Session.remove() Session.add(self.obj) Session.refresh(self.obj) # Set reference to package in the HarvestObject and flag it as # the current one if not self.obj.package_id: self.obj.package_id = package['id'] self.obj.current = True self.obj.save() return package
def teardown_class(cls): """ Tear down, remove the session. """ CreateTestData.delete() Session.remove()
def test_harvest_deleted_record(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml', 'source_type': u'gemini-single' } source, first_job = self._create_source_and_job(source_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Delete package first_package_dict['state'] = u'deleted' self.context.update({'id':first_package_dict['id']}) updated_package_dict = get_action('package_update_rest')(self.context,first_package_dict) # Create and run a second job, the date has not changed, so the package should not be updated # and remain deleted first_job.status = u'Finished' first_job.save() second_job = self._create_job(source.id) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, updated_package_dict['id'] == second_package_dict['id'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Harvest an updated document, with a more recent modified date, package should be # updated and reactivated source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml' source.save() third_job = self._create_job(source.id) third_obj = self._run_job_for_single_document(third_job) third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) # Package was updated assert third_package_dict, third_package_dict['id'] == second_package_dict['id'] assert third_obj.package, third_obj.package assert third_obj.current == True, second_obj.current == False assert first_obj.current == False assert 'NEWER' in third_package_dict['title'] assert third_package_dict['state'] == u'active'
def test_harvest_different_sources_same_document(self): # Create source1 source1_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml', 'source_type': u'gemini-single' } source1, first_job = self._create_source_and_job(source1_fixture) first_obj = self._run_job_for_single_document(first_job) first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was created assert first_package_dict assert first_package_dict['state'] == u'active' assert first_obj.current == True # Harvest the same document, unchanged, from another source, the package # is not updated. # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67 # we are no longer throwing an exception when this happens) source2_fixture = { 'title': 'Test Source 2', 'name': 'test-source-2', 'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml', 'source_type': u'gemini-single' } source2, second_job = self._create_source_and_job(source2_fixture) second_obj = self._run_job_for_single_document(second_job) second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was not updated assert second_package_dict, first_package_dict['id'] == second_package_dict['id'] assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified'] assert not second_obj.package, not second_obj.package_id assert second_obj.current == False, first_obj.current == True # Inactivate source1 and reharvest from source2, package should be updated third_job = self._create_job(source2.id) third_obj = self._run_job_for_single_document(third_job,force_import=True) Session.remove() Session.add(first_obj) Session.add(second_obj) Session.add(third_obj) Session.refresh(first_obj) Session.refresh(second_obj) Session.refresh(third_obj) third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id}) # Package was updated assert third_package_dict, first_package_dict['id'] == third_package_dict['id'] assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified'] assert third_obj.package, third_obj.package_id == first_package_dict['id'] assert third_obj.current == True assert second_obj.current == False assert first_obj.current == False
def teardown_class(self): Session.remove()