def test_zharvester_import(self, mocked=True):
        harvest_object, harv = self._create_harvester()
        self.assert_(harv.info()['name'] == 'OAI-PMH')
        real_content = json.loads(harvest_object.content)
        self.assert_(real_content)
        self.assert_(harv.import_stage(harvest_object))

        the_package = Session.query(Package).filter(Package.title == u"homer")
        print the_package
        the_package = the_package[0]
        self.assert_(the_package)
        self.assert_(len(the_package.get_tags()) == 4)
        self.assert_(len(the_package.get_groups()) == 3)
        self.assert_(the_package.url == "http://helda.helsinki.fi/oai/request?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc" % the_package.id)
        # Test with empty request
        Session.remove()
        CreateTestData.delete()
        Session.query(Package).delete()
        harvest_object, harv = self._create_harvester(config=False)
        real_content = json.loads(harvest_object.content)
        self.assert_(harv.import_stage(harvest_object) == False)
        errs = Session.query(HarvestGatherError).all()
        self.assert_(len(errs) == 2)
        errs = Session.query(HarvestObjectError).all()
        self.assert_(len(errs) == 3)
 def _refresh_harvest_objects(self, harvest_object, package_id):
     """
     Perform harvester housekeeping:
         - Flag the other objects of the source as not current
         - Set a refernce to the package in the harvest object
         - Flag it as current
         - And save the changes
     """
     # Flag the other objects of this source as not current
     from ckanext.harvest.model import harvest_object_table
     u = update(harvest_object_table) \
         .where(harvest_object_table.c.package_id == bindparam('pkg_id')) \
         .values(current=False)
     Session.execute(u, params={'pkg_id': package_id})
     Session.commit()
     # Refresh current object from session, otherwise the
     # import paster command fails
     # (Copied from the Gemini harvester--not sure if necessary)
     Session.remove()
     Session.add(harvest_object)
     Session.refresh(harvest_object)
     # Set reference to package in the HarvestObject and flag it as
     # the current one
     if not harvest_object.package_id:
         harvest_object.package_id = package_id
     harvest_object.current = True
     harvest_object.save()
Beispiel #3
0
    def test_harvest_import_command(self):

        # Create source
        source_fixture = {
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        before_package_dict = get_action('package_show_rest')(
            self.context, {
                'id': first_obj.package_id
            })

        # Package was created
        assert before_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run two more jobs, the package should not be updated
        second_job = self._create_job(source.id)
        second_obj = self._run_job_for_single_document(second_job)
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job)

        # Run the import command manually
        imported_objects = get_action('harvest_objects_import')(
            self.context, {
                'source_id': source.id
            })
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        after_package_dict = get_action('package_show_rest')(
            self.context, {
                'id': first_obj.package_id
            })

        # Package was updated, and the current object remains the same
        assert after_package_dict, before_package_dict[
            'id'] == after_package_dict['id']
        assert after_package_dict['metadata_modified'] > before_package_dict[
            'metadata_modified']
        assert third_obj.current == False
        assert second_obj.current == False
        assert first_obj.current == True

        source_dict = get_action('harvest_source_show')(self.context, {
            'id': source.id
        })
        assert len(source_dict['status']['packages']) == 1
    def test_harvest_update_records(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"}

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run a second job, the package should not be updated
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)

        second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict["id"] == second_package_dict["id"]
        assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"]
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Create and run a third job, forcing the importing to simulate an update in the package
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job, force_import=True)

        # For some reason first_obj does not get updated after the import_stage,
        # and we have to force a refresh to get the actual DB values.
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action("package_show_rest")(self.context, {"id": third_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict["id"] == third_package_dict["id"]
        assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"]
        assert third_obj.package, third_obj.package_id == first_package_dict["id"]
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Beispiel #5
0
 def setup_class(cls):
     """
     Remove any initial sessions.
     """
     Session.remove()
     # TODO: Should also remove test data
     CreateTestData.create()
Beispiel #6
0
 def setup_class(self):
     """
     Remove any initial sessions.
     """
     Session.remove()
     # TODO: Should also remove test data
     CreateTestData.create()
     self.sysadmin = model.User.get('testsysadmin')
 def setup_class(cls):
     """
     Remove any initial sessions.
     """
     Session.remove()
     CreateTestData.create()
     url = url_for(controller="ckanext.sitemap.controller:SitemapController", action="view")
     cls.cont = cls.app.get(url)
     cls.content_file = StringIO(cls.cont.body)
    def test_harvest_different_sources_same_document(self):

        # Create source1
        source1_fixture = {"url": u"http://127.0.0.1:8999/single/source1/same_dataset.xml", "type": u"gemini-single"}

        source1, first_job = self._create_source_and_job(source1_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict["state"] == u"active"
        assert first_obj.current == True

        # Harvest the same document, unchanged, from another source, the package
        # is not updated.
        # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67
        # we are no longer throwing an exception when this happens)
        source2_fixture = {"url": u"http://127.0.0.1:8999/single/source2/same_dataset.xml", "type": u"gemini-single"}

        source2, second_job = self._create_source_and_job(source2_fixture)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict["id"] == second_package_dict["id"]
        assert first_package_dict["metadata_modified"] == second_package_dict["metadata_modified"]
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Inactivate source1 and reharvest from source2, package should be updated
        third_job = self._create_job(source2.id)
        third_obj = self._run_job_for_single_document(third_job, force_import=True)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict["id"] == third_package_dict["id"]
        assert third_package_dict["metadata_modified"] > second_package_dict["metadata_modified"]
        assert third_obj.package, third_obj.package_id == first_package_dict["id"]
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Beispiel #9
0
 def setup_class(cls):
     """
     Remove any initial sessions.
     """
     Session.remove()
     CreateTestData.create()
     url = url_for(
         controller="ckanext.sitemap.controller:SitemapController",
         action='view')
     cls.cont = cls.app.get(url)
     cls.content_file = StringIO(cls.cont.body)
Beispiel #10
0
    def test_harvest_import_command(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        before_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert before_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run two more jobs, the package should not be updated
        second_job = self._create_job(source.id)
        second_obj = self._run_job_for_single_document(second_job)
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job)

        # Run the import command manually
        imported_objects = get_action('harvest_objects_import')(self.context,{'source_id':source.id})
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        after_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was updated, and the current object remains the same
        assert after_package_dict, before_package_dict['id'] == after_package_dict['id']
        assert after_package_dict['metadata_modified'] > before_package_dict['metadata_modified']
        assert third_obj.current == False
        assert second_obj.current == False
        assert first_obj.current == True


        source_dict = get_action('harvest_source_show')(self.context,{'id':source.id})
        assert source_dict['status']['total_datasets'] == 1
Beispiel #11
0
 def setup_class(cls):
     """
     Remove any initial sessions.
     """
     Session.remove()
     cls.original_config = config.copy()
     plugins = set(config['ckan.plugins'].strip().split())
     plugins.add('sitemap')
     config['ckan.plugins'] = ' '.join(plugins)
     cls.app = _get_test_app()
     CreateTestData.create()
     url = url_for(
         controller="ckanext.sitemap.controller:SitemapController",
         action='view')
     cls.cont = cls.app.get(url)
     cls.content_file = StringIO(cls.cont.body)
Beispiel #12
0
    def test_harvest_import_command(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"}

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        before_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert before_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run two more jobs, the package should not be updated
        second_job = self._create_job(source.id)
        second_obj = self._run_job_for_single_document(second_job)
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job)

        # Run the import command manually
        imported_objects = get_action("harvest_objects_import")(self.context, {"source_id": source.id})
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        after_package_dict = get_action("package_show_rest")(self.context, {"id": imported_objects[0]["package_id"]})

        # Package was updated, and the current object remains the same
        assert after_package_dict, before_package_dict["id"] == after_package_dict["id"]
        assert after_package_dict["metadata_modified"] > before_package_dict["metadata_modified"]
        assert third_obj.current == False
        assert second_obj.current == False
        assert first_obj.current == True

        source_dict = get_action("harvest_source_show")(self.context, {"id": source.id})
        assert len(source_dict["status"]["packages"]) == 1
Beispiel #13
0
 def authenticate(self, environ, identity):
     if 'repoze.who.plugins.openid.userid' in identity:
         openid = identity.get('repoze.who.plugins.openid.userid')
         user = User.by_openid(openid)
         if user is None:
             # TODO: Implement a mask to ask for an alternative user 
             # name instead of just using the OpenID identifier. 
             name = identity.get('repoze.who.plugins.openid.nickname')
             if not User.check_name_valid(name):
                 name = openid
             if not User.check_name_available(name):
                 name = openid
             user = User(openid=openid, name=name,
                     fullname=identity.get('repoze.who.plugins.openid.fullname'),
                     email=identity.get('repoze.who.plugins.openid.email'))
             Session.add(user)
             Session.commit()
             Session.remove()
         return user.name
     return None
Beispiel #14
0
    def _get_or_create_user(self, env):
        # WSGI Variables
        # Shib-Application-ID            'default'
        # Shib-Authentication-Instant    '2012-08-13T12:04:22.492Z'
        # Shib-Authentication-Method     'urn:oasis:names:tc:SAML:2.0:ac:classes:PasswordProtectedTransport'
        # Shib-AuthnContext-Class        'urn:oasis:names:tc:SAML:2.0:ac:classes:PasswordProtectedTransport'
        # Shib-Identity-Provider         'https://idp.example.com/idp/shibboleth'
        # Shib-Session-ID                '_7ec5a681e6dbae627c1cefcc7cb4d56a'
        # Shib-Session-Index             '39dafd8477850f5e0b968e3561570197f2109948c1d374a7a2b4c9a7adbf8628'
        # cn                             'My Other Self'
        # givenName                      'My Other Self'
        # mail                           '*****@*****.**'

        email = env.get(self.mail, None)
        fullname = env.get(self.name, None)

        if not email or not fullname:
            log.debug("Environ does not contain mail or cn attributes, user not loaded.")
            return None

        user = meta.Session.query(User).autoflush(False).filter_by(openid=email).first()

        if user is None:
            log.debug("User does not exists, creating new one.")

            import re

            username = re.sub("[.@]", "_", email)

            user = User(name=username, fullname=fullname, email=email, openid=email)

            Session.add(user)
            Session.commit()
            Session.remove()

            log.debug("Created new user %s" % fullname)

        return user
Beispiel #15
0
    def preauthenticate(self, environ, identity):
        # turn the oauth identity into a CKAN one; set it in our identity
        import oauth2 as oauth
        try:
            access_token = dict(urlparse.parse_qsl(identity['userdata']))
            oauth_token = access_token['oauth_token']
            oauth_token_secret = access_token['oauth_token_secret']
        except KeyError:
            return None
        access_token = oauth.Token(oauth_token,
                                   oauth_token_secret)
        client = oauth.Client(self.consumer, access_token)
        resp, content = client.request(self.user_url, "GET")
        data = json.loads(content)
        user_id = data['id']
        logging.info("Preauth: Got oauth user data for user %s" % user_id)
        user = User.by_openid(user_id)
        if user is None:
            user = User(openid=user_id,
                        name=data['id'],
                        fullname=data['name'],
                        email=data['mail'])
            Session.add(user)
        else:
            user.fullname = data['name'] # if the name is updated
        Session.commit()
        Session.remove()
        logging.info("Preauth: Created new/updated user %s" % user_id)

        # deal with groups
        user_groups = data['groups']
        _sync_auth_groups(user, user_groups)
        name = user.name.encode("utf8")
        logging.info("Preauth: Returning user identifier %s" % name)
        identity['repoze.who.userid'] = name 
        return identity
Beispiel #16
0
    def test_harvest_update_records(self):

        # Create source
        source_fixture = {
			'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run a second job, the package should not be updated
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)

        second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Create and run a third job, forcing the importing to simulate an update in the package
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        # For some reason first_obj does not get updated after the import_stage,
        # and we have to force a refresh to get the actual DB values.
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show_rest')(self.context,{'id':third_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Beispiel #17
0
 def teardown_class(cls):
     """
     Tear down, remove the session.
     """
     Session.remove()
Beispiel #18
0
    def test_harvest_deleted_record(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/service1.xml", "type": u"gemini-single"}

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict["state"] == u"active"
        assert first_obj.current == True

        # Delete package
        first_package_dict["state"] = u"deleted"
        self.context.update({"id": first_package_dict["id"]})
        updated_package_dict = get_action("package_update_rest")(self.context, first_package_dict)

        # Create and run a second job, the date has not changed, so the package should not be updated
        # and remain deleted
        first_job.status = u"Finished"
        first_job.save()
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        # Package was not updated
        assert second_package_dict, updated_package_dict["id"] == second_package_dict["id"]
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Harvest an updated document, with a more recent modified date, package should be
        # updated and reactivated
        source.url = u"http://127.0.0.1:8999/single/service1_newer.xml"
        source.save()

        third_job = self._create_job(source.id)

        third_obj = self._run_job_for_single_document(third_job)

        third_package_dict = get_action("package_show_rest")(self.context, {"id": first_obj.package_id})

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        # Package was updated
        assert third_package_dict, third_package_dict["id"] == second_package_dict["id"]
        assert third_obj.package, third_obj.package
        assert third_obj.current == True, second_obj.current == False
        assert first_obj.current == False

        assert "NEWER" in third_package_dict["title"]
        assert third_package_dict["state"] == u"active"
Beispiel #19
0
    def test_harvest_deleted_record(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Delete package
        first_package_dict['state'] = u'deleted'
        self.context.update({'id':first_package_dict['id']})
        updated_package_dict = get_action('package_update')(self.context,first_package_dict)

        # Create and run a second job, the date has not changed, so the package should not be updated
        # and remain deleted
        first_job.status = u'Finished'
        first_job.save()
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, updated_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True


        # Harvest an updated document, with a more recent modified date, package should be
        # updated and reactivated
        source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml'
        source.save()

        third_job = self._create_job(source.id)

        third_obj = self._run_job_for_single_document(third_job)

        third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        # Package was updated
        assert third_package_dict, third_package_dict['id'] == second_package_dict['id']
        assert third_obj.package, third_obj.package
        assert third_obj.current == True, second_obj.current == False
        assert first_obj.current == False

        assert 'NEWER' in third_package_dict['title']
        assert third_package_dict['state'] == u'active'
Beispiel #20
0
    def test_harvest_different_sources_same_document(self):

        # Create source1
        source1_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source1, first_job = self._create_source_and_job(source1_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Harvest the same document, unchanged, from another source, the package
        # is not updated.
        # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67
        # we are no longer throwing an exception when this happens)
        source2_fixture = {
            'title': 'Test Source 2',
            'name': 'test-source-2',
            'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source2, second_job = self._create_source_and_job(source2_fixture)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Inactivate source1 and reharvest from source2, package should be updated
        third_job = self._create_job(source2.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Beispiel #21
0
    def test_harvest_update_records(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_obj.current == True
        assert first_obj.package

        # Create and run a second job, the package should not be updated
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)

        second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True

        # Create and run a third job, forcing the importing to simulate an update in the package
        third_job = self._create_job(source.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        # For some reason first_obj does not get updated after the import_stage,
        # and we have to force a refresh to get the actual DB values.
        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show')(self.context,{'id':third_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Beispiel #22
0
 def setup_class(cls):
     """
     Remove any initial sessions.
     """
     Session.remove()
     package_dicts = [{'name':u'abraham', 'title':u'Abraham'},
             {'name':u'homer', 'title':u'Homer', 'tags':['foo', 'bar',
                                                         'baz']},
             {'name':u'homer_derived', 'title':u'Homer Derived'},
             {'name':u'beer', 'title':u'Beer'},
             {'name':u'bart', 'title':u'Bart'},
             {'name':u'lisa', 'title':u'Lisa', 'extras': {'fezina':'foo'}},
             {'name':u'marge', 'title':u'Marge'},
             {'name':u'marge1', 'title':u'Marge'},
             {'name':u'marge11', 'title':u'Marge'},
             {'name':u'marge121', 'title':u'Marge'},
             {'name':u'marge311', 'title':u'Marge'},
             {'name':u'marge24', 'title':u'Marge'},
             {'name':u'marget1', 'title':u'Marge'},
             {'name':u'marge31', 'title':u'Marge'},
             {'name':u'marge1121', 'title':u'Marge'},
             {'name':u'marge1t', 'title':u'Marge'},
             {'name':u'marge1b', 'title':u'Marge'},
             {'name':u'marge1a', 'title':u'Marge'},
             ]
     CreateTestData.create_arbitrary(package_dicts)
     package_dicts = [u'abraham',
             u'homer',
             u'homer_derived',
             u'beer',
             u'bart',
             u'lisa',
             u'marge',
             u'marge1',
             u'marge11',
             u'marge121',
             u'marge311',
             u'marge24',
             u'marget1',
             u'marge31',
             u'marge1121',
             u'marge1t',
             u'marge1b',
             u'marge1a',
             ]
     group_dicts = [{'name':'roger', 'title':'roger', 'description':'',
                     'packages': package_dicts},
                    {'name':'roger1', 'title':'roger', 'description':''},
                    {'name':'roger2', 'title':'roger', 'description':''},
                    {'name':'roger3', 'title':'roger', 'description':''},
                    {'name':'roger4', 'title':'roger', 'description':''},
                    {'name':'roger5', 'title':'roger', 'description':''},
                    {'name':'roger6', 'title':'roger', 'description':''},
                    {'name':'roger7', 'title':'roger', 'description':''},
                    {'name':'roger8', 'title':'roger', 'description':''},
                    {'name':'roger9', 'title':'roger', 'description':''},
                    {'name':'roger0', 'title':'roger', 'description':''},
                    {'name':'roger11', 'title':'roger', 'description':''},
                    {'name':'roger12', 'title':'roger', 'description':''},
                    {'name':'roger13', 'title':'roger', 'description':''},
                    {'name':'roger14', 'title':'roger', 'description':''}]
     CreateTestData.create_groups(group_dicts)
     setup()
     cls._first = True
     cls._second = False
Beispiel #23
0
    def write_package_from_gemini_string(self, content):
        '''Create or update a Package based on some content that has
        come from a URL.

        Returns the package_dict of the result.
        If there is an error, it returns None or raises Exception.
        '''
        log = logging.getLogger(__name__ + '.import')
        package = None
        gemini_document = GeminiDocument(content)
        gemini_values = gemini_document.read_values()
        gemini_guid = gemini_values['guid']

        # Save the metadata reference date in the Harvest Object
        try:
            metadata_modified_date = datetime.strptime(
                gemini_values['metadata-date'], '%Y-%m-%d')
        except ValueError:
            try:
                metadata_modified_date = datetime.strptime(
                    gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S')
            except:
                raise Exception('Could not extract reference date for GUID %s (%s)' \
                        % (gemini_guid,gemini_values['metadata-date']))

        self.obj.metadata_modified_date = metadata_modified_date
        self.obj.save()

        last_harvested_object = Session.query(HarvestObject) \
                            .filter(HarvestObject.guid==gemini_guid) \
                            .filter(HarvestObject.current==True) \
                            .all()

        if len(last_harvested_object) == 1:
            last_harvested_object = last_harvested_object[0]
        elif len(last_harvested_object) > 1:
            raise Exception(
                'Application Error: more than one current record for GUID %s' %
                gemini_guid)

        reactivate_package = False
        if last_harvested_object:
            # We've previously harvested this (i.e. it's an update)

            # Use metadata modified date instead of content to determine if the package
            # needs to be updated
            if last_harvested_object.metadata_modified_date is None \
                or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \
                or self.force_import \
                or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and
                    last_harvested_object.source.active is False):

                if self.force_import:
                    log.info('Import forced for object %s with GUID %s' %
                             (self.obj.id, gemini_guid))
                else:
                    log.info(
                        'Package for object with GUID %s needs to be created or updated'
                        % gemini_guid)

                package = last_harvested_object.package

                # If the package has a deleted state, we will only update it and reactivate it if the
                # new document has a more recent modified date
                if package.state == u'deleted':
                    if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date:
                        log.info(
                            'Package for object with GUID %s will be re-activated'
                            % gemini_guid)
                        reactivate_package = True
                    else:
                        log.info(
                            'Remote record with GUID %s is not more recent than a deleted package, skipping... '
                            % gemini_guid)
                        return None

            else:
                if last_harvested_object.content != self.obj.content and \
                 last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
                    diff_generator = difflib.unified_diff(
                        last_harvested_object.content.split('\n'),
                        self.obj.content.split('\n'))
                    diff = '\n'.join([line for line in diff_generator])
                    raise Exception(
                        'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s'
                        % (gemini_guid, diff))
                else:
                    # The content hasn't changed, no need to update the package
                    log.info('Document with GUID %s unchanged, skipping...' %
                             (gemini_guid))
                return None
        else:
            log.info(
                'No package with GEMINI guid %s found, let\'s create one' %
                gemini_guid)

        extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id}

        # Just add some of the metadata as extras, not the whole lot
        for name in [
                # Essentials
                'spatial-reference-system',
                'guid',
                # Usefuls
                'dataset-reference-date',
                'metadata-language',  # Language
                'metadata-date',  # Released
                'coupled-resource',
                'contact-email',
                'frequency-of-update',
                'spatial-data-service-type',
        ]:
            extras[name] = gemini_values[name]

        if len(gemini_values.get('progress', [])):
            extras['progress'] = gemini_values['progress'][0]
        else:
            extras['progress'] = ''

        extras['resource-type'] = gemini_values['resource-type'][0]

        # Use-constraints can contain values which are:
        #  * free text
        #  * licence URL
        # Store all values in extra['licence'] and if there is a
        # URL in there, store that in extra['licence-url']
        extras['licence'] = gemini_values.get('use-constraints', '')
        if len(extras['licence']):
            licence_url_extracted = self._extract_first_licence_url(
                extras['licence'])
            if licence_url_extracted:
                extras['licence_url'] = licence_url_extracted

        extras['access_constraints'] = gemini_values.get(
            'limitations-on-public-access', '')
        if 'temporal-extent-begin' in gemini_values:
            #gemini_values['temporal-extent-begin'].sort()
            extras['temporal_coverage-from'] = gemini_values[
                'temporal-extent-begin']
        if 'temporal-extent-end' in gemini_values:
            #gemini_values['temporal-extent-end'].sort()
            extras['temporal_coverage-to'] = gemini_values[
                'temporal-extent-end']

        # Save responsible organization roles
        provider, responsible_parties = self._process_responsible_organisation(
            gemini_values['responsible-organisation'])
        extras['provider'] = provider
        extras['responsible-party'] = '; '.join(responsible_parties)

        if len(gemini_values['bbox']) > 0:
            extras['bbox-east-long'] = gemini_values['bbox'][0]['east']
            extras['bbox-north-lat'] = gemini_values['bbox'][0]['north']
            extras['bbox-south-lat'] = gemini_values['bbox'][0]['south']
            extras['bbox-west-long'] = gemini_values['bbox'][0]['west']

            # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry
            extent_string = self.extent_template.substitute(
                xmin=extras['bbox-east-long'],
                ymin=extras['bbox-south-lat'],
                xmax=extras['bbox-west-long'],
                ymax=extras['bbox-north-lat'])

            extras['spatial'] = extent_string.strip()

        tags = []
        for tag in gemini_values['tags']:
            tag = tag[:50] if len(tag) > 50 else tag
            tags.append({'name': tag})

        package_dict = {
            'title': gemini_values['title'],
            'notes': gemini_values['abstract'],
            'tags': tags,
            'resources': []
        }

        if self.obj.source.publisher_id:
            package_dict['groups'] = [{'id': self.obj.source.publisher_id}]

        if reactivate_package:
            package_dict['state'] = u'active'

        if package is None or package.title != gemini_values['title']:
            name = self.gen_new_name(gemini_values['title'])
            if not name:
                name = self.gen_new_name(six.text_type(gemini_guid))
            if not name:
                raise Exception(
                    'Could not generate a unique name from the title or the GUID. Please choose a more unique title.'
                )
            package_dict['name'] = name
        else:
            package_dict['name'] = package.name

        resource_locators = gemini_values.get('resource-locator', [])

        if len(resource_locators):
            for resource_locator in resource_locators:
                url = resource_locator.get('url', '')
                if url:
                    resource_format = ''
                    resource = {}
                    if extras['resource-type'] == 'service':
                        # Check if the service is a view service
                        test_url = url.split('?')[0] if '?' in url else url
                        if self._is_wms(test_url):
                            resource['verified'] = True
                            resource['verified_date'] = datetime.now(
                            ).isoformat()
                            resource_format = 'WMS'
                    resource.update({
                        'url':
                        url,
                        'name':
                        resource_locator.get('name', ''),
                        'description':
                        resource_locator.get('description')
                        if resource_locator.get('description') else
                        'Resource locator',
                        'format':
                        resource_format or None,
                        'resource_locator_protocol':
                        resource_locator.get('protocol', ''),
                        'resource_locator_function':
                        resource_locator.get('function', '')
                    })
                    package_dict['resources'].append(resource)

            # Guess the best view service to use in WMS preview
            verified_view_resources = [
                r for r in package_dict['resources']
                if 'verified' in r and r['format'] == 'WMS'
            ]
            if len(verified_view_resources):
                verified_view_resources[0][
                    'ckan_recommended_wms_preview'] = True
            else:
                view_resources = [
                    r for r in package_dict['resources']
                    if r['format'] == 'WMS'
                ]
                if len(view_resources):
                    view_resources[0]['ckan_recommended_wms_preview'] = True

        extras_as_dict = []
        for key, value in extras.items():
            if isinstance(value, six.string_types + (Number, )):
                extras_as_dict.append({'key': key, 'value': value})
            else:
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})

        package_dict['extras'] = extras_as_dict

        if package == None:
            # Create new package from data.
            package = self._create_package_from_data(package_dict)
            log.info('Created new package ID %s with GEMINI guid %s',
                     package['id'], gemini_guid)
        else:
            package = self._create_package_from_data(package_dict,
                                                     package=package)
            log.info(
                'Updated existing package ID %s with existing GEMINI guid %s',
                package['id'], gemini_guid)

        # Flag the other objects of this source as not current anymore
        from ckanext.harvest.model import harvest_object_table
        u = update(harvest_object_table) \
                .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                .values(current=False)
        Session.execute(u, params={'b_package_id': package['id']})
        Session.commit()

        # Refresh current object from session, otherwise the
        # import paster command fails
        Session.remove()
        Session.add(self.obj)
        Session.refresh(self.obj)

        # Set reference to package in the HarvestObject and flag it as
        # the current one
        if not self.obj.package_id:
            self.obj.package_id = package['id']

        self.obj.current = True
        self.obj.save()

        return package
Beispiel #24
0
 def teardown_class(cls):
     """
     Tear down, remove the session.
     """
     CreateTestData.delete()
     Session.remove()
Beispiel #25
0
 def teardown_class(cls):
     """
     Tear down, remove the session.
     """
     Session.remove()
Beispiel #26
0
    def test_harvest_deleted_record(self):

        # Create source
        source_fixture = {
			'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
            'source_type': u'gemini-single'
        }

        source, first_job = self._create_source_and_job(source_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Delete package
        first_package_dict['state'] = u'deleted'
        self.context.update({'id':first_package_dict['id']})
        updated_package_dict = get_action('package_update_rest')(self.context,first_package_dict)

        # Create and run a second job, the date has not changed, so the package should not be updated
        # and remain deleted
        first_job.status = u'Finished'
        first_job.save()
        second_job = self._create_job(source.id)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, updated_package_dict['id'] == second_package_dict['id']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True


        # Harvest an updated document, with a more recent modified date, package should be
        # updated and reactivated
        source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml'
        source.save()

        third_job = self._create_job(source.id)

        third_obj = self._run_job_for_single_document(third_job)

        third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        # Package was updated
        assert third_package_dict, third_package_dict['id'] == second_package_dict['id']
        assert third_obj.package, third_obj.package
        assert third_obj.current == True, second_obj.current == False
        assert first_obj.current == False

        assert 'NEWER' in third_package_dict['title']
        assert third_package_dict['state'] == u'active'
Beispiel #27
0
    def test_harvest_different_sources_same_document(self):

        # Create source1
        source1_fixture = {
		    'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source1, first_job = self._create_source_and_job(source1_fixture)

        first_obj = self._run_job_for_single_document(first_job)

        first_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was created
        assert first_package_dict
        assert first_package_dict['state'] == u'active'
        assert first_obj.current == True

        # Harvest the same document, unchanged, from another source, the package
        # is not updated.
        # (As of https://github.com/okfn/ckanext-inspire/commit/9fb67
        # we are no longer throwing an exception when this happens)
        source2_fixture = {
			'title': 'Test Source 2',
			'name': 'test-source-2',
            'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml',
            'source_type': u'gemini-single'
        }

        source2, second_job = self._create_source_and_job(source2_fixture)

        second_obj = self._run_job_for_single_document(second_job)

        second_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was not updated
        assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
        assert first_package_dict['metadata_modified'] == second_package_dict['metadata_modified']
        assert not second_obj.package, not second_obj.package_id
        assert second_obj.current == False, first_obj.current == True


        # Inactivate source1 and reharvest from source2, package should be updated
        third_job = self._create_job(source2.id)
        third_obj = self._run_job_for_single_document(third_job,force_import=True)

        Session.remove()
        Session.add(first_obj)
        Session.add(second_obj)
        Session.add(third_obj)

        Session.refresh(first_obj)
        Session.refresh(second_obj)
        Session.refresh(third_obj)

        third_package_dict = get_action('package_show_rest')(self.context,{'id':first_obj.package_id})

        # Package was updated
        assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
        assert third_package_dict['metadata_modified'] > second_package_dict['metadata_modified']
        assert third_obj.package, third_obj.package_id == first_package_dict['id']
        assert third_obj.current == True
        assert second_obj.current == False
        assert first_obj.current == False
Beispiel #28
0
 def teardown_class(cls):
     """
     Tear down, remove the session.
     """
     CreateTestData.delete()
     Session.remove()
 def teardown_class(self):
     Session.remove()