Ejemplo n.º 1
0
    def setup_class(cls):
        try:
            from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        cls.content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content1)

        cls.content2 = '<xml>Content 2</xml>'
        cls.original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content2)

        hoe = HarvestObjectExtra(key='original_document',
                                 value=cls.original_content2,
                                 object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        cls.object_id_1 = ho1.id
        cls.object_id_2 = ho2.id
Ejemplo n.º 2
0
 def test_gather(self):
     source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
     source.save()
     job = HarvestJob(source=source)
     job.save()
     self.harvester.client = _FakeClient()
     self.harvester.gather_stage(job)
Ejemplo n.º 3
0
def harvest_source_create(context, data_dict):

    log.info('Creating harvest source: %r', data_dict)
    check_access('harvest_source_create', context, data_dict)

    model = context['model']
    session = context['session']
    schema = context.get('schema') or default_harvest_source_schema()

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        log.warn('Harvest source does not validate: %r', errors)
        raise ValidationError(errors, _error_summary(errors))

    source = HarvestSource()
    source.url = data['url'].strip()
    source.type = data['type']

    opt = [
        'active', 'title', 'description', 'user_id', 'publisher_id', 'config'
    ]
    for o in opt:
        if o in data and data[o] is not None:
            source.__setattr__(o, data[o])

    if 'active' in data_dict:
        source.active = data['active']

    source.save()
    log.info('Harvest source created: %s', source.id)

    return harvest_source_dictize(source, context)
Ejemplo n.º 4
0
    def test_auth_publisher_profile_different_publisher(self):

        # Create a source for publisher 1
        source = HarvestSource(url=u'http://test-source.com',
                               type='ckan',
                               publisher_id=self.publisher1.id)
        Session.add(source)
        Session.commit()

        extra_environ = {
            'REMOTE_USER': self.publisher2_user.name.encode('utf8')
        }

        # List (Publihsers can see the sources list)
        res = self.app.get('/harvest', extra_environ=extra_environ)
        assert 'Harvesting Sources' in res
        # Create
        res = self.app.get('/harvest/new', extra_environ=extra_environ)
        assert 'New harvest source' in res
        assert 'publisher_id' in res

        # Check that this publihser is not allowed to manage sources from other publishers
        status = 401
        # Read
        res = self.app.get('/harvest/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Edit
        res = self.app.get('/harvest/edit/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Refresh
        res = self.app.get('/harvest/refresh/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
Ejemplo n.º 5
0
    def _test_auth_not_allowed(self, user_name=None, source=None, status=401):

        if not source:
            # Create harvest source
            source = HarvestSource(url=u'http://test-source.com', type='ckan')
            Session.add(source)
            Session.commit()

        if user_name:
            extra_environ = {'REMOTE_USER': user_name.encode('utf8')}
        else:
            extra_environ = {}

        # List
        res = self.app.get('/harvest',
                           status=status,
                           extra_environ=extra_environ)
        # Create
        res = self.app.get('/harvest/new',
                           status=status,
                           extra_environ=extra_environ)
        # Read
        res = self.app.get('/harvest/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Edit
        res = self.app.get('/harvest/edit/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Refresh
        res = self.app.get('/harvest/refresh/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
Ejemplo n.º 6
0
 def test_form_bound_to_new_object(self):
     source = HarvestSource(url=u'http://localhost/', description=u'My source', type=u'Gemini')
     fs = form.get_harvest_source_fieldset()
     fs = fs.bind(source)
     text = fs.render()
     assert 'url' in text
     assert 'http://localhost/' in text
     assert 'description' in text
     assert 'My source' in text
Ejemplo n.º 7
0
 def test_form_bound_to_existing_object(self):
     source = HarvestSource(url=u'http://localhost/', description=u'My source', type=u'Gemini')
     model.Session.add(source)
     model.Session.commit()
     model.Session.remove()
     fs = form.get_harvest_source_fieldset()
     fs = fs.bind(source)
     text = fs.render()
     assert 'url' in text
     assert 'http://localhost/' in text
     assert 'description' in text
     assert 'My source' in text
Ejemplo n.º 8
0
def _create_harvest_source_object(context, data_dict):
    '''
        Creates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to create harvest sources. The created harvest source will
        have the same id as the dataset.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    log.info('Creating harvest source: %r', data_dict)

    source = HarvestSource()

    source.id = data_dict['id']
    source.url = data_dict['url'].strip()

    # Avoids clashes with the dataset type
    source.type = data_dict['source_type']

    opt = [
        'active', 'title', 'description', 'user_id', 'publisher_id', 'config',
        'frequency'
    ]
    for o in opt:
        if o in data_dict and data_dict[o] is not None:
            source.__setattr__(o, data_dict[o])

    source.active = not data_dict.get('state', None) == 'deleted'

    # Don't commit yet, let package_create do it
    source.add()
    log.info('Harvest source created: %s', source.id)

    return source
Ejemplo n.º 9
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        source = HarvestSource(url=u'http://test-source.org', type='test')
        source.save()

        job = HarvestJob(source=source)
        job.save()

        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           job=job,
                           guid=u'test-guid',
                           content=u'<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
Ejemplo n.º 10
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package.get('name', None),
                          utils.pid_to_name(package.get('id', None)))
        self.assertEquals(utils.get_primary_pid(package),
                          u'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('notes', None),
                          u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {
            u'id': u'http://islrn.org/resources/248-895-085-557-0',
            u'provider': provider,
            u'type': u'relation',
            u'relation': u'generalRelation'
        }

        self.assertTrue(expected_pid not in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
Ejemplo n.º 11
0
    def test_api(self):
        try:
            from ckanext.harvest.model import (HarvestObject, HarvestJob,
                                               HarvestSource,
                                               HarvestObjectExtra)
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=content1)

        content2 = '<xml>Content 2</xml>'
        original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=content2)

        hoe = HarvestObjectExtra(
            key='original_document',
            value=original_content2,
            object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        object_id_1 = ho1.id
        object_id_2 = ho2.id

        app = self._get_test_app()

        # Test redirects for old URLs
        url = '/api/2/rest/harvestobject/{0}/xml'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 301)
        assert ('/harvest/object/{0}'.format(object_id_1)
                in r.headers['Location'])

        url = '/api/2/rest/harvestobject/{0}/html'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 301)
        assert ('/harvest/object/{0}/html'.format(object_id_1)
                in r.headers['Location'])

        # Access object content
        url = '/harvest/object/{0}'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'application/xml; charset=utf-8')
        assert_equals(
            r.body,
            '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>')

        # Access original content in object extra (if present)
        url = '/harvest/object/{0}/original'.format(object_id_1)
        r = app.get(url, status=404)
        assert_equals(r.status_int, 404)

        url = '/harvest/object/{0}/original'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'application/xml; charset=utf-8')
        assert_equals(
            r.body,
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            + '<xml>Original Content 2</xml>')

        # Access HTML transformation
        url = '/harvest/object/{0}/html'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body

        url = '/harvest/object/{0}/html/original'.format(object_id_1)
        r = app.get(url, status=404)
        assert_equals(r.status_int, 404)

        url = '/harvest/object/{0}/html'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body

        url = '/harvest/object/{0}/html/original'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body
Ejemplo n.º 12
0
    def test_api(self, app):
        try:
            from ckanext.harvest.model import (
                HarvestObject,
                HarvestJob,
                HarvestSource,
                HarvestObjectExtra,
            )
        except ImportError:
            raise pytest.skip(
                "The harvester extension is needed for these tests")

        content1 = "<xml>Content 1</xml>"
        ho1 = HarvestObject(
            guid="test-ho-1",
            job=HarvestJob(source=HarvestSource(url="http://", type="xx")),
            content=content1,
        )

        content2 = "<xml>Content 2</xml>"
        original_content2 = "<xml>Original Content 2</xml>"
        ho2 = HarvestObject(
            guid="test-ho-2",
            job=HarvestJob(source=HarvestSource(url="http://", type="xx")),
            content=content2,
        )

        hoe = HarvestObjectExtra(
            key="original_document", value=original_content2, object=ho2
        )

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        object_id_1 = ho1.id
        object_id_2 = ho2.id

        # Access object content
        url = "/harvest/object/{0}".format(object_id_1)
        r = app.get(url, status=200)
        assert(
            r.headers["Content-Type"] == "application/xml; charset=utf-8"
        )
        assert(
            r.body ==
            '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>'
        )

        # Access original content in object extra (if present)
        url = "/harvest/object/{0}/original".format(object_id_1)
        r = app.get(url, status=404)

        url = "/harvest/object/{0}/original".format(object_id_2)
        r = app.get(url, status=200)
        assert(
            r.headers["Content-Type"] == "application/xml; charset=utf-8"
        )
        assert(
            r.body ==
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            + "<xml>Original Content 2</xml>"
        )
Ejemplo n.º 13
0
 def harvest_sources(self):
     ddi = HarvestSource(
         url=
         'http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt',
         type='DDI')
     ddi.save()