def test_gather(self):
     source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
     source.save()
     job = HarvestJob(source=source)
     job.save()
     self.harvester.client = _FakeClient()
     self.harvester.gather_stage(job)
Beispiel #2
0
    def setup_class(cls):
        try:
            from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        cls.content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content1)

        cls.content2 = '<xml>Content 2</xml>'
        cls.original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content2)

        hoe = HarvestObjectExtra(key='original_document',
                                 value=cls.original_content2,
                                 object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        cls.object_id_1 = ho1.id
        cls.object_id_2 = ho2.id
Beispiel #3
0
 def test_gather(self):
     source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
     source.save()
     job = HarvestJob(source=source)
     job.save()
     self.harvester.client = _FakeClient()
     self.harvester.gather_stage(job)
Beispiel #4
0
def harvest_source_create(context, data_dict):

    log.info('Creating harvest source: %r', data_dict)
    check_access('harvest_source_create', context, data_dict)

    model = context['model']
    session = context['session']
    schema = context.get('schema') or default_harvest_source_schema()

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        log.warn('Harvest source does not validate: %r', errors)
        raise ValidationError(errors, _error_summary(errors))

    source = HarvestSource()
    source.url = data['url'].strip()
    source.type = data['type']

    opt = [
        'active', 'title', 'description', 'user_id', 'publisher_id', 'config'
    ]
    for o in opt:
        if o in data and data[o] is not None:
            source.__setattr__(o, data[o])

    if 'active' in data_dict:
        source.active = data['active']

    source.save()
    log.info('Harvest source created: %s', source.id)

    return harvest_source_dictize(source, context)
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'})

        self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180')
        self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0',
                        u'provider': provider,
                        u'type': u'metadata'}

        self.assertTrue(expected_pid in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'})

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
Beispiel #6
0
def harvest_source_create(context,data_dict):

    log.info('Creating harvest source: %r', data_dict)
    check_access('harvest_source_create',context,data_dict)

    model = context['model']
    session = context['session']
    schema = context.get('schema') or default_harvest_source_schema()

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        log.warn('Harvest source does not validate: %r', errors)
        raise ValidationError(errors,_error_summary(errors))

    source = HarvestSource()
    source.url = data['url'].strip()
    source.type = data['type']

    opt = ['active','title','description','user_id','publisher_id','config']
    for o in opt:
        if o in data and data[o] is not None:
            source.__setattr__(o,data[o])

    if 'active' in data_dict:
        source.active = data['active']

    source.save()
    log.info('Harvest source created: %s', source.id)

    return harvest_source_dictize(source,context)
Beispiel #7
0
def harvest_job_list(context, data_dict):
    model = context['model']
    user = context.get('user')

    source_id = data_dict.get('source_id', False)
    if not source_id:
        return {
            'success': False,
            'msg': _('Only sysadmins can list all harvest jobs') % str(user)
        }

    source = HarvestSource.get(source_id)
    if not source:
        raise p.toolkit.ObjectNotFound

    # Check the user is admin/editor for the publisher - i.e. has
    # update_dataset permission
    check1 = ckan.new_authz.has_user_permission_for_group_or_org(
        source.publisher_id, user, 'update_dataset')
    if not check1:
        return {
            'success':
            False,
            'msg':
            _('User %s not authorized to list jobs from source %s') %
            (str(user), source.id)
        }

    return {'success': True}
Beispiel #8
0
    def test_auth_publisher_profile_different_publisher(self):

        # Create a source for publisher 1
        source = HarvestSource(url=u'http://test-source.com',
                               type='ckan',
                               publisher_id=self.publisher1.id)
        Session.add(source)
        Session.commit()

        extra_environ = {
            'REMOTE_USER': self.publisher2_user.name.encode('utf8')
        }

        # List (Publihsers can see the sources list)
        res = self.app.get('/harvest', extra_environ=extra_environ)
        assert 'Harvesting Sources' in res
        # Create
        res = self.app.get('/harvest/new', extra_environ=extra_environ)
        assert 'New harvest source' in res
        assert 'publisher_id' in res

        # Check that this publihser is not allowed to manage sources from other publishers
        status = 401
        # Read
        res = self.app.get('/harvest/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Edit
        res = self.app.get('/harvest/edit/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Refresh
        res = self.app.get('/harvest/refresh/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
Beispiel #9
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
Beispiel #10
0
def harvest_source_index_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string
    '''

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id')

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
Beispiel #11
0
def harvest_source_id_exists(value, context):

    result = HarvestSource.get(value)

    if not result:
        raise Invalid('Harvest Source with id %r does not exist.' % str(value))
    return value
Beispiel #12
0
def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict ={
        'source_id':source_id,
        'status':u'New'
    }
    exists = harvest_job_list(context,data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
Beispiel #14
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(data_dict['id']))
                return data_dict

            data_dict['status'] = harvest_logic.action.get.harvest_source_show_status(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # validate is false is passed only on indexing.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                        ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Beispiel #15
0
def harvest_source_index_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string
    '''

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id')

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))

    solr_commit = toolkit.asbool(config.get('ckan.search.solr_commit', 'true'))
    if toolkit.check_ckan_version(max_version='2.5.99'):
        # conn is solrpy
        try:
            conn.delete_query(query)
            if solr_commit:
                conn.commit()
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
        finally:
Beispiel #16
0
def harvest_job_list(context,data_dict):
    model = context['model']
    user = context.get('user')

    # Check user is logged in
    if not user:
        return {'success': False, 'msg': _('Only logged users are authorized to see their sources')}

    user_obj = User.get(user)

    # Checks for non sysadmin users
    if not Authorizer().is_sysadmin(user):
        if not user_obj or len(user_obj.get_groups(u'publisher')) == 0:
            return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest jobs') % str(user)}

        source_id = data_dict.get('source_id',False)
        if not source_id:
            return {'success': False, 'msg': _('Only sysadmins can list all harvest jobs') % str(user)}

        source = HarvestSource.get(source_id)
        if not source:
            raise NotFound

        if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher')]:
            return {'success': False, 'msg': _('User %s not authorized to list jobs from source %s') % (str(user),source.id)}

    return {'success': True}
Beispiel #17
0
    def _test_auth_not_allowed(self, user_name=None, source=None, status=401):

        if not source:
            # Create harvest source
            source = HarvestSource(url=u'http://test-source.com', type='ckan')
            Session.add(source)
            Session.commit()

        if user_name:
            extra_environ = {'REMOTE_USER': user_name.encode('utf8')}
        else:
            extra_environ = {}

        # List
        res = self.app.get('/harvest',
                           status=status,
                           extra_environ=extra_environ)
        # Create
        res = self.app.get('/harvest/new',
                           status=status,
                           extra_environ=extra_environ)
        # Read
        res = self.app.get('/harvest/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Edit
        res = self.app.get('/harvest/edit/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
        # Refresh
        res = self.app.get('/harvest/refresh/%s' % source.id,
                           status=status,
                           extra_environ=extra_environ)
Beispiel #18
0
def harvest_job_create(context, data_dict):
    model = context['model']
    user = context.get('user')

    source_id = data_dict['source_id']

    if not user:
        return {
            'success':
            False,
            'msg':
            _('Non-logged in users are not authorized to create harvest jobs')
        }

    if ckan.new_authz.is_sysadmin(user):
        return {'success': True}

    user_obj = User.get(user)
    source = HarvestSource.get(source_id)
    if not source:
        raise NotFound

    if not user_obj or not source.publisher_id in [
            g.id for g in user_obj.get_groups(u'organization')
    ]:
        return {
            'success':
            False,
            'msg':
            _('User %s not authorized to create a job for source %s') %
            (str(user), source.id)
        }
    else:
        return {'success': True}
def harvest_source_id_exists(value, context):

    result = HarvestSource.get(value)

    if not result:
        raise Invalid('Harvest Source with id %r does not exist.' % str(value))
    return value
Beispiel #20
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(data_dict['id']))
                return data_dict

            data_dict['status'] = harvest_logic.action.get.harvest_source_show_status(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # validate is false is passed only on indexing.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                        ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Beispiel #21
0
def harvest_source_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    '''
    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;'''.format(
        harvest_source_id=harvest_source_id)

    model = context['model']

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({'validate': False, 'ignore_auth': True})
    package_dict = logic.get_action('package_show')(context, {
        'id': harvest_source_id
    })

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {'id': harvest_source_id}
Beispiel #22
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict = {'source_id': source_id, 'status': u'New'}
    exists = harvest_job_list(context, data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
Beispiel #23
0
def harvest_source_show(context,data_dict):
    '''
    Returns the metadata of a harvest source

    This method just proxies the request to package_show. All auth checks and
    validation will be done there.

    :param id: the id or name of the harvest source
    :type id: string

    :returns: harvest source metadata
    :rtype: dictionary
    '''
    check_access('harvest_source_show',context,data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr',None)

    source = HarvestSource.get(id,attr=attr)
    context['source'] = source

    if not source:
        raise NotFound

    if 'include_status' not in context:
        context['include_status'] = True

    return harvest_source_dictize(source,context)
Beispiel #24
0
def harvest_source_clear(context, data_dict):
    """
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    """
    check_access("harvest_source_clear", context, data_dict)

    harvest_source_id = data_dict.get("id", None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error("Harvest source %s does not exist", harvest_source_id)
        raise NotFound("Harvest source %s does not exist" % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = """begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;""".format(
        harvest_source_id=harvest_source_id
    )

    model = context["model"]

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({"validate": False, "ignore_auth": True})
    package_dict = logic.get_action("package_show")(context, {"id": harvest_source_id})

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {"id": harvest_source_id}
Beispiel #25
0
 def test_form_bound_to_new_object(self):
     source = HarvestSource(url=u'http://localhost/', description=u'My source', type=u'Gemini')
     fs = form.get_harvest_source_fieldset()
     fs = fs.bind(source)
     text = fs.render()
     assert 'url' in text
     assert 'http://localhost/' in text
     assert 'description' in text
     assert 'My source' in text
Beispiel #26
0
 def test_form_validate_new_object_and_sync(self):
     assert not HarvestSource.get(u'http://localhost/', None, 'url')
     fs = form.get_harvest_source_fieldset()
     register = HarvestSource
     data = {
         'HarvestSource--url': u'http://localhost/', 
         'HarvestSource--type': u'Gemini',
         'HarvestSource--description': u'My source'
     }
     fs = fs.bind(register, data=data, session=model.Session)
     # Test bound_fields.validate().
     fs.validate()
     assert not fs.errors
     # Test bound_fields.sync().
     fs.sync()
     model.Session.commit()
     source = HarvestSource.get(u'http://localhost/', None, 'url')
     assert source.id
Beispiel #27
0
def _update_harvest_source_object(context, data_dict):
    '''
        Updates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to update harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise logic.NotFound('Harvest source %s does not exist' % source_id)

    fields = [
        'url', 'title', 'description', 'user_id', 'publisher_id', 'frequency',
        'time'
    ]
    for f in fields:
        if f in data_dict and data_dict[f] is not None:
            if f == 'url':
                data_dict[f] = data_dict[f].strip()
            source.__setattr__(f, data_dict[f])

    # Avoids clashes with the dataset type
    if 'source_type' in data_dict:
        source.type = data_dict['source_type']

    if 'config' in data_dict:
        source.config = data_dict['config']

    # Don't change state unless explicitly set in the dict
    if 'state' in data_dict:
        source.active = data_dict.get('state') == 'active'

    # Don't commit yet, let package_create do it
    source.add()

    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source, status=u'New')
        log.info(
            'Harvest source %s not active, so aborting %i outstanding jobs',
            source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.add()

    return source
    def setup(self):
        print ("")
        print ("TestUM:setup() before each test method")

        # Add sysadmin user
        self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True)
        model.Session.add(self.harvestUser)
        model.Session.commit()

        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'xml/sample.xml',
            'source_type': u'ngds'
        }

        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher.id

        source_dict=get_action('harvest_source_create')(context, source_fixture)
        self.oHarvestSource = HarvestSource.get(source_dict['id'])

        job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id})
        self.oHarvestJob = HarvestJob.get(job_dict['id'])

        context = {
            'model' : model,
            'session': model.Session,
            'ignore_auth': True,
        }

        data_dict = {
            'guid' : 'guid',
            'content' : self.contentDataset,
            'job_id' : self.oHarvestJob.id,
            'extras' : { 'a key' : 'a value' },
        }

        oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict)
        self.oHarvestObject = HarvestObject.get(oHarvestObject['id'])

        package_schema = default_update_package_schema()
        self.context = {
            'model':model,
            'session': model.Session,
            'user':u'harvest',
            'schema':package_schema,
            'api_version': '2'
        }
Beispiel #29
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(
                    data_dict['id']))
                return data_dict

            st_action_name = 'harvest_source_show_status'
            try:
                status_action = p.toolkit.get_action(st_action_name)
            except KeyError:
                logic.clear_actions_cache()
                status_action = p.toolkit.get_action(st_action_name)

            data_dict['status'] = status_action(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # If the harvest extras are there, remove them. This can happen eg
            # when calling package_update or resource_update, which call
            # package_show
            if data_dict.get('extras'):
                data_dict['extras'][:] = [
                    e for e in data_dict.get('extras', []) if not e['key'] in (
                        'harvest_object_id',
                        'harvest_source_id',
                        'harvest_source_title',
                    )
                ]

            # We only want to add these extras at index time so they are part
            # of the cached data_dict used to display, search results etc. We
            # don't want them added when editing the dataset, otherwise we get
            # duplicated key errors.
            # The only way to detect indexing right now is checking that
            # validate is set to False.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Beispiel #30
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
Beispiel #31
0
def get_source_object(context, data_dict={}):
    if not 'source' in context:
        model = context['model']
        id = data_dict.get('id', None)
        source = HarvestSource.get(id)
        if not source:
            raise NotFound
    else:
        source = context['source']

    return source
Beispiel #32
0
def get_source_object(context, data_dict = {}):
    if not 'source' in context:
        model = context['model']
        id = data_dict.get('id',None)
        source = HarvestSource.get(id)
        if not source:
            raise NotFound
    else:
        source = context['source']

    return source
def _update_harvest_source_object(context, data_dict):
    '''
        Updates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to update harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise logic.NotFound('Harvest source %s does not exist' % source_id)


    fields = ['url', 'title', 'description', 'user_id',
              'publisher_id', 'frequency']
    for f in fields:
        if f in data_dict and data_dict[f] is not None:
            if f == 'url':
                data_dict[f] = data_dict[f].strip()
            source.__setattr__(f,data_dict[f])

    # Avoids clashes with the dataset type
    if 'source_type' in data_dict:
        source.type = data_dict['source_type']

    if 'config' in data_dict:
        source.config = data_dict['config']

    # Don't change state unless explicitly set in the dict
    if 'state' in data_dict:
      source.active = data_dict.get('state') == 'active'

    # Don't commit yet, let package_create do it
    source.add()

    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.add()

    return source
Beispiel #34
0
    def _create_source(self, source_fixture=FISBROKER_HARVESTER_CONFIG):
        context = {
            'model': model,
            'session': Session,
            'user': u'harvest'
        }

        source_dict = get_action('harvest_source_create')(context,source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        return source
Beispiel #35
0
 def test_form_bound_to_existing_object(self):
     source = HarvestSource(url=u'http://localhost/', description=u'My source', type=u'Gemini')
     model.Session.add(source)
     model.Session.commit()
     model.Session.remove()
     fs = form.get_harvest_source_fieldset()
     fs = fs.bind(source)
     text = fs.render()
     assert 'url' in text
     assert 'http://localhost/' in text
     assert 'description' in text
     assert 'My source' in text
Beispiel #36
0
def harvest_source_update(context, data_dict):

    check_access('harvest_source_update', context, data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors, _error_summary(errors))

    fields = ['url', 'title', 'type', 'description', 'user_id', 'publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            if f == 'url':
                data[f] = data[f].strip()
            source.__setattr__(f, data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source, status=u'New')
        log.info(
            'Harvest source %s not active, so aborting %i outstanding jobs',
            source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    # Ensure sqlalchemy writes to the db immediately, since the gather/fetch
    # runs in a different process and needs the latest source info. Not sure if
    # this works, but try it.
    model.repo.commit_and_remove()

    return harvest_source_dictize(source, context)
Beispiel #37
0
def harvest_source_show(context,data_dict):
    p.toolkit.check_access('harvest_source_show',context,data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr',None)

    source = HarvestSource.get(id,attr=attr)

    if not source:
        raise NotFound

    return harvest_source_dictize(source,context)
Beispiel #38
0
def harvest_source_show(context, data_dict):
    check_access('harvest_source_show', context, data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr', None)

    source = HarvestSource.get(id, attr=attr)

    if not source:
        raise NotFound

    return harvest_source_dictize(source, context)
Beispiel #39
0
 def harvest_sources(self):
     ddi = HarvestSource(url='http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt',
                         type='DDI')
     ddi.save()
     oai = HarvestSource(url='http://helda.helsinki.fi/oai/request',
                         type='OAI-PMH')
     oai.save()
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(data_dict['id']))
                return data_dict

            st_action_name = 'harvest_source_show_status'
            try:
                status_action = p.toolkit.get_action(st_action_name)
            except KeyError:
                logic.clear_actions_cache()
                status_action = p.toolkit.get_action(st_action_name)

            data_dict['status'] = status_action(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # If the harvest extras are there, remove them. This can happen eg
            # when calling package_update or resource_update, which call
            # package_show
            if data_dict.get('extras'):
                data_dict['extras'][:] = [e for e in data_dict.get('extras', [])
                                          if not e['key']
                                          in ('harvest_object_id', 'harvest_source_id', 'harvest_source_title',)]


            # We only want to add these extras at index time so they are part
            # of the cached data_dict used to display, search results etc. We
            # don't want them added when editing the dataset, otherwise we get
            # duplicated key errors.
            # The only way to detect indexing right now is checking that
            # validate is set to False.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                        ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Beispiel #41
0
    def _create_source_and_job(self, source_fixture):
        context = {"model": model, "session": Session, "user": u"harvest"}

        if config.get("ckan.harvest.auth.profile") == u"publisher" and not "publisher_id" in source_fixture:
            source_fixture["publisher_id"] = self.publisher.id

        source_dict = get_action("harvest_source_create")(context, source_fixture)
        source = HarvestSource.get(source_dict["id"])
        assert source

        job = self._create_job(source.id)

        return source, job
Beispiel #42
0
def _create_harvest_source_object(context, data_dict):
    '''
        Creates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to create harvest sources. The created harvest source will
        have the same id as the dataset.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    log.info('Creating harvest source: %r', data_dict)

    source = HarvestSource()

    source.id = data_dict['id']
    source.url = data_dict['url'].strip()

    # Avoids clashes with the dataset type
    source.type = data_dict['source_type']

    opt = [
        'active', 'title', 'description', 'user_id', 'publisher_id', 'config',
        'frequency'
    ]
    for o in opt:
        if o in data_dict and data_dict[o] is not None:
            source.__setattr__(o, data_dict[o])

    source.active = not data_dict.get('state', None) == 'deleted'

    # Don't commit yet, let package_create do it
    source.add()
    log.info('Harvest source created: %s', source.id)

    return source
Beispiel #43
0
def harvest_source_update(context,data_dict):

    check_access('harvest_source_update',context,data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors,_error_summary(errors))

    fields = ['url','title','type','description','user_id','publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            if f == 'url':
                data[f] = data[f].strip()
            source.__setattr__(f,data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    # Ensure sqlalchemy writes to the db immediately, since the gather/fetch
    # runs in a different process and needs the latest source info. Not sure if
    # this works, but try it.
    model.repo.commit_and_remove()

    return harvest_source_dictize(source,context)
Beispiel #44
0
def harvest_objects_import(context,data_dict):
    '''
        Reimports the current harvest objects
        It performs the import stage with the last fetched objects, optionally
        belonging to a certain source.
        Please note that no objects will be fetched from the remote server.
        It will only affect the last fetched objects already present in the
        database.
    '''
    log.info('Harvest objects import: %r', data_dict)
    check_access('harvest_objects_import',context,data_dict)

    model = context['model']
    session = context['session']
    source_id = data_dict.get('source_id',None)

    if source_id:
        source = HarvestSource.get(source_id)
        if not source:
            log.error('Harvest source %s does not exist', source_id)
            raise NotFound('Harvest source %s does not exist' % source_id)

        if not source.active:
            log.warn('Harvest source %s is not active.', source_id)
            raise Exception('This harvest source is not active')

        last_objects_ids = session.query(HarvestObject.id) \
                .join(HarvestSource).join(Package) \
                .filter(HarvestObject.source==source) \
                .filter(HarvestObject.current==True) \
                .filter(Package.state==u'active') \
                .all()
    else:
        last_objects_ids = session.query(HarvestObject.id) \
                .join(Package) \
                .filter(HarvestObject.current==True) \
                .filter(Package.state==u'active') \
                .all()

    last_objects = []
    for obj_id in last_objects_ids:
        obj = session.query(HarvestObject).get(obj_id)
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == obj.source.type:
                if hasattr(harvester,'force_import'):
                    harvester.force_import = True
                harvester.import_stage(obj)
                break
        last_objects.append(harvest_object_dictize(obj,context))
    log.info('Harvest objects imported: %r', last_objects)
    return last_objects
Beispiel #45
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        source = HarvestSource(url=u'http://test-source.org', type='test')
        source.save()

        job = HarvestJob(source=source)
        job.save()

        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           job=job,
                           guid=u'test-guid',
                           content=u'<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
Beispiel #46
0
    def _create_source_and_job(self, source_fixture):
        context = {'model': model, 'session': Session, 'user': u'harvest'}

        if not 'publisher_id' in source_fixture:
            source_fixture['publisher_id'] = self.publisher['id']

        source_dict = get_action('harvest_source_create')(context,
                                                          source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Beispiel #47
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        source = HarvestSource(url=u'http://test-source.org',type='test')
        source.save()

        job = HarvestJob(source=source)
        job.save()

        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           job=job,
                           guid=u'test-guid',
                           content=u'<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
Beispiel #48
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()
        
        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)
        
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()
        
        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()
            
            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)
                
            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id}))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.ObjectNotFound('Harvest source %s does not exist' %
                                     source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError(
            'Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(context, {
            'id': job.id
        })

    return harvest_job_dictize(job, context)
    def _create_source_and_job(self, source_fixture):
        context ={'model':model,
                 'session':Session,
                 'user':u'harvest'}

        if not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher['id']

        source_dict=get_action('harvest_source_create')(context,source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
def harvest_object_list(context, data_dict):
    model = context['model']
    user = context.get('user')

    # Check user is logged in
    if not user:
        return {
            'success': False,
            'msg': _('Only logged users are authorized to see their sources')
        }

    user_obj = User.get(user)

    # Checks for non sysadmin users
    if not Authorizer().is_sysadmin(user):
        if not user_obj or len(user_obj.get_groups(u'publisher')) == 0:
            return {
                'success':
                False,
                'msg':
                _('User %s must belong to a publisher to list harvest objects')
                % str(user)
            }

        source_id = data_dict.get('source_id', False)
        if not source_id:
            return {
                'success': False,
                'msg':
                _('Only sysadmins can list all harvest objects') % str(user)
            }

        source = HarvestSource.get(source_id)
        if not source:
            raise NotFound

        if not source.publisher_id in [
                g.id for g in user_obj.get_groups(u'publisher')
        ]:
            return {
                'success':
                False,
                'msg':
                _('User %s not authorized to list objects from source %s') %
                (str(user), source.id)
            }

    return {'success': True}
    def _create_source_and_job(self, source_fixture):
        context = {'model': model,
                   'session': Session,
                   'user': u'harvest'}

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and 'publisher_id' not in source_fixture:
            source_fixture['publisher_id'] = self.publisher.id

        source_dict = get_action('harvest_source_create')(context, source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Beispiel #53
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s',
                 exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(
            context, {'id': job.id})

    return harvest_job_dictize(job, context)
def _create_harvest_source_object(context, data_dict):
    '''
        Creates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to create harvest sources. The created harvest source will
        have the same id as the dataset.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    log.info('Creating harvest source: %r', data_dict)

    source = HarvestSource()

    source.id = data_dict['id']
    source.url = data_dict['url'].strip()

    # Avoids clashes with the dataset type
    source.type = data_dict['source_type']

    opt = ['active', 'title', 'description', 'user_id',
           'publisher_id', 'config', 'frequency']
    for o in opt:
        if o in data_dict and data_dict[o] is not None:
            source.__setattr__(o,data_dict[o])

    source.active = not data_dict.get('state', None) == 'deleted'

    # Don't commit yet, let package_create do it
    source.add()
    log.info('Harvest source created: %s', source.id)

    return source
Beispiel #55
0
def harvest_source_update(context,data_dict):

    check_access('harvest_source_update',context,data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors,_error_summary(errors))

    fields = ['url','title','type','description','user_id','publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            source.__setattr__(f,data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    return harvest_source_dictize(source,context)
    def _create_source_and_job(self):
        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }
        source_fixture = {'url': u'http://csw/GetCapabilities', 'type': u'csw'}
        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
            source_fixture['publisher_id'] = self.publisher.id

        source_dict = get_action('harvest_source_create')(context,
                                                          source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Beispiel #57
0
    def _create_source_and_job(self):
        context ={'model': model,
                  'session': model.Session,
                  'user': u'harvest'}
        source_fixture = {
            'url': u'http://csw/GetCapabilities',
            'type': u'csw'
        }
        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher.id

        source_dict=get_action('harvest_source_create')(context,source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Beispiel #58
0
def get_harvest_source_config(harvester_id):
    source_config = {}
    keys_lookfor = [
        'default_groups',
        'private_datasets',
        'validator_profiles',
    ]
    try:
        harvest_source = HarvestSource.get(harvester_id)
        source_config = json.loads(harvest_source.config)
    except:
        pass

    # convert single string element list to string
    if source_config:
        for key in keys_lookfor:
            value = source_config.get(key, '')
            if type(value) is list:
                source_config[key] = value[0]
    return source_config
Beispiel #59
0
def get_harvest_source_config(harvester_id):
    source_config = {}
    keys_lookfor =[
            'default_groups',
            'private_datasets',
            'validator_profiles',
    ]
    try:
        harvest_source = HarvestSource.get(harvester_id)
        source_config = json.loads(harvest_source.config)
    except:
        pass

    # convert single string element list to string
    if source_config:
        for key in keys_lookfor:
            value = source_config.get(key, '')
            if type(value) is list:
                source_config[key] = value[0]
    return source_config