Ejemplo n.º 1
0
def harvest_source_index_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string
    '''

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id')

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))

    solr_commit = toolkit.asbool(config.get('ckan.search.solr_commit', 'true'))
    if toolkit.check_ckan_version(max_version='2.5.99'):
        # conn is solrpy
        try:
            conn.delete_query(query)
            if solr_commit:
                conn.commit()
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
        finally:
Ejemplo n.º 2
0
def harvest_source_index_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string
    '''

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id')

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
Ejemplo n.º 3
0
Archivo: get.py Proyecto: tbalaz/test
def harvest_source_show(context,data_dict):
    '''
    Returns the metadata of a harvest source

    This method just proxies the request to package_show. All auth checks and
    validation will be done there.

    :param id: the id or name of the harvest source
    :type id: string

    :returns: harvest source metadata
    :rtype: dictionary
    '''
    check_access('harvest_source_show',context,data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr',None)

    source = HarvestSource.get(id,attr=attr)
    context['source'] = source

    if not source:
        raise NotFound

    if 'include_status' not in context:
        context['include_status'] = True

    return harvest_source_dictize(source,context)
Ejemplo n.º 4
0
def harvest_source_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    '''
    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;'''.format(
        harvest_source_id=harvest_source_id)

    model = context['model']

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({'validate': False, 'ignore_auth': True})
    package_dict = logic.get_action('package_show')(context, {
        'id': harvest_source_id
    })

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {'id': harvest_source_id}
Ejemplo n.º 5
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict = {'source_id': source_id, 'status': u'New'}
    exists = harvest_job_list(context, data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
Ejemplo n.º 6
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(data_dict['id']))
                return data_dict

            data_dict['status'] = harvest_logic.action.get.harvest_source_show_status(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # validate is false is passed only on indexing.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                        ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Ejemplo n.º 7
0
def harvest_job_create(context, data_dict):
    model = context['model']
    user = context.get('user')

    source_id = data_dict['source_id']

    if not user:
        return {
            'success':
            False,
            'msg':
            _('Non-logged in users are not authorized to create harvest jobs')
        }

    if ckan.new_authz.is_sysadmin(user):
        return {'success': True}

    user_obj = User.get(user)
    source = HarvestSource.get(source_id)
    if not source:
        raise NotFound

    if not user_obj or not source.publisher_id in [
            g.id for g in user_obj.get_groups(u'organization')
    ]:
        return {
            'success':
            False,
            'msg':
            _('User %s not authorized to create a job for source %s') %
            (str(user), source.id)
        }
    else:
        return {'success': True}
Ejemplo n.º 8
0
def harvest_job_list(context,data_dict):
    model = context['model']
    user = context.get('user')

    # Check user is logged in
    if not user:
        return {'success': False, 'msg': _('Only logged users are authorized to see their sources')}

    user_obj = User.get(user)

    # Checks for non sysadmin users
    if not Authorizer().is_sysadmin(user):
        if not user_obj or len(user_obj.get_groups(u'publisher')) == 0:
            return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest jobs') % str(user)}

        source_id = data_dict.get('source_id',False)
        if not source_id:
            return {'success': False, 'msg': _('Only sysadmins can list all harvest jobs') % str(user)}

        source = HarvestSource.get(source_id)
        if not source:
            raise NotFound

        if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher')]:
            return {'success': False, 'msg': _('User %s not authorized to list jobs from source %s') % (str(user),source.id)}

    return {'success': True}
Ejemplo n.º 9
0
def harvest_job_list(context, data_dict):
    model = context['model']
    user = context.get('user')

    source_id = data_dict.get('source_id', False)
    if not source_id:
        return {
            'success': False,
            'msg': _('Only sysadmins can list all harvest jobs') % str(user)
        }

    source = HarvestSource.get(source_id)
    if not source:
        raise p.toolkit.ObjectNotFound

    # Check the user is admin/editor for the publisher - i.e. has
    # update_dataset permission
    check1 = ckan.new_authz.has_user_permission_for_group_or_org(
        source.publisher_id, user, 'update_dataset')
    if not check1:
        return {
            'success':
            False,
            'msg':
            _('User %s not authorized to list jobs from source %s') %
            (str(user), source.id)
        }

    return {'success': True}
Ejemplo n.º 10
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
Ejemplo n.º 11
0
def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict ={
        'source_id':source_id,
        'status':u'New'
    }
    exists = harvest_job_list(context,data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
Ejemplo n.º 12
0
def harvest_source_id_exists(value, context):

    result = HarvestSource.get(value)

    if not result:
        raise Invalid('Harvest Source with id %r does not exist.' % str(value))
    return value
Ejemplo n.º 13
0
def harvest_source_id_exists(value, context):

    result = HarvestSource.get(value)

    if not result:
        raise Invalid('Harvest Source with id %r does not exist.' % str(value))
    return value
Ejemplo n.º 14
0
def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
Ejemplo n.º 15
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(data_dict['id']))
                return data_dict

            data_dict['status'] = harvest_logic.action.get.harvest_source_show_status(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # validate is false is passed only on indexing.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                        ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Ejemplo n.º 16
0
def harvest_source_clear(context, data_dict):
    """
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    """
    check_access("harvest_source_clear", context, data_dict)

    harvest_source_id = data_dict.get("id", None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error("Harvest source %s does not exist", harvest_source_id)
        raise NotFound("Harvest source %s does not exist" % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = """begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;""".format(
        harvest_source_id=harvest_source_id
    )

    model = context["model"]

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({"validate": False, "ignore_auth": True})
    package_dict = logic.get_action("package_show")(context, {"id": harvest_source_id})

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {"id": harvest_source_id}
Ejemplo n.º 17
0
def _update_harvest_source_object(context, data_dict):
    '''
        Updates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to update harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise logic.NotFound('Harvest source %s does not exist' % source_id)

    fields = [
        'url', 'title', 'description', 'user_id', 'publisher_id', 'frequency',
        'time'
    ]
    for f in fields:
        if f in data_dict and data_dict[f] is not None:
            if f == 'url':
                data_dict[f] = data_dict[f].strip()
            source.__setattr__(f, data_dict[f])

    # Avoids clashes with the dataset type
    if 'source_type' in data_dict:
        source.type = data_dict['source_type']

    if 'config' in data_dict:
        source.config = data_dict['config']

    # Don't change state unless explicitly set in the dict
    if 'state' in data_dict:
        source.active = data_dict.get('state') == 'active'

    # Don't commit yet, let package_create do it
    source.add()

    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source, status=u'New')
        log.info(
            'Harvest source %s not active, so aborting %i outstanding jobs',
            source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.add()

    return source
Ejemplo n.º 18
0
 def test_form_validate_new_object_and_sync(self):
     assert not HarvestSource.get(u'http://localhost/', None, 'url')
     fs = form.get_harvest_source_fieldset()
     register = HarvestSource
     data = {
         'HarvestSource--url': u'http://localhost/', 
         'HarvestSource--type': u'Gemini',
         'HarvestSource--description': u'My source'
     }
     fs = fs.bind(register, data=data, session=model.Session)
     # Test bound_fields.validate().
     fs.validate()
     assert not fs.errors
     # Test bound_fields.sync().
     fs.sync()
     model.Session.commit()
     source = HarvestSource.get(u'http://localhost/', None, 'url')
     assert source.id
Ejemplo n.º 19
0
    def setup(self):
        print ("")
        print ("TestUM:setup() before each test method")

        # Add sysadmin user
        self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True)
        model.Session.add(self.harvestUser)
        model.Session.commit()

        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'xml/sample.xml',
            'source_type': u'ngds'
        }

        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher.id

        source_dict=get_action('harvest_source_create')(context, source_fixture)
        self.oHarvestSource = HarvestSource.get(source_dict['id'])

        job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id})
        self.oHarvestJob = HarvestJob.get(job_dict['id'])

        context = {
            'model' : model,
            'session': model.Session,
            'ignore_auth': True,
        }

        data_dict = {
            'guid' : 'guid',
            'content' : self.contentDataset,
            'job_id' : self.oHarvestJob.id,
            'extras' : { 'a key' : 'a value' },
        }

        oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict)
        self.oHarvestObject = HarvestObject.get(oHarvestObject['id'])

        package_schema = default_update_package_schema()
        self.context = {
            'model':model,
            'session': model.Session,
            'user':u'harvest',
            'schema':package_schema,
            'api_version': '2'
        }
Ejemplo n.º 20
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(
                    data_dict['id']))
                return data_dict

            st_action_name = 'harvest_source_show_status'
            try:
                status_action = p.toolkit.get_action(st_action_name)
            except KeyError:
                logic.clear_actions_cache()
                status_action = p.toolkit.get_action(st_action_name)

            data_dict['status'] = status_action(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # If the harvest extras are there, remove them. This can happen eg
            # when calling package_update or resource_update, which call
            # package_show
            if data_dict.get('extras'):
                data_dict['extras'][:] = [
                    e for e in data_dict.get('extras', []) if not e['key'] in (
                        'harvest_object_id',
                        'harvest_source_id',
                        'harvest_source_title',
                    )
                ]

            # We only want to add these extras at index time so they are part
            # of the cached data_dict used to display, search results etc. We
            # don't want them added when editing the dataset, otherwise we get
            # duplicated key errors.
            # The only way to detect indexing right now is checking that
            # validate is set to False.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Ejemplo n.º 21
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
Ejemplo n.º 22
0
def get_source_object(context, data_dict={}):
    if not 'source' in context:
        model = context['model']
        id = data_dict.get('id', None)
        source = HarvestSource.get(id)
        if not source:
            raise NotFound
    else:
        source = context['source']

    return source
Ejemplo n.º 23
0
def get_source_object(context, data_dict = {}):
    if not 'source' in context:
        model = context['model']
        id = data_dict.get('id',None)
        source = HarvestSource.get(id)
        if not source:
            raise NotFound
    else:
        source = context['source']

    return source
Ejemplo n.º 24
0
def _update_harvest_source_object(context, data_dict):
    '''
        Updates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to update harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise logic.NotFound('Harvest source %s does not exist' % source_id)


    fields = ['url', 'title', 'description', 'user_id',
              'publisher_id', 'frequency']
    for f in fields:
        if f in data_dict and data_dict[f] is not None:
            if f == 'url':
                data_dict[f] = data_dict[f].strip()
            source.__setattr__(f,data_dict[f])

    # Avoids clashes with the dataset type
    if 'source_type' in data_dict:
        source.type = data_dict['source_type']

    if 'config' in data_dict:
        source.config = data_dict['config']

    # Don't change state unless explicitly set in the dict
    if 'state' in data_dict:
      source.active = data_dict.get('state') == 'active'

    # Don't commit yet, let package_create do it
    source.add()

    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.add()

    return source
Ejemplo n.º 25
0
    def _create_source(self, source_fixture=FISBROKER_HARVESTER_CONFIG):
        context = {
            'model': model,
            'session': Session,
            'user': u'harvest'
        }

        source_dict = get_action('harvest_source_create')(context,source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        return source
Ejemplo n.º 26
0
def harvest_source_show(context,data_dict):
    p.toolkit.check_access('harvest_source_show',context,data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr',None)

    source = HarvestSource.get(id,attr=attr)

    if not source:
        raise NotFound

    return harvest_source_dictize(source,context)
Ejemplo n.º 27
0
def harvest_source_show(context, data_dict):
    check_access('harvest_source_show', context, data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr', None)

    source = HarvestSource.get(id, attr=attr)

    if not source:
        raise NotFound

    return harvest_source_dictize(source, context)
Ejemplo n.º 28
0
def harvest_source_update(context, data_dict):

    check_access('harvest_source_update', context, data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors, _error_summary(errors))

    fields = ['url', 'title', 'type', 'description', 'user_id', 'publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            if f == 'url':
                data[f] = data[f].strip()
            source.__setattr__(f, data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source, status=u'New')
        log.info(
            'Harvest source %s not active, so aborting %i outstanding jobs',
            source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    # Ensure sqlalchemy writes to the db immediately, since the gather/fetch
    # runs in a different process and needs the latest source info. Not sure if
    # this works, but try it.
    model.repo.commit_and_remove()

    return harvest_source_dictize(source, context)
Ejemplo n.º 29
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(data_dict['id']))
                return data_dict

            st_action_name = 'harvest_source_show_status'
            try:
                status_action = p.toolkit.get_action(st_action_name)
            except KeyError:
                logic.clear_actions_cache()
                status_action = p.toolkit.get_action(st_action_name)

            data_dict['status'] = status_action(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # If the harvest extras are there, remove them. This can happen eg
            # when calling package_update or resource_update, which call
            # package_show
            if data_dict.get('extras'):
                data_dict['extras'][:] = [e for e in data_dict.get('extras', [])
                                          if not e['key']
                                          in ('harvest_object_id', 'harvest_source_id', 'harvest_source_title',)]


            # We only want to add these extras at index time so they are part
            # of the cached data_dict used to display, search results etc. We
            # don't want them added when editing the dataset, otherwise we get
            # duplicated key errors.
            # The only way to detect indexing right now is checking that
            # validate is set to False.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                        ]:
                    _add_extra(data_dict, key, value)

        return data_dict
Ejemplo n.º 30
0
    def _create_source_and_job(self, source_fixture):
        context = {"model": model, "session": Session, "user": u"harvest"}

        if config.get("ckan.harvest.auth.profile") == u"publisher" and not "publisher_id" in source_fixture:
            source_fixture["publisher_id"] = self.publisher.id

        source_dict = get_action("harvest_source_create")(context, source_fixture)
        source = HarvestSource.get(source_dict["id"])
        assert source

        job = self._create_job(source.id)

        return source, job
Ejemplo n.º 31
0
def harvest_source_update(context,data_dict):

    check_access('harvest_source_update',context,data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors,_error_summary(errors))

    fields = ['url','title','type','description','user_id','publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            if f == 'url':
                data[f] = data[f].strip()
            source.__setattr__(f,data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    # Ensure sqlalchemy writes to the db immediately, since the gather/fetch
    # runs in a different process and needs the latest source info. Not sure if
    # this works, but try it.
    model.repo.commit_and_remove()

    return harvest_source_dictize(source,context)
Ejemplo n.º 32
0
def harvest_objects_import(context,data_dict):
    '''
        Reimports the current harvest objects
        It performs the import stage with the last fetched objects, optionally
        belonging to a certain source.
        Please note that no objects will be fetched from the remote server.
        It will only affect the last fetched objects already present in the
        database.
    '''
    log.info('Harvest objects import: %r', data_dict)
    check_access('harvest_objects_import',context,data_dict)

    model = context['model']
    session = context['session']
    source_id = data_dict.get('source_id',None)

    if source_id:
        source = HarvestSource.get(source_id)
        if not source:
            log.error('Harvest source %s does not exist', source_id)
            raise NotFound('Harvest source %s does not exist' % source_id)

        if not source.active:
            log.warn('Harvest source %s is not active.', source_id)
            raise Exception('This harvest source is not active')

        last_objects_ids = session.query(HarvestObject.id) \
                .join(HarvestSource).join(Package) \
                .filter(HarvestObject.source==source) \
                .filter(HarvestObject.current==True) \
                .filter(Package.state==u'active') \
                .all()
    else:
        last_objects_ids = session.query(HarvestObject.id) \
                .join(Package) \
                .filter(HarvestObject.current==True) \
                .filter(Package.state==u'active') \
                .all()

    last_objects = []
    for obj_id in last_objects_ids:
        obj = session.query(HarvestObject).get(obj_id)
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == obj.source.type:
                if hasattr(harvester,'force_import'):
                    harvester.force_import = True
                harvester.import_stage(obj)
                break
        last_objects.append(harvest_object_dictize(obj,context))
    log.info('Harvest objects imported: %r', last_objects)
    return last_objects
Ejemplo n.º 33
0
    def _create_source_and_job(self, source_fixture):
        context = {'model': model, 'session': Session, 'user': u'harvest'}

        if not 'publisher_id' in source_fixture:
            source_fixture['publisher_id'] = self.publisher['id']

        source_dict = get_action('harvest_source_create')(context,
                                                          source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Ejemplo n.º 34
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.ObjectNotFound('Harvest source %s does not exist' %
                                     source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError(
            'Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(context, {
            'id': job.id
        })

    return harvest_job_dictize(job, context)
Ejemplo n.º 35
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()
        
        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)
        
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()
        
        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()
            
            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)
                
            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id}))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
    def _create_source_and_job(self, source_fixture):
        context ={'model':model,
                 'session':Session,
                 'user':u'harvest'}

        if not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher['id']

        source_dict=get_action('harvest_source_create')(context,source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Ejemplo n.º 37
0
def harvest_object_list(context, data_dict):
    model = context['model']
    user = context.get('user')

    # Check user is logged in
    if not user:
        return {
            'success': False,
            'msg': _('Only logged users are authorized to see their sources')
        }

    user_obj = User.get(user)

    # Checks for non sysadmin users
    if not Authorizer().is_sysadmin(user):
        if not user_obj or len(user_obj.get_groups(u'publisher')) == 0:
            return {
                'success':
                False,
                'msg':
                _('User %s must belong to a publisher to list harvest objects')
                % str(user)
            }

        source_id = data_dict.get('source_id', False)
        if not source_id:
            return {
                'success': False,
                'msg':
                _('Only sysadmins can list all harvest objects') % str(user)
            }

        source = HarvestSource.get(source_id)
        if not source:
            raise NotFound

        if not source.publisher_id in [
                g.id for g in user_obj.get_groups(u'publisher')
        ]:
            return {
                'success':
                False,
                'msg':
                _('User %s not authorized to list objects from source %s') %
                (str(user), source.id)
            }

    return {'success': True}
    def _create_source_and_job(self, source_fixture):
        context = {'model': model,
                   'session': Session,
                   'user': u'harvest'}

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and 'publisher_id' not in source_fixture:
            source_fixture['publisher_id'] = self.publisher.id

        source_dict = get_action('harvest_source_create')(context, source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Ejemplo n.º 39
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s',
                 exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(
            context, {'id': job.id})

    return harvest_job_dictize(job, context)
Ejemplo n.º 40
0
def harvest_source_update(context,data_dict):

    check_access('harvest_source_update',context,data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors,_error_summary(errors))

    fields = ['url','title','type','description','user_id','publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            source.__setattr__(f,data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    return harvest_source_dictize(source,context)
Ejemplo n.º 41
0
    def _create_source_and_job(self):
        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }
        source_fixture = {'url': u'http://csw/GetCapabilities', 'type': u'csw'}
        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
            source_fixture['publisher_id'] = self.publisher.id

        source_dict = get_action('harvest_source_create')(context,
                                                          source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Ejemplo n.º 42
0
    def _create_source_and_job(self):
        context ={'model': model,
                  'session': model.Session,
                  'user': u'harvest'}
        source_fixture = {
            'url': u'http://csw/GetCapabilities',
            'type': u'csw'
        }
        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher.id

        source_dict=get_action('harvest_source_create')(context,source_fixture)
        source = HarvestSource.get(source_dict['id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Ejemplo n.º 43
0
def get_harvest_source_config(harvester_id):
    source_config = {}
    keys_lookfor = [
        'default_groups',
        'private_datasets',
        'validator_profiles',
    ]
    try:
        harvest_source = HarvestSource.get(harvester_id)
        source_config = json.loads(harvest_source.config)
    except:
        pass

    # convert single string element list to string
    if source_config:
        for key in keys_lookfor:
            value = source_config.get(key, '')
            if type(value) is list:
                source_config[key] = value[0]
    return source_config
Ejemplo n.º 44
0
def get_harvest_source_config(harvester_id):
    source_config = {}
    keys_lookfor =[
            'default_groups',
            'private_datasets',
            'validator_profiles',
    ]
    try:
        harvest_source = HarvestSource.get(harvester_id)
        source_config = json.loads(harvest_source.config)
    except:
        pass

    # convert single string element list to string
    if source_config:
        for key in keys_lookfor:
            value = source_config.get(key, '')
            if type(value) is list:
                source_config[key] = value[0]
    return source_config
Ejemplo n.º 45
0
    def _create_source_and_job(self, source_fixture):
        '''

        :param source_fixture: 

        '''
        context = {u'user': u'harvest'}

        if toolkit.config.get(u'ckan.harvest.auth.profile') == u'publisher' \
                and not u'publisher_id' in source_fixture:
            source_fixture[u'publisher_id'] = self.publisher.id

        source_dict = toolkit.get_action(u'harvest_source_create')(
            context, source_fixture)
        source = HarvestSource.get(source_dict[u'id'])
        assert source

        job = self._create_job(source.id)

        return source, job
Ejemplo n.º 46
0
def harvest_source_job_history_clear(context, data_dict):
    '''
    Clears all jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.
    The datasets imported from the harvest source will NOT be deleted!!!

    :param id: the id of the harvest source to clear
    :type id: string

    '''
    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    model = context['model']

    sql = '''begin;
    delete from harvest_object_error where harvest_object_id
     in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id
     in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id
     in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    commit;
    '''.format(harvest_source_id=harvest_source_id)

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    get_action('harvest_source_reindex')(context, {'id': harvest_source_id})

    return {'id': harvest_source_id}
Ejemplo n.º 47
0
def harvest_source_job_history_clear(context, data_dict):
    '''
    Clears all jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.
    The datasets imported from the harvest source will NOT be deleted!!!

    :param id: the id of the harvest source to clear
    :type id: string

    '''
    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    model = context['model']

    sql = '''begin;
    delete from harvest_object_error where harvest_object_id
     in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id
     in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id
     in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    commit;
    '''.format(harvest_source_id=harvest_source_id)

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    get_action('harvest_source_reindex')(context, {'id': harvest_source_id})

    return {'id': harvest_source_id}
Ejemplo n.º 48
0
def harvest_source_index_clear(context, data_dict):

    check_access("harvest_source_clear", context, data_dict)
    harvest_source_id = data_dict.get("id", None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error("Harvest source %s does not exist", harvest_source_id)
        raise NotFound("Harvest source %s does not exist" % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = """ +%s:"%s" +site_id:"%s" """ % ("harvest_source_id", harvest_source_id, config.get("ckan.site_id"))
    try:
        conn.delete_query(query)
        if asbool(config.get("ckan.search.solr_commit", "true")):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
Ejemplo n.º 49
0
    def after_dataset_show(self, context, data_dict):

        if "type" in data_dict and data_dict["type"] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict["id"])
            if not source:
                log.error("Harvest source not found for dataset {0}".format(
                    data_dict["id"]))
                return data_dict

            st_action_name = "harvest_source_show_status"
            try:
                status_action = p.toolkit.get_action(st_action_name)
            except KeyError:
                logic.clear_actions_cache()
                status_action = p.toolkit.get_action(st_action_name)

            data_dict["status"] = status_action(context, {"id": source.id})

        return data_dict
Ejemplo n.º 50
0
def harvest_jobs_run(context, data_dict):
    #model = context['model']
    user = context.get('user')

    source_id = data_dict.get('source_id', False)
    if not source_id:
        return {'success': False, 'msg': _('Only sysadmins can run all harvest jobs') % str(user)}

    source = HarvestSource.get(source_id)
    if not source:
        raise p.toolkit.ObjectNotFound

    # Check the user is admin/editor for the publisher - i.e. has
    # update_dataset permission
    check1 = ckan.new_authz.has_user_permission_for_group_or_org(
        source.publisher_id, user, 'update_dataset'
    )
    if not check1:
        return {'success': False, 'msg': _('User %s not authorized to run jobs from source %s') % (str(user),source.id)}

    return {'success': True}
Ejemplo n.º 51
0
    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(
                    data_dict['id']))
                return data_dict

            st_action_name = 'harvest_source_show_status'
            try:
                status_action = p.toolkit.get_action(st_action_name)
            except KeyError:
                logic.clear_actions_cache()
                status_action = p.toolkit.get_action(st_action_name)

            data_dict['status'] = status_action(context, {'id': source.id})

        return data_dict
Ejemplo n.º 52
0
def harvest_job_create(context,data_dict):
    model = context['model']
    user = context.get('user')

    source_id = data_dict['source_id']

    if not user:
        return {'success': False, 'msg': _('Non-logged in users are not authorized to create harvest jobs')}

    if Authorizer().is_sysadmin(user):
        return {'success': True}

    user_obj = User.get(user)
    source = HarvestSource.get(source_id)
    if not source:
        raise NotFound

    if not user_obj or not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher')]:
        return {'success': False, 'msg': _('User %s not authorized to create a job for source %s') % (str(user),source.id)}
    else:
        return {'success': True}
Ejemplo n.º 53
0
def _delete_harvest_source_object(context, data_dict):
    '''
        Deletes an actual HarvestSource object with the id provided on the
        data dict of the harvest_source dataset. Similarly to the datasets,
        the source object is not actually deleted, just flagged as inactive.
        All validation and authorization checks should be used by now, so
        this function is not to be used directly to delete harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The deleted HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Deleting harvest source: %s', source_id)

    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise p.toolkit.ObjectNotFound('Harvest source %s does not exist' %
                                       source_id)

    # Don't actually delete the record, just flag it as inactive
    source.active = False
    source.save()

    # Abort any pending jobs
    jobs = HarvestJob.filter(source=source, status=u'New')
    if jobs:
        log.info('Aborting %i jobs due to deleted harvest source',
                 jobs.count())
        for job in jobs:
            job.status = u'Aborted'
            job.save()

    log.debug('Harvest source %s deleted', source_id)

    return source
Ejemplo n.º 54
0
def harvest_source_index_clear(context,data_dict):

    check_access('harvest_source_clear',context,data_dict)
    harvest_source_id = data_dict.get('id',None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % ('harvest_source_id', harvest_source_id,
                                            config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
Ejemplo n.º 55
0
def harvest_source_index_clear(context, data_dict):

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
Ejemplo n.º 56
0
def ogdch_shacl_validate(context, data_dict):  # noqa
    """
    validates a harvest source against a shacl shape
    """

    # get sources from data_dict
    if 'harvest_source_id' in data_dict:
        harvest_source_id = data_dict['harvest_source_id']
        harvest_source = HarvestSource.get(harvest_source_id)
        if not harvest_source:
            raise NotFound(
                'Harvest source {} does not exist'.format(harvest_source_id))
    else:
        raise NotFound('Configuration missing for harvest source')

    datapath = data_dict['datapath']
    resultpath = data_dict['resultpath']
    shapefilepath = data_dict['shapefilepath']
    csvpath = data_dict['csvpath']
    shaclcommand = data_dict['shaclcommand']

    log.info('shacl_validate called for source: {},'
             'configuration: {}'.format(harvest_source_id, data_dict))

    # get rdf parse config for harvest source
    rdf_format = json.loads(harvest_source.config)\
        .get("rdf_format", "xml")

    # parse harvest_source
    data_rdfgraph = rdflib.Graph()

    # parse data from harvest source url
    try:
        data_rdfgraph.parse(harvest_source.url, format=rdf_format)
    except RDFParserException, e:
        raise RDFParserException(
            'Error parsing the RDF file during shacl validation: {0}'.format(
                e))
Ejemplo n.º 57
0
def _delete_harvest_source_object(context, data_dict):
    '''
        Deletes an actual HarvestSource object with the id provided on the
        data dict of the harvest_source dataset. Similarly to the datasets,
        the source object is not actually deleted, just flagged as inactive.
        All validation and authorization checks should be used by now, so
        this function is not to be used directly to delete harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The deleted HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Deleting harvest source: %s', source_id)

    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise p.toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id)

    # Don't actually delete the record, just flag it as inactive
    source.active = False
    source.save()

    # Abort any pending jobs
    jobs = HarvestJob.filter(source=source, status=u'New')
    if jobs:
        log.info('Aborting %i jobs due to deleted harvest source', jobs.count())
        for job in jobs:
            job.status = u'Aborted'
            job.save()

    log.debug('Harvest source %s deleted', source_id)

    return source
def harvest_source_delete(context,data_dict):
    log.info('Deleting harvest source: %r', data_dict)
    check_access('harvest_source_delete',context,data_dict)

    source_id = data_dict.get('id')
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Don't actually delete the record, just flag it as inactive
    source.active = False
    source.save()

    # Abort any pending jobs
    jobs = HarvestJob.filter(source=source,status=u'New')
    if jobs:
        log.info('Aborting %i jobs due to deleted harvest source', jobs.count())
        for job in jobs:
            job.status = u'Aborted'
            job.save()

    log.info('Harvest source %s deleted', source_id)
    return True
Ejemplo n.º 59
0
def harvest_source_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string
    '''

    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id')

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    model = context['model']

    # CKAN-2.6 or above: related don't exist any more
    if toolkit.check_ckan_version(max_version='2.5.99'):

        sql = '''select id from related where id in (
                  select related_id from related_dataset where dataset_id in (
                      select package_id from harvest_object
                      where harvest_source_id = '{harvest_source_id}'));'''.format(
            harvest_source_id=harvest_source_id)
        result = model.Session.execute(sql)
        ids = []
        for row in result:
            ids.append(row[0])
        related_ids = "('" + "','".join(ids) + "')"

    sql = '''begin;
        update package set state = 'to_delete' where id in (
            select package_id from harvest_object
            where harvest_source_id = '{harvest_source_id}');'''.format(
        harvest_source_id=harvest_source_id)

    # CKAN-2.3 or above: delete resource views, resource revisions & resources
    if toolkit.check_ckan_version(min_version='2.3'):
        sql += '''
        delete from resource_view where resource_id in (
            select id from resource where package_id in (
                select id from package where state = 'to_delete'));
        delete from resource_revision where package_id in (
            select id from package where state = 'to_delete');
        delete from resource where package_id in (
            select id from package where state = 'to_delete');
        '''
    # Backwards-compatibility: support ResourceGroup (pre-CKAN-2.3)
    else:
        sql += '''
        delete from resource_revision where resource_group_id in (
            select id from resource_group where package_id in (
                select id from package where state = 'to_delete'));
        delete from resource where resource_group_id in (
            select id from resource_group where package_id in (
                select id from package where state = 'to_delete'));
        delete from resource_group_revision where package_id in (
            select id from package where state = 'to_delete');
        delete from resource_group where package_id in (
            select id from package where state = 'to_delete');
        '''
    # CKAN pre-2.5: authz models were removed in migration 078
    if toolkit.check_ckan_version(max_version='2.4.99'):
        sql += '''
        delete from package_role where package_id in (
            select id from package where state = 'to_delete');
        delete from user_object_role where id not in (
            select user_object_role_id from package_role)
            and context = 'Package';
        '''

    sql += '''
    delete from harvest_object_error where harvest_object_id in (
        select id from harvest_object
        where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (
        select id from harvest_object
        where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (
        select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_tag_revision where package_id in (
        select id from package where state = 'to_delete');
    delete from member_revision where table_id in (
        select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (
        select id from package where state = 'to_delete');
    delete from package_revision where id in (
        select id from package where state = 'to_delete');
    delete from package_tag where package_id in (
        select id from package where state = 'to_delete');
    delete from package_extra where package_id in (
        select id from package where state = 'to_delete');
    delete from package_relationship_revision where subject_package_id in (
        select id from package where state = 'to_delete');
    delete from package_relationship_revision where object_package_id in (
        select id from package where state = 'to_delete');
    delete from package_relationship where subject_package_id in (
        select id from package where state = 'to_delete');
    delete from package_relationship where object_package_id in (
        select id from package where state = 'to_delete');
    delete from member where table_id in (
        select id from package where state = 'to_delete');
     '''.format(harvest_source_id=harvest_source_id)

    if toolkit.check_ckan_version(max_version='2.5.99'):
        sql += '''
        delete from related_dataset where dataset_id in (
            select id from package where state = 'to_delete');
        delete from related where id in {related_ids};
        delete from package where id in (
            select id from package where state = 'to_delete');
        '''.format(related_ids=related_ids)
    else:
        # CKAN-2.6 or above: related don't exist any more
        sql += '''
        delete from package where id in (
            select id from package where state = 'to_delete');
        '''

    sql += '''
    commit;
    '''
    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    get_action('harvest_source_reindex')(context, {'id': harvest_source_id})

    return {'id': harvest_source_id}