예제 #1
0
def harvest_source_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    '''
    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;'''.format(
        harvest_source_id=harvest_source_id)

    model = context['model']

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({'validate': False, 'ignore_auth': True})
    package_dict = logic.get_action('package_show')(context, {
        'id': harvest_source_id
    })

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {'id': harvest_source_id}
예제 #2
0
def harvest_source_reindex(context, data_dict):
    '''Reindex a single harvest source'''

    harvest_source_id = logic.get_or_bust(data_dict, 'id')
    defer_commit = context.get('defer_commit', False)

    if 'extras_as_string'in context:
        del context['extras_as_string']
    context.update({'ignore_auth': True})
    package_dict = logic.get_action('harvest_source_show')(
        context, {'id': harvest_source_id})
    log.debug('Updating search index for harvest source: %s',
              package_dict.get('name') or harvest_source_id)

    # Remove configuration values
    new_dict = {}
    if package_dict.get('config'):
        config = json.loads(package_dict['config'])
        for key, value in package_dict.iteritems():
            if key not in config:
                new_dict[key] = value
    package_index = PackageSearchIndex()
    package_index.index_package(new_dict, defer_commit=defer_commit)

    return True
예제 #3
0
def harvest_sources_reindex(context, data_dict):
    '''
        Reindexes all harvest source datasets with the latest status
    '''
    log.info('Reindexing all harvest sources')
    check_access('harvest_sources_reindex', context, data_dict)

    model = context['model']

    packages = model.Session.query(model.Package) \
                            .filter(model.Package.type==DATASET_TYPE_NAME) \
                            .filter(model.Package.state==u'active') \
                            .all()

    package_index = PackageSearchIndex()
    for package in packages:
        if 'extras_as_string' in context:
            del context['extras_as_string']
        context.update({'validate': False, 'ignore_auth': True})
        package_dict = logic.get_action('package_show')(context, {
            'id': package.id
        })
        log.debug('Updating search index for harvest source {0}'.format(
            package.id))
        package_index.index_package(package_dict, defer_commit=True)

    package_index.commit()
    log.info('Updated search index for {0} harvest sources'.format(
        len(packages)))
예제 #4
0
def harvest_sources_reindex(context, data_dict):
    """
        Reindexes all harvest source datasets with the latest status
    """
    log.info("Reindexing all harvest sources")
    check_access("harvest_sources_reindex", context, data_dict)

    model = context["model"]

    packages = (
        model.Session.query(model.Package)
        .filter(model.Package.type == DATASET_TYPE_NAME)
        .filter(model.Package.state == u"active")
        .all()
    )

    package_index = PackageSearchIndex()
    for package in packages:
        if "extras_as_string" in context:
            del context["extras_as_string"]
        context.update({"validate": False, "ignore_auth": True})
        package_dict = logic.get_action("package_show")(context, {"id": package.id})
        log.debug("Updating search index for harvest source {0}".format(package.id))
        package_index.index_package(package_dict, defer_commit=True)

    package_index.commit()
    log.info("Updated search index for {0} harvest sources".format(len(packages)))
예제 #5
0
def harvest_sources_reindex(context, data_dict):
    '''
        Reindexes all harvest source datasets with the latest status
    '''
    log.info('Reindexing all harvest sources')
    check_access('harvest_sources_reindex', context, data_dict)

    model = context['model']

    packages = model.Session.query(model.Package) \
                            .filter(model.Package.type==DATASET_TYPE_NAME) \
                            .filter(model.Package.state==u'active') \
                            .all()

    package_index = PackageSearchIndex()
    for package in packages:
        if 'extras_as_string'in context:
            del context['extras_as_string']
        context.update({'ignore_auth': True})
        package_dict = logic.get_action('harvest_source_show')(context,
            {'id': package.id})
        log.debug('Updating search index for harvest source {0}'.format(package.id))
        package_index.index_package(package_dict, defer_commit=True)

    package_index.commit()
    log.info('Updated search index for {0} harvest sources'.format(len(packages)))
예제 #6
0
def harvest_source_reindex(context, data_dict):
    '''Reindex a single harvest source'''

    harvest_source_id = logic.get_or_bust(data_dict, 'id')

    defer_commit = context.get('defer_commit', False)

    if 'extras_as_string' in context:
        del context['extras_as_string']
    context.update({'ignore_auth': True})
    package_dict = logic.get_action('harvest_source_show')(
        context, {
            'id': harvest_source_id
        })
    log.debug('Updating search index for harvest source: %s',
              package_dict.get('name') or harvest_source_id)

    # Remove configuration values
    new_dict = {}

    try:
        config = json.loads(package_dict.get('config', ''))
    except ValueError:
        config = {}
    for key, value in package_dict.items():
        if key not in config:
            new_dict[key] = value

    package_index = PackageSearchIndex()
    package_index.index_package(new_dict, defer_commit=defer_commit)

    return True
예제 #7
0
def harvest_source_clear(context, data_dict):
    """
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    """
    check_access("harvest_source_clear", context, data_dict)

    harvest_source_id = data_dict.get("id", None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error("Harvest source %s does not exist", harvest_source_id)
        raise NotFound("Harvest source %s does not exist" % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = """begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;""".format(
        harvest_source_id=harvest_source_id
    )

    model = context["model"]

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({"validate": False, "ignore_auth": True})
    package_dict = logic.get_action("package_show")(context, {"id": harvest_source_id})

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {"id": harvest_source_id}
예제 #8
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
예제 #9
0
def _update_search_index(package_id, log):
    '''
    Tells CKAN to update its search index for a given package.
    '''
    from ckan import model
    from ckan.lib.search.index import PackageSearchIndex
    package_index = PackageSearchIndex()
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session,
                'use_cache': False, 'validate': False}
    package = toolkit.get_action('package_show')(context_, {'id': package_id})
    package_index.index_package(package, defer_commit=False)
    log.info('Search indexed %s', package['name'])
예제 #10
0
def _update_search_index(package_id, log):
    """
    Tells CKAN to update its search index for a given package.
    """
    from ckan import model
    from ckan.lib.search.index import PackageSearchIndex

    package_index = PackageSearchIndex()
    context_ = {"model": model, "ignore_auth": True, "session": model.Session, "use_cache": False, "validate": False}
    package = toolkit.get_action("package_show")(context_, {"id": package_id})
    package_index.index_package(package, defer_commit=False)
    log.info("Search indexed %s", package["name"])
예제 #11
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()
        
        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)
        
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()
        
        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()
            
            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)
                
            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id}))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
예제 #12
0
def harvest_source_reindex(context, data_dict):
    """Reindex a single harvest source"""

    harvest_source_id = logic.get_or_bust(data_dict, "id")
    defer_commit = context.get("defer_commit", False)

    if "extras_as_string" in context:
        del context["extras_as_string"]
    context.update({"ignore_auth": True})
    package_dict = logic.get_action("harvest_source_show")(context, {"id": harvest_source_id})
    log.debug("Updating search index for harvest source {0}".format(harvest_source_id))

    # Remove configuration values
    new_dict = {}
    if package_dict.get("config"):
        config = json.loads(package_dict["config"])
        for key, value in package_dict.iteritems():
            if key not in config:
                new_dict[key] = value
    package_index = PackageSearchIndex()
    package_index.index_package(new_dict, defer_commit=defer_commit)

    return True
예제 #13
0
    def import_stage(self, harvest_object):
        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name(),
        }

        log = logging.getLogger(__name__ + '.import')
        log.debug('Import stage for harvest object: %s', harvest_object.id)

        if not harvest_object:
            log.error('No harvest object received')
            return False

        self._set_source_config(harvest_object.source.config)

        if self.force_import:
            status = 'change'
        else:
            status = self._get_object_extra(harvest_object, 'status')

        # Get the last harvested object (if any)
        previous_object = model.Session.query(HarvestObject) \
                                       .filter(HarvestObject.guid == harvest_object.guid) \
                                       .filter(HarvestObject.current == True).first() # noqa

        if status == 'delete':
            # Delete package
            context.update({
                'ignore_auth': True,
            })
            if harvest_object.package_id:
                p.toolkit.get_action('package_delete')(
                    context, {
                        'id': harvest_object.package_id
                    })
                log.info('Deleted package {0} with guid {1}'.format(
                    harvest_object.package_id, harvest_object.guid))

            return True

        # Check if it is a non ISO document
        original_document = self._get_object_extra(harvest_object,
                                                   'original_document')
        original_format = self._get_object_extra(harvest_object,
                                                 'original_format')
        if original_document and original_format:
            # DEPRECATED use the ISpatialHarvester interface method
            self.__base_transform_to_iso_called = False
            content = self.transform_to_iso(original_document, original_format,
                                            harvest_object)
            if not self.__base_transform_to_iso_called:
                log.warn(
                    'Deprecation warning: calling transform_to_iso directly is deprecated. '
                    +
                    'Please use the ISpatialHarvester interface method instead.'
                )

            for harvester in p.PluginImplementations(ISpatialHarvester):
                content = harvester.transform_to_iso(original_document,
                                                     original_format,
                                                     harvest_object)

            if content:
                harvest_object.content = content
            else:
                self._save_object_error('Transformation to ISO failed',
                                        harvest_object, 'Import')
                return False
        else:
            if harvest_object.content is None:
                self._save_object_error(
                    'Empty content for object {0}'.format(harvest_object.id),
                    harvest_object, 'Import')
                return False

            # Validate ISO document
            is_valid, profile, errors = self._validate_document(
                harvest_object.content, harvest_object)
            if not is_valid:
                # If validation errors were found, import will stop unless
                # configuration per source or per instance says otherwise
                continue_import = p.toolkit.asbool(config.get('ckanext.spatial.harvest.continue_on_validation_errors',
                                                              False)) or \
                    self.source_config.get('continue_on_validation_errors')
                if not continue_import:
                    return False

        # Parse ISO document
        try:

            iso_parser = ISODocument(harvest_object.content)
            iso_values = iso_parser.read_values()
        except Exception as e:
            self._save_object_error(
                'Error parsing ISO document for object {0}: {1}'.format(
                    harvest_object.id, six.text_type(e)), harvest_object,
                'Import')
            return False

        # Flag previous object as not current anymore
        if previous_object and not self.force_import:
            previous_object.current = False
            previous_object.add()

        # Update GUID with the one on the document
        iso_guid = iso_values['guid']
        if iso_guid and harvest_object.guid != iso_guid:
            # First make sure there already aren't current objects
            # with the same guid
            existing_object = model.Session.query(HarvestObject.id) \
                            .filter(HarvestObject.guid == iso_guid) \
                            .filter(HarvestObject.current == True).first() # noqa
            if existing_object:
                self._save_object_error(
                    'Object {0} already has this guid {1}'.format(
                        existing_object.id, iso_guid), harvest_object,
                    'Import')
                return False

            harvest_object.guid = iso_guid
            harvest_object.add()

        # Generate GUID if not present (i.e. it's a manual import)
        if not harvest_object.guid:
            m = hashlib.md5()
            m.update(harvest_object.content.encode('utf8', 'ignore'))
            harvest_object.guid = m.hexdigest()
            harvest_object.add()

        # Get document modified date
        try:
            metadata_modified_date = dateutil.parser.parse(
                iso_values['metadata-date'], ignoretz=True)
        except ValueError:
            self._save_object_error(
                'Could not extract reference date for object {0} ({1})'.format(
                    harvest_object.id, iso_values['metadata-date']),
                harvest_object, 'Import')
            return False

        harvest_object.metadata_modified_date = metadata_modified_date
        harvest_object.add()

        # Build the package dict
        package_dict = self.get_package_dict(iso_values, harvest_object)
        for harvester in p.PluginImplementations(ISpatialHarvester):
            package_dict = harvester.get_package_dict(
                context, {
                    'package_dict': package_dict,
                    'iso_values': iso_values,
                    'xml_tree': iso_parser.xml_tree,
                    'harvest_object': harvest_object,
                })
        if not package_dict:
            log.error(
                'No package dict returned, aborting import for object {0}'.
                format(harvest_object.id))
            return False

        # Create / update the package
        context.update({
            'extras_as_string': True,
            'api_version': '2',
            'return_id_only': True
        })

        if self._site_user and context['user'] == self._site_user['name']:
            context['ignore_auth'] = True

        # The default package schema does not like Upper case tags
        tag_schema = logic.schema.default_tags_schema()
        tag_schema['name'] = [not_empty, six.text_type]

        # Flag this object as the current one
        harvest_object.current = True
        harvest_object.add()

        if status == 'new':
            package_schema = logic.schema.default_create_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            # We need to explicitly provide a package ID, otherwise ckanext-spatial
            # won't be be able to link the extent to the package.
            package_dict['id'] = six.text_type(uuid.uuid4())
            package_schema['id'] = [six.text_type]

            # Save reference to the package on the object
            harvest_object.package_id = package_dict['id']
            harvest_object.add()
            # Defer constraints and flush so the dataset can be indexed with
            # the harvest object id (on the after_show hook from the harvester
            # plugin)
            model.Session.execute(
                'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
            model.Session.flush()

            try:
                package_id = p.toolkit.get_action('package_create')(
                    context, package_dict)
                log.info('Created new package %s with guid %s', package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % six.text_type(e.error_summary),
                    harvest_object, 'Import')
                return False

        elif status == 'change':

            # Check if the modified date is more recent
            if not self.force_import and previous_object \
                    and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:

                # Assign the previous job id to the new object to
                # avoid losing history
                harvest_object.harvest_job_id = previous_object.job.id
                harvest_object.add()

                # Delete the previous object to avoid cluttering the object table
                previous_object.delete()

                # Reindex the corresponding package to update the reference to the
                # harvest object
                if ((config.get('ckanext.spatial.harvest.reindex_unchanged',
                                True) != 'False'
                     or self.source_config.get('reindex_unchanged') != 'False')
                        and harvest_object.package_id):
                    context.update({'validate': False, 'ignore_auth': True})
                    try:
                        package_dict = logic.get_action('package_show')(
                            context, {
                                'id': harvest_object.package_id
                            })
                    except p.toolkit.ObjectNotFound:
                        pass
                else:
                    for extra in package_dict.get('extras', []):
                        if extra['key'] == 'harvest_object_id':
                            extra['value'] = harvest_object.id
                    if package_dict:
                        package_index = PackageSearchIndex()
                        package_index.index_package(package_dict)

                log.info('Document with GUID %s unchanged, skipping...' %
                         (harvest_object.guid))
            else:
                package_schema = logic.schema.default_update_package_schema()
                package_schema['tags'] = tag_schema
                context['schema'] = package_schema

                package_dict['id'] = harvest_object.package_id
                try:
                    package_id = p.toolkit.get_action('package_update')(
                        context, package_dict)
                    log.info('Updated package %s with guid %s', package_id,
                             harvest_object.guid)
                except p.toolkit.ValidationError as e:
                    self._save_object_error(
                        'Validation Error: %s' %
                        six.text_type(e.error_summary), harvest_object,
                        'Import')
                    return False

        model.Session.commit()

        return True
    def import_stage(self, harvest_object):

        log = logging.getLogger(__name__ + '.import')
        log.debug('%s: Import stage for harvest object: %s',
                  self.harvester_name(), harvest_object.id)

        if not harvest_object:
            log.error('No harvest object received')
            return False

        if not harvest_object.content:
            log.error('Harvest object contentless')
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_source_config(harvest_object.source.config)

        status = self._get_object_extra(harvest_object, 'status')

        # Get the last harvested object (if any)
        previous_object = Session.query(HarvestObject) \
                          .filter(HarvestObject.guid == harvest_object.guid) \
                          .filter(HarvestObject.current == True) \
                          .first()

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }

        if status == 'delete':
            # Delete package
            p.toolkit.get_action('package_delete')(
                context, {
                    'id': harvest_object.package_id
                })
            log.info('Deleted package {0} with guid {1}'.format(
                harvest_object.package_id, harvest_object.guid))

            return True

        # Flag previous object as not current anymore
        if previous_object:
            previous_object.current = False
            previous_object.add()

        # Flag this object as the current one
        harvest_object.current = True
        harvest_object.add()

        # Generate GUID if not present (i.e. it's a manual import)
        if not harvest_object.guid:
            self._save_object_error(
                'Missing GUID for object {0}'.format(harvest_object.id),
                harvest_object, 'Import')
            return False

        # pre-check to skip resource logic in case no changes occurred remotely
        if status == 'change':

            # Check if the document has changed
            m = hashlib.md5()
            m.update(previous_object.content.encode())
            old_md5 = m.hexdigest()

            m = hashlib.md5()
            m.update(harvest_object.content.encode())
            new_md5 = m.hexdigest()

            if old_md5 == new_md5:

                # Assign the previous job id to the new object to # avoid losing history
                harvest_object.harvest_job_id = previous_object.job.id
                harvest_object.add()

                harvest_object.metadata_modified_date = previous_object.metadata_modified_date
                harvest_object.add()

                # Delete the previous object to avoid cluttering the object table
                previous_object.delete()

                # Reindex the corresponding package to update the reference to the harvest object
                context.update({'validate': False, 'ignore_auth': True})
                try:
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': harvest_object.package_id
                        })
                except p.toolkit.ObjectNotFound:
                    pass
                else:
                    for extra in package_dict.get('extras', []):
                        if extra['key'] == 'harvest_object_id':
                            extra['value'] = harvest_object.id
                    if package_dict:
                        package_index = PackageSearchIndex()
                        package_index.index_package(package_dict)

                log.info('%s document with GUID %s unchanged, skipping...',
                         self.harvester_name(), harvest_object.guid)
                model.Session.commit()

                return "unchanged"

        # Build the package dict
        package_dict, metadata = self.create_package_dict(
            harvest_object.guid, harvest_object.content)

        if not package_dict:
            log.error(
                'No package dict returned, aborting import for object {0}'.
                format(harvest_object.id))
            return False

        package_dict['name'] = self._gen_new_name(package_dict['title'])

        # We need to get the owner organization (if any) from the harvest source dataset
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            package_dict['owner_org'] = source_dataset.owner_org

        self.attach_resources(metadata, package_dict, harvest_object)

        # Create / update the package

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name(),
            'extras_as_string': True,
            'api_version': '2',
            'return_id_only': True
        }
        if context['user'] == self._site_user['name']:
            context['ignore_auth'] = True

        # The default package schema does not like Upper case tags
        tag_schema = logic.schema.default_tags_schema()
        tag_schema['name'] = [not_empty]

        if status == 'new':
            package_schema = logic.schema.default_create_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            # We need to explicitly provide a package ID, otherwise ckanext-spatial
            # won't be be able to link the extent to the package.
            package_dict['id'] = uuid.uuid4().hex
            package_schema['id'] = []

            # Save reference to the package on the object
            harvest_object.package_id = package_dict['id']
            harvest_object.add()
            # Defer constraints and flush so the dataset can be indexed with
            # the harvest object id (on the after_show hook from the harvester
            # plugin)
            Session.execute(
                'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
            model.Session.flush()

            try:
                package_id = p.toolkit.get_action('package_create')(
                    context, package_dict)
                log.info('%s: Created new package %s with guid %s',
                         self.harvester_name(), package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % str(e.error_summary),
                    harvest_object, 'Import')
                return False

        elif status == 'change':
            # we know the internal document did change, bc of a md5 hash comparison done above

            package_schema = logic.schema.default_update_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            package_dict['id'] = harvest_object.package_id
            try:
                package_id = p.toolkit.get_action('package_update')(
                    context, package_dict)
                log.info('%s updated package %s with guid %s',
                         self.harvester_name(), package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % str(e.error_summary),
                    harvest_object, 'Import')
                return False

        model.Session.commit()

        return True
예제 #15
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
예제 #16
0
def harvest_jobs_run(context, data_dict):
    log.info("Harvest job run: %r", data_dict)
    check_access("harvest_jobs_run", context, data_dict)

    session = context["session"]

    source_id = data_dict.get("source_id", None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context["return_objects"] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job["gather_finished"]:
                objects = (
                    session.query(HarvestObject.id)
                    .filter(HarvestObject.harvest_job_id == job["id"])
                    .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR")))
                    .order_by(HarvestObject.import_finished.desc())
                )

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job["id"])
                    job_obj.status = u"Finished"

                    last_object = (
                        session.query(HarvestObject)
                        .filter(HarvestObject.harvest_job_id == job["id"])
                        .filter(HarvestObject.import_finished != None)
                        .order_by(HarvestObject.import_finished.desc())
                        .first()
                    )
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if "extras_as_string" in context:
                        del context["extras_as_string"]
                    context.update({"validate": False, "ignore_auth": True})
                    package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"})
    if len(jobs) == 0:
        log.info("No new harvest jobs.")
        raise Exception("There are no new harvesting jobs")

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context["detailed"] = False
        source = harvest_source_show(context, {"id": job["source_id"]})
        if source["active"]:
            job_obj = HarvestJob.get(job["id"])
            job_obj.status = job["status"] = u"Running"
            job_obj.save()
            publisher.send({"harvest_job_id": job["id"]})
            log.info("Sent job %s to the gather queue" % job["id"])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
예제 #17
0
def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    session = context['session']

    source_id = data_dict.get('source_id',None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string'in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(context,
                            {'id': job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs