Example #1
0
def harvest_source_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    '''
    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;'''.format(
        harvest_source_id=harvest_source_id)

    model = context['model']

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({'validate': False, 'ignore_auth': True})
    package_dict = logic.get_action('package_show')(context, {
        'id': harvest_source_id
    })

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {'id': harvest_source_id}
Example #2
0
def harvest_sources_reindex(context, data_dict):
    '''
        Reindexes all harvest source datasets with the latest status
    '''
    log.info('Reindexing all harvest sources')
    check_access('harvest_sources_reindex', context, data_dict)

    model = context['model']

    packages = model.Session.query(model.Package) \
                            .filter(model.Package.type==DATASET_TYPE_NAME) \
                            .filter(model.Package.state==u'active') \
                            .all()

    package_index = PackageSearchIndex()
    for package in packages:
        if 'extras_as_string'in context:
            del context['extras_as_string']
        context.update({'ignore_auth': True})
        package_dict = logic.get_action('harvest_source_show')(context,
            {'id': package.id})
        log.debug('Updating search index for harvest source {0}'.format(package.id))
        package_index.index_package(package_dict, defer_commit=True)

    package_index.commit()
    log.info('Updated search index for {0} harvest sources'.format(len(packages)))
Example #3
0
def harvest_sources_reindex(context, data_dict):
    """
        Reindexes all harvest source datasets with the latest status
    """
    log.info("Reindexing all harvest sources")
    check_access("harvest_sources_reindex", context, data_dict)

    model = context["model"]

    packages = (
        model.Session.query(model.Package)
        .filter(model.Package.type == DATASET_TYPE_NAME)
        .filter(model.Package.state == u"active")
        .all()
    )

    package_index = PackageSearchIndex()

    reindex_context = {"defer_commit": True}
    for package in packages:
        get_action("harvest_source_reindex")(reindex_context, {"id": package.id})

    package_index.commit()

    return True
Example #4
0
def harvest_sources_reindex(context, data_dict):
    """
        Reindexes all harvest source datasets with the latest status
    """
    log.info("Reindexing all harvest sources")
    check_access("harvest_sources_reindex", context, data_dict)

    model = context["model"]

    packages = (
        model.Session.query(model.Package)
        .filter(model.Package.type == DATASET_TYPE_NAME)
        .filter(model.Package.state == u"active")
        .all()
    )

    package_index = PackageSearchIndex()
    for package in packages:
        if "extras_as_string" in context:
            del context["extras_as_string"]
        context.update({"validate": False, "ignore_auth": True})
        package_dict = logic.get_action("package_show")(context, {"id": package.id})
        log.debug("Updating search index for harvest source {0}".format(package.id))
        package_index.index_package(package_dict, defer_commit=True)

    package_index.commit()
    log.info("Updated search index for {0} harvest sources".format(len(packages)))
Example #5
0
def harvest_sources_reindex(context, data_dict):
    '''
        Reindexes all harvest source datasets with the latest status
    '''
    log.info('Reindexing all harvest sources')
    check_access('harvest_sources_reindex', context, data_dict)

    model = context['model']

    packages = model.Session.query(model.Package) \
                            .filter(model.Package.type == DATASET_TYPE_NAME) \
                            .filter(model.Package.state == u'active') \
                            .all()

    package_index = PackageSearchIndex()

    reindex_context = {'defer_commit': True}
    for package in packages:
        get_action('harvest_source_reindex')(reindex_context, {
            'id': package.id
        })

    package_index.commit()

    return True
Example #6
0
def harvest_source_reindex(context, data_dict):
    '''Reindex a single harvest source'''

    harvest_source_id = logic.get_or_bust(data_dict, 'id')
    defer_commit = context.get('defer_commit', False)

    if 'extras_as_string'in context:
        del context['extras_as_string']
    context.update({'ignore_auth': True})
    package_dict = logic.get_action('harvest_source_show')(
        context, {'id': harvest_source_id})
    log.debug('Updating search index for harvest source: %s',
              package_dict.get('name') or harvest_source_id)

    # Remove configuration values
    new_dict = {}
    if package_dict.get('config'):
        config = json.loads(package_dict['config'])
        for key, value in package_dict.iteritems():
            if key not in config:
                new_dict[key] = value
    package_index = PackageSearchIndex()
    package_index.index_package(new_dict, defer_commit=defer_commit)

    return True
Example #7
0
def harvest_source_reindex(context, data_dict):
    '''Reindex a single harvest source'''

    harvest_source_id = logic.get_or_bust(data_dict, 'id')

    defer_commit = context.get('defer_commit', False)

    if 'extras_as_string' in context:
        del context['extras_as_string']
    context.update({'ignore_auth': True})
    package_dict = logic.get_action('harvest_source_show')(
        context, {
            'id': harvest_source_id
        })
    log.debug('Updating search index for harvest source: %s',
              package_dict.get('name') or harvest_source_id)

    # Remove configuration values
    new_dict = {}

    try:
        config = json.loads(package_dict.get('config', ''))
    except ValueError:
        config = {}
    for key, value in package_dict.items():
        if key not in config:
            new_dict[key] = value

    package_index = PackageSearchIndex()
    package_index.index_package(new_dict, defer_commit=defer_commit)

    return True
Example #8
0
def harvest_source_clear(context, data_dict):
    """
    Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself.
    This is useful to clean history of long running harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    """
    check_access("harvest_source_clear", context, data_dict)

    harvest_source_id = data_dict.get("id", None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error("Harvest source %s does not exist", harvest_source_id)
        raise NotFound("Harvest source %s does not exist" % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    sql = """begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from package where id in (select id from package where state = 'to_delete'); commit;""".format(
        harvest_source_id=harvest_source_id
    )

    model = context["model"]

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    context.update({"validate": False, "ignore_auth": True})
    package_dict = logic.get_action("package_show")(context, {"id": harvest_source_id})

    if package_dict:
        package_index = PackageSearchIndex()
        package_index.index_package(package_dict)

    return {"id": harvest_source_id}
Example #9
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
Example #10
0
def _update_search_index(package_id, log):
    '''
    Tells CKAN to update its search index for a given package.
    '''
    from ckan import model
    from ckan.lib.search.index import PackageSearchIndex
    package_index = PackageSearchIndex()
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session,
                'use_cache': False, 'validate': False}
    package = toolkit.get_action('package_show')(context_, {'id': package_id})
    package_index.index_package(package, defer_commit=False)
    log.info('Search indexed %s', package['name'])
Example #11
0
def _update_search_index(package_id, log):
    """
    Tells CKAN to update its search index for a given package.
    """
    from ckan import model
    from ckan.lib.search.index import PackageSearchIndex

    package_index = PackageSearchIndex()
    context_ = {"model": model, "ignore_auth": True, "session": model.Session, "use_cache": False, "validate": False}
    package = toolkit.get_action("package_show")(context_, {"id": package_id})
    package_index.index_package(package, defer_commit=False)
    log.info("Search indexed %s", package["name"])
Example #12
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()
        
        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)
        
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()
        
        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()
            
            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)
                
            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id}))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
Example #13
0
def harvest_sources_reindex(context, data_dict):
    '''
        Reindexes all harvest source datasets with the latest status
    '''
    log.info('Reindexing all harvest sources')
    check_access('harvest_sources_reindex', context, data_dict)

    model = context['model']

    packages = model.Session.query(model.Package) \
                            .filter(model.Package.type==DATASET_TYPE_NAME) \
                            .filter(model.Package.state==u'active') \
                            .all()

    package_index = PackageSearchIndex()
    for package in packages:
        if 'extras_as_string' in context:
            del context['extras_as_string']
        context.update({'validate': False, 'ignore_auth': True})
        package_dict = logic.get_action('package_show')(context, {
            'id': package.id
        })
        log.debug('Updating search index for harvest source {0}'.format(
            package.id))
        package_index.index_package(package_dict, defer_commit=True)

    package_index.commit()
    log.info('Updated search index for {0} harvest sources'.format(
        len(packages)))
Example #14
0
def harvest_source_reindex(context, data_dict):
    """Reindex a single harvest source"""

    harvest_source_id = logic.get_or_bust(data_dict, "id")
    defer_commit = context.get("defer_commit", False)

    if "extras_as_string" in context:
        del context["extras_as_string"]
    context.update({"ignore_auth": True})
    package_dict = logic.get_action("harvest_source_show")(context, {"id": harvest_source_id})
    log.debug("Updating search index for harvest source {0}".format(harvest_source_id))

    # Remove configuration values
    new_dict = {}
    if package_dict.get("config"):
        config = json.loads(package_dict["config"])
        for key, value in package_dict.iteritems():
            if key not in config:
                new_dict[key] = value
    package_index = PackageSearchIndex()
    package_index.index_package(new_dict, defer_commit=defer_commit)

    return True
Example #15
0
def harvest_sources_reindex(context, data_dict):
    '''
        Reindexes all harvest source datasets with the latest status
    '''
    log.info('Reindexing all harvest sources')
    check_access('harvest_sources_reindex', context, data_dict)

    model = context['model']

    packages = model.Session.query(model.Package) \
                            .filter(model.Package.type==DATASET_TYPE_NAME) \
                            .filter(model.Package.state==u'active') \
                            .all()

    package_index = PackageSearchIndex()

    reindex_context = {'defer_commit': True}
    for package in packages:
        get_action('harvest_source_reindex')(reindex_context, {'id': package.id})

    package_index.commit()

    return True
    def import_stage(self, harvest_object):

        log = logging.getLogger(__name__ + '.import')
        log.debug('%s: Import stage for harvest object: %s',
                  self.harvester_name(), harvest_object.id)

        if not harvest_object:
            log.error('No harvest object received')
            return False

        if not harvest_object.content:
            log.error('Harvest object contentless')
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_source_config(harvest_object.source.config)

        status = self._get_object_extra(harvest_object, 'status')

        # Get the last harvested object (if any)
        previous_object = Session.query(HarvestObject) \
                          .filter(HarvestObject.guid == harvest_object.guid) \
                          .filter(HarvestObject.current == True) \
                          .first()

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }

        if status == 'delete':
            # Delete package
            p.toolkit.get_action('package_delete')(
                context, {
                    'id': harvest_object.package_id
                })
            log.info('Deleted package {0} with guid {1}'.format(
                harvest_object.package_id, harvest_object.guid))

            return True

        # Flag previous object as not current anymore
        if previous_object:
            previous_object.current = False
            previous_object.add()

        # Flag this object as the current one
        harvest_object.current = True
        harvest_object.add()

        # Generate GUID if not present (i.e. it's a manual import)
        if not harvest_object.guid:
            self._save_object_error(
                'Missing GUID for object {0}'.format(harvest_object.id),
                harvest_object, 'Import')
            return False

        # pre-check to skip resource logic in case no changes occurred remotely
        if status == 'change':

            # Check if the document has changed
            m = hashlib.md5()
            m.update(previous_object.content.encode())
            old_md5 = m.hexdigest()

            m = hashlib.md5()
            m.update(harvest_object.content.encode())
            new_md5 = m.hexdigest()

            if old_md5 == new_md5:

                # Assign the previous job id to the new object to # avoid losing history
                harvest_object.harvest_job_id = previous_object.job.id
                harvest_object.add()

                harvest_object.metadata_modified_date = previous_object.metadata_modified_date
                harvest_object.add()

                # Delete the previous object to avoid cluttering the object table
                previous_object.delete()

                # Reindex the corresponding package to update the reference to the harvest object
                context.update({'validate': False, 'ignore_auth': True})
                try:
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': harvest_object.package_id
                        })
                except p.toolkit.ObjectNotFound:
                    pass
                else:
                    for extra in package_dict.get('extras', []):
                        if extra['key'] == 'harvest_object_id':
                            extra['value'] = harvest_object.id
                    if package_dict:
                        package_index = PackageSearchIndex()
                        package_index.index_package(package_dict)

                log.info('%s document with GUID %s unchanged, skipping...',
                         self.harvester_name(), harvest_object.guid)
                model.Session.commit()

                return "unchanged"

        # Build the package dict
        package_dict, metadata = self.create_package_dict(
            harvest_object.guid, harvest_object.content)

        if not package_dict:
            log.error(
                'No package dict returned, aborting import for object {0}'.
                format(harvest_object.id))
            return False

        package_dict['name'] = self._gen_new_name(package_dict['title'])

        # We need to get the owner organization (if any) from the harvest source dataset
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            package_dict['owner_org'] = source_dataset.owner_org

        self.attach_resources(metadata, package_dict, harvest_object)

        # Create / update the package

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name(),
            'extras_as_string': True,
            'api_version': '2',
            'return_id_only': True
        }
        if context['user'] == self._site_user['name']:
            context['ignore_auth'] = True

        # The default package schema does not like Upper case tags
        tag_schema = logic.schema.default_tags_schema()
        tag_schema['name'] = [not_empty]

        if status == 'new':
            package_schema = logic.schema.default_create_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            # We need to explicitly provide a package ID, otherwise ckanext-spatial
            # won't be be able to link the extent to the package.
            package_dict['id'] = uuid.uuid4().hex
            package_schema['id'] = []

            # Save reference to the package on the object
            harvest_object.package_id = package_dict['id']
            harvest_object.add()
            # Defer constraints and flush so the dataset can be indexed with
            # the harvest object id (on the after_show hook from the harvester
            # plugin)
            Session.execute(
                'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
            model.Session.flush()

            try:
                package_id = p.toolkit.get_action('package_create')(
                    context, package_dict)
                log.info('%s: Created new package %s with guid %s',
                         self.harvester_name(), package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % str(e.error_summary),
                    harvest_object, 'Import')
                return False

        elif status == 'change':
            # we know the internal document did change, bc of a md5 hash comparison done above

            package_schema = logic.schema.default_update_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            package_dict['id'] = harvest_object.package_id
            try:
                package_id = p.toolkit.get_action('package_update')(
                    context, package_dict)
                log.info('%s updated package %s with guid %s',
                         self.harvester_name(), package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % str(e.error_summary),
                    harvest_object, 'Import')
                return False

        model.Session.commit()

        return True
    def import_stage(self, harvest_object):
        # The import stage actually creates the dataset.

        log.debug('In %s import_stage' % repr(self))

        # Get default values.
        harvester_config = self.load_config(harvest_object.source)

        # Get the metadata that we stored in the HarvestObject's content field.
        dataset = json.loads(harvest_object.content)

        # We need to get the owner organization (if any) from the harvest
        # source dataset
        owner_org = None
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            owner_org = source_dataset.owner_org

        # Assemble basic information about the dataset.
        pkg = {
            "name":
            self.make_package_name(dataset["title"], harvest_object.guid,
                                   False),
            "state":
            "active",  # in case was previously deleted
            "owner_org":
            owner_org,
            "extras": [{
                "key": "source_url",
                "value": harvest_object.source.url,
            }, {
                "key": "source_title",
                "value": harvest_object.source.title,
            }, {
                "key": "source_identifier",
                "value": dataset["identifier"],
            }, {
                "key":
                "source_hash",
                "value":
                self.make_upstream_content_hash(dataset,
                                                harvest_object.source),
            }, {
                "key": "harvest_harvester_version",
                "value": self.HARVESTER_VERSION,
            }, {
                "key": "harvest_last_updated",
                "value": datetime.datetime.utcnow().isoformat(),
            }]
        }

        # Set default values from the harvester configuration. Do this before
        # applying values from the harvest source so that the values can be
        # overridden.
        self.set_extras(pkg, harvester_config["defaults"])

        # Set specific information about the dataset.
        self.set_dataset_info(pkg, dataset, harvester_config)

        # Set "overrides" values from the harvester configuration, overriding
        # anything found in the harvester source.
        self.set_extras(pkg, harvester_config["overrides"])

        # Try to update an existing package with the ID set in harvest_object.guid. If that GUID
        # corresponds with an existing package, get its current metadata.
        try:
            existing_pkg = get_action('package_show')(self.context(), {
                "id": harvest_object.guid
            })
        except NotFound:
            existing_pkg = None

        if existing_pkg:
            # Update the existing metadata with the new information.

            # But before doing that, try to avoid replacing existing resources with new resources
            # my assigning resource IDs where they match up.
            for res in pkg.get("resources", []):
                for existing_res in existing_pkg.get("resources", []):
                    if res["url"] == existing_res["url"]:
                        res["id"] = existing_res["id"]

            existing_pkg.update(
                pkg
            )  # preserve other fields that we're not setting, but clobber extras
            pkg = existing_pkg

            log.warn('updating package %s (%s) from %s' %
                     (pkg["name"], pkg["id"], harvest_object.source.url))
            pkg = get_action('package_update')(self.context(), pkg)
        else:
            # It doesn't exist yet. Create a new one.
            try:
                pkg = get_action('package_create')(self.context(), pkg)
                log.warn('created package %s (%s) from %s' %
                         (pkg["name"], pkg["id"], harvest_object.source.url))
            except:
                log.error('failed to create package %s from %s' %
                          (pkg["name"], harvest_object.source.url))
                raise

        # Flag the other HarvestObjects linking to this package as not current anymore
        for ob in model.Session.query(HarvestObject).filter_by(
                package_id=pkg["id"]):
            ob.current = False
            ob.save()

        # Flag this HarvestObject as the current harvest object
        harvest_object.package_id = pkg['id']
        harvest_object.current = True
        harvest_object.save()

        # Now that the package and the harvest source are associated, re-index the
        # package so it knows it is part of the harvest source. The CKAN harvester
        # does this by creating the association before the package is saved by
        # overriding the GUID creation on a new package. That's too difficult.
        # So here we end up indexing twice.
        PackageSearchIndex().index_package(pkg)

        return True
class DatasetHarvesterBase(HarvesterBase):
    '''
    A Harvester for datasets.
    '''

    # SUBCLASSES MUST IMPLEMENT
    #HARVESTER_VERSION = "1.0"
    #def info(self):
    #    return {
    #        'name': 'harvester_base',
    #        'title': 'Base Harvester',
    #        'description': 'Abstract base class for harvesters that pull in datasets.',
    #    }

    def validate_config(self, config):
        if not config:
            return config
        config_obj = yaml.load(config)
        return config

    def context(self):
        # Reusing the dict across calls to action methods can be dangerous, so
        # create a new dict every time we need it.
        # Setting validate to False is critical for getting the harvester plugin
        # to set extra fields on the package during indexing (see ckanext/harvest/plugin.py
        # line 99, https://github.com/okfn/ckanext-harvest/blob/master/ckanext/harvest/plugin.py#L99).
        return {"user": "******", "ignore_auth": True, "validate": False}

    # SUBCLASSES MUST IMPLEMENT
    def load_remote_catalog(self, harvest_job):
        # Loads a remote data catalog. This function must return a JSON-able
        # list of dicts, each dict a dataset containing an 'identifier' field
        # with a locally unique identifier string and a 'title' field.
        raise Exception("Not implemented")

    def gather_stage(self, harvest_job):
        # The gather stage scans a remote resource (like a /data.json file) for
        # a list of datasets to import.

        log.debug('In %s gather_stage (%s)' %
                  (repr(self), harvest_job.source.url))

        # Start gathering.
        source = self.load_remote_catalog(harvest_job)
        if len(source) == 0: return []

        # Loop through the packages we've already imported from this source
        # and go into their extra fields to get their source_identifier,
        # which corresponds to the remote catalog's 'identifier' field.
        # Make a mapping so we know how to update existing records.
        existing_datasets = {}
        for hobj in model.Session.query(HarvestObject).filter_by(
                source=harvest_job.source, current=True):
            try:
                pkg = get_action('package_show')(self.context(), {
                    "id": hobj.package_id
                })
            except:
                # reference is broken
                continue
            sid = self.find_extra(pkg, "source_identifier")
            if sid:
                existing_datasets[sid] = pkg

        # Create HarvestObjects for any records in the remote catalog.

        object_ids = []
        seen_datasets = set()

        for dataset in source:
            # Create a new HarvestObject for this dataset and save the
            # dataset metdata inside it for later.

            # Get the package_id of this resource if we've already imported
            # it into our system. Otherwise, assign a brand new GUID to the
            # HarvestObject. I'm not sure what the point is of that.

            if dataset['identifier'] in existing_datasets:
                pkg = existing_datasets[dataset["identifier"]]
                pkg_id = pkg["id"]
                seen_datasets.add(dataset['identifier'])

                # We store a hash of the dict associated with this dataset
                # in the package so we can avoid updating datasets that
                # don't look like they've changed.
                if pkg.get("state") == "active" \
                    and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source):
                    continue
            else:
                pkg_id = uuid.uuid4().hex

            # Create a new HarvestObject and store in it the GUID of the
            # existing dataset (if it exists here already) and the dataset's
            # metadata from the remote catalog file.
            obj = HarvestObject(
                guid=pkg_id,
                job=harvest_job,
                content=json.dumps(dataset, sort_keys=True)
            )  # use sort_keys to preserve field order so hashes of this string are constant from run to run
            obj.save()
            object_ids.append(obj.id)

        # Remove packages no longer in the remote catalog.
        for upstreamid, pkg in existing_datasets.items():
            if upstreamid in seen_datasets: continue  # was just updated
            if pkg.get("state") == "deleted": continue  # already deleted
            pkg["state"] = "deleted"
            pkg["name"] = self.make_package_name(
                pkg["title"], pkg["id"], True
            )  # try to prevent name clash by giving it a "deleted-" name
            log.warn('deleting package %s (%s) because it is no longer in %s' %
                     (pkg["name"], pkg["id"], harvest_job.source.url))
            get_action('package_update')(self.context(), pkg)

        return object_ids

    def fetch_stage(self, harvest_object):
        # Nothing to do in this stage because we captured complete
        # dataset metadata from the first request to the remote catalog file.
        return True

    # SUBCLASSES MUST IMPLEMENT
    def set_dataset_info(self, pkg, dataset, dataset_defaults):
        # Sets package metadata on 'pkg' using the remote catalog's metadata
        # in 'dataset' and default values as configured in 'dataset_defaults'.
        raise Exception("Not implemented.")

    def import_stage(self, harvest_object):
        # The import stage actually creates the dataset.

        log.debug('In %s import_stage' % repr(self))

        # Get default values.
        dataset_defaults = None
        try:
            source_config = yaml.load(harvest_object.source.config)

            try:
                dataset_defaults = source_config["defaults"]
            except TypeError:
                pass
            except KeyError:
                pass
        except Exception, e:
            print e
        if not dataset_defaults: dataset_defaults = {}

        # Get the metadata that we stored in the HarvestObject's content field.
        h = HTMLParser.HTMLParser()
        dataset = json.loads(h.unescape(harvest_object.content))

        # We need to get the owner organization (if any) from the harvest
        # source dataset
        owner_org = None
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            owner_org = source_dataset.owner_org

        # Assemble basic information about the dataset.
        pkg = {
            "name":
            self.make_package_name(dataset["title"], harvest_object.guid,
                                   False),
            "state":
            "active",  # in case was previously deleted
            "owner_org":
            owner_org,
            "extras": [{
                "key": "source_url",
                "value": harvest_object.source.url,
            }, {
                "key": "source_title",
                "value": harvest_object.source.title,
            }, {
                "key": "source_identifier",
                "value": dataset["identifier"],
            }, {
                "key":
                "source_hash",
                "value":
                self.make_upstream_content_hash(dataset,
                                                harvest_object.source),
            }, {
                "key": "harvest_harvester_version",
                "value": self.HARVESTER_VERSION,
            }]
        }

        # Set specific information about the dataset.
        self.set_dataset_info(pkg, dataset, dataset_defaults)

        # Try to update an existing package with the ID set in harvest_object.guid. If that GUID
        # corresponds with an existing package, get its current metadata.
        try:
            existing_pkg = get_action('package_show')(self.context(), {
                "id": harvest_object.guid
            })
        except NotFound:
            existing_pkg = None

        if existing_pkg:
            # Update the existing metadata with the new information.

            # But before doing that, try to avoid replacing existing resources with new resources
            # my assigning resource IDs where they match up.
            for res in pkg.get("resources", []):
                for existing_res in existing_pkg.get("resources", []):
                    if res["url"] == existing_res["url"]:
                        res["id"] = existing_res["id"]

            existing_pkg.update(
                pkg
            )  # preserve other fields that we're not setting, but clobber extras
            pkg = existing_pkg

            log.warn('updating package %s (%s) from %s' %
                     (pkg["name"], pkg["id"], harvest_object.source.url))
            pkg = get_action('package_update')(self.context(), pkg)
        else:
            # It doesn't exist yet. Create a new one.
            try:
                pkg = get_action('package_create')(self.context(), pkg)
                log.warn('created package %s (%s) from %s' %
                         (pkg["name"], pkg["id"], harvest_object.source.url))
            except:
                log.error('failed to create package %s from %s' %
                          (pkg["name"], harvest_object.source.url))
                raise

        # Flag the other HarvestObjects linking to this package as not current anymore
        for ob in model.Session.query(HarvestObject).filter_by(
                package_id=pkg["id"]):
            ob.current = False
            ob.save()

        # Flag this HarvestObject as the current harvest object
        harvest_object.package_id = pkg['id']
        harvest_object.current = True
        harvest_object.save()

        # Now that the package and the harvest source are associated, re-index the
        # package so it knows it is part of the harvest source. The CKAN harvester
        # does this by creating the association before the package is saved by
        # overriding the GUID creation on a new package. That's too difficult.
        # So here we end up indexing twice.
        PackageSearchIndex().index_package(pkg)

        return True
Example #19
0
 def get_package_search_index(self):
     if not self.package_index:
         self.package_index = PackageSearchIndex()
     return self.package_index
Example #20
0
def harvest_jobs_run(context, data_dict):
    log.info("Harvest job run: %r", data_dict)
    check_access("harvest_jobs_run", context, data_dict)

    session = context["session"]

    source_id = data_dict.get("source_id", None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context["return_objects"] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job["gather_finished"]:
                objects = (
                    session.query(HarvestObject.id)
                    .filter(HarvestObject.harvest_job_id == job["id"])
                    .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR")))
                    .order_by(HarvestObject.import_finished.desc())
                )

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job["id"])
                    job_obj.status = u"Finished"

                    last_object = (
                        session.query(HarvestObject)
                        .filter(HarvestObject.harvest_job_id == job["id"])
                        .filter(HarvestObject.import_finished != None)
                        .order_by(HarvestObject.import_finished.desc())
                        .first()
                    )
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if "extras_as_string" in context:
                        del context["extras_as_string"]
                    context.update({"validate": False, "ignore_auth": True})
                    package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"})
    if len(jobs) == 0:
        log.info("No new harvest jobs.")
        raise Exception("There are no new harvesting jobs")

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context["detailed"] = False
        source = harvest_source_show(context, {"id": job["source_id"]})
        if source["active"]:
            job_obj = HarvestJob.get(job["id"])
            job_obj.status = job["status"] = u"Running"
            job_obj.save()
            publisher.send({"harvest_job_id": job["id"]})
            log.info("Sent job %s to the gather queue" % job["id"])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Example #21
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Example #22
0
def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    session = context['session']

    source_id = data_dict.get('source_id',None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string'in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(context,
                            {'id': job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Example #23
0
    def import_stage(self, harvest_object):
        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name(),
        }

        log = logging.getLogger(__name__ + '.import')
        log.debug('Import stage for harvest object: %s', harvest_object.id)

        if not harvest_object:
            log.error('No harvest object received')
            return False

        self._set_source_config(harvest_object.source.config)

        if self.force_import:
            status = 'change'
        else:
            status = self._get_object_extra(harvest_object, 'status')

        # Get the last harvested object (if any)
        previous_object = model.Session.query(HarvestObject) \
                                       .filter(HarvestObject.guid == harvest_object.guid) \
                                       .filter(HarvestObject.current == True).first() # noqa

        if status == 'delete':
            # Delete package
            context.update({
                'ignore_auth': True,
            })
            if harvest_object.package_id:
                p.toolkit.get_action('package_delete')(
                    context, {
                        'id': harvest_object.package_id
                    })
                log.info('Deleted package {0} with guid {1}'.format(
                    harvest_object.package_id, harvest_object.guid))

            return True

        # Check if it is a non ISO document
        original_document = self._get_object_extra(harvest_object,
                                                   'original_document')
        original_format = self._get_object_extra(harvest_object,
                                                 'original_format')
        if original_document and original_format:
            # DEPRECATED use the ISpatialHarvester interface method
            self.__base_transform_to_iso_called = False
            content = self.transform_to_iso(original_document, original_format,
                                            harvest_object)
            if not self.__base_transform_to_iso_called:
                log.warn(
                    'Deprecation warning: calling transform_to_iso directly is deprecated. '
                    +
                    'Please use the ISpatialHarvester interface method instead.'
                )

            for harvester in p.PluginImplementations(ISpatialHarvester):
                content = harvester.transform_to_iso(original_document,
                                                     original_format,
                                                     harvest_object)

            if content:
                harvest_object.content = content
            else:
                self._save_object_error('Transformation to ISO failed',
                                        harvest_object, 'Import')
                return False
        else:
            if harvest_object.content is None:
                self._save_object_error(
                    'Empty content for object {0}'.format(harvest_object.id),
                    harvest_object, 'Import')
                return False

            # Validate ISO document
            is_valid, profile, errors = self._validate_document(
                harvest_object.content, harvest_object)
            if not is_valid:
                # If validation errors were found, import will stop unless
                # configuration per source or per instance says otherwise
                continue_import = p.toolkit.asbool(config.get('ckanext.spatial.harvest.continue_on_validation_errors',
                                                              False)) or \
                    self.source_config.get('continue_on_validation_errors')
                if not continue_import:
                    return False

        # Parse ISO document
        try:

            iso_parser = ISODocument(harvest_object.content)
            iso_values = iso_parser.read_values()
        except Exception as e:
            self._save_object_error(
                'Error parsing ISO document for object {0}: {1}'.format(
                    harvest_object.id, six.text_type(e)), harvest_object,
                'Import')
            return False

        # Flag previous object as not current anymore
        if previous_object and not self.force_import:
            previous_object.current = False
            previous_object.add()

        # Update GUID with the one on the document
        iso_guid = iso_values['guid']
        if iso_guid and harvest_object.guid != iso_guid:
            # First make sure there already aren't current objects
            # with the same guid
            existing_object = model.Session.query(HarvestObject.id) \
                            .filter(HarvestObject.guid == iso_guid) \
                            .filter(HarvestObject.current == True).first() # noqa
            if existing_object:
                self._save_object_error(
                    'Object {0} already has this guid {1}'.format(
                        existing_object.id, iso_guid), harvest_object,
                    'Import')
                return False

            harvest_object.guid = iso_guid
            harvest_object.add()

        # Generate GUID if not present (i.e. it's a manual import)
        if not harvest_object.guid:
            m = hashlib.md5()
            m.update(harvest_object.content.encode('utf8', 'ignore'))
            harvest_object.guid = m.hexdigest()
            harvest_object.add()

        # Get document modified date
        try:
            metadata_modified_date = dateutil.parser.parse(
                iso_values['metadata-date'], ignoretz=True)
        except ValueError:
            self._save_object_error(
                'Could not extract reference date for object {0} ({1})'.format(
                    harvest_object.id, iso_values['metadata-date']),
                harvest_object, 'Import')
            return False

        harvest_object.metadata_modified_date = metadata_modified_date
        harvest_object.add()

        # Build the package dict
        package_dict = self.get_package_dict(iso_values, harvest_object)
        for harvester in p.PluginImplementations(ISpatialHarvester):
            package_dict = harvester.get_package_dict(
                context, {
                    'package_dict': package_dict,
                    'iso_values': iso_values,
                    'xml_tree': iso_parser.xml_tree,
                    'harvest_object': harvest_object,
                })
        if not package_dict:
            log.error(
                'No package dict returned, aborting import for object {0}'.
                format(harvest_object.id))
            return False

        # Create / update the package
        context.update({
            'extras_as_string': True,
            'api_version': '2',
            'return_id_only': True
        })

        if self._site_user and context['user'] == self._site_user['name']:
            context['ignore_auth'] = True

        # The default package schema does not like Upper case tags
        tag_schema = logic.schema.default_tags_schema()
        tag_schema['name'] = [not_empty, six.text_type]

        # Flag this object as the current one
        harvest_object.current = True
        harvest_object.add()

        if status == 'new':
            package_schema = logic.schema.default_create_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            # We need to explicitly provide a package ID, otherwise ckanext-spatial
            # won't be be able to link the extent to the package.
            package_dict['id'] = six.text_type(uuid.uuid4())
            package_schema['id'] = [six.text_type]

            # Save reference to the package on the object
            harvest_object.package_id = package_dict['id']
            harvest_object.add()
            # Defer constraints and flush so the dataset can be indexed with
            # the harvest object id (on the after_show hook from the harvester
            # plugin)
            model.Session.execute(
                'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
            model.Session.flush()

            try:
                package_id = p.toolkit.get_action('package_create')(
                    context, package_dict)
                log.info('Created new package %s with guid %s', package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % six.text_type(e.error_summary),
                    harvest_object, 'Import')
                return False

        elif status == 'change':

            # Check if the modified date is more recent
            if not self.force_import and previous_object \
                    and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:

                # Assign the previous job id to the new object to
                # avoid losing history
                harvest_object.harvest_job_id = previous_object.job.id
                harvest_object.add()

                # Delete the previous object to avoid cluttering the object table
                previous_object.delete()

                # Reindex the corresponding package to update the reference to the
                # harvest object
                if ((config.get('ckanext.spatial.harvest.reindex_unchanged',
                                True) != 'False'
                     or self.source_config.get('reindex_unchanged') != 'False')
                        and harvest_object.package_id):
                    context.update({'validate': False, 'ignore_auth': True})
                    try:
                        package_dict = logic.get_action('package_show')(
                            context, {
                                'id': harvest_object.package_id
                            })
                    except p.toolkit.ObjectNotFound:
                        pass
                else:
                    for extra in package_dict.get('extras', []):
                        if extra['key'] == 'harvest_object_id':
                            extra['value'] = harvest_object.id
                    if package_dict:
                        package_index = PackageSearchIndex()
                        package_index.index_package(package_dict)

                log.info('Document with GUID %s unchanged, skipping...' %
                         (harvest_object.guid))
            else:
                package_schema = logic.schema.default_update_package_schema()
                package_schema['tags'] = tag_schema
                context['schema'] = package_schema

                package_dict['id'] = harvest_object.package_id
                try:
                    package_id = p.toolkit.get_action('package_update')(
                        context, package_dict)
                    log.info('Updated package %s with guid %s', package_id,
                             harvest_object.guid)
                except p.toolkit.ValidationError as e:
                    self._save_object_error(
                        'Validation Error: %s' %
                        six.text_type(e.error_summary), harvest_object,
                        'Import')
                    return False

        model.Session.commit()

        return True
    def import_stage(self, harvest_object):
        # The import stage actually creates the dataset.

        log.debug('In %s import_stage' % repr(self))

        if (harvest_object.content == None):
            return True

        dataset = json.loads(harvest_object.content)
        schema_version = '1.0'  # default to '1.0'
        is_collection = False
        parent_pkg_id = ''
        catalog_extras = {}
        for extra in harvest_object.extras:
            if extra.key == 'schema_version':
                schema_version = extra.value
            if extra.key == 'is_collection' and extra.value:
                is_collection = True
            if extra.key == 'collection_pkg_id' and extra.value:
                parent_pkg_id = extra.value
            if extra.key.startswith('catalog_'):
                catalog_extras[extra.key] = extra.value

        # if this dataset is part of collection, we need to check if
        # parent dataset exist or not. we dont support any hierarchy
        # in this, so the check does not apply to those of is_collection
        if parent_pkg_id and not is_collection:
            parent_pkg = None
            try:
                parent_pkg = get_action('package_show')(self.context(), {
                    "id": parent_pkg_id
                })
            except:
                pass
            if not parent_pkg:
                parent_check_message = "isPartOf identifer '%s' not found." \
                    % dataset.get('isPartOf')
                self._save_object_error(parent_check_message, harvest_object,
                                        'Import')
                return None

        # Get default values.
        dataset_defaults = self.load_config(harvest_object.source)["defaults"]

        source_config = json.loads(harvest_object.source.config or '{}')
        validator_schema = source_config.get('validator_schema')
        if schema_version == '1.0' and validator_schema != 'non-federal':
            lowercase_conversion = True
        else:
            lowercase_conversion = False

        MAPPING = {
            "title": "title",
            "description": "notes",
            "keyword": "tags",
            "modified": "extras__modified",  # ! revision_timestamp
            "publisher": "extras__publisher",  # !owner_org
            "contactPoint": "maintainer",
            "mbox": "maintainer_email",
            "identifier": "extras__identifier",  # !id
            "accessLevel": "extras__accessLevel",
            "bureauCode": "extras__bureauCode",
            "programCode": "extras__programCode",
            "accessLevelComment": "extras__accessLevelComment",
            "license": "extras__license",  # !license_id 
            "spatial":
            "extras__spatial",  # Geometry not valid GeoJSON, not indexing
            "temporal": "extras__temporal",
            "theme": "extras__theme",
            "dataDictionary": "extras__dataDictionary",  # !data_dict
            "dataQuality": "extras__dataQuality",
            "accrualPeriodicity": "extras__accrualPeriodicity",
            "landingPage": "extras__landingPage",
            "language": "extras__language",
            "primaryITInvestmentUII":
            "extras__primaryITInvestmentUII",  # !PrimaryITInvestmentUII
            "references": "extras__references",
            "issued": "extras__issued",
            "systemOfRecords": "extras__systemOfRecords",
            "accessURL": None,
            "webService": None,
            "format": None,
            "distribution": None,
        }

        MAPPING_V1_1 = {
            "title": "title",
            "description": "notes",
            "keyword": "tags",
            "modified": "extras__modified",  # ! revision_timestamp
            "publisher": "extras__publisher",  # !owner_org
            "contactPoint": {
                "fn": "maintainer",
                "hasEmail": "maintainer_email"
            },
            "identifier": "extras__identifier",  # !id
            "accessLevel": "extras__accessLevel",
            "bureauCode": "extras__bureauCode",
            "programCode": "extras__programCode",
            "rights": "extras__rights",
            "license": "extras__license",  # !license_id
            "spatial":
            "extras__spatial",  # Geometry not valid GeoJSON, not indexing
            "temporal": "extras__temporal",
            "theme": "extras__theme",
            "dataDictionary": "extras__dataDictionary",  # !data_dict
            "dataQuality": "extras__dataQuality",
            "accrualPeriodicity": "extras__accrualPeriodicity",
            "landingPage": "extras__landingPage",
            "language": "extras__language",
            "primaryITInvestmentUII":
            "extras__primaryITInvestmentUII",  # !PrimaryITInvestmentUII
            "references": "extras__references",
            "issued": "extras__issued",
            "systemOfRecords": "extras__systemOfRecords",
            "distribution": None,
        }

        SKIP = ["accessURL", "webService", "format",
                "distribution"]  # will go into pkg["resources"]
        # also skip the processed_how key, it was added to indicate how we processed the dataset.
        SKIP.append("processed_how")

        SKIP_V1_1 = ["@type", "isPartOf", "distribution"]
        SKIP_V1_1.append("processed_how")

        if lowercase_conversion:

            mapping_processed = {}
            for k, v in MAPPING.items():
                mapping_processed[k.lower()] = v

            skip_processed = [k.lower() for k in SKIP]

            dataset_processed = {'processed_how': ['lowercase']}
            for k, v in dataset.items():
                if k.lower() in mapping_processed.keys():
                    dataset_processed[k.lower()] = v
                else:
                    dataset_processed[k] = v

            if 'distribution' in dataset and dataset[
                    'distribution'] is not None:
                dataset_processed['distribution'] = []
                for d in dataset['distribution']:
                    d_lower = {}
                    for k, v in d.items():
                        if k.lower() in mapping_processed.keys():
                            d_lower[k.lower()] = v
                        else:
                            d_lower[k] = v
                    dataset_processed['distribution'].append(d_lower)
        else:
            dataset_processed = dataset
            mapping_processed = MAPPING
            skip_processed = SKIP

        if schema_version == '1.1':
            mapping_processed = MAPPING_V1_1
            skip_processed = SKIP_V1_1

        validate_message = self._validate_dataset(validator_schema,
                                                  schema_version,
                                                  dataset_processed)
        if validate_message:
            self._save_object_error(validate_message, harvest_object, 'Import')
            return None

        # We need to get the owner organization (if any) from the harvest
        # source dataset
        owner_org = None
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            owner_org = source_dataset.owner_org

        source_config = json.loads(harvest_object.source.config or '{}')
        group_name = source_config.get('default_groups', '')

        # Assemble basic information about the dataset.

        pkg = {
            "state":
            "active",  # in case was previously deleted
            "owner_org":
            owner_org,
            "groups": [{
                "name": group_name
            }],
            "resources": [],
            "extras": [
                {
                    "key": "resource-type",
                    "value": "Dataset",
                },
                {
                    "key":
                    "source_hash",
                    "value":
                    self.make_upstream_content_hash(dataset,
                                                    harvest_object.source,
                                                    catalog_extras,
                                                    schema_version),
                },
                {
                    "key": "source_datajson_identifier",
                    "value": True,
                },
                {
                    "key": "harvest_source_id",
                    "value": harvest_object.harvest_source_id,
                },
                {
                    "key": "harvest_object_id",
                    "value": harvest_object.id,
                },
                {
                    "key": "harvest_source_title",
                    "value": harvest_object.source.title,
                },
                {
                    "key": "source_schema_version",
                    "value": schema_version,
                },
            ]
        }

        extras = pkg["extras"]
        unmapped = []

        for key, value in dataset_processed.iteritems():
            if key in skip_processed:
                continue
            new_key = mapping_processed.get(key)
            if not new_key:
                unmapped.append(key)
                continue

            # after schema 1.0+, we need to deal with multiple new_keys
            new_keys = []
            values = []
            if isinstance(new_key, dict):  # when schema is not 1.0
                _new_key_keys = new_key.keys()
                new_keys = new_key.values()
                values = []
                for _key in _new_key_keys:
                    values.append(value.get(_key))
            else:
                new_keys.append(new_key)
                values.append(value)

            if not any(item for item in values):
                continue

            mini_dataset = dict(zip(new_keys, values))
            for mini_key, mini_value in mini_dataset.iteritems():
                if not mini_value:
                    continue
                if mini_key.startswith('extras__'):
                    extras.append({"key": mini_key[8:], "value": mini_value})
                else:
                    pkg[mini_key] = mini_value

        # pick a fix number of unmapped entries and put into extra
        if unmapped:
            unmapped.sort()
            del unmapped[100:]
            for key in unmapped:
                value = dataset_processed.get(key, "")
                if value is not None:
                    extras.append({"key": key, "value": value})

        # if theme is geospatial/Geospatial, we tag it in metadata_type.
        themes = self.find_extra(pkg, "theme")
        if themes and ('geospatial' in [x.lower() for x in themes]):
            extras.append({'key': 'metadata_type', 'value': 'geospatial'})

        if is_collection:
            extras.append({'key': 'collection_metadata', 'value': 'true'})
        elif parent_pkg_id:
            extras.append({
                'key': 'collection_package_id',
                'value': parent_pkg_id
            })

        for k, v in catalog_extras.iteritems():
            extras.append({'key': k, 'value': v})

        # Set specific information about the dataset.
        self.set_dataset_info(pkg, dataset_processed, dataset_defaults,
                              schema_version)

        # Try to update an existing package with the ID set in harvest_object.guid. If that GUID
        # corresponds with an existing package, get its current metadata.
        try:
            existing_pkg = get_action('package_show')(self.context(), {
                "id": harvest_object.guid
            })
        except NotFound:
            existing_pkg = None

        if existing_pkg:
            # Update the existing metadata with the new information.

            # But before doing that, try to avoid replacing existing resources with new resources
            # my assigning resource IDs where they match up.
            for res in pkg.get("resources", []):
                for existing_res in existing_pkg.get("resources", []):
                    if res["url"] == existing_res["url"]:
                        res["id"] = existing_res["id"]
            pkg['groups'] = existing_pkg['groups']
            existing_pkg.update(
                pkg
            )  # preserve other fields that we're not setting, but clobber extras
            pkg = existing_pkg

            log.warn('updating package %s (%s) from %s' %
                     (pkg["name"], pkg["id"], harvest_object.source.url))
            pkg = get_action('package_update')(self.context(), pkg)
        else:
            # It doesn't exist yet. Create a new one.
            pkg['name'] = self.make_package_name(dataset_processed["title"],
                                                 harvest_object.guid)
            try:
                pkg = get_action('package_create')(self.context(), pkg)
                log.warn('created package %s (%s) from %s' %
                         (pkg["name"], pkg["id"], harvest_object.source.url))
            except:
                log.error('failed to create package %s from %s' %
                          (pkg["name"], harvest_object.source.url))
                raise

        # Flag the other HarvestObjects linking to this package as not current anymore
        for ob in model.Session.query(HarvestObject).filter_by(
                package_id=pkg["id"]):
            ob.current = False
            ob.save()

        # Flag this HarvestObject as the current harvest object
        harvest_object.package_id = pkg['id']
        harvest_object.current = True
        harvest_object.save()

        # Now that the package and the harvest source are associated, re-index the
        # package so it knows it is part of the harvest source. The CKAN harvester
        # does this by creating the association before the package is saved by
        # overriding the GUID creation on a new package. That's too difficult.
        # So here we end up indexing twice.
        PackageSearchIndex().index_package(pkg)

        return True