Example #1
0
def harvest_object_create(context, data_dict):
    """ Create a new harvest object

    :type guid: string (optional)
    :type content: string (optional)
    :type job_id: string 
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
    """
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(data_dict, harvest_object_create_schema(),
                             context)

    if errors:
        raise logic.ValidationError(errors)

    obj = HarvestObject(guid=data.get('guid'),
                        content=data.get('content'),
                        job=data['job_id'],
                        harvest_source_id=data.get('source_id'),
                        package_id=data.get('package_id'),
                        extras=[
                            HarvestObjectExtra(key=k, value=v)
                            for k, v in data.get('extras', {}).items()
                        ])

    obj.save()
    return harvest_object_dictize(obj, context)
Example #2
0
    def setup_class(cls):
        try:
            from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        cls.content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content1)

        cls.content2 = '<xml>Content 2</xml>'
        cls.original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content2)

        hoe = HarvestObjectExtra(key='original_document',
                                 value=cls.original_content2,
                                 object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        cls.object_id_1 = ho1.id
        cls.object_id_2 = ho2.id
    def gather_stage(self, harvest_job):
        log.debug('In DotStatHarvester gather_stage')

        # For each row of data, use its ID as the GUID and save a harvest object
        # Return a list of all these new harvest jobs
        try:
            harvest_obj_ids = []
            self._set_config(harvest_job.source.config)
            base_url = harvest_job.source.url

            try:
                # Get list of endpoint ids
                endpoints = self.get_endpoints(base_url)

            except (AccessTypeNotAvailableError, KeyError):
                log.debug('Endpoint function failed')

            # Make a harvest object for each dataset
            # Set the GUID to the dataset's ID (DF_SDG etc.)

            for agency_id, _id, version in endpoints:
                harvest_obj = HarvestObject(
                    guid="{}-{}".format(agency_id, _id),
                    job=harvest_job
                )

                harvest_obj.extras = [
                    HarvestObjectExtra(key='stats_guid',
                                       value=_id),
                    HarvestObjectExtra(key='version',
                                       value=version)
                ]
                harvest_obj.save()

                harvest_obj_ids.append(harvest_obj.id)

            log.debug('IDs: {}'.format(harvest_obj_ids))

            return harvest_obj_ids

        except Exception as e:
            self._save_gather_error(
                'Unable to get content for URL: %s: %s / %s' %
                (base_url, str(e), traceback.format_exc()), harvest_job)
 def delete_geocat_ids(self, harvest_job, harvest_obj_ids,
                       packages_to_delete):
     delete_harvest_obj_ids = []
     for package_info in packages_to_delete:
         obj = HarvestObject(guid=package_info[1].name,
                             job=harvest_job,
                             extras=[
                                 HarvestObjectExtra(key='import_action',
                                                    value='delete')
                             ])
         obj.save()
         delete_harvest_obj_ids.append(obj.id)
     return delete_harvest_obj_ids
Example #5
0
    def gather_stage(self, harvest_job):

        if harvest_job.source.url.startswith('basic_test'):
            obj = HarvestObject(guid='test1', job=harvest_job)
            obj.extras.append(HarvestObjectExtra(key='key', value='value'))
            obj2 = HarvestObject(guid='test2', job=harvest_job)
            obj3 = HarvestObject(guid='test_to_delete', job=harvest_job)
            obj.add()
            obj2.add()
            obj3.save()  # this will commit both
            return [obj.id, obj2.id, obj3.id]

        return []
Example #6
0
 def _make_harvest_objs(datasets):
     '''Create HarvestObject with Socrata dataset content.'''
     obj_ids = []
     guids = []
     for d in datasets:
         log.debug('Creating HarvestObject for {} {}'.format(
             d['resource']['name'], d['resource']['id']))
         obj = HarvestObject(
             guid=d['resource']['id'],
             job=harvest_job,
             content=json.dumps(d),
             extras=[HarvestObjectExtra(key='status', value='hi!')])
         obj.save()
         obj_ids.append(obj.id)
         guids.append(d['resource']['id'])
     return obj_ids, guids
Example #7
0
    def _mark_datasets_for_deletion(self, guids_in_source, harvest_job):
        '''
        Given a list of guids in the remote source, checks which in the DB
        need to be deleted

        To do so it queries all guids in the DB for this source and calculates
        the difference.

        For each of these creates a HarvestObject with the dataset id, marked
        for deletion.

        Returns a list with the ids of the Harvest Objects to delete.
        '''

        object_ids = []

        # Get all previous current guids and dataset ids for this source
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id)\
            .filter(
            HarvestObject.current == True  # noqa
        ).filter(HarvestObject.harvest_source_id == harvest_job.source.id)

        guid_to_package_id = {}
        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = list(guid_to_package_id.keys())

        # Get objects/datasets to delete (ie in the DB but not in the source)
        guids_to_delete = set(guids_in_db) - set(guids_in_source)

        # Create a harvest object for each of them, flagged for deletion
        for guid in guids_to_delete:
            obj = HarvestObject(
                guid=guid,
                job=harvest_job,
                package_id=guid_to_package_id[guid],
                extras=[HarvestObjectExtra(key='status', value='delete')])

            # Mark the rest of objects for this guid as not current
            model.Session.query(HarvestObject) \
                         .filter_by(guid=guid) \
                         .update({'current': False}, False)
            obj.save()
            object_ids.append(obj.id)

        return object_ids
Example #8
0
    def _mark_datasets_for_deletion(self, guids_in_source, harvest_job):
        # This is the same as the method in the base class, except that a different query is used.

        object_ids = []

        portal = self._get_portal_from_config(harvest_job.source.config)

        # Get all previous current guids and dataset ids for this harvested portal independent of
        # the harvest objects. This allows cleaning the harvest data without loosing the
        # dataset mappings.
        # Build a subquery to get all the packages of the current portal first
        portal_packages = model.Session.query(model.PackageExtra.package_id.label('id')) \
            .filter(model.PackageExtra.key == EXTRA_KEY_HARVESTED_PORTAL) \
            .filter(model.PackageExtra.value == portal) \
            .subquery()

        # then get the extras.guid for those packages
        query = model.Session.query(model.PackageExtra.value, portal_packages.c.id) \
            .filter(model.PackageExtra.key == 'guid') \
            .filter(model.PackageExtra.package_id == portal_packages.c.id)

        guid_to_package_id = {}
        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = guid_to_package_id.keys()

        # Get objects/datasets to delete (ie in the DB but not in the source)
        guids_to_delete = set(guids_in_db) - set(guids_in_source)

        # Create a harvest object for each of them, flagged for deletion
        for guid in guids_to_delete:
            obj = HarvestObject(
                guid=guid,
                job=harvest_job,
                package_id=guid_to_package_id[guid],
                extras=[HarvestObjectExtra(key='status', value='delete')])

            # Mark the rest of objects for this guid as not current
            model.Session.query(HarvestObject) \
                .filter_by(guid=guid) \
                .update({'current': False}, False)
            obj.save()
            object_ids.append(obj.id)

        return object_ids
Example #9
0
    def fetch_stage(self, harvest_object):

        # Check harvest object status
        status = self._get_object_extra(harvest_object,'status')

        if status == 'delete':
            # No need to fetch anything, just pass to the import stage
            return True

        # We need to fetch the remote document

        # Get location
        url = self._get_object_extra(harvest_object, 'waf_location')
        if not url:
            self._save_object_error(
                    'No location defined for object {0}'.format(harvest_object.id),
                    harvest_object)
            return False

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            msg = 'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                    object=harvest_object,
                    key='original_document',
                    value=content)
            extra.save()

            extra = HOExtra(
                    object=harvest_object,
                    key='original_format',
                    value=document_format)
            extra.save()

        return True
Example #10
0
    def _run_job_for_single_document(self, harvest_job, object_id):

        harvester = FisbrokerPlugin()

        # we circumvent gather_stage() and fetch_stage() and just load the
        # content with a known object_id and create the harvest object:
        url = harvest_job.source.url
        # _get_content() returns XML
        content = harvester._get_content(url)
        obj = HarvestObject(guid=object_id,
                            job=harvest_job,
                            content=content,
                            extras=[HarvestObjectExtra(key='status',value='new')])
        obj.save()

        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)

        harvest_job.status = u'Finished'
        harvest_job.save()

        return obj
        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception, e:
            msg = u'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == u'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                object=harvest_object,
                key=u'original_document',
                value=content)
            extra.save()

            extra = HOExtra(
                object=harvest_object,
                key=u'original_format',
                value=document_format)
            extra.save()

        return True


apache = parse.SkipTo(parse.CaselessLiteral(u'<a href='),
                      include=True).suppress() + parse.quotedString.setParseAction(
    parse.removeQuotes).setResultsName(u'url') + parse.SkipTo(u'</a>',
class DatasetHarvesterBase(HarvesterBase):
    """
    A Harvester for datasets.
    """
    _user_name = None

    def validate_config(self, config):
        if not config:
            return config
        config_obj = yaml.load(config)
        return config

    def load_config(self, harvest_source):
        # Load the harvest source's configuration data. We expect it to be a YAML
        # string. Unfortunately I went ahead of CKAN on this. The stock CKAN harvester
        # only allows JSON in the configuration box. My fork is necessary for this
        # to work: https://github.com/joshdata/ckanext-harvest

        ret = {
            "filters":
            {},  # map data.json field name to list of values one of which must be present
            "defaults": {},
        }

        source_config = yaml.load(harvest_source.config)

        try:
            ret["filters"].update(source_config["filters"])
        except TypeError:
            pass
        except KeyError:
            pass

        try:
            ret["defaults"].update(source_config["defaults"])
        except TypeError:
            pass
        except KeyError:
            pass

        return ret

    def _get_user_name(self):
        if not self._user_name:
            user = p.toolkit.get_action('get_site_user')({
                'model': model,
                'ignore_auth': True
            }, {})
            self._user_name = user['name']

        return self._user_name

    def context(self):
        return {"user": self._get_user_name(), "ignore_auth": True}

    # SUBCLASSES MUST IMPLEMENT
    def load_remote_catalog(self, harvest_job):
        raise Exception("Not implemented")

    def extra_schema(self):
        return {
            'validator_schema': [ignore_empty, unicode, validate_schema],
        }

    def gather_stage(self, harvest_job):
        log.debug('In %s gather_stage (%s)' %
                  (repr(self), harvest_job.source.url))
        try:
            source_datasets, catalog_values = self.load_remote_catalog(
                harvest_job)
        except ValueError as e:
            self._save_gather_error("Error loading json content: %s." % (e),
                                    harvest_job)
            return []
        tmp_superThemes = [
            "agri", "educ", "econ", "ener", "envi", "gove", "heal", "intr",
            "just", "regi", "soci", "tech", "tran"
        ]
        ckan_host = ''
        # Call to config.ini to load superTheme list
        if 'CKAN_CONFIG' in environ:
            if path.exists(environ['CKAN_CONFIG']):
                try:
                    tmp_ckan_config = ConfigParser()
                    tmp_ckan_config.read(environ['CKAN_CONFIG'])
                except IOError:
                    log.warn(
                        'Error loading CKAN config.ini file [%s]. '
                        'Loading default SuperThemes', environ['CKAN_CONFIG'])
                except Exception:
                    log.warn(
                        'Unknow error loading CKAN config.ini file [%s]. '
                        'Loading default SuperThemes', environ['CKAN_CONFIG'])
                try:
                    ckan_host = tmp_ckan_config.get('app:main',
                                                    'ckan.site_url')
                except Exception:
                    log.warn(
                        'Error loading \"ckan.site_url\" from CKAN config.ini file [%s]. '
                        'Loading default SuperThemes', environ['CKAN_CONFIG'])
                # Get superThemeTaxonomy
                try:
                    if len(ckan_host) > 0:
                        stt_url = '{site_url}/superThemeTaxonomy.json'.format(
                            site_url=ckan_host)
                        superThemeTaxonomy = requests.get(stt_url)
                        superThemeTaxonomy = superThemeTaxonomy.json()
                        if len(superThemeTaxonomy) < 0:
                            raise Exception('SuperThemeTaxonomy JSON in empty')
                        if 'id' not in [theme for theme in superThemeTaxonomy]:
                            raise Exception(
                                'SuperThemeTaxonomy JSON don\'t contains \"id\" field'
                            )
                        tmp_superThemes = [
                            theme['id'] for theme in superThemeTaxonomy
                        ]
                        log.info("superThemeTaxonomy loaded!")
                    else:
                        raise Exception(
                            'The field of config.ini \"site_url\" is empty.')
                except Exception, e:
                    log.warn("Error getting \"ThemeTaxonomy.json\", err: %s.",
                             e)
        superThemes = tmp_superThemes
        for dataset in source_datasets:
            # Delete if exists @type key
            try:
                del dataset['@type']
            except Exception:
                pass

            try:
                foo = dataset['theme']
                log.info('Theme exists and it value is:{0}.'.format(foo))
            except IndexError:
                log.warn('The field \"theme\" not exists!... Fill it MF!')
                dataset.update({'theme': []})
            try:
                tags = dataset['keyword']
                themes = dataset['theme']
                if len(themes) > 0:
                    if type(tags) is list:
                        dataset['keyword'] = tags + themes
                    else:
                        dataset['keyword'] = [tags] + themes
            except IndexError:
                pass
            try:
                dataset.update({'author_email': dataset['publisher']['mbox']})
            except IndexError:
                log.warn(
                    'El campo \"publisher\" para \"{0}\" no contine campo \"mbox\".'
                    .format(dataset['title']))
                dataset.update({'author_mail': "unknow"})
            except Exception:
                log.warn(
                    'El fallo el campo \"publisher\" para \"{0}\". Este error es critico, '
                    'se completara el campo \"mbox\". para evitar errores futuros.'
                    .format(dataset['title']))
                dataset.update({'author_email': "unknow"})

            try:
                dataset.update({'author': dataset['publisher']['name']})
            except IndexError:
                log.warn(
                    'El campo \"publisher\" para \"{0}\" no contine campo \"name\".'
                    .format(dataset['title']))
                dataset.update({'author': "unknow"})
            except Exception:
                log.warn(
                    'El fallo el campo \"publisher\" para \"{0}\". Este error es critico, '
                    'se completara el campo \"name\". para evitar errores futuros.'
                    .format(dataset['title']))
                dataset.update({'author': "unknow"})
            try:
                del dataset['publisher']
            except Exception:
                pass

            try:
                dataset.update(
                    {'maintainer_email': dataset['contactPoint']['hasEmail']})
                dataset.update({'maintainer': dataset['contactPoint']['fn']})
                del dataset['contactPoint']
            except Exception:
                dataset.update({'maintainer_email': ""})
                dataset.update({'maintainer': ""})
                del dataset['contactPoint']

        DATAJSON_SCHEMA = source_datasets
        schema_version = '1.2'
        parent_identifiers = set()
        child_identifiers = set()
        catalog_extras = {}
        if isinstance(catalog_values, dict):
            schema_version = '1.2'
            for dataset in source_datasets:
                parent_identifier = dataset.get('isPartOf')
                if parent_identifier:
                    parent_identifiers.add(parent_identifier)
                    child_identifiers.add(dataset.get('identifier'))

            # get a list of needed catalog values and put into hobj
            catalog_fields = ['title', 'description']
            catalog_extras = dict(('catalog_' + k, v)
                                  for (k, v) in catalog_values.iteritems()
                                  if k in catalog_fields)

        # Loop through the packages we've already imported from this source
        # and go into their extra fields to get their source_identifier,
        # which corresponds to the remote catalog's 'identifier' field.
        # Make a mapping so we know how to update existing records.
        # Added: mark all existing parent datasets.
        existing_datasets = {}
        existing_parents = {}
        for hobj in model.Session.query(HarvestObject).filter_by(
                source=harvest_job.source, current=True):
            try:
                pkg = get_action('package_show')(self.context(), {
                    "id": hobj.package_id
                })
            except:
                # reference is broken
                continue
            sid = self.find_extra(pkg, "identifier")
            is_parent = self.find_extra(pkg, "collection_metadata")
            if sid:
                existing_datasets[sid] = pkg
            if is_parent and pkg.get("state") == "active":
                existing_parents[sid] = pkg

        # which parent has been demoted to child level?
        existing_parents_demoted = set(
            identifier for identifier in existing_parents.keys() \
            if identifier not in parent_identifiers)

        # which dataset has been promoted to parent level?
        existing_datasets_promoted = set(
            identifier for identifier in existing_datasets.keys() \
            if identifier in parent_identifiers \
            and identifier not in existing_parents.keys())

        # if there is any new parents, we will have to harvest parents
        # first, mark the status in harvest_source config, which
        # triggers a children harvest_job after parents job is finished.
        source = harvest_job.source
        source_config = json.loads(source.config or '{}')
        # run status: None, or parents_run, or children_run?
        run_status = source_config.get('datajson_collection')
        if parent_identifiers:
            for parent in parent_identifiers & child_identifiers:
                self._save_gather_error("Collection identifier '%s' \
                    cannot be isPartOf another collection." \
                                        % parent, harvest_job)

            new_parents = set(identifier for identifier in parent_identifiers \
                              if identifier not in existing_parents.keys())
            if new_parents:
                if not run_status:
                    # fresh start
                    run_status = 'parents_run'
                    source_config['datajson_collection'] = run_status
                    source.config = json.dumps(source_config)
                    source.save()
                elif run_status == 'children_run':
                    # it means new parents are tried and failed.
                    # but skip some which have previously reported with
                    # parent_identifiers & child_identifiers
                    for parent in new_parents - \
                            (parent_identifiers & child_identifiers):
                        self._save_gather_error("Collection identifier '%s' \
                            not found. Records which are part of this \
                            collection will not be harvested." \
                                                % parent, harvest_job)
                else:
                    # run_status was parents_run, and did not finish.
                    # something wrong but not sure what happened.
                    # let's leave it as it is, let it run one more time.
                    pass
            else:
                # all parents are already in place. run it as usual.
                run_status = None
        elif run_status:
            # need to clear run_status
            run_status = None
            source_config['datajson_collection'] = run_status
            source.config = json.dumps(source_config)
            source.save()

        # Create HarvestObjects for any records in the remote catalog.

        object_ids = []
        seen_datasets = set()
        unique_datasets = set()

        filters = self.load_config(harvest_job.source)["filters"]

        for dataset in source_datasets:
            # Create a new HarvestObject for this dataset and save the
            # dataset metdata inside it for later.

            # Check the config's filters to see if we should import this dataset.
            # For each filter, check that the value specified in the data.json file
            # is among the permitted values in the filter specification.
            matched_filters = True
            for k, v in filters.items():
                if dataset.get(k) not in v:
                    matched_filters = False
            if not matched_filters:
                continue

            if parent_identifiers and new_parents \
                    and dataset['identifier'] not in parent_identifiers \
                    and dataset.get('isPartOf') in new_parents:
                if run_status == 'parents_run':
                    # skip those whose parents still need to run.
                    continue
                else:
                    # which is 'children_run'.
                    # error out since parents got issues.
                    self._save_gather_error(
                        "Record with identifier '%s': isPartOf '%s' points to \
                        an erroneous record." %
                        (dataset['identifier'], dataset.get('isPartOf')),
                        harvest_job)
                    continue

            # Some source contains duplicate identifiers. skip all except the first one
            if dataset['identifier'] in unique_datasets:
                self._save_gather_error(
                    "Duplicate entry ignored for identifier: '%s'." %
                    (dataset['identifier']), harvest_job)
                continue
            unique_datasets.add(dataset['identifier'])

            # Get the package_id of this resource if we've already imported
            # it into our system. Otherwise, assign a brand new GUID to the
            # HarvestObject. I'm not sure what the point is of that.

            if dataset['identifier'] in existing_datasets:
                pkg = existing_datasets[dataset["identifier"]]
                pkg_id = pkg["id"]
                seen_datasets.add(dataset['identifier'])

                # We store a hash of the dict associated with this dataset
                # in the package so we can avoid updating datasets that
                # don't look like they've changed.
                if pkg.get("state") == "active" \
                        and dataset['identifier'] not in existing_parents_demoted \
                        and dataset['identifier'] not in existing_datasets_promoted \
                        and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset,
                                                                                                   harvest_job.source,
                                                                                                   catalog_extras,
                                                                                                   schema_version):
                    continue
            else:
                pkg_id = uuid.uuid4().hex

            # Create a new HarvestObject and store in it the GUID of the
            # existing dataset (if it exists here already) and the dataset's
            # metadata from the remote catalog file.
            extras = [
                HarvestObjectExtra(key='schema_version', value=schema_version)
            ]
            if dataset['identifier'] in parent_identifiers:
                extras.append(
                    HarvestObjectExtra(key='is_collection', value=True))
            elif dataset.get('isPartOf'):
                parent_pkg_id = existing_parents[dataset.get('isPartOf')]['id']
                extras.append(
                    HarvestObjectExtra(key='collection_pkg_id',
                                       value=parent_pkg_id))
            # FIX EXTRAS
            # for k,v in

            for k, v in catalog_extras.iteritems():
                extras.append(HarvestObjectExtra(key=k, value=v))
            # ----
            obj = HarvestObject(
                guid=pkg_id,
                job=harvest_job,
                extras=extras,
                content=json.dumps(dataset, sort_keys=True)
            )  # use sort_keys to preserve field order so hashes of this string are constant from run to run
            obj.save()
            object_ids.append(obj.id)

        # Remove packages no longer in the remote catalog.
        for upstreamid, pkg in existing_datasets.items():
            if upstreamid in seen_datasets: continue  # was just updated
            if pkg.get("state") == "deleted": continue  # already deleted
            pkg["state"] = "deleted"
            log.warn('deleting package %s (%s) because it is no longer in %s' %
                     (pkg["name"], pkg["id"], harvest_job.source.url))
            get_action('package_update')(self.context(), pkg)
            obj = HarvestObject(
                guid=pkg_id,
                package_id=pkg["id"],
                job=harvest_job,
            )
            obj.save()
            object_ids.append(obj.id)

        return object_ids
Example #13
0
        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception, e:
            msg = 'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                    object=harvest_object,
                    key='original_document',
                    value=content)
            extra.save()

            extra = HOExtra(
                    object=harvest_object,
                    key='original_format',
                    value=document_format)
            extra.save()

        return True


apache  = parse.SkipTo(parse.CaselessLiteral("<a href="), include=True).suppress() \
        + parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url') \
        + parse.SkipTo("</a>", include=True).suppress() \
Example #14
0
    def gather_stage(self, harvest_job):
        # The gather stage scans a remote resource (like a /data.json file) for
        # a list of datasets to import.

        log.debug('In %s gather_stage (%s)' %
                  (repr(self), harvest_job.source.url))

        # Start gathering.
        try:
            source_datasets, catalog_values = self.load_remote_catalog(
                harvest_job)
        except ValueError as e:
            self._save_gather_error("Error loading json content: %s." % (e),
                                    harvest_job)
            return []

        if len(source_datasets) == 0: return []

        DATAJSON_SCHEMA = {
            "https://project-open-data.cio.gov/v1.1/schema": '1.1',
        }

        # schema version is default 1.0, or a valid one (1.1, ...)
        schema_version = '1.0'
        parent_identifiers = set()
        child_identifiers = set()
        catalog_extras = {}
        if isinstance(catalog_values, dict):
            schema_value = catalog_values.get('conformsTo', '')
            if schema_value not in DATAJSON_SCHEMA.keys():
                self._save_gather_error('Error reading json schema value.' \
                    ' The given value is %s.' % ('empty' if schema_value == ''
                    else schema_value), harvest_job)
                return []
            schema_version = DATAJSON_SCHEMA.get(schema_value, '1.0')

            for dataset in source_datasets:
                parent_identifier = dataset.get('isPartOf')
                if parent_identifier:
                    parent_identifiers.add(parent_identifier)
                    child_identifiers.add(dataset.get('identifier'))

            # get a list of needed catalog values and put into hobj
            catalog_fields = ['@context', '@id', 'conformsTo', 'describedBy']
            catalog_extras = dict(('catalog_' + k, v)
                                  for (k, v) in catalog_values.iteritems()
                                  if k in catalog_fields)

        # Loop through the packages we've already imported from this source
        # and go into their extra fields to get their source_identifier,
        # which corresponds to the remote catalog's 'identifier' field.
        # Make a mapping so we know how to update existing records.
        # Added: mark all existing parent datasets.
        existing_datasets = {}
        existing_parents = {}
        for hobj in model.Session.query(HarvestObject).filter_by(
                source=harvest_job.source, current=True):
            try:
                pkg = get_action('package_show')(self.context(), {
                    "id": hobj.package_id
                })
            except:
                # reference is broken
                continue
            sid = self.find_extra(pkg, "identifier")
            is_parent = self.find_extra(pkg, "collection_metadata")
            if sid:
                existing_datasets[sid] = pkg
            if is_parent and pkg.get("state") == "active":
                existing_parents[sid] = pkg

        # which parent has been demoted to child level?
        existing_parents_demoted = set(
            identifier for identifier in existing_parents.keys() \
            if identifier not in parent_identifiers)

        # which dataset has been promoted to parent level?
        existing_datasets_promoted = set(
                identifier for identifier in existing_datasets.keys() \
                if identifier in parent_identifiers \
                and identifier not in existing_parents.keys())

        source = harvest_job.source
        source_config = self.load_config(source)

        if parent_identifiers:
            for parent in parent_identifiers & child_identifiers:
                self._save_gather_error("Collection identifier '%s' \
                    cannot be isPartOf another collection." \
                    % parent, harvest_job)

            new_parents = set(identifier for identifier in parent_identifiers \
                if identifier not in existing_parents.keys())

        # Create HarvestObjects for any records in the remote catalog.

        object_ids = []
        seen_datasets = set()
        unique_datasets = set()

        filters = source_config["filters"]

        for dataset in source_datasets:
            # Create a new HarvestObject for this dataset and save the
            # dataset metdata inside it for later.

            # Check the config's filters to see if we should import this dataset.
            # For each filter, check that the value specified in the data.json file
            # is among the permitted values in the filter specification.
            matched_filters = True
            for k, v in filters.items():
                if dataset.get(k) not in v:
                    matched_filters = False
            if not matched_filters:
                continue

            # Some source contains duplicate identifiers. skip all except the first one
            if dataset['identifier'] in unique_datasets:
                self._save_gather_error(
                    "Duplicate entry ignored for identifier: '%s'." %
                    (dataset['identifier']), harvest_job)
                continue
            unique_datasets.add(dataset['identifier'])

            # Get the package_id of this resource if we've already imported
            # it into our system. Otherwise, assign a brand new GUID to the
            # HarvestObject. I'm not sure what the point is of that.

            log.info('Check existing dataset: {}'.format(
                dataset['identifier']))
            if dataset['identifier'] in existing_datasets:
                pkg = existing_datasets[dataset["identifier"]]
                pkg_id = pkg["id"]
                seen_datasets.add(dataset['identifier'])

                # We store a hash of the dict associated with this dataset
                # in the package so we can avoid updating datasets that
                # don't look like they've changed.
                if pkg.get("state") == "active" \
                    and dataset['identifier'] not in existing_parents_demoted \
                    and dataset['identifier'] not in existing_datasets_promoted \
                    and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, source, catalog_extras, schema_version):
                    log.info('SKIP: {}'.format(dataset['identifier']))
                    continue
            else:
                pkg_id = uuid.uuid4().hex

            # Create a new HarvestObject and store in it the GUID of the
            # existing dataset (if it exists here already) and the dataset's
            # metadata from the remote catalog file.
            extras = [
                HarvestObjectExtra(key='schema_version', value=schema_version)
            ]
            if dataset['identifier'] in parent_identifiers:
                extras.append(
                    HarvestObjectExtra(key='is_collection', value=True))
            elif dataset.get('isPartOf'):
                is_part_of = dataset.get('isPartOf')
                existing_parent = existing_parents.get(is_part_of, None)
                if existing_parent is None:  # maybe the parent is not harvested yet
                    parent_pkg_id = 'IPO:{}'.format(is_part_of)
                else:
                    parent_pkg_id = existing_parent['id']
                extras.append(
                    HarvestObjectExtra(key='collection_pkg_id',
                                       value=parent_pkg_id))
            for k, v in catalog_extras.iteritems():
                extras.append(HarvestObjectExtra(key=k, value=v))

            log.info('Datajson creates a HO: {}'.format(dataset['identifier']))
            obj = HarvestObject(
                guid=pkg_id,
                job=harvest_job,
                extras=extras,
                content=json.dumps(dataset, sort_keys=True)
            )  # use sort_keys to preserve field order so hashes of this string are constant from run to run
            obj.save()

            # we are sorting parent datasets in the list first and then children so that the parents are
            # harvested first, we then use the parent id to associate the children to the parent
            if dataset['identifier'] in parent_identifiers:
                object_ids.insert(0, obj.id)
            else:
                object_ids.append(obj.id)

        # Remove packages no longer in the remote catalog.
        for upstreamid, pkg in existing_datasets.items():
            if upstreamid in seen_datasets: continue  # was just updated
            if pkg.get("state") == "deleted": continue  # already deleted
            pkg["state"] = "deleted"
            log.warn('deleting package %s (%s) because it is no longer in %s' %
                     (pkg["name"], pkg["id"], harvest_job.source.url))
            get_action('package_update')(self.context(), pkg)
            obj = HarvestObject(
                guid=pkg_id,
                package_id=pkg["id"],
                job=harvest_job,
            )
            obj.save()
            object_ids.append(obj.id)

        return object_ids
Example #15
0
    def test_api(self, app):
        try:
            from ckanext.harvest.model import (
                HarvestObject,
                HarvestJob,
                HarvestSource,
                HarvestObjectExtra,
            )
        except ImportError:
            raise pytest.skip(
                "The harvester extension is needed for these tests")

        content1 = "<xml>Content 1</xml>"
        ho1 = HarvestObject(
            guid="test-ho-1",
            job=HarvestJob(source=HarvestSource(url="http://", type="xx")),
            content=content1,
        )

        content2 = "<xml>Content 2</xml>"
        original_content2 = "<xml>Original Content 2</xml>"
        ho2 = HarvestObject(
            guid="test-ho-2",
            job=HarvestJob(source=HarvestSource(url="http://", type="xx")),
            content=content2,
        )

        hoe = HarvestObjectExtra(
            key="original_document", value=original_content2, object=ho2
        )

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        object_id_1 = ho1.id
        object_id_2 = ho2.id

        # Access object content
        url = "/harvest/object/{0}".format(object_id_1)
        r = app.get(url, status=200)
        assert(
            r.headers["Content-Type"] == "application/xml; charset=utf-8"
        )
        assert(
            r.body ==
            '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>'
        )

        # Access original content in object extra (if present)
        url = "/harvest/object/{0}/original".format(object_id_1)
        r = app.get(url, status=404)

        url = "/harvest/object/{0}/original".format(object_id_2)
        r = app.get(url, status=200)
        assert(
            r.headers["Content-Type"] == "application/xml; charset=utf-8"
        )
        assert(
            r.body ==
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            + "<xml>Original Content 2</xml>"
        )
Example #16
0
    def _mark_datasets_for_deletion(self, guids_in_source, harvest_job):
        # This is the same as the method in the base class, except that a different query is used.

        object_ids = []

        portal = self._get_portal_from_config(harvest_job.source.config)

        starttime = time.time()
        # Get all previous current guids and dataset ids for this harvested portal independent of
        # the harvest objects. This allows cleaning the harvest data without loosing the
        # dataset mappings.
        # Build a subquery to get all active packages having a GUID first
        subquery = model.Session.query(model.PackageExtra.value, model.Package.id) \
            .join(model.Package, model.Package.id == model.PackageExtra.package_id)\
            .filter(model.Package.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.key == 'guid') \
            .subquery()
        # then get all active packages of the current portal and join with their GUIDs if
        # available (outer join)
        query = model.Session.query(model.Package.id, subquery.c.value) \
            .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\
            .outerjoin(subquery, subquery.c.id == model.Package.id)\
            .filter(model.Package.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.key == EXTRA_KEY_HARVESTED_PORTAL) \
            .filter(model.PackageExtra.value == portal)

        checkpoint_start = time.time()
        guid_to_package_id = {}
        for package_id, guid in query:
            if guid:
                guid_to_package_id[guid] = package_id
            # Also remove all packages without a GUID, use ID as GUID to share logic below
            else:
                guid_to_package_id[package_id] = package_id
        checkpoint_end = time.time()
        LOGGER.debug('Time for query harvest source related datasets : %s',
                     str(checkpoint_end - checkpoint_start))

        guids_in_db = guid_to_package_id.keys()

        # Get objects/datasets to delete (ie in the DB but not in the source)
        guids_to_delete = set(guids_in_db) - set(guids_in_source)

        # Create a harvest object for each of them, flagged for deletion
        for guid in guids_to_delete:
            obj = HarvestObject(guid=guid, job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                extras=[HarvestObjectExtra(key='status',
                                                           value='delete')])

            # Mark the rest of objects for this guid as not current
            model.Session.query(HarvestObject) \
                .filter_by(guid=guid) \
                .update({'current': False}, False)
            obj.save()
            object_ids.append(obj.id)

        endtime = time.time()
        LOGGER.debug('Found %s packages for deletion. Time total: %s', len(guids_to_delete),
                     str(endtime - starttime))

        return object_ids
    def reimport_batch(self, package_ids, context):
        '''Batch-reimport all packages in `package_ids` from their original
           harvest source.'''

        ckan_fb_mapping = {}

        # first, do checks that can be done without connection to FIS-Broker
        for package_id in package_ids:
            package = Package.get(package_id)

            if not package:
                raise PackageIdDoesNotExistError(package_id)

            if not dataset_was_harvested(package):
                raise PackageNotHarvestedError(package_id)

            harvester = harvester_for_package(package)
            harvester_url = harvester.url
            harvester_type = harvester.type
            if not harvester_type == HARVESTER_ID:
                raise PackageNotHarvestedInFisbrokerError(package_id)

            fb_guid = fisbroker_guid(package)
            if not fb_guid:
                raise NoFisbrokerIdError(package_id)

            ckan_fb_mapping[package.id] = fb_guid

        # get the harvest source for FIS-Broker datasets
        fb_source = get_fisbroker_source()
        if not fb_source:
            raise NoFBHarvesterDefined()
        source_id = fb_source.get('id', None)

        # Create and start a new harvest job
        job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id})
        harvest_job = HarvestJob.get(job_dict['id'])
        harvest_job.gather_started = datetime.datetime.utcnow()
        assert harvest_job

        # instatiate the CSW connector (on the reasonable assumption that harvester_url is
        # the same for all package_ids)
        package_id = None
        reimported_packages = []
        try:
            csw = CatalogueServiceWeb(harvester_url)
            for package_id, fb_guid in ckan_fb_mapping.items():
                # query connector to get resource document
                csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd'])

                # show resource document
                record = csw.records.get(fb_guid, None)
                if record:
                    obj = HarvestObject(guid=fb_guid,
                                        job=harvest_job,
                                        content=record.xml,
                                        package_id=package_id,
                                        extras=[
                                            HarvestObjectExtra(key='status',value='change'),
                                            HarvestObjectExtra(key='type',value='reimport'),
                                        ])
                    obj.save()

                    assert obj, obj.content

                    harvester = FisbrokerPlugin()
                    harvester.force_import = True
                    harvester.import_stage(obj)
                    rejection_reason = self._dataset_rejected(obj)
                    if rejection_reason:
                        raise FBImportError(package_id, rejection_reason)

                    harvester.force_import = False
                    Session.refresh(obj)

                    reimported_packages.append(record)

                else:
                    raise NotFoundInFisbrokerError(package_id, fb_guid)

        except RequestException as error:
            raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__))


        # successfully finish harvest job
        harvest_job.status = u'Finished'
        harvest_job.finished = datetime.datetime.utcnow()
        harvest_job.save()

        return reimported_packages
Example #18
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
                                    first()

        def create_extras(url, status):
            return [
                HOExtra(key='doc_location', value=url),
                HOExtra(key='status', value=status)
            ]

        if not existing_object:
            guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url, 'new'),
                                           guid=guid)
        else:
            harvest_object = HarvestObject(
                job=harvest_job,
                extras=create_extras(url, 'change'),
                guid=existing_object.guid,
                package_id=existing_object.package_id)

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]
Example #19
0
    def test_api(self):
        try:
            from ckanext.harvest.model import (HarvestObject, HarvestJob,
                                               HarvestSource,
                                               HarvestObjectExtra)
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=content1)

        content2 = '<xml>Content 2</xml>'
        original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=content2)

        hoe = HarvestObjectExtra(
            key='original_document',
            value=original_content2,
            object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        object_id_1 = ho1.id
        object_id_2 = ho2.id

        app = self._get_test_app()

        # Test redirects for old URLs
        url = '/api/2/rest/harvestobject/{0}/xml'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 301)
        assert ('/harvest/object/{0}'.format(object_id_1)
                in r.headers['Location'])

        url = '/api/2/rest/harvestobject/{0}/html'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 301)
        assert ('/harvest/object/{0}/html'.format(object_id_1)
                in r.headers['Location'])

        # Access object content
        url = '/harvest/object/{0}'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'application/xml; charset=utf-8')
        assert_equals(
            r.body,
            '<?xml version="1.0" encoding="UTF-8"?>\n<xml>Content 1</xml>')

        # Access original content in object extra (if present)
        url = '/harvest/object/{0}/original'.format(object_id_1)
        r = app.get(url, status=404)
        assert_equals(r.status_int, 404)

        url = '/harvest/object/{0}/original'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'application/xml; charset=utf-8')
        assert_equals(
            r.body,
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            + '<xml>Original Content 2</xml>')

        # Access HTML transformation
        url = '/harvest/object/{0}/html'.format(object_id_1)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body

        url = '/harvest/object/{0}/html/original'.format(object_id_1)
        r = app.get(url, status=404)
        assert_equals(r.status_int, 404)

        url = '/harvest/object/{0}/html'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body

        url = '/harvest/object/{0}/html/original'.format(object_id_2)
        r = app.get(url)
        assert_equals(r.status_int, 200)
        assert_equals(r.headers['Content-Type'],
                      'text/html; charset=utf-8')
        assert 'GEMINI record about' in r.body
Example #20
0
            if sorted(previous_guids) == sorted(batch_guids):
                # Server does not support pagination or no more pages
                log.debug('Same content, no more pages')
                break

            page = page + 1

            previous_guids = batch_guids

        # Check datasets that need to be deleted
        guids_to_delete = set(guids_in_db) - set(guids_in_source)
        for guid in guids_to_delete:
            obj = HarvestObject(
                guid=guid, job=harvest_job,
                package_id=guid_to_package_id[guid],
                extras=[HarvestObjectExtra(key='status', value='delete')])
            ids.append(obj.id)
            model.Session.query(HarvestObject).\
                filter_by(guid=guid).\
                update({'current': False}, False)
            obj.save()

        return ids

    def fetch_stage(self, harvest_object):
        return True

    def import_stage(self, harvest_object):
        log.debug('In DCATJSONHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
    def gather_stage(self, harvest_job):
        log.debug('In DCATHarvester gather_stage')

        ids = []

        # Get the previous guids for this source
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = guid_to_package_id.keys()
        guids_in_source = []

        # Get file contents
        url = harvest_job.source.url

        previous_guids = []
        page = 1
        while True:

            try:
                content, content_type = self._get_content_and_type(
                    url, harvest_job, page)
            except requests.exceptions.HTTPError, error:
                if error.response.status_code == 404:
                    if page > 1:
                        # Server returned a 404 after the first page, no more
                        # records
                        log.debug('404 after first page, no more pages')
                        break
                    else:
                        # Proper 404
                        msg = 'Could not get content. Server responded with 404 Not Found'
                        self._save_gather_error(msg, harvest_job)
                        return None
                else:
                    # This should never happen. Raising just in case.
                    raise

            if not content:
                return None

            try:

                batch_guids = []
                for guid, as_string in self._get_guids_and_datasets(content):
                    '''
                    When ABORT is received from the datanorgeHarvester.py, the dataset is skipped since it is not transport-related
                    NOTE: This way of filtering transport related datasets should be changed when DIFI gets their new API working.
                    With their current API, it is not possible to filter on category, so it must be done manually like this.
                    THIS IS ALSO USED BY GEONORGE
                    '''

                    if (as_string == 'ABORT'):
                        log.debug('Dataset skipped, not relevant'.format(
                            guid.encode('utf8')))
                        continue

                    log.debug('Got identifier: {0}'.format(
                        guid.encode('utf8')))

                    batch_guids.append(guid)

                    if guid not in previous_guids:

                        if guid in guids_in_db:
                            # Dataset needs to be udpated
                            obj = HarvestObject(
                                guid=guid,
                                job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                content=as_string,
                                extras=[
                                    HarvestObjectExtra(key='status',
                                                       value='change')
                                ])
                        else:
                            # Dataset needs to be created
                            obj = HarvestObject(guid=guid,
                                                job=harvest_job,
                                                content=as_string,
                                                extras=[
                                                    HarvestObjectExtra(
                                                        key='status',
                                                        value='new')
                                                ])
                        obj.save()
                        ids.append(obj.id)

                if len(batch_guids) > 0:
                    guids_in_source.extend(
                        set(batch_guids) - set(previous_guids))
                else:
                    log.debug('Empty document, no more records')
                    # Empty document, no more ids
                    break

            except ValueError, e:
                msg = 'Error parsing file: {0}'.format(str(e))
                self._save_gather_error(msg, harvest_job)
                return None
Example #22
0
    def gather_stage(self, harvest_job):
        log.debug('In DCATJSONHarvester gather_stage')

        ids = []

        # Get the previous guids for this source
        query = \
            model.Session.query(HarvestObject.guid, HarvestObject.package_id) \
            .filter(HarvestObject.current == True) \
            .filter(HarvestObject.harvest_source_id == harvest_job.source.id)
        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = guid_to_package_id.keys()
        guids_in_source = []

        # Get file contents
        url = harvest_job.source.url

        previous_guids = []
        page = 1
        while True:

            try:
                content, content_type = \
                    self._get_content_and_type(url, harvest_job, page)
            except requests.exceptions.HTTPError, error:
                if error.response.status_code == 404:
                    if page > 1:
                        # Server returned a 404 after the first page, no more
                        # records
                        log.debug('404 after first page, no more pages')
                        break
                    else:
                        # Proper 404
                        msg = 'Could not get content. Server responded with ' \
                            '404 Not Found'
                        self._save_gather_error(msg, harvest_job)
                        return None
                else:
                    # This should never happen. Raising just in case.
                    raise

            if not content:
                return None

            try:

                batch_guids = []
                for guid, as_string in self._get_guids_and_datasets(content):

                    log.debug('Got identifier: {0}'
                              .format(guid.encode('utf8')))
                    batch_guids.append(guid)

                    if guid not in previous_guids:

                        if guid in guids_in_db:
                            # Dataset needs to be udpated
                            obj = HarvestObject(
                                guid=guid, job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                content=as_string,
                                extras=[HarvestObjectExtra(key='status',
                                                           value='change')])
                        else:
                            # Dataset needs to be created
                            obj = HarvestObject(
                                guid=guid, job=harvest_job,
                                content=as_string,
                                extras=[HarvestObjectExtra(key='status',
                                                           value='new')])
                        obj.save()
                        ids.append(obj.id)

                if len(batch_guids) > 0:
                    guids_in_source.extend(set(batch_guids)
                                           - set(previous_guids))
                else:
                    log.debug('Empty document, no more records')
                    # Empty document, no more ids
                    break

            except ValueError, e:
                msg = 'Error parsing file: {0}'.format(str(e))
                self._save_gather_error(msg, harvest_job)
                return None
    def import_stage(self, harvest_object):
        log.debug('In DotStatHarvester import_stage')
        self._set_config(harvest_object.job.source.config)

        if not harvest_object:
            log.error('No harvest object received')
            self._save_object_error('No harvest object received',
                                    harvest_object)
            return False

        try:
            base_url = harvest_object.source.url
            # Parse the SDMX as XML with bs4
            soup = BeautifulSoup(harvest_object.content, 'xml')

            # Make a package dict
            pkg_dict = {}
            pkg_dict['id'] = harvest_object.guid

            # Added thematic string
            pkg_dict['thematic_area_string'] = ["Official Statistics"]

            # Open license for all dotStat resources
            pkg_dict['license_id'] = "other-open"

            # Get owner_org if there is one
            source_dataset = get_action('package_show')(
                {
                    'ignore_auth': True
                }, {
                    'id': harvest_object.source.id
                })
            owner_org = source_dataset.get('owner_org')
            pkg_dict['owner_org'] = owner_org

            # Match other fields with tags in XML structure
            agency_id = self.config['agencyId']
            stats_guid = self._get_object_extra(harvest_object, 'stats_guid')

            structure = soup.find('Dataflow')
            pkg_dict['title'] = structure.find('Name', {"xml:lang" : "en"}).text
            pkg_dict['publisher_name'] = structure['agencyID']
            pkg_dict['version'] = structure['version']

            # Need to change url to point to Data Explorer
            de_url = 'https://stats.pacificdata.org/vis?locale=en&dataflow[datasourceId]=SPC2&dataflow[agencyId]={}&dataflow[dataflowId]={}&dataflow[version]={}'.format(
                agency_id,
                stats_guid,
                structure['version']
            )
            pkg_dict['source'] = de_url


            # Set resource to metadata data dictionary (if available)
            annotation = structure.find('Annotations')
            annots = annotation.find_all('Annotation')
            metaurl = None
            for annot in annots:
                metalink = annot.find('AnnotationType')
                if metalink.text == 'EXT_RESOURCE':
                    metaurl = annot.find('AnnotationText', {'xml:lang':'en'}).text.split('|')[1]

            # Set default resource, and metadata pdf if it exists
            if metaurl:
                pkg_dict['resources'] = [
                {
                    'url':
                    'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format(
                        agency_id,
                        stats_guid,
                        structure['version']
                    ),
                    'format': 'CSV',
                    'mimetype': 'CSV',
                    'description': 'All data for {}'.format(pkg_dict['title']),
                    'name': '{} Data CSV'.format(pkg_dict['title'])
                },
                {
                    'url': metaurl,
                    'format': 'PDF',
                    'mimetype': 'PDF',
                    'description': 'Detailed metadata dictionary for {}'.format(pkg_dict['title']),
                    'name': '{} Metadata PDF'.format(pkg_dict['title'])
                }]
            else:
                pkg_dict['resources'] = [
                {
                    'url':
                    'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format(
                        agency_id,
                        stats_guid,
                        structure['version']
                    ),
                    'format': 'CSV',
                    'mimetype': 'CSV',
                    'description': 'All data for {}'.format(pkg_dict['title']),
                    'name': '{} Data CSV'.format(pkg_dict['title'])
                }]


            # Get notes/description if it exists
            try:
                desc = structure.find('Description', {"xml:lang": "en"}).text
                desc += '\nFind more Pacific data on PDH.stat : https://stats.pacificdata.org/'
                pkg_dict['notes'] = desc
            except Exception as e:
                log.error("An error occured: {}".format(e))
                pkg_dict['notes'] = 'Find more Pacific data on PDH.stat : https://stats.pacificdata.org/'

            # Add tags from CategoryScheme and ConceptScheme
            # List of uninteresting tags
            generic_schemes = ['Time', 'Frequency', 'Observation value', 'Observation Status', 'Confidentiality status', 'Unit of measure', 'Unit multiplier', 'Base period', 'Comment',
                'Decimals', 'Data source', 'Pacific Island Countries and territories', 'Indicator', 'Transformation', 'Reporting type', 'Composite breakdown']
            tag_strings = []
            
            # For finding Category Schemes for tags
            schemes = soup.find('CategorySchemes')
            if schemes is not None:
                catschemes = schemes.find_all('CategoryScheme')
                for catscheme in catschemes:
                    cats = catscheme.find_all('Category')
                    for cat in cats:
                        found = cat.find('Name', {'xml:lang': 'en'}).text
                        if found not in tag_strings:
                            tag_strings.append(found)
           
            # For finding Concept Schemes for tags
            concepts = soup.find('Concepts')
            if concepts is not None:
                concschemes = concepts.find_all('ConceptScheme')
                for concscheme in concschemes:
                    concepts = concscheme.find_all('Concept')
                    for concept in concepts:
                        found = concept.find('Name', {'xml:lang': 'en'}).text
                        if found not in tag_strings:
                            tag_strings.append(found)

            # Tag cleaning
            psp_mapping = {
                'Industry and Services': ['pacific-skills', 'industry', 'training'],
                'Education level': ['pacific-skills', 'education', 'training'],
                'Occupation': ['pacific-skills', 'occupation'],
                'Disability': ['pacific-skills', 'disability'],
                'Economic sector': ['pacific-skills', 'industry', 'training'],
                'Labour force status': ['pacific-skills', 'employment'],
                'Employment status': ['pacific-skills', 'employment'],
                'Labour and employment status': ['pacific-skills', 'employment']
            }

            if len(tag_strings) > 0:
                # Bring in PSP tags
                for tag in tag_strings:
                    if tag in list(psp_mapping.keys()):
                        tag_strings.extend(psp_mapping[tag])
                # Remove duplicates
                tag_strings = list(set(tag_strings))
                # Remove tags found in generic_schemes list
                tags = [x.lower() for x in tag_strings if x not in generic_schemes]
                # Make a string of tags for CKAN
                pkg_dict['tag_string'] = ', '.join([munge_tag(tag) for tag in tags])

            
            '''
            May need modifying when DF_SDG is broken into several DFs
            This gets the list of indicators for SDG-related dataflows
            Stores the list of strings in 'alternate_identifier' field
            '''
            if soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES'
                                            }) is not None:
                pkg_dict['alternate_identifier'] = []
                codelist = soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES'})
                for indic in codelist.findAll('Name', {"xml:lang" : "en"}):
                    if not indic or indic.text == 'SDG Indicator or Series':
                        continue
                    pkg_dict['alternate_identifier'].append(indic.text)
            '''
            When support for metadata endpoints arrives in .Stat, here is how more metadata may be found:
            # Use the metadata/flow endpoint
            metadata = requests.get('{}metadata/data/{}/all?detail=full'.format(base_url, harvest_object.guid))

            # Parse with bs4
            parsed = BeautifulSoup(metadata.text, 'xml')

            # Now search for tags which may be useful as metadata
            # example: getting the name and definition of metadata set
            # (may need tweaking depending on SPC's metadata setup)

            # We can get name from the metadata structure
            set = parsed.find('MetadataSet')
            pkg_dict['name'] = set.find('Name').text

            # Then we can go to the reported attribute structure for more details
            detail = set.find('ReportedAttribute', attrs={'id': 'DEF'})
            pkg_dict['notes'] = detail.find('StructuredText', attrs={'lang': 'en'})
            source_details = set.find('ReportedAttribute', attrs={'id': 'SOURCE_DEF'})
            pkg_dict['source'] = source_details.find('StructuredText', attrs={'lang': 'en'})
            '''

            log.debug('package dict: %s' % pkg_dict)
            content_hash = str(_hashify(pkg_dict))
            harvest_object.extras = [
                HarvestObjectExtra(key='content_hash',
                                   value=content_hash)
            ]

            harvest_object.save()

            prev_object = model.Session.query(HarvestObject).filter(
                HarvestObject.source == harvest_object.source,
                HarvestObject.guid == harvest_object.guid,
                ~HarvestObject.import_finished.is_(None)).order_by(
                    HarvestObject.import_finished.desc()).first()

            obj_hash = self._get_object_extra(prev_object, 'content_hash')
            if obj_hash and obj_hash == content_hash:
                log.debug('Content is not changed. Skip..')
                return True

            # Create or update the package
            return self._create_or_update_package(
                pkg_dict, harvest_object, package_dict_form='package_show')
        except Exception as e:
            self._save_object_error(('Exception in import stage: %r / %s' %
                                     (e, traceback.format_exc())),
                                    harvest_object)
            return False