def test_serialize(self):
     node = _get_inventory_doc('test_inventory.xml').dataset_nodes().next()
     node_str = InventoryDocument.serialize_node(node)
     print node_str
     node_ = InventoryDocument.parse_xml_string(node_str)
     # test the round-trip
     node_str_ = InventoryDocument.serialize_node(node_)
     assert_equal(node_str.strip(), node_str_.strip())
Beispiel #2
0
 def test_serialize(self):
     node = _get_inventory_doc('test_inventory.xml').dataset_nodes().next()
     node_str = InventoryDocument.serialize_node(node)
     print node_str
     node_ = InventoryDocument.parse_xml_string(node_str)
     # test the round-trip
     node_str_ = InventoryDocument.serialize_node(node_)
     assert_equal(node_str.strip(), node_str_.strip())
Beispiel #3
0
class InventoryHarvester(DguHarvesterBase):
    '''
    Harvesting of LGA Inventories from a single XML document provided at a
    URL.
    '''
    implements(IHarvester)

    IDENTIFIER_KEY = 'inventory_identifier'

    def info(self):
        '''
        Returns a descriptor with information about the harvester.
        '''
        return {
            "name":
            "inventory",
            "title":
            "Inventory XML",
            "description":
            "Dataset metadata published according to the Inventory XML format: http://schemas.opendata.esd.org.uk/Inventory with XSD: https://github.com/datagovuk/ckanext-dgu-local/blob/master/ckanext/dgulocal/data/inventory.xsd"
        }

    def gather_stage(self, harvest_job):
        '''
        Fetches the single inventory document containing all of the
        datasets to be created/modified.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        from ckanext.harvest.model import (HarvestJob, HarvestObject,
                                           HarvestObjectExtra as HOExtra,
                                           HarvestGatherError)

        from ckanext.dgulocal.lib.geo import get_boundary
        from ckan import model

        self.last_run = None

        log.debug('Resolving source: %s', harvest_job.source.url)
        try:
            req = requests.get(harvest_job.source.url)
            e = req.raise_for_status()
        except requests.exceptions.RequestException, e:
            # e.g. requests.exceptions.ConnectionError
            self._save_gather_error(
                'Failed to get content from URL: %s Error:%s %s' %
                (harvest_job.source.url, e.__class__.__name__, e), harvest_job)
            return None

        try:
            doc = InventoryDocument(req.content)
        except InventoryXmlError, e:
            self._save_gather_error(
                'Failed to parse or validate the XML document: %s %s' %
                (e.__class__.__name__, e), harvest_job)
            return None
Beispiel #4
0
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        inv_dataset = InventoryDocument.dataset_to_dict(
            InventoryDocument.parse_xml_string(harvest_object.content))

        pkg = dict(title=inv_dataset['title'],
                   notes=inv_dataset['description'],
                   state='active' if inv_dataset['active'] else 'deleted',
                   resources=[],
                   extras={
                       self.IDENTIFIER_KEY: inv_dataset['identifier'],
                       'harvest_source_reference': harvest_object.guid
                   })
        # License
        rights = inv_dataset.get('rights')
        if rights:
            register = model.Package.get_license_register()
            if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/':
                pkg['license_id'] = 'uk-ogl'
            else:
                for l in register.values():
                    if l.url == rights:
                        pkg['license_id'] = l.id
                        break
                else:
                    # just save it as it is
                    pkg['license_id'] = register
                    log.info('Did not recognize license %r', register)
        else:
            pkg['license_id'] = None

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = Formats.by_mime_type().get(inv_resource['mimetype'])
            if format_:
                format_ = format_['display_name']
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {
                'url': inv_resource['url'],
                'format': format_,
                'description': description,
                'resource_type': resource_type,
                'schema-url': schema_url,
                'schema-type': schema_type,
            }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self.check_name(
                self.munge_title_to_name('%s %s' %
                                         (pkg['title'], publisher_abbrev)))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        themes = dgutheme.categorize_package(pkg)
        log.debug('%s given themes: %r', pkg['name'], themes)
        if themes:
            pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0]
            if len(themes) == 2:
                pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1]

        pkg['extras'] = self.extras_from_dict(pkg['extras'])
        return pkg
Beispiel #5
0
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        import ckanext.dgu.lib.theme as dgutheme
        from ckan.lib.helpers import resource_formats
        from ckan import model
        from ckanext.harvest.model import (HarvestObjectExtra as HOExtra,
                                           HarvestGatherError)

        res_formats = resource_formats()

        inv_dataset = InventoryDocument.dataset_to_dict(
            InventoryDocument.parse_xml_string(harvest_object.content))

        pkg = dict(title=inv_dataset['title'],
                   notes=inv_dataset['description'],
                   state='active' if inv_dataset['active'] else 'deleted',
                   resources=[],
                   extras={
                       self.IDENTIFIER_KEY: inv_dataset['identifier'],
                       'harvest_source_reference': harvest_object.guid
                   })
        # License
        rights = inv_dataset.get('rights')
        if rights:
            license_id, licence = \
                dgu_helpers.get_licence_fields_from_free_text(rights)
            pkg['license_id'] = license_id
            if licence:
                pkg['extras']['licence'] = licence
                log.info('Custom licence %r', rights)
        else:
            pkg['license_id'] = ''

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = res_formats.get(inv_resource['mimetype'].lower().strip())
            if format_:
                format_ = format_[1]
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {
                'url': inv_resource['url'],
                'format': format_,
                'description': description,
                'resource_type': resource_type,
                'schema-url': schema_url,
                'schema-type': schema_type,
            }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self._gen_new_name('%s %s' %
                                             (pkg['title'], publisher_abbrev))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        try:
            themes = dgutheme.categorize_package(pkg)
            log.debug('%s given themes: %r', pkg['name'], themes)
        except ImportError, e:
            log.debug('Theme cannot be given: %s', e)
            themes = []
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        inv_dataset = InventoryDocument.dataset_to_dict(
                       InventoryDocument.parse_xml_string(harvest_object.content)
                       )

        pkg = dict(
            title=inv_dataset['title'],
            notes=inv_dataset['description'],
            state='active' if inv_dataset['active'] else 'deleted',
            resources=[],
            extras={self.IDENTIFIER_KEY: inv_dataset['identifier'],
                    'harvest_source_reference': harvest_object.guid
                    }
            )
        # License
        rights = inv_dataset.get('rights')
        if rights:
            register = model.Package.get_license_register()
            if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/':
                pkg['license_id'] = 'uk-ogl'
            else:
                for l in register.values():
                    if l.url == rights:
                        pkg['license_id'] = l.id
                        break
                else:
                    # just save it as it is
                    pkg['license_id'] = register
                    log.info('Did not recognize license %r', register)
        else:
            pkg['license_id'] = None

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = Formats.by_mime_type().get(inv_resource['mimetype'])
            if format_:
                format_ = format_['display_name']
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {'url': inv_resource['url'],
                   'format': format_,
                   'description': description,
                   'resource_type': resource_type,
                   'schema-url': schema_url,
                   'schema-type': schema_type,
                   }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self.check_name(self.munge_title_to_name(
                '%s %s' % (pkg['title'], publisher_abbrev)))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        themes = dgutheme.categorize_package(pkg)
        log.debug('%s given themes: %r', pkg['name'], themes)
        if themes:
            pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0]
            if len(themes) == 2:
                pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1]

        pkg['extras'] = self.extras_from_dict(pkg['extras'])
        return pkg
Beispiel #7
0
def _get_inventory_doc(inventory_xml_filename):
    path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
    filepath = os.path.join(path, inventory_xml_filename)
    return InventoryDocument(open(filepath, 'r').read())