def test_serialize(self): node = _get_inventory_doc('test_inventory.xml').dataset_nodes().next() node_str = InventoryDocument.serialize_node(node) print node_str node_ = InventoryDocument.parse_xml_string(node_str) # test the round-trip node_str_ = InventoryDocument.serialize_node(node_) assert_equal(node_str.strip(), node_str_.strip())
class InventoryHarvester(DguHarvesterBase): ''' Harvesting of LGA Inventories from a single XML document provided at a URL. ''' implements(IHarvester) IDENTIFIER_KEY = 'inventory_identifier' def info(self): ''' Returns a descriptor with information about the harvester. ''' return { "name": "inventory", "title": "Inventory XML", "description": "Dataset metadata published according to the Inventory XML format: http://schemas.opendata.esd.org.uk/Inventory with XSD: https://github.com/datagovuk/ckanext-dgu-local/blob/master/ckanext/dgulocal/data/inventory.xsd" } def gather_stage(self, harvest_job): ''' Fetches the single inventory document containing all of the datasets to be created/modified. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' from ckanext.harvest.model import (HarvestJob, HarvestObject, HarvestObjectExtra as HOExtra, HarvestGatherError) from ckanext.dgulocal.lib.geo import get_boundary from ckan import model self.last_run = None log.debug('Resolving source: %s', harvest_job.source.url) try: req = requests.get(harvest_job.source.url) e = req.raise_for_status() except requests.exceptions.RequestException, e: # e.g. requests.exceptions.ConnectionError self._save_gather_error( 'Failed to get content from URL: %s Error:%s %s' % (harvest_job.source.url, e.__class__.__name__, e), harvest_job) return None try: doc = InventoryDocument(req.content) except InventoryXmlError, e: self._save_gather_error( 'Failed to parse or validate the XML document: %s %s' % (e.__class__.__name__, e), harvest_job) return None
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content)) pkg = dict(title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={ self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid }) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = { 'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name( self.munge_title_to_name('%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' import ckanext.dgu.lib.theme as dgutheme from ckan.lib.helpers import resource_formats from ckan import model from ckanext.harvest.model import (HarvestObjectExtra as HOExtra, HarvestGatherError) res_formats = resource_formats() inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content)) pkg = dict(title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={ self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid }) # License rights = inv_dataset.get('rights') if rights: license_id, licence = \ dgu_helpers.get_licence_fields_from_free_text(rights) pkg['license_id'] = license_id if licence: pkg['extras']['licence'] = licence log.info('Custom licence %r', rights) else: pkg['license_id'] = '' # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = res_formats.get(inv_resource['mimetype'].lower().strip()) if format_: format_ = format_[1] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = { 'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self._gen_new_name('%s %s' % (pkg['title'], publisher_abbrev)) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] try: themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) except ImportError, e: log.debug('Theme cannot be given: %s', e) themes = []
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content) ) pkg = dict( title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid } ) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = {'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name(self.munge_title_to_name( '%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg
def _get_inventory_doc(inventory_xml_filename): path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) filepath = os.path.join(path, inventory_xml_filename) return InventoryDocument(open(filepath, 'r').read())