def test_munge_tag_multiple_pass(self): """Munge a list of tags muliple times gives expected results.""" for org, exp in self.munge_list: first_munge = munge_tag(org) assert_equal(first_munge, exp) second_munge = munge_tag(first_munge) assert_equal(second_munge, exp)
def test_munge_tag_multiple_pass(self): '''Munge a list of tags muliple times gives expected results.''' for org, exp in self.munge_list: first_munge = munge_tag(org) assert first_munge == exp second_munge = munge_tag(first_munge) assert second_munge == exp
def _generate_term_translations(self, lang_index, file_path): ''' ''' try: translations = [] de_cols = self._get_col_dict_array(0, file_path) other_cols = self._get_col_dict_array(lang_index, file_path) log.debug(de_cols) log.debug(other_cols) keys = ['title', 'notes', 'author', 'maintainer', 'license_id'] for col_idx in range(len(de_cols)): for key in keys: translations.append({ 'lang_code': self.LANG_CODES[lang_index], 'term': de_cols[col_idx][key], 'term_translation': other_cols[col_idx][key] }) de_tags = de_cols[col_idx]['tags'].split(u', ') other_tags = other_cols[col_idx]['tags'].split(u', ') if len(de_tags) == len(other_tags): for tag_idx in range(len(de_tags)): translations.append({ 'lang_code': self.LANG_CODES[lang_index], 'term': munge_tag(de_tags[tag_idx]), 'term_translation': munge_tag(other_tags[tag_idx]) }) for lang, org in self.ORGANIZATION.items(): if lang != 'de': for field in ['name', 'description']: translations.append({ 'lang_code': lang, 'term': self.ORGANIZATION['de'][field], 'term_translation': org[field] }) for lang, groups in self.GROUPS.iteritems(): if lang != u'de': for idx, group in enumerate(self.GROUPS[lang]): translations.append({ 'lang_code': lang, 'term': self.GROUPS[u'de'][idx], 'term_translation': group }) return translations except Exception as e: log.exception(e) raise
def _generate_tag_translations(self, lang, tags, orig_tags): tag_trans = [] for idx, tag in enumerate(tags): if tag: tag_trans.append({ 'lang_code': lang, 'term': munge_tag(orig_tags[idx]), 'term_translation': munge_tag(tag) }) return tag_trans
def parse_set(self, dataset): ''' Parse one dataset and its resources and return them as dict ''' log.debug('parsing dataset') dataset_attrs = dataset.find('dataset_attributes') metadata = { 'id': dataset.get('id') } for attr in self.DATASET_ATTRIBUTES: metadata[attr] = dataset_attrs.find(attr).find('de').text log.debug(metadata) if 'name' in metadata: metadata['name'] = munge_tag(metadata['name']) metadata['resources'] = self._build_resources_list(dataset) metadata['translations'] = self._build_term_translations( dataset ) log.debug(metadata) return metadata
def _generate_term_translations(self, base_data, dataset): ''' Return all the term_translations for a given dataset ''' translations = [] for data in dataset: if base_data.find('title') != data.find('title'): lang = data.get('{http://www.w3.org/XML/1998/namespace}lang') for base_group, group in zip(self._get_data_groups(base_data), self._get_data_groups(data)): translations.append({ 'lang_code': lang, 'term': base_group, 'term_translation': group }) for base_tag, tag in zip(self._generate_tags_array(base_data), self._generate_tags_array(data)): translations.append({ 'lang_code': lang, 'term': munge_tag(base_tag), 'term_translation': munge_tag(tag) }) for key in ['title', 'author', 'maintainer', 'description']: if (base_data.find(key) is not None and data.find(key) is not None): translations.append({ 'lang_code': lang, 'term': base_data.find(key).text, 'term_translation': data.find(key).text }) for lang, org in self.ORGANIZATION.items(): if lang != u'de': for field in ['name', 'description']: translations.append({ 'lang_code': lang, 'term': self.ORGANIZATION[u'de'][field], 'term_translation': org[field] }) return translations
def _generate_term_translations(self, base_data, dataset): ''' Return all the term_translations for a given dataset ''' translations = [] for data in dataset: if base_data.find('title') != data.find('title'): lang = data.get('{http://www.w3.org/XML/1998/namespace}lang') for base_group, group in zip( self._get_data_groups(base_data), self._get_data_groups(data)): translations.append({ 'lang_code': lang, 'term': base_group, 'term_translation': group }) for base_tag, tag in zip( self._generate_tags_array(base_data), self._generate_tags_array(data)): translations.append({ 'lang_code': lang, 'term': munge_tag(base_tag), 'term_translation': munge_tag(tag) }) for key in ['title', 'author', 'maintainer', 'description']: if (base_data.find(key) is not None and data.find(key) is not None): translations.append({ 'lang_code': lang, 'term': base_data.find(key).text, 'term_translation': data.find(key).text }) for lang, org in self.ORGANIZATION.items(): if lang != u'de': for field in ['name', 'description']: translations.append({ 'lang_code': lang, 'term': self.ORGANIZATION[u'de'][field], 'term_translation': org[field] }) return translations
def gather_stage(self, harvest_job): log.debug('In FOPHHarvester gather_stage') try: file_path = self._fetch_metadata_file() ids = [] de_cols = self._get_col_dict_array(0, file_path) for col in de_cols: # Construct the metadata dict for the dataset on CKAN metadata = { 'datasetID': col[u'id'], 'title': col[u'title'], 'url': col[u'url'], 'notes': col[u'notes'], 'author': col[u'author'], 'author_email': col[u'author_email'], 'maintainer': col[u'maintainer'], 'maintainer_email': col[u'maintainer_email'], 'license_id': col[u'license_id'].lower(), 'version': col[u'version'], 'translations': [], 'tags': [] } tags = col[u'tags'].split(u', ') tags = [munge_tag(tag) for tag in tags] metadata['tags'] = tags metadata['resources'] = self._generate_resources_dict_array( col[u'id']) metadata['resources'][0]['version'] = col[u'version'] log.debug(metadata['resources']) # Adding term translations metadata['translations'].extend( self._generate_term_translations(1, file_path)) # fr metadata['translations'].extend( self._generate_term_translations(2, file_path)) # it metadata['translations'].extend( self._generate_term_translations(3, file_path)) # en log.debug(metadata['translations']) obj = HarvestObject( guid=self._create_uuid(col[u'id']), job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + col[u'id'] + ' to the queue') ids.append(obj.id) log.debug(de_cols) except Exception: return False return ids
def infer_tags(self, values): tags = [] theme_keywords = [] place_keywords = [] stratum_keywords = [] temporal_keywords = [] if len(values.get('keywords', [])): key = values['keywords'][0] for theme in key.get('theme-keyword', []): if re.match('^[\w .-]+$', theme) is None: theme = munge_tag(theme) if theme not in tags: tags.append(theme) if theme not in theme_keywords: theme_keywords.append(theme) for place in key.get('place-keyword', []): if re.match('^[\w .-]+$', place) is None: place = munge_tag(place) if place not in place_keywords: place_keywords.append(place) for stratum in key.get('stratum-keyword', []): if re.match('^[\w .-]+$', stratum) is None: stratum = munge_tag(stratum) if stratum not in stratum_keywords: stratum_keywords.append(stratum) for temporal in key.get('temporal-keyword', []): if re.match('^[\w .-]+$', temporal) is None: temporal = munge_tag(temporal) if temporal not in temporal_keywords: temporal_keywords.append(temporal) values['tags'] = tags values['theme-keywords'] = theme_keywords values['place-keywords'] = place_keywords values['stratum-keywords'] = stratum_keywords values['temporal-keywords'] = temporal_keywords
def get_package_dict(self, iso_values, harvest_object): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details Extensions willing to modify the dict should do so implementing the ISpatialHarvester interface import ckan.plugins as p from ckanext.spatial.interfaces import ISpatialHarvester class MyHarvester(p.SingletonPlugin): p.implements(ISpatialHarvester, inherit=True) def get_package_dict(self, context, data_dict): package_dict = data_dict['package_dict'] package_dict['extras'].append( {'key': 'my-custom-extra', 'value': 'my-custom-value'} ) return package_dict If a dict is not returned by this function, the import stage will be cancelled. :param iso_values: Dictionary with parsed values from the ISO 19139 XML document :type iso_values: dict :param harvest_object: HarvestObject domain object (with access to job and source objects) :type harvest_object: HarvestObject :returns: A dataset dictionary (package_dict) :rtype: dict ''' tags = [] if 'tags' in iso_values: do_clean = self.source_config.get('clean_tags') tags_val = [ munge_tag(tag) if do_clean else tag[:100] for tag in iso_values['tags'] ] tags = [{'name': tag} for tag in tags_val] # Add default_tags from config default_tags = self.source_config.get('default_tags', []) if default_tags: for tag in default_tags: tags.append({'name': tag}) package_dict = { 'title': iso_values['title'], 'notes': iso_values['abstract'], 'tags': tags, 'resources': [], } # We need to get the owner organization (if any) from the harvest # source dataset source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: package_dict['owner_org'] = source_dataset.owner_org # Package name package = harvest_object.package if package is None or package.title != iso_values['title']: name = self._gen_new_name(iso_values['title']) if not name: name = self._gen_new_name(six.text_type(iso_values['guid'])) if not name: raise Exception( 'Could not generate a unique name from the title or the GUID. ' 'Please choose a more unique title.') package_dict['name'] = name else: package_dict['name'] = package.name extras = { 'guid': harvest_object.guid, 'spatial_harvester': True, } # Just add some of the metadata as extras, not the whole lot for name in [ # Essentials 'spatial-reference-system', 'guid', # Usefuls 'dataset-reference-date', 'metadata-language', # Language 'metadata-date', # Released 'coupled-resource', 'contact-email', 'frequency-of-update', 'spatial-data-service-type', ]: extras[name] = iso_values[name] if len(iso_values.get('progress', [])): extras['progress'] = iso_values['progress'][0] else: extras['progress'] = '' if len(iso_values.get('resource-type', [])): extras['resource-type'] = iso_values['resource-type'][0] else: extras['resource-type'] = '' extras['licence'] = iso_values.get('use-constraints', '') def _extract_first_license_url(licences): for licence in licences: o = urlparse(licence) if o.scheme and o.netloc: return licence return None if len(extras['licence']): license_url_extracted = _extract_first_license_url( extras['licence']) if license_url_extracted: extras['licence_url'] = license_url_extracted # Metadata license ID check for package use_constraints = iso_values.get('use-constraints') if use_constraints: context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } license_list = p.toolkit.get_action('license_list')(context, {}) for constraint in use_constraints: package_license = None for license in license_list: if constraint.lower() == license.get( 'id') or constraint == license.get('url'): package_license = license.get('id') break if package_license: package_dict['license_id'] = package_license break extras['access_constraints'] = iso_values.get( 'limitations-on-public-access', '') # Grpahic preview browse_graphic = iso_values.get('browse-graphic') if browse_graphic: browse_graphic = browse_graphic[0] extras['graphic-preview-file'] = browse_graphic.get('file') if browse_graphic.get('description'): extras['graphic-preview-description'] = browse_graphic.get( 'description') if browse_graphic.get('type'): extras['graphic-preview-type'] = browse_graphic.get('type') for key in ['temporal-extent-begin', 'temporal-extent-end']: if len(iso_values[key]) > 0: extras[key] = iso_values[key][0] # Save responsible organization roles if iso_values['responsible-organisation']: parties = {} for party in iso_values['responsible-organisation']: if party['organisation-name'] in parties: if not party['role'] in parties[ party['organisation-name']]: parties[party['organisation-name']].append( party['role']) else: parties[party['organisation-name']] = [party['role']] extras['responsible-party'] = [{ 'name': k, 'roles': v } for k, v in parties.items()] if len(iso_values['bbox']) > 0: bbox = iso_values['bbox'][0] extras['bbox-east-long'] = bbox['east'] extras['bbox-north-lat'] = bbox['north'] extras['bbox-south-lat'] = bbox['south'] extras['bbox-west-long'] = bbox['west'] try: xmin = float(bbox['west']) xmax = float(bbox['east']) ymin = float(bbox['south']) ymax = float(bbox['north']) except ValueError as e: self._save_object_error( 'Error parsing bounding box value: {0}'.format( six.text_type(e)), harvest_object, 'Import') else: # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry # Some publishers define the same two corners for the bbox (ie a point), # that causes problems in the search if stored as polygon if xmin == xmax or ymin == ymax: extent_string = Template( '{"type": "Point", "coordinates": [$x, $y]}' ).substitute(x=xmin, y=ymin) self._save_object_error( 'Point extent defined instead of polygon', harvest_object, 'Import') else: extent_string = self.extent_template.substitute(xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax) extras['spatial'] = extent_string.strip() else: log.debug('No spatial extent defined for this object') resource_locators = iso_values.get('resource-locator', []) +\ iso_values.get('resource-locator-identification', []) if len(resource_locators): for resource_locator in resource_locators: url = resource_locator.get('url', '').strip() if url: resource = {} resource['format'] = guess_resource_format(url) if resource['format'] == 'wms' and config.get( 'ckanext.spatial.harvest.validate_wms', False): # Check if the service is a view service test_url = url.split('?')[0] if '?' in url else url if self._is_wms(test_url): resource['verified'] = True resource['verified_date'] = datetime.now( ).isoformat() resource.update({ 'url': url, 'name': resource_locator.get('name') or p.toolkit._('Unnamed resource'), 'description': resource_locator.get('description') or '', 'resource_locator_protocol': resource_locator.get('protocol') or '', 'resource_locator_function': resource_locator.get('function') or '', }) package_dict['resources'].append(resource) # Add default_extras from config default_extras = self.source_config.get('default_extras', {}) if default_extras: override_extras = self.source_config.get('override_extras', False) for key, value in default_extras.items(): log.debug('Processing extra %s', key) if not key in extras or override_extras: # Look for replacement strings if isinstance(value, six.string_types): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url. strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id) extras[key] = value extras_as_dict = [] for key, value in extras.items(): if isinstance(value, (list, dict)): extras_as_dict.append({'key': key, 'value': json.dumps(value)}) else: extras_as_dict.append({'key': key, 'value': value}) package_dict['extras'] = extras_as_dict return package_dict
def test_munge_tag(self): """Munge a list of tags gives expected results.""" for org, exp in self.munge_list: munge = munge_tag(org) assert_equal(munge, exp)
def test_munge_tag(self): '''Munge a list of tags gives expected results.''' for org, exp in self.munge_list: munge = munge_tag(org) assert munge == exp
def parse_dataset(self, dataset_dict, dataset_ref): dataset_dict['extras'] = [] dataset_dict['resources'] = [] # Basic fields for key, predicate in ( ('title', DCT.title), ('notes', DCT.description), ('url', DCAT.landingPage), ('version', OWL.versionInfo), ): value = self._object_value(dataset_ref, predicate) if value: dataset_dict[key] = value if not dataset_dict.get('version'): # adms:version was supported on the first version of the DCAT-AP value = self._object_value(dataset_ref, ADMS.version) if value: dataset_dict['version'] = value # Tags keywords = self._object_value_list(dataset_ref, DCAT.keyword) or [] # Split keywords with commas keywords_with_commas = [k for k in keywords if ',' in k] for keyword in keywords_with_commas: keywords.remove(keyword) keywords.extend([k.strip() for k in keyword.split(',')]) # replace munge_tag to noop if there's no need to clean tags do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False)) tags_val = [munge_tag(tag) if do_clean else tag for tag in keywords] tags = [{'name': tag} for tag in tags_val] dataset_dict['tags'] = tags # Extras # Simple values for key, predicate in ( ('issued', DCT.issued), ('modified', DCT.modified), ('identifier', DCT.identifier), ('version_notes', ADMS.versionNotes), ('frequency', DCT.accrualPeriodicity), ('access_rights', DCT.accessRights), ('provenance', DCT.provenance), ('dcat_type', DCT.type), ): value = self._object_value(dataset_ref, predicate) if value: dataset_dict['extras'].append({'key': key, 'value': value}) # Lists for key, predicate, in ( ('language', DCT.language), ('theme', DCAT.theme), ('alternate_identifier', ADMS.identifier), ('conforms_to', DCT.conformsTo), ('documentation', FOAF.page), ('related_resource', DCT.relation), ('has_version', DCT.hasVersion), ('is_version_of', DCT.isVersionOf), ('source', DCT.source), ('sample', ADMS.sample), ): values = self._object_value_list(dataset_ref, predicate) if values: dataset_dict['extras'].append({'key': key, 'value': json.dumps(values)}) # Contact details contact = self._contact_details(dataset_ref, DCAT.contactPoint) if not contact: # adms:contactPoint was supported on the first version of DCAT-AP contact = self._contact_details(dataset_ref, ADMS.contactPoint) if contact: for key in ('uri', 'name', 'email'): if contact.get(key): dataset_dict['extras'].append( {'key': 'contact_{0}'.format(key), 'value': contact.get(key)}) # Publisher publisher = self._publisher(dataset_ref, DCT.publisher) for key in ('uri', 'name', 'email', 'url', 'type'): if publisher.get(key): dataset_dict['extras'].append( {'key': 'publisher_{0}'.format(key), 'value': publisher.get(key)}) # Temporal start, end = self._time_interval(dataset_ref, DCT.temporal) if start: dataset_dict['extras'].append( {'key': 'temporal_start', 'value': start}) if end: dataset_dict['extras'].append( {'key': 'temporal_end', 'value': end}) # Spatial spatial = self._spatial(dataset_ref, DCT.spatial) for key in ('uri', 'text', 'geom'): if spatial.get(key): dataset_dict['extras'].append( {'key': 'spatial_{0}'.format(key) if key != 'geom' else 'spatial', 'value': spatial.get(key)}) # Dataset URI (explicitly show the missing ones) dataset_uri = (unicode(dataset_ref) if isinstance(dataset_ref, rdflib.term.URIRef) else '') dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri}) # License if 'license_id' not in dataset_dict: dataset_dict['license_id'] = self._license(dataset_ref) # Source Catalog if toolkit.asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)): catalog_src = self._get_source_catalog(dataset_ref) if catalog_src is not None: src_data = self._extract_catalog_dict(catalog_src) dataset_dict['extras'].extend(src_data) # Resources for distribution in self._distributions(dataset_ref): resource_dict = {} # Simple values for key, predicate in ( ('name', DCT.title), ('description', DCT.description), ('download_url', DCAT.downloadURL), ('issued', DCT.issued), ('modified', DCT.modified), ('status', ADMS.status), ('rights', DCT.rights), ('license', DCT.license), ): value = self._object_value(distribution, predicate) if value: resource_dict[key] = value resource_dict['url'] = (self._object_value(distribution, DCAT.accessURL) or self._object_value(distribution, DCAT.downloadURL)) # Lists for key, predicate in ( ('language', DCT.language), ('documentation', FOAF.page), ('conforms_to', DCT.conformsTo), ): values = self._object_value_list(distribution, predicate) if values: resource_dict[key] = json.dumps(values) # Format and media type normalize_ckan_format = config.get( 'ckanext.dcat.normalize_ckan_format', True) imt, label = self._distribution_format(distribution, normalize_ckan_format) if imt: resource_dict['mimetype'] = imt if label: resource_dict['format'] = label elif imt: resource_dict['format'] = imt # Size size = self._object_value_int(distribution, DCAT.byteSize) if size is not None: resource_dict['size'] = size # Checksum for checksum in self.g.objects(distribution, SPDX.checksum): algorithm = self._object_value(checksum, SPDX.algorithm) checksum_value = self._object_value(checksum, SPDX.checksumValue) if algorithm: resource_dict['hash_algorithm'] = algorithm if checksum_value: resource_dict['hash'] = checksum_value # Distribution URI (explicitly show the missing ones) resource_dict['uri'] = (unicode(distribution) if isinstance(distribution, rdflib.term.URIRef) else '') dataset_dict['resources'].append(resource_dict) if self.compatibility_mode: # Tweak the resulting dict to make it compatible with previous # versions of the ckanext-dcat parsers for extra in dataset_dict['extras']: if extra['key'] in ('issued', 'modified', 'publisher_name', 'publisher_email',): extra['key'] = 'dcat_' + extra['key'] if extra['key'] == 'language': extra['value'] = ','.join( sorted(json.loads(extra['value']))) return dataset_dict
def _build_term_translations(self, dataset): """ Generate meaningful term translations for all translated values """ translations = [] langs = ['fr', 'it', 'en'] dataset_attrs = dataset.find('dataset_attributes') for attr in self.DATASET_ATTRIBUTES: term = dataset_attrs.find(attr).find('de').text log.debug('Create translation for %s' % term) if attr == 'tags': for lang in langs: trans = dataset_attrs.find(attr).find(lang).text # Tags are split and translated individually split_term = self._clean_values(term.split(',')) split_trans = self._clean_values(trans.split(',')) if len(split_term) == len(split_trans): for term, trans in zip(split_term, split_trans): log.debug( 'Term (tag): %s, Translation (%s): %s' % (term, lang, trans) ) translations.append({ u'lang_code': lang, u'term': munge_tag(term), u'term_translation': munge_tag(trans) }) else: for lang in langs: trans = dataset_attrs.find(attr).find(lang).text if term != trans: log.debug( 'Term: %s, Translation (%s): %s' % (term, lang, trans) ) translations.append({ u'lang_code': lang, u'term': term, u'term_translation': trans }) resources = dataset.findall('resource') for resource in resources: for attr in self.RESOURCE_ATTRIBUTES: res_attr = resource.find('resource_attributes') term = res_attr.find(attr).find('de').text log.debug('Create translation for %s' % term) for lang in langs: trans = res_attr.find(attr).find(lang).text if term != trans: log.debug( 'Term: %s, Translation (%s): %s' % (term, lang, trans) ) translations.append({ u'lang_code': lang, u'term': term, u'term_translation': trans }) return translations