def test_munge_tag_multiple_pass(self): '''Munge a list of tags muliple times gives expected results.''' for org, exp in self.munge_list: first_munge = munge_tag(org) assert_equal(first_munge, exp) second_munge = munge_tag(first_munge) assert_equal(second_munge, exp)
def test_munge_tag_multiple_pass(original, expected): """Munge a list of tags muliple times gives expected results.""" first_munge = munge_tag(original) assert first_munge == expected second_munge = munge_tag(first_munge) assert second_munge == expected
def test_munge_tag_muliple_pass(self): '''Munge a list of tags muliple times gives expected results.''' for org, exp in self.munge_list: first_munge = munge_tag(org) nose_tools.assert_equal(first_munge, exp) second_munge = munge_tag(first_munge) nose_tools.assert_equal(second_munge, exp)
def _extract_tags_and_extras(self, content): extras = [] tags = [] for key, value in content.iteritems(): if key in self._get_mapping().values(): continue if key in ['type', 'subject']: if type(value) is list: tags.extend(value) else: tags.extend(value.split(';')) continue if value and type(value) is list: value = value[0] if not value: value = None if key.endswith('date') and value: # the ckan indexer can't handle timezone-aware datetime objects try: from dateutil.parser import parse date_value = parse(value) date_without_tz = date_value.replace(tzinfo=None) value = date_without_tz.isoformat() except (ValueError, TypeError): continue extras.append((key, value)) tags = [munge_tag(tag[:100]) for tag in tags] return (tags, extras)
def _extract_tags_and_extras(self, content): extras = [] tags = [] for key, value in content.items(): if key in list(self._get_mapping().values()): continue if key in ['type', 'subject']: if type(value) is list: tags.extend(value) else: tags.extend(value.split(';')) continue if value and type(value) is list: value = value[0] if not value: value = None if key.endswith('date') and value: # the ckan indexer can't handle timezone-aware datetime objects try: from dateutil.parser import parse date_value = parse(value) date_without_tz = date_value.replace(tzinfo=None) value = date_without_tz.isoformat() except (ValueError, TypeError): continue extras.append((key, value)) tags = [munge_tag(tag[:100]) for tag in tags] return (tags, extras)
def _clean_tags(self, tags): try: def _update_tag(tag_dict, key, newvalue): # update the dict and return it tag_dict[key] = newvalue return tag_dict # assume it's in the package_show form tags = [_update_tag(t, 'name', munge_tag(t['name'])) for t in tags if munge_tag(t['name']) != ''] except TypeError: # a TypeError is raised if `t` above is a string # REST format: 'tags' is a list of strings tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) return tags return tags
def _clean_tags(self, tags): try: def _update_tag(tag_dict, key, newvalue): # update the dict and return it tag_dict[key] = newvalue return tag_dict # assume it's in the package_show form tags = [_update_tag(t, 'name', munge_tag(t['name'])) for t in tags if munge_tag(t['name']) != ''] except TypeError: # a TypeError is raised if `t` above is a string # REST format: 'tags' is a list of strings tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) return tags return tags
def map_to_ogdch_keywords(geocat_keywords): ogdch_keywords = {'fr': [], 'de': [], 'en': [], 'it': []} for keyword in geocat_keywords: for lang, geocat_keyword in keyword.items(): if geocat_keyword != \ 'opendata.swiss' and lang in ['fr', 'de', 'en', 'it']: if geocat_keyword: ogdch_keywords[lang].append(munge_tag(geocat_keyword)) return ogdch_keywords
def _keywords(self, subject, predicate): keywords = {} for keyword_node in self.g.objects(subject, predicate): lang = keyword_node.language keyword = munge_tag(unicode(keyword_node)) keywords.setdefault(lang, []).append(keyword) return keywords
def import_stage(self, harvest_object): log.debug('In NadaHarvester import_stage') self._set_config(harvest_object.job.source.config) if not harvest_object: log.error('No harvest object received') self._save_object_error('No harvest object received', harvest_object) return False try: base_url = harvest_object.source.url.rstrip('/') ckan_metadata = DdiCkanMetadata() pkg_dict = ckan_metadata.load(harvest_object.content) pkg_dict = self._convert_to_extras(pkg_dict) # update URL with NADA catalog link catalog_path = self._get_catalog_path(harvest_object.guid) pkg_dict['url'] = base_url + catalog_path # set license from harvester config or use CKAN instance default if 'license' in self.config: pkg_dict['license_id'] = self.config['license'] else: pkg_dict['license_id'] = config.get( 'ckanext.ddi.default_license', '') tags = [] for tag in pkg_dict['tags']: if isinstance(tag, basestring): tags.append(munge_tag(tag[:100])) pkg_dict['tags'] = tags pkg_dict['version'] = pkg_dict['version'][:100] # add resources resources = [ { 'url': base_url + self._get_ddi_api(harvest_object.guid), 'name': 'DDI XML of %s' % pkg_dict['title'], 'format': 'xml' }, { 'url': pkg_dict['url'], 'name': 'NADA catalog entry', 'format': 'html' }, ] pkg_dict['resources'] = resources log.debug('package dict: %s' % pkg_dict) return self._create_or_update_package(pkg_dict, harvest_object) except Exception, e: self._save_object_error(('Exception in import stage: %r / %s' % (e, traceback.format_exc())), harvest_object) return False
def _keywords(self, subject, predicate): keywords = {} # initialize the keywords with empty lists for all languages for lang in dh.get_langs(): keywords[lang] = [] for keyword_node in self.g.objects(subject, predicate): lang = keyword_node.language keyword = munge_tag(unicode(keyword_node)) keywords.setdefault(lang, []).append(keyword) return keywords
def do_load_regions(g, vocab_name): concepts = [] for reg in g.subjects(None, URIRef(REGION_TYPE)): names = list(g.objects(reg, URIRef(NAME_TYPE))) identifier = munge_tag(reg.split('/')[-1]) labels = [{'lang': n.language, 'text': n.value} for n in names] concepts.append({'name': identifier, 'labels': labels}) log.info(f'Loaded {len(concepts)} regions') return concepts
def validator(key, data, errors, context): if errors[key]: return value = json.loads(data[key]) new_value = {} for lang in schema['form_languages']: new_value[lang] = [] if lang not in value.keys(): continue for keyword in value[lang]: new_value[lang].append(munge_tag(keyword)) data[key] = json.dumps(new_value)
def do_load_regions(g, vocab_name): concepts = [] pref_labels = [] for reg in g.subjects(None, URIRef(REGION_TYPE)): names = list(g.objects(reg, URIRef(NAME_TYPE))) identifier = munge_tag(unicode(reg).split('/')[-1]) concepts.append(identifier) for n in names: label = {'name': identifier, 'lang': n.language, 'localized_text': n.value} pref_labels.append(label) log.info('Loaded %d regions', len(concepts)) print('Loaded %d regions' % len(concepts)) return pref_labels, concepts
def importCmd(self, path=None): self.ckan = self._ckan_connect() if (path is None): print "Argument 'path' must be set" self.helpCmd() sys.exit(1) for root, dirs, files in os.walk(path): for dir_name in dirs: try: dir_path = os.path.join(root, dir_name) print "dir_path: %s" % dir_path for file_name in os.listdir(dir_path): file_path = os.path.join(dir_path, file_name) if not file_path.endswith('.pdf') or not os.path.isfile(file_path): continue base_name = file_name.split('.')[0] meta_xml_path = os.path.join(dir_path, base_name + '.xml') metadata = self._parse_metadata(meta_xml_path) # read fulltext with tika metadata['full_text_search'] = self.tika_parser.parse_with_tika(file_path) print "FULLTEXT: %s" % metadata['full_text_search'] # add tags to structure tags = [ metadata.get('source', '').replace('#', ' ').replace('-', ' '), metadata.get('contributor'), metadata.get('creator'), metadata.get('publisher'), metadata.get('pdf_image_color_mode'), metadata.get('pdf_image_color_space'), metadata.get('pdf_image_format'), metadata.get('pdf_image_resolution'), ] tags = [munge_tag(tag) for tag in tags if tag and tag is not None] metadata['tags'] = [{'name': tag} for tag in set(tags)] pkg = self._create_or_update_package(base_name, metadata) self._attach_file(pkg['id'], file_name, file_name, file_path, metadata, 'PDF') self._attach_file(pkg['id'], base_name + '.xml', 'Metadata XML', meta_xml_path, format='XML') except Exception, e: traceback.print_exc()
def process(self, record): record = record data_dict = { 'id': record['ID'], 'title': record['title'].strip('{}'), 'name': munge_title_to_name(record['ID'] + record['title']), 'notes': record['abstract'], 'harvest_source': 'MENDELEY', 'creator': record['author'].replace(',', '').split(' and '), 'tag_string': ','.join(munge_tag(tag) for tag in record['keywords'].split(',')), 'owner_org': tk.config.get('ckanext.ingestor.config.mendeley_bib.owner_org', 'iaea'), 'type': 'publications' } identifiers = [] if 'doi' in record: identifiers.append('doi:' + record['doi']) if 'isbn' in record: identifiers.append('isbn:' + record['isbn']) if 'pmid' in record: identifiers.append('pmid:' + record['pmid']) data_dict['identifier'] = identifiers if 'editor' in record: data_dict['contributor'] = [record['editor']] if 'publisher' in record: data_dict['publisher'] = [record['publisher']] if 'language' in record: data_dict['language'] = [record['language']] data_dict['source'] = record.get('url') user = tk.get_action('get_site_user')({'ignore_auth': True}) existing = model.Package.get(data_dict['id']) action = tk.get_action( 'package_update' if existing else 'package_create') action({'ignore_auth': True, 'user': user['name']}, data_dict)
def _extract_tags_and_extras(self, content): extras = [] tags = [] for key, value in content.iteritems(): if key in self._get_mapping().values(): continue if key in ["type", "subject"]: if type(value) is list: tags.extend(value) else: tags.extend(value.split(";")) continue if value and type(value) is list: value = value[0] if not value: value = None extras.append((key, value)) tags = [munge_tag(tag[:100]) for tag in tags] return (tags, extras)
def _nonEpos_extract_tags_and_extras(self, content): extras = [] tags = [] for key, value in content.iteritems(): if key in self._get_mapping().values(): continue if key in ['type', 'subject']: if type(value) is list: tags.extend(value) else: tags.extend(value.split(';')) continue if value and type(value) is list: value = value[0] if not value: value = None extras.append((key, value)) tags = [munge_tag(tag[:100]) for tag in tags] return (tags, extras)
def parse_dataset(self, dataset_dict, dataset_ref): # noqa log.debug("Parsing dataset '%r'" % dataset_ref) dataset_dict['temporals'] = [] dataset_dict['tags'] = [] dataset_dict['extras'] = [] dataset_dict['resources'] = [] dataset_dict['relations'] = [] dataset_dict['see_alsos'] = [] # Basic fields for key, predicate in ( ('identifier', DCT.identifier), ('accrual_periodicity', DCT.accrualPeriodicity), ('spatial_uri', DCT.spatial), ('spatial', DCT.spatial), ('url', DCAT.landingPage), ): value = self._object_value(dataset_ref, predicate) if value: dataset_dict[key] = value # Timestamp fields for key, predicate in ( ('issued', DCT.issued), ('modified', DCT.modified), ): value = self._object_value(dataset_ref, predicate) if value: dataset_dict[key] = self._clean_datetime(value) # Multilingual basic fields for key, predicate in ( ('title', DCT.title), ('description', DCT.description), ): value = self._object_value(dataset_ref, predicate, multilang=True) if value: dataset_dict[key] = value # Tags keywords = self._object_value_list(dataset_ref, DCAT.keyword) or [] for keyword in keywords: dataset_dict['tags'].append({'name': munge_tag(unicode(keyword))}) # Keywords dataset_dict['keywords'] = self._keywords(dataset_ref, DCAT.keyword) # Themes dcat_theme_urls = self._object_value_list(dataset_ref, DCAT.theme) if dcat_theme_urls: dataset_dict['groups'] = [] for dcat_theme_url in dcat_theme_urls: search_result = slug_id_pattern.search(dcat_theme_url) dcat_theme_slug = search_result.group() dataset_dict['groups'].append({'name': dcat_theme_slug}) # Languages languages = self._object_value_list(dataset_ref, DCT.language) if languages: dataset_dict['language'] = languages # Contact details dataset_dict['contact_points'] = self._contact_points( dataset_ref, DCAT.contactPoint) # Publisher dataset_dict['publishers'] = self._publishers(dataset_ref, DCT.publisher) # Relations dataset_dict['relations'] = self._relations(dataset_ref, DCT.relation) # Temporal dataset_dict['temporals'] = self._temporals(dataset_ref, DCT.temporal) # References see_alsos = self._object_value_list(dataset_ref, RDFS.seeAlso) for see_also in see_alsos: dataset_dict['see_alsos'].append({'dataset_identifier': see_also}) # Dataset URI (explicitly show the missing ones) dataset_uri = (unicode(dataset_ref) if isinstance( dataset_ref, rdflib.term.URIRef) else '') dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri}) # Resources for distribution in self._distributions(dataset_ref): resource_dict = { 'media_type': '', 'language': [], } # Simple values for key, predicate in ( ('identifier', DCT.identifier), ('format', DCT['format']), ('mimetype', DCAT.mediaType), ('media_type', DCAT.mediaType), ('download_url', DCAT.downloadURL), ('url', DCAT.accessURL), ('rights', DCT.rights), ('license', DCT.license), ): value = self._object_value(distribution, predicate) if value: resource_dict[key] = value # if media type is not set, use format as fallback if (not resource_dict.get('media_type') and resource_dict.get('format')): resource_dict['media_type'] = resource_dict['format'] # Timestamp fields for key, predicate in ( ('issued', DCT.issued), ('modified', DCT.modified), ): value = self._object_value(distribution, predicate) if value: resource_dict[key] = self._clean_datetime(value) # Multilingual fields for key, predicate in ( ('title', DCT.title), ('description', DCT.description), ): value = self._object_value(distribution, predicate, multilang=True) if value: resource_dict[key] = value resource_dict['url'] = ( self._object_value(distribution, DCAT.accessURL) or self._object_value(distribution, DCAT.downloadURL) or '') # languages for language in self._object_value_list(distribution, DCT.language): resource_dict['language'].append(language) # byteSize byte_size = self._object_value_int(distribution, DCAT.byteSize) if byte_size is not None: resource_dict['byte_size'] = byte_size # Distribution URI (explicitly show the missing ones) resource_dict['uri'] = (unicode(distribution) if isinstance( distribution, rdflib.term.URIRef) else '') dataset_dict['resources'].append(resource_dict) log.debug("Parsed dataset '%r': %s" % (dataset_ref, dataset_dict)) return dataset_dict
def _build_package_dict(self, context, harvest_object): ''' Build and return a package_dict suitable for use with CKAN `package_create` and `package_update`. ''' # Local harvest source organization source_dataset = toolkit.get_action('package_show')( context.copy(), { 'id': harvest_object.source.id }) local_org = source_dataset.get('owner_org') res = json.loads(harvest_object.content) package_dict = { 'title': res['resource']['name'], 'name': self._gen_new_name(res['resource']['name']), 'url': res.get('permalink', ''), 'notes': res['resource'].get('description', ''), 'author': res['resource']['attribution'], 'tags': [], 'extras': [], 'identifier': res['resource']['id'], 'owner_org': local_org, 'resources': [], } # Add tags package_dict['tags'] = \ [{'name': munge_tag(t)} for t in res['classification'].get('tags', []) + res['classification'].get('domain_tags', [])] # Add domain_metadata to extras package_dict['extras'].extend(res['classification'].get( 'domain_metadata', [])) # Add source createdAt to extras package_dict['extras'].append({ 'key': 'source_created_at', 'value': res['resource']['createdAt'] }) # Add source updatedAt to extras package_dict['extras'].append({ 'key': 'source_updated_at', 'value': res['resource']['updatedAt'] }) # Add owner_display_name to extras package_dict['extras'].append({ 'key': 'owner_display_name', 'value': res.get('owner', {}).get('display_name') }) # Add categories to extras package_dict['extras'].append({ 'key': 'categories', 'value': [ t for t in res['classification'].get('categories', []) + res['classification'].get('domain_categories', []) ], }) # Add Socrata metadata.license if available if res['metadata'].get('license', False): package_dict['extras'].append({ 'key': 'license', 'value': res['metadata']['license'] }) # Add provenance if res['resource'].get('provenance', False): package_dict['provenance'] = res['resource']['provenance'] # Resources package_dict['resources'] = [{ 'url': DOWNLOAD_ENDPOINT_TEMPLATE.format( domain=urlparse(harvest_object.source.url).hostname, resource_id=res['resource']['id']), 'format': 'CSV' }] return package_dict
def import_stage(self, harvest_object): log.debug('In NadaHarvester import_stage') self._set_config(harvest_object.job.source.config) if not harvest_object: log.error('No harvest object received') self._save_object_error( 'No harvest object received', harvest_object ) return False try: base_url = harvest_object.source.url.rstrip('/') # Get a class which maps ckan metadata to the DDI equivalent ckan_metadata = DdiCkanMetadata() # Extract metadata content from XML DDI # put it in a dictionary pkg_dict = ckan_metadata.load(harvest_object.content) # Go through the dictionary and put 'uncrecognised' attributes # into a field called 'extras' (any field which isn't in DEFAULT ATTRIBUTES) pkg_dict = self._convert_to_extras(pkg_dict) # update URL with NADA catalog link catalog_path = self._get_catalog_path(harvest_object.guid) pkg_dict['url'] = base_url + catalog_path # set license from harvester config or use CKAN instance default if 'license' in self.config: pkg_dict['license_id'] = self.config['license'] else: pkg_dict['license_id'] = config.get( 'ckanext.ddi.default_license', '' ) # Add tags if necessary tags = [] for tag in pkg_dict['tags']: if isinstance(tag, basestring): tags.append(munge_tag(tag[:100])) pkg_dict['tags'] = tags pkg_dict['version'] = pkg_dict['version'][:100] # add resources # basically sources resources = [ { 'url': base_url + self._get_ddi_api(harvest_object.guid), 'name': 'DDI XML of %s' % pkg_dict['title'], 'format': 'xml' }, { 'url': pkg_dict['url'], 'name': 'NADA catalog entry', 'format': 'html' }, ] pkg_dict['resources'] = resources log.debug('package dict: %s' % pkg_dict) # Now create the package return self._create_or_update_package(pkg_dict, harvest_object) except Exception, e: self._save_object_error( ( 'Exception in import stage: %r / %s' % (e, traceback.format_exc()) ), harvest_object ) return False
def munge_tags(package_dict): tags = package_dict.get('tags', []) tags = [munge_tag(t['name']) for t in tags if t] tags = [t for t in tags if t != '__'] # i.e. just padding tags = remove_duplicates_in_a_list(tags) package_dict['tags'] = [dict(name=name) for name in tags]
def _build_package_dict(self, context, harvest_object): ''' Build and return a package_dict suitable for use with CKAN `package_create` and `package_update`. ''' # Local harvest source organization source_dataset = toolkit.get_action('package_show')( context.copy(), {'id': harvest_object.source.id} ) local_org = source_dataset.get('owner_org') res = json.loads(harvest_object.content) package_dict = { 'title': res['resource']['name'], 'name': self._gen_new_name(res['resource']['name']), 'url': res.get('permalink', ''), 'notes': res['resource'].get('description', ''), 'author': res['resource']['attribution'], 'tags': [], 'extras': [], 'identifier': res['resource']['id'], 'owner_org': local_org, 'resources': [], } # Add tags package_dict['tags'] = \ [{'name': munge_tag(t)} for t in res['classification'].get('tags', []) + res['classification'].get('domain_tags', [])] # Add domain_metadata to extras package_dict['extras'].extend(res['classification'] .get('domain_metadata', [])) # Add source createdAt to extras package_dict['extras'].append({ 'key': 'source_created_at', 'value': res['resource']['createdAt'] }) # Add source updatedAt to extras package_dict['extras'].append({ 'key': 'source_updated_at', 'value': res['resource']['updatedAt'] }) # Add owner_display_name to extras package_dict['extras'].append({ 'key': 'owner_display_name', 'value': res.get('owner', {}).get('display_name') }) # Add categories to extras package_dict['extras'].append({ 'key': 'categories', 'value': [t for t in res['classification'].get('categories', []) + res['classification'].get('domain_categories', [])], }) # Add Socrata metadata.license if available if res['metadata'].get('license', False): package_dict['extras'].append({ 'key': 'license', 'value': res['metadata']['license'] }) # Add provenance if res['resource'].get('provenance', False): package_dict['provenance'] = res['resource']['provenance'] # Resources package_dict['resources'] = [{ 'url': DOWNLOAD_ENDPOINT_TEMPLATE.format( domain=urlparse(harvest_object.source.url).hostname, resource_id=res['resource']['id']), 'format': 'CSV' }] return package_dict
def munge_tag(self): tag = request.params.get("tag") or request.params.get("name") munged_tag = munge.munge_tag(tag) return self._finish_ok(munged_tag)
def test_munge_tag(self): '''Munge a list of tags gives expected results.''' for org, exp in self.munge_list: munge = munge_tag(org) nose_tools.assert_equal(munge, exp)
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: existing_package_dict = self._find_existing_package(package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' model.repo.new_revision() master_data = json.loads(harvest_object.content) domain = master_data['domain'] group = Group.get(domain) if not group: group = Group(name=domain, description=domain) if 'records' in master_data: records = master_data['records'] set_name = master_data['set_name'] for rec in records: identifier, metadata, _ = rec if metadata: name = metadata['title'][0] if len(metadata['title'])\ else identifier title = name norm_title = unicodedata.normalize('NFKD', name)\ .encode('ASCII', 'ignore')\ .lower().replace(' ', '_')[:35] slug = ''.join(e for e in norm_title if e in string.ascii_letters + '_') name = slug creator = metadata['creator'][0]\ if len(metadata['creator']) else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg = Package.by_name(name) if not pkg: pkg = Package(name=name, title=title) extras = {} for met in metadata.items(): key, value = met if len(value) > 0: if key == 'subject' or key == 'type': for tag in value: if tag: tag = munge_tag(tag[:100]) tag_obj = model.Tag.by_name(tag) if not tag_obj: tag_obj = model.Tag(name=tag) if tag_obj: pkgtag = model.PackageTag( tag=tag_obj, package=pkg) Session.add(tag_obj) Session.add(pkgtag) else: extras[key] = ' '.join(value) pkg.author = creator pkg.author_email = creator pkg.title = title pkg.notes = description pkg.extras = extras pkg.url = \ "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\ % (harvest_object.job.source.url, identifier) pkg.save() harvest_object.package_id = pkg.id Session.add(harvest_object) setup_default_user_roles(pkg) url = '' for ids in metadata['identifier']: if ids.startswith('http://'): url = ids title = metadata['title'][0] if len(metadata['title'])\ else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg.add_resource(url, description=description, name=title) group.add_package_by_name(pkg.name) subg_name = "%s - %s" % (domain, set_name) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) subgroup.add_package_by_name(pkg.name) Session.add(group) Session.add(subgroup) setup_default_user_roles(group) setup_default_user_roles(subgroup) model.repo.commit() else: self._save_object_error('Could not receive any objects from fetch!' , harvest_object, stage='Import') return False return True
def parse_dataset(self, dataset_dict, dataset_ref): dataset_dict['extras'] = [] dataset_dict['resources'] = [] # Basic fields for key, predicate in ( ('title', DCT.title), ('notes', DCT.description), ('url', DCAT.landingPage), ('version', OWL.versionInfo), ): value = self._object_value(dataset_ref, predicate) if value: dataset_dict[key] = value if not dataset_dict.get('version'): # adms:version was supported on the first version of the DCAT-AP value = self._object_value(dataset_ref, ADMS.version) if value: dataset_dict['version'] = value # Tags keywords = self._object_value_list(dataset_ref, DCAT.keyword) or [] # Split keywords with commas keywords_with_commas = [k for k in keywords if ',' in k] for keyword in keywords_with_commas: keywords.remove(keyword) keywords.extend([k.strip() for k in keyword.split(',')]) # replace munge_tag to noop if there's no need to clean tags do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False)) tags_val = [munge_tag(tag) if do_clean else tag for tag in keywords] tags = [{'name': tag} for tag in tags_val] dataset_dict['tags'] = tags # Extras # Simple values for key, predicate in ( ('issued', DCT.issued), ('modified', DCT.modified), ('identifier', DCT.identifier), ('version_notes', ADMS.versionNotes), ('frequency', DCT.accrualPeriodicity), ('access_rights', DCT.accessRights), ('provenance', DCT.provenance), ('dcat_type', DCT.type), ): value = self._object_value(dataset_ref, predicate) if value: dataset_dict['extras'].append({'key': key, 'value': value}) # Lists for key, predicate, in ( ('language', DCT.language), ('theme', DCAT.theme), ('alternate_identifier', ADMS.identifier), ('conforms_to', DCT.conformsTo), ('documentation', FOAF.page), ('related_resource', DCT.relation), ('has_version', DCT.hasVersion), ('is_version_of', DCT.isVersionOf), ('source', DCT.source), ('sample', ADMS.sample), ): values = self._object_value_list(dataset_ref, predicate) if values: dataset_dict['extras'].append({ 'key': key, 'value': json.dumps(values) }) # Contact details contact = self._contact_details(dataset_ref, DCAT.contactPoint) if not contact: # adms:contactPoint was supported on the first version of DCAT-AP contact = self._contact_details(dataset_ref, ADMS.contactPoint) if contact: for key in ('uri', 'name', 'email'): if contact.get(key): dataset_dict['extras'].append({ 'key': 'contact_{0}'.format(key), 'value': contact.get(key) }) # Publisher publisher = self._publisher(dataset_ref, DCT.publisher) for key in ('uri', 'name', 'email', 'url', 'type'): if publisher.get(key): dataset_dict['extras'].append({ 'key': 'publisher_{0}'.format(key), 'value': publisher.get(key) }) # Temporal start, end = self._time_interval(dataset_ref, DCT.temporal) if start: dataset_dict['extras'].append({ 'key': 'temporal_start', 'value': start }) if end: dataset_dict['extras'].append({ 'key': 'temporal_end', 'value': end }) # Spatial spatial = self._spatial(dataset_ref, DCT.spatial) for key in ('uri', 'text', 'geom'): if spatial.get(key): dataset_dict['extras'].append({ 'key': 'spatial_{0}'.format(key) if key != 'geom' else 'spatial', 'value': spatial.get(key) }) # Dataset URI (explicitly show the missing ones) dataset_uri = (unicode(dataset_ref) if isinstance( dataset_ref, rdflib.term.URIRef) else '') dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri}) # License if 'license_id' not in dataset_dict: dataset_dict['license_id'] = self._license(dataset_ref) # Source Catalog if toolkit.asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)): catalog_src = self._get_source_catalog(dataset_ref) if catalog_src is not None: src_data = self._extract_catalog_dict(catalog_src) dataset_dict['extras'].extend(src_data) # Resources for distribution in self._distributions(dataset_ref): resource_dict = {} # Simple values for key, predicate in ( ('name', DCT.title), ('description', DCT.description), ('download_url', DCAT.downloadURL), ('issued', DCT.issued), ('modified', DCT.modified), ('status', ADMS.status), ('rights', DCT.rights), ('license', DCT.license), ): value = self._object_value(distribution, predicate) if value: resource_dict[key] = value resource_dict['url'] = ( self._object_value(distribution, DCAT.accessURL) or self._object_value(distribution, DCAT.downloadURL)) # Lists for key, predicate in ( ('language', DCT.language), ('documentation', FOAF.page), ('conforms_to', DCT.conformsTo), ): values = self._object_value_list(distribution, predicate) if values: resource_dict[key] = json.dumps(values) # Format and media type normalize_ckan_format = config.get( 'ckanext.dcat.normalize_ckan_format', True) imt, label = self._distribution_format(distribution, normalize_ckan_format) if imt: resource_dict['mimetype'] = imt if label: resource_dict['format'] = label elif imt: resource_dict['format'] = imt # Size size = self._object_value_int(distribution, DCAT.byteSize) if size is not None: resource_dict['size'] = size # Checksum for checksum in self.g.objects(distribution, SPDX.checksum): algorithm = self._object_value(checksum, SPDX.algorithm) checksum_value = self._object_value(checksum, SPDX.checksumValue) if algorithm: resource_dict['hash_algorithm'] = algorithm if checksum_value: resource_dict['hash'] = checksum_value # Distribution URI (explicitly show the missing ones) resource_dict['uri'] = (unicode(distribution) if isinstance( distribution, rdflib.term.URIRef) else '') dataset_dict['resources'].append(resource_dict) if self.compatibility_mode: # Tweak the resulting dict to make it compatible with previous # versions of the ckanext-dcat parsers for extra in dataset_dict['extras']: if extra['key'] in ( 'issued', 'modified', 'publisher_name', 'publisher_email', ): extra['key'] = 'dcat_' + extra['key'] if extra['key'] == 'language': extra['value'] = ','.join( sorted(json.loads(extra['value']))) return dataset_dict
def handle_fluent_harvest_dictinary(self, field, iso_values, package_dict, schema, handled_fields, harvest_config): field_name = field['field_name'] if field_name in handled_fields: return field_value = {} if not field.get('preset', '').startswith(u'fluent'): return # set default language, default to english default_language = iso_values.get('metadata-language', 'en')[0:2] if not default_language: default_language = 'en' # handle tag fields if field.get('preset', '') == u'fluent_tags': fluent_tags = iso_values.get(field_name, []) schema_languages = plugins.toolkit.h.fluent_form_languages( schema=schema) do_clean = toolkit.asbool(harvest_config.get('clean_tags', False)) # init language key field_value = {sl: [] for sl in schema_languages} # process fluent_tags by convert list of language dictionaries into # a dictionary of language lists for t in fluent_tags: tobj = self.from_json(t.get('keyword', t)) if isinstance(tobj, Number): tobj = str(tobj) if isinstance(tobj, dict): for key, value in tobj.items(): if key in schema_languages: if do_clean: if isinstance(value, list): value = [ munge.munge_tag(kw) for kw in value ] else: value = munge.munge_tag(value) field_value[key].append(value) else: if do_clean: tobj = munge.munge_tag(tobj) field_value[default_language].append(tobj) package_dict[field_name] = field_value # update tags with all values from fluent_tags tag_list = [t['name'] for t in package_dict['tags']] for item in field_value.get('en', []) + field_value.get('fr', []): if item not in tag_list: tag_list.append(item) package_dict['tags'] = [{'name': t} for t in tag_list] else: # Populate translated fields from core. this could have been done in # the spatial extensions. example 'title' -> 'title_translated' # strip trailing _translated part of field name if field_name.endswith(u'_translated'): package_fn = field_name[:-11] else: package_fn = field_name package_val = package_dict.get(package_fn, '') field_value = self.from_json(package_val) if isinstance(field_value, dict): # assume bilingual values already in data package_dict[field_name] = field_value else: # create bilingual dictionary. This will likely fail validation as it does not contain all the languages package_dict[field_name] = {} package_dict[field_name][default_language] = field_value handled_fields.append(field_name)
def _clean_keywords(self, pkg_dict): clean_keywords = {} if 'keywords' in pkg_dict: for lang, tag_list in pkg_dict['keywords'].iteritems(): clean_keywords[lang] = [munge_tag(tag) for tag in tag_list] return clean_keywords
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' logger.debug("in import stage: %s" % harvest_object.guid) if not harvest_object: logger.error('No harvest object received') self._save_object_error('No harvest object received') return False try: self._set_config(harvest_object.job.source.config) package_dict = json.loads(harvest_object.content) data_dict = {} data_dict['id'] = package_dict['id'] data_dict['title'] = package_dict['title'] data_dict['name'] = munge_title_to_name(package_dict['name']) data_dict['notes'] = markdown_extract( package_dict.get('description')) tags = package_dict.get('keyword', []) data_dict['tag_string'] = ', '.join( [munge_tag(tag) for tag in tags]) data_dict['private'] = False license_id = package_dict.get('license', 'cc-by').strip('/').split('/')[-1] if license_id == 'de2a56f5-a565-481a-8589-406dc40b5588': license_id = 'sprep-public-license' data_dict['license_id'] = license_id or 'notspecified' data_dict['created'] = _parse_drupal_date(package_dict['issued']) data_dict['modified'] = _parse_drupal_date( package_dict['modified']) c_point, c_email = package_dict['contactPoint'][ 'fn'], package_dict['contactPoint']['hasEmail'].split(':')[-1] if c_email != '*****@*****.**': data_dict['contact_uri'] = c_point data_dict['contact_email'] = c_email data_dict['resources'] = [] for res in package_dict.get('distribution', []): # res['issued'] = _parse_drupal_date(res.pop('created')) # res['modified'] = _parse_drupal_date( # res.pop('last_modified').replace('Date changed ', '') # ) res['url'] = res.get('downloadURL') or res.get('accessURL') res['format'] = res['format'] res['name'] = res['title'] res['description'] = markdown_extract(res.get('description')) data_dict['resources'].append(res) if 'spatial' in package_dict: data_dict['spatial'] = package_dict['spatial'] try: data_dict['spatial'] = json.dumps({ "type": "Polygon", "coordinates": [[[float(c) for c in pair.split()] for pair in RE_SPATIAL.match( data_dict['spatial']).group(1).split(', ')]] }) except KeyError: pass # package_dict.pop('type') # add owner_org source_dataset = get_action('package_show')( { 'ignore_auth': True }, { 'id': harvest_object.source.id }) owner_org = source_dataset.get('owner_org') data_dict['owner_org'] = owner_org data_dict['member_countries'] = country_mapping[None] if 'isPartOf' in package_dict: country = package_dict['isPartOf'].split('.')[0] data_dict['member_countries'] = country_mapping.get( country, country_mapping[None]) org = model.Session.query( model.Group).filter_by(name=country + '-data').first() if org: data_dict['owner_org'] = org.id data_dict['source'] = package_dict.get('landingPage') data_dict['theme'] = package_dict.get('theme', []) data_dict['theme'] = package_dict.get('theme', []) data_dict['thematic_area_string'] = _map_theme_to_topic( data_dict['theme']) data_dict['harvest_source'] = 'SPREP' self._create_or_update_package(data_dict, harvest_object, 'package_show') Session.commit() logger.debug("Finished record") except: logger.exception('Something went wrong!') self._save_object_error('Exception in import stage', harvest_object) return False return True
def split_tags(tag): tags = [] for tag in tag.split(','): tags.extend(tag.split('>')) return [munge_tag(tag) for tag in tags if munge_tag(tag) != '']
def _clean_keywords(self, pkg_dict): clean_keywords = {} if 'keywords' in pkg_dict: for lang, tag_list in pkg_dict['keywords'].iteritems(): clean_keywords[lang] = [munge_tag(tag) for tag in tag_list if tag != 'opendata.swiss'] # noqa return clean_keywords
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') #TODO: use site user when available user_name = self.config.get('user', u'harvest') else: api_version = 2 user_name = u'harvest' context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, } tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def import_stage(self, harvest_object): """Import the metadata received in the fetch stage to a dataset and create groups if ones are defined. Fill in metadata from study and document description. """ try: xml_dict = {} xml_dict["source"] = harvest_object.content udict = json.loads(harvest_object.content) if "url" in udict: f = urllib2.urlopen(udict["url"]).read() ddi_xml = BeautifulSoup(f, "xml") else: self._save_object_error("No url in content!", harvest_object) return False except urllib2.URLError: self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object) return False except etree.XMLSyntaxError: self._save_object_error("Unable to parse XML!", harvest_object) return False model.repo.new_revision() study_descr = ddi_xml.codeBook.stdyDscr document_info = ddi_xml.codeBook.docDscr.citation title = study_descr.citation.titlStmt.titl.string if not title: title = document_info.titlStmt.titl.string name = study_descr.citation.titlStmt.IDNo.string update = True pkg = Package.get(name) if not pkg: pkg = Package(name=name) update = False producer = study_descr.citation.prodStmt.producer if not producer: producer = study_descr.citation.rspStmt.AuthEnty if not producer: producer = study_descr.citation.rspStmt.othId pkg.author = producer.string pkg.maintainer = producer.string if study_descr.citation.distStmt.contact: pkg.maintainer = study_descr.citation.distStmt.contact.string if document_info.titlStmt.IDNo: pkg.id = document_info.titlStmt.IDNo.string keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas")) keywords = list(set(keywords)) for kw in keywords: if kw: vocab = None kw_str = "" if kw.string: kw_str = kw.string if "vocab" in kw.attrs: vocab = kw.attrs.get("vocab", None) if vocab and kw.string: kw_str = vocab + " " + kw.string pkg.add_tag_by_name(munge_tag(kw_str)) if study_descr.stdyInfo.abstract: description_array = study_descr.stdyInfo.abstract("p") else: description_array = study_descr.citation.serStmt.serInfo("p") pkg.notes = "<br />".join([description.string for description in description_array]) pkg.title = title[:100] pkg.url = udict["url"] if not update: ofs = get_ofs() nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") idno = study_descr.citation.titlStmt.IDNo agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string label = "%s/%s.xml" % (nowstr, agencyxml) ofs.put_stream(BUCKET, label, f, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f)) pkg.add_resource( url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title ) metas = {} descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants] for docextra in descendants: if isinstance(docextra, Tag): if docextra: if docextra.name == "p": docextra.name = docextra.parent.name if not docextra.name in metas and docextra.string: metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra) else: if docextra.string: metas[docextra.name] += ( " " + docextra.string if docextra.string else self._collect_attribs(docextra) ) if ddi_xml.codeBook.dataDscr and not update: vars = ddi_xml.codeBook.dataDscr("var") heads = self._get_headers() c_heads = ["ID", "catValu", "labl", "catStat"] f_var = StringIO.StringIO() c_var = StringIO.StringIO() varwriter = csv.DictWriter(f_var, heads) codewriter = csv.DictWriter(c_var, c_heads) heading_row = {} for head in heads: heading_row[head] = head c_heading_row = {} for head in c_heads: c_heading_row[head] = head varwriter.writerow(heading_row) codewriter.writerow(c_heading_row) for var in vars: try: varwriter.writerow(self._construct_csv(var, heads)) codewriter.writerows(self._create_code_rows(var)) except ValueError, e: raise IOError("Failed to import DDI to CSV! %s" % e) f_var.flush() label = "%s/%s_var.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, f_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len) label = "%s/%s_code.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, c_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len) f_var.seek(0) reader = csv.DictReader(f_var) for var in reader: metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]
def munge_tag(self): tag = request.params.get('tag') or request.params.get('name') munged_tag = munge.munge_tag(tag) return self._finish_ok(munged_tag)
def test_munge_tag(self): '''Munge a list of tags gives expected results.''' for org, exp in self.munge_list: munge = munge_tag(org) assert_equal(munge, exp)
def munge_tag(self): tag = request.params.get('tag') or request.params.get('name') munged_tag = munge.munge_tag(tag) return self._finish_ok(munged_tag)
def import_stage(self, harvest_object): log.debug('In DotStatHarvester import_stage') self._set_config(harvest_object.job.source.config) if not harvest_object: log.error('No harvest object received') self._save_object_error('No harvest object received', harvest_object) return False try: base_url = harvest_object.source.url # Parse the SDMX as XML with bs4 soup = BeautifulSoup(harvest_object.content, 'xml') # Make a package dict pkg_dict = {} pkg_dict['id'] = harvest_object.guid # Added thematic string pkg_dict['thematic_area_string'] = ["Official Statistics"] # Open license for all dotStat resources pkg_dict['license_id'] = "other-open" # Get owner_org if there is one source_dataset = get_action('package_show')( { 'ignore_auth': True }, { 'id': harvest_object.source.id }) owner_org = source_dataset.get('owner_org') pkg_dict['owner_org'] = owner_org # Match other fields with tags in XML structure agency_id = self.config['agencyId'] stats_guid = self._get_object_extra(harvest_object, 'stats_guid') structure = soup.find('Dataflow') pkg_dict['title'] = structure.find('Name', {"xml:lang" : "en"}).text pkg_dict['publisher_name'] = structure['agencyID'] pkg_dict['version'] = structure['version'] # Need to change url to point to Data Explorer de_url = 'https://stats.pacificdata.org/vis?locale=en&dataflow[datasourceId]=SPC2&dataflow[agencyId]={}&dataflow[dataflowId]={}&dataflow[version]={}'.format( agency_id, stats_guid, structure['version'] ) pkg_dict['source'] = de_url # Set resource to metadata data dictionary (if available) annotation = structure.find('Annotations') annots = annotation.find_all('Annotation') metaurl = None for annot in annots: metalink = annot.find('AnnotationType') if metalink.text == 'EXT_RESOURCE': metaurl = annot.find('AnnotationText', {'xml:lang':'en'}).text.split('|')[1] # Set default resource, and metadata pdf if it exists if metaurl: pkg_dict['resources'] = [ { 'url': 'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format( agency_id, stats_guid, structure['version'] ), 'format': 'CSV', 'mimetype': 'CSV', 'description': 'All data for {}'.format(pkg_dict['title']), 'name': '{} Data CSV'.format(pkg_dict['title']) }, { 'url': metaurl, 'format': 'PDF', 'mimetype': 'PDF', 'description': 'Detailed metadata dictionary for {}'.format(pkg_dict['title']), 'name': '{} Metadata PDF'.format(pkg_dict['title']) }] else: pkg_dict['resources'] = [ { 'url': 'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format( agency_id, stats_guid, structure['version'] ), 'format': 'CSV', 'mimetype': 'CSV', 'description': 'All data for {}'.format(pkg_dict['title']), 'name': '{} Data CSV'.format(pkg_dict['title']) }] # Get notes/description if it exists try: desc = structure.find('Description', {"xml:lang": "en"}).text desc += '\nFind more Pacific data on PDH.stat : https://stats.pacificdata.org/' pkg_dict['notes'] = desc except Exception as e: log.error("An error occured: {}".format(e)) pkg_dict['notes'] = 'Find more Pacific data on PDH.stat : https://stats.pacificdata.org/' # Add tags from CategoryScheme and ConceptScheme # List of uninteresting tags generic_schemes = ['Time', 'Frequency', 'Observation value', 'Observation Status', 'Confidentiality status', 'Unit of measure', 'Unit multiplier', 'Base period', 'Comment', 'Decimals', 'Data source', 'Pacific Island Countries and territories', 'Indicator', 'Transformation', 'Reporting type', 'Composite breakdown'] tag_strings = [] # For finding Category Schemes for tags schemes = soup.find('CategorySchemes') if schemes is not None: catschemes = schemes.find_all('CategoryScheme') for catscheme in catschemes: cats = catscheme.find_all('Category') for cat in cats: found = cat.find('Name', {'xml:lang': 'en'}).text if found not in tag_strings: tag_strings.append(found) # For finding Concept Schemes for tags concepts = soup.find('Concepts') if concepts is not None: concschemes = concepts.find_all('ConceptScheme') for concscheme in concschemes: concepts = concscheme.find_all('Concept') for concept in concepts: found = concept.find('Name', {'xml:lang': 'en'}).text if found not in tag_strings: tag_strings.append(found) # Tag cleaning psp_mapping = { 'Industry and Services': ['pacific-skills', 'industry', 'training'], 'Education level': ['pacific-skills', 'education', 'training'], 'Occupation': ['pacific-skills', 'occupation'], 'Disability': ['pacific-skills', 'disability'], 'Economic sector': ['pacific-skills', 'industry', 'training'], 'Labour force status': ['pacific-skills', 'employment'], 'Employment status': ['pacific-skills', 'employment'], 'Labour and employment status': ['pacific-skills', 'employment'] } if len(tag_strings) > 0: # Bring in PSP tags for tag in tag_strings: if tag in list(psp_mapping.keys()): tag_strings.extend(psp_mapping[tag]) # Remove duplicates tag_strings = list(set(tag_strings)) # Remove tags found in generic_schemes list tags = [x.lower() for x in tag_strings if x not in generic_schemes] # Make a string of tags for CKAN pkg_dict['tag_string'] = ', '.join([munge_tag(tag) for tag in tags]) ''' May need modifying when DF_SDG is broken into several DFs This gets the list of indicators for SDG-related dataflows Stores the list of strings in 'alternate_identifier' field ''' if soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES' }) is not None: pkg_dict['alternate_identifier'] = [] codelist = soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES'}) for indic in codelist.findAll('Name', {"xml:lang" : "en"}): if not indic or indic.text == 'SDG Indicator or Series': continue pkg_dict['alternate_identifier'].append(indic.text) ''' When support for metadata endpoints arrives in .Stat, here is how more metadata may be found: # Use the metadata/flow endpoint metadata = requests.get('{}metadata/data/{}/all?detail=full'.format(base_url, harvest_object.guid)) # Parse with bs4 parsed = BeautifulSoup(metadata.text, 'xml') # Now search for tags which may be useful as metadata # example: getting the name and definition of metadata set # (may need tweaking depending on SPC's metadata setup) # We can get name from the metadata structure set = parsed.find('MetadataSet') pkg_dict['name'] = set.find('Name').text # Then we can go to the reported attribute structure for more details detail = set.find('ReportedAttribute', attrs={'id': 'DEF'}) pkg_dict['notes'] = detail.find('StructuredText', attrs={'lang': 'en'}) source_details = set.find('ReportedAttribute', attrs={'id': 'SOURCE_DEF'}) pkg_dict['source'] = source_details.find('StructuredText', attrs={'lang': 'en'}) ''' log.debug('package dict: %s' % pkg_dict) content_hash = str(_hashify(pkg_dict)) harvest_object.extras = [ HarvestObjectExtra(key='content_hash', value=content_hash) ] harvest_object.save() prev_object = model.Session.query(HarvestObject).filter( HarvestObject.source == harvest_object.source, HarvestObject.guid == harvest_object.guid, ~HarvestObject.import_finished.is_(None)).order_by( HarvestObject.import_finished.desc()).first() obj_hash = self._get_object_extra(prev_object, 'content_hash') if obj_hash and obj_hash == content_hash: log.debug('Content is not changed. Skip..') return True # Create or update the package return self._create_or_update_package( pkg_dict, harvest_object, package_dict_form='package_show') except Exception as e: self._save_object_error(('Exception in import stage: %r / %s' % (e, traceback.format_exc())), harvest_object) return False
return [] else: projects = json.load(handle)['projects'] for project in projects: log.debug(project['project_info']) # add dataset for project metadata = { 'datasetID': self._get(project['project_info'], 'shortname'), 'title': self._get(project['project_info'], 'longname'), 'url': 'http://salsah.org/', 'notes': 'This project is part of SALSAH.', # 'author': , # 'maintainer': , # 'maintainer-email': , 'license_id': self._get(project['project_info'], 'ckan_license_id'), 'tags': [munge_tag(tag[:100]) for tag in self._get(project['project_info'], 'ckan_tags')], 'resources': [{ 'name': 'SALSAH API', 'resource_type': 'api', 'format': 'JSON', 'url': harvest_job.source.url.rstrip('/') + '?project=' + self._get(project['project_info'], 'shortname') }], 'groups': [self._get(project['project_info'], 'longname')], 'extras': [ ('level', 'Project') ] } pprint(metadata) obj = HarvestObject(
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package(package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = p.toolkit.get_action( 'package_update' if package_dict_form == 'package_show' else 'package_update_rest')(context, package_dict) else: log.info('No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError, e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an existing one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package( package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = p.toolkit.get_action( 'package_update' if package_dict_form == 'package_show' else 'package_update_rest')( context, package_dict) else: log.info( 'No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name( package_dict['name']) else: package_dict['name'] = self._gen_new_name( package_dict['title']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError, e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
metadata = { 'datasetID': self._get(project['project_info'], 'shortname'), 'title': self._get(project['project_info'], 'longname'), 'url': 'http://salsah.org/', 'notes': 'This project is part of SALSAH.', # 'author': , # 'maintainer': , # 'maintainer-email': , 'license_id': self._get(project['project_info'], 'ckan_license_id'), 'tags': [ munge_tag(tag[:100]) for tag in self._get( project['project_info'], 'ckan_tags') ], 'resources': [{ 'name': 'SALSAH API', 'resource_type': 'api', 'format': 'JSON', 'url': harvest_job.source.url.rstrip('/') + '?project=' + self._get(project['project_info'], 'shortname') }], 'groups': [self._get(project['project_info'], 'longname')], 'extras': [('level', 'Project')]