def drop_if_same_as_publisher(key, data, errors, context): """ Validates the contact- and foi- data. If it's the same as that on the publisher, then the data is dropped, otherwise it's kept, and stored on the dataset (as an extra field). For example: if key == 'contact-name'. Then we load the group referenced by 'groups__0__name', and then check group.extras['contact-name']. """ from ckan.model.group import Group field_name = key[0] # extract from tuple group_ref = None for ref_name in ['name', 'id']: group_ref = data.get(('groups', 0, ref_name), None) if group_ref and group_ref is not missing: break if not group_ref: return group = Group.get(group_ref) if not group: return if group.extras.get(field_name, None) == data[key]: # Remove from data and errors iff the two are equal. # If the group doesn't have an extra field for this key, # then store it against the dataset. data.pop(key, None) errors.pop(key, None) raise StopOnError
def get_publishers(self): from ckan.model.group import Group if Authorizer().is_sysadmin(c.user): groups = Group.all(group_type='publisher') elif c.userobj: # need to get c.userobj again as it may be detached from the # session since the last time we called get_groups (it caches) c.userobj = model.User.by_name(c.user) groups = c.userobj.get_groups('publisher') else: # anonymous user shouldn't have access to this page anyway. groups = [] # Be explicit about which fields we make available in the template groups = [ { 'name': g.name, 'id': g.id, 'title': g.title, 'contact-name': g.extras.get('contact-name', ''), 'contact-email': g.extras.get('contact-email', ''), 'contact-phone': g.extras.get('contact-phone', ''), 'foi-name': g.extras.get('foi-name', ''), 'foi-email': g.extras.get('foi-email', ''), 'foi-phone': g.extras.get('foi-phone', ''), } for g in groups ] return dict( (g['name'], g) for g in groups )
def _setup_template_variables(self, context, data_dict=None, package_type=None): log.debug("_setup_template_variables") c.update_frequency = update_frequency c.categorization = categorization c.groups_authz = get_action('group_list_authz')(context, data_dict) data_dict.update({'available_only': True}) #c.groups_available = get_action('group_list_authz')(context, data_dict) c.groups_available = c.userobj and c.userobj.get_groups( 'organization') or [] c.licences = [('', '')] + model.Package.get_license_options() c.is_sysadmin = Authorizer().is_sysadmin(c.user) if c.is_sysadmin: c.groups_available = Group.all('organization') log.fatal("is_sysadmin: %s" % c.is_sysadmin) log.fatal("groups: %s" % c.groups_available) ## This is messy as auths take domain object not data_dict context_pkg = context.get('package', None) pkg = context_pkg or c.pkg if pkg: try: if not context_pkg: context['package'] = pkg check_access('package_change_state', context) c.auth_for_change_state = True except NotAuthorized: c.auth_for_change_state = False
def get_publishers(self): from ckan.model.group import Group if Authorizer().is_sysadmin(c.user): groups = Group.all(group_type='publisher') elif c.userobj: # need to get c.userobj again as it may be detached from the # session since the last time we called get_groups (it caches) c.userobj = model.User.by_name(c.user) groups = c.userobj.get_groups('publisher') else: # anonymous user shouldn't have access to this page anyway. groups = [] # Be explicit about which fields we make available in the template groups = [{ 'name': g.name, 'id': g.id, 'title': g.title, 'contact-name': g.extras.get('contact-name', ''), 'contact-email': g.extras.get('contact-email', ''), 'contact-phone': g.extras.get('contact-phone', ''), 'foi-name': g.extras.get('foi-name', ''), 'foi-email': g.extras.get('foi-email', ''), 'foi-phone': g.extras.get('foi-phone', ''), } for g in groups] return dict((g['name'], g) for g in groups)
def _setup_template_variables(self, context, data_dict=None, package_type=None): log.debug("_setup_template_variables") c.update_frequency = update_frequency c.categorization = categorization c.groups_authz = get_action('group_list_authz')(context, data_dict) data_dict.update({'available_only':True}) #c.groups_available = get_action('group_list_authz')(context, data_dict) c.groups_available = c.userobj and c.userobj.get_groups('organization') or [] c.licences = [('', '')] + model.Package.get_license_options() c.is_sysadmin = Authorizer().is_sysadmin(c.user) if c.is_sysadmin: c.groups_available = Group.all('organization') log.fatal("is_sysadmin: %s" % c.is_sysadmin) log.fatal("groups: %s" % c.groups_available) ## This is messy as auths take domain object not data_dict context_pkg = context.get('package',None) pkg = context_pkg or c.pkg if pkg: try: if not context_pkg: context['package'] = pkg check_access('package_change_state',context) c.auth_for_change_state = True except NotAuthorized: c.auth_for_change_state = False
def add_field__group_titles(cls, pkg_dict): '''Adds the group titles.''' groups = [Group.get(g) for g in pkg_dict['groups']] # Group titles if not pkg_dict.has_key('group_titles'): pkg_dict['group_titles'] = [g.title for g in groups] else: log.warning('Unable to add "group_titles" to index, as the datadict ' 'already contains a key of that name')
def add_field__group_abbreviation(cls, pkg_dict): '''Adds any group abbreviation ''' abbr = None for g in [Group.get(g) for g in pkg_dict['groups']]: abbr = g.extras.get('abbreviation') break if abbr: pkg_dict['group_abbreviation'] = abbr log.debug('Abbreviations %s: %s', pkg_dict['name'], abbr)
def add_field__group_titles(cls, pkg_dict): """Adds the group titles.""" groups = [Group.get(g) for g in pkg_dict["groups"]] # Group titles if not pkg_dict.has_key("organization_titles"): pkg_dict["organization_titles"] = [g.title for g in groups] else: log.warning( 'Unable to add "organization_titles" to index, as the datadict ' "already contains a key of that name" )
def add_field__group_titles(cls, pkg_dict): '''Adds the group titles.''' groups = [Group.get(g) for g in pkg_dict['groups']] # Group titles if not pkg_dict.has_key('organization_titles'): pkg_dict['organization_titles'] = [g.title for g in groups] else: log.warning( 'Unable to add "organization_titles" to index, as the datadict ' 'already contains a key of that name')
def add_field__group_abbreviation(cls, pkg_dict): '''Adds any group abbreviation ''' abbrs = [] for g in [Group.get(g) for g in pkg_dict['groups']]: abbr = g.extras.get('abbreviation') if abbr: abbrs.append(abbr) if abbrs: pkg_dict['group_abbreviation'] = abbrs log.debug('Abbreviations %s: %s', pkg_dict['name'], abbrs)
def populate_from_publisher_if_missing(key, data, errors, context): """ If the data is missing, then populate from the publisher. """ from ckan.model.group import Group if data[key] is not missing: return field_name = key[0] # extract from tuple group = Group.get(data.get(('groups', 0, 'name'), None)) if not group: return data[key] = group.extras.get(field_name, None)
def _get_publishers(self): groups = None if ckan.new_authz.is_sysadmin(c.user): groups = Group.all(group_type='organization') elif c.userobj: groups = c.userobj.get_groups('organization') else: # anonymous user shouldn't have access to this page anyway. groups = [] # Be explicit about which fields we make available in the template groups = [ { 'name': g.name, 'id': g.id, 'title': g.title, } for g in groups ] return groups
def _get_publishers(self): groups = None if ckan.new_authz.is_sysadmin(c.user): groups = Group.all(group_type='organization') elif c.userobj: groups = c.userobj.get_groups('organization') else: # anonymous user shouldn't have access to this page anyway. groups = [] # Be explicit about which fields we make available in the template groups = [{ 'name': g.name, 'id': g.id, 'title': g.title, } for g in groups] return groups
def get_publishers(self): from ckan.model.group import Group if Authorizer().is_sysadmin(c.user): groups = Group.all(group_type='publisher') elif c.userobj: # need to get c.userobj again as it may be detached from the # session since the last time we called get_groups (it caches) c.userobj = model.User.by_name(c.user) # For each group where the user is an admin, we should also include # all of the child publishers. admin_groups = set() for g in c.userobj.get_groups('publisher', 'admin'): for pub in publib.go_down_tree(g): admin_groups.add(pub) editor_groups = c.userobj.get_groups('publisher', 'editor') groups = list(admin_groups) + editor_groups else: # anonymous user shouldn't have access to this page anyway. groups = [] # Be explicit about which fields we make available in the template groups = [ { 'name': g.name, 'id': g.id, 'title': g.title, 'contact-name': g.extras.get('contact-name', ''), 'contact-email': g.extras.get('contact-email', ''), 'contact-phone': g.extras.get('contact-phone', ''), 'foi-name': g.extras.get('foi-name', ''), 'foi-email': g.extras.get('foi-email', ''), 'foi-phone': g.extras.get('foi-phone', ''), 'foi-web': g.extras.get('foi-name', ''), } for g in groups ] return dict( (g['name'], g) for g in groups )
def get_publishers(self): from ckan.model.group import Group if dgu_helpers.is_sysadmin(): groups = Group.all(group_type='organization') elif c.userobj: # need to get c.userobj again as it may be detached from the # session since the last time we called get_groups (it caches) c.userobj = model.User.by_name(c.user) # For each group where the user is an admin, we should also include # all of the child publishers. admin_groups = set() for g in c.userobj.get_groups('organization', 'admin'): for pub in publib.go_down_tree(g): admin_groups.add(pub) editor_groups = c.userobj.get_groups('organization', 'editor') groups = list(admin_groups) + editor_groups else: # anonymous user shouldn't have access to this page anyway. groups = [] # Be explicit about which fields we make available in the template groups = [{ 'name': g.name, 'id': g.id, 'title': g.title, 'contact-name': g.extras.get('contact-name', ''), 'contact-email': g.extras.get('contact-email', ''), 'contact-phone': g.extras.get('contact-phone', ''), 'foi-name': g.extras.get('foi-name', ''), 'foi-email': g.extras.get('foi-email', ''), 'foi-phone': g.extras.get('foi-phone', ''), 'foi-web': g.extras.get('foi-web', ''), } for g in groups] return dict((g['name'], g) for g in groups)
def populate_theme_groups(instance, clean_existing=False): """ For given instance, it finds groups from mapping corresponding to Dataset's themes, and will assign dataset to those groups. Existing groups will be removed, if clean_existing is set to True. This utilizes `ckanext.dcatapit.theme_group_mapping.add_new_groups` configuration option. If it's set to true, and mapped group doesn't exist, new group will be created. """ add_new = toolkit.asbool( config.get(DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS)) themes = [] for ex in (instance.get('extras') or []): if ex['key'] == FIELD_THEMES_AGGREGATE: _t = ex['value'] if isinstance(_t, list): themes.extend(_t) else: try: tval = json.loads(_t) except Exception: log.warning(f'Trying old themes format for {_t}') tval = [{ 'theme': t, 'subthemes': [] } for t in _t.strip('{}').split(',')] for tv in tval: themes.append(tv['theme']) break # we don't need any other info - if there are 'themes' is ok to bypass them elif ex['key'] == 'theme': _t = ex['value'] if isinstance(_t, list): themes.extend(_t) else: try: tval = json.loads(_t) except Exception: log.warning(f'Trying old themes format for {_t}') tval = _t.strip('{}').split(',') themes.extend(tval) # dont break the for loop: if aggregates are there, they get precedence if not themes: log.debug('no theme from %s', instance) return instance theme_map = get_theme_to_groups() if not theme_map: log.warning('Theme to group map is empty') return instance if not isinstance(themes, list): themes = [themes] all_groups = set() for theme in themes: _groups = theme_map.get(theme) if not _groups: continue all_groups = all_groups.union(set(_groups)) if clean_existing: _clean_groups(instance) groups = [] for gname in all_groups: gname = gname.strip() if not gname: continue group = Group.get(gname) or _get_group_from_session(gname) if add_new and group is None: group = Group(name=gname) Session.add(group) if group: groups.append(group) if Session.new: # flush to db, refresh with ids Session.flush() groups = [(Group.get(g.name) if g.id is None else g) for g in groups] _add_groups(instance['id'], set(groups)) Session.flush() return instance
def populate_theme_groups(instance, clean_existing=False): """ For given instance, it finds groups from mapping corresponding to Dataset's themes, and will assign dataset to those groups. Existing groups will be removed, if clean_existing is set to True. This utilizes `ckanext.dcatapit.theme_group_mapping.add_new_groups` configuration option. If it's set to true, and mapped group doesn't exist, new group will be created. """ add_new = toolkit.asbool( config.get(DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS)) themes = [] for ex in (instance.get('extras') or []): if ex['key'] == 'theme': _t = ex['value'] if isinstance(_t, list): themes.extend(_t) else: try: tval = json.loads(_t) except Exception: tval = [{ 'theme': t, 'subthemes': [] } for t in _decode_list(_t)] for tv in tval: themes.append(tv['theme']) if not themes: log.debug("no theme from %s", instance) return instance theme_map = get_theme_to_groups() if not theme_map: log.warning("Theme to group map is empty") return instance if not isinstance(themes, list): themes = [themes] all_groups = set() for theme in themes: _groups = theme_map.get(theme) if not _groups: continue all_groups = all_groups.union(set(_groups)) if clean_existing: _clean_groups(instance) groups = [] for gname in all_groups: gname = gname.strip() if not gname: continue group = Group.get(gname) or _get_group_from_session(gname) if add_new and group is None: group = Group(name=gname) Session.add(group) if group: groups.append(group) if Session.new: # flush to db, refresh with ids rev = Session.revision Session.flush() Session.revision = rev groups = [(Group.get(g.name) if g.id is None else g) for g in groups] _add_groups(instance['id'], set(groups)) # preserve revision, since it's not a commit yet rev = Session.revision Session.flush() Session.revision = rev return instance
def parse_dataset(self, dataset_dict, dataset_ref): log.debug('Parsing Dataset with IAEST DCAT Profile') dataset_dict['tags'] = [] dataset_dict['extras'] = [] dataset_dict['resources'] = [] dataset_dict['groups'] = [] log.debug('Parsing Keyword') # Tags keywords = self._object_value_list(dataset_ref, DCAT.keyword) or [] # Split keywords with commas keywords_with_commas = [k for k in keywords if ',' in k] for keyword in keywords_with_commas: keywords.remove(keyword) keywords.extend([k.strip() for k in keyword.split(',')]) for keyword in keywords: dataset_dict['tags'].append({'name': keyword}) # Basic fields log.debug('Parsing Basic Fields') for key, predicate in ( ('title', DCT.title), ('notes', DCT.description), ('url', DCAT.landingPage), ('version', OWL.versionInfo), ): value = self._object_value(dataset_ref, predicate) if value: dataset_dict[key] = value # Last time dataset was modified log.debug("Parsing last time dataset modified") rdf_modified = self._object_value(dataset_ref, DCT.modified) # Convert these strings to two dates comparable, yyyy-mm-dd (10 characters) dataset_dict['metadata_modified'] = rdf_modified[:10] log.debug("Modified date in RDF: %s" % dataset_dict['metadata_modified']) # Publisher log.debug('Parsing publisher') publisher = self._publisher(dataset_ref, DCT.publisher) dataset_dict['maintainer'] = publisher.get('title') dataset_dict['author'] = publisher.get('title') dataset_dict['author_email'] = self._object_value( dataset_ref, DCAT.author_email) dataset_dict['url'] = publisher.get('url') log.debug('version') if not dataset_dict.get('version'): # adms:version was supported on the first version of the DCAT-AP value = self._object_value(dataset_ref, ADMS.version) if value: dataset_dict['version'] = value log.debug('version obtenida: %s', dataset_dict['version']) # Extras #TODO Revisar los 0X_ porque alguno deben llevar acentos. log.debug('Obteniendo Extras') for key, predicate in ( ('01_IAEST_Tema estadistico', DCAT.tema_estadistico), ('02_IAEST_Unidad Estadistica', DCAT.unidad_estadistica), ('03_IAEST_Poblacion estadistica', DCAT.poblacion_estadistica), ('04_IAEST_Unidad de medida', DCAT.unidad_medida), ('06_IAEST_Periodo base', DCAT.periodo_base), ('07_IAEST_Tipo de operacion', DCAT.tipo_operacion), ('08_IAEST_Tipologia de datos de origen', DCAT.tipologia_datos_origen), ('09_IAEST_Fuente', DCAT.fuente), ('11_IAEST_Tratamiento estadistico', DCAT.tratamiento_estadistico), ('5_IAEST_Legislacion UE', DCAT.legislacion_ue), ('Data Dictionary URL0', DCAT.urlDictionary), ('Granularity', DCAT.granularity), ('LangES', DCT.language), ('Spatial', DCT.spatial), ('TemporalFrom', DCT.temporalFrom), ('TemporalUntil', DCT.temporalUntil), ('nameAragopedia', DCAT.name_aragopedia), ('shortUriAragopedia', DCAT.short_uri_aragopedia), ('typeAragopedia', DCAT.type_aragopedia), ('uriAragopedia', DCAT.uri_aragopedia), ('iaest_modified', DCT.modified), ): value = self._object_value(dataset_ref, predicate) log.debug(' Key: %s Value:%s', key, value) if value: dataset_dict['extras'].append({'key': key, 'value': value}) if key == 'Data Dictionary URL0': dataset_dict['extras'].append({ 'key': 'Data Dictionary', 'value': 'El diccionario del dato se encuentra en la siguiente url' }) #Obtener frecuency del nodo accrualPeridicity # Dataset URI (explicitly show the missing ones) dataset_uri = (unicode(dataset_ref) if isinstance( dataset_ref, rdflib.term.URIRef) else '') #dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri}) # License license_id_final, license_title_final = self._license(dataset_ref) log.debug('Licencias obtenidas %s,%s', license_id_final, license_title_final) dataset_dict['license_id'] = license_id_final dataset_dict['license_title'] = license_title_final log.debug('Tratando themes: ...') for theme in self._themes(dataset_ref): theme_id = self._object_value(theme, DCT.identifier) log.debug('identifier: %s', theme_id) if theme_id: log.debug('Grupo incluido en RDF: %s', theme_id) group = Group.get(theme_id) log.debug('Grupo id: %s', group.id) dataset_dict['groups'].append({'id': group.id}) log.debug('dataset_dict[groups]: %s', dataset_dict['groups']) log.debug('Procesando resources') # Resources for distribution in self._distributions(dataset_ref): resource_dict = {} # Simple values for key, predicate in ( ('name', DCT.title), ('description', DCT.description), ('download_url', DCAT.downloadURL), ('issued', DCT.issued), ('modified', DCT.modified), ('status', ADMS.status), ('rights', DCT.rights), ('license', DCT.license), ): value = self._object_value(distribution, predicate) if value: resource_dict[key] = value resource_dict['url'] = ( self._object_value(distribution, DCAT.accessURL) or self._object_value(distribution, DCAT.downloadURL)) # Lists for key, predicate in ( ('language', DCT.language), ('documentation', FOAF.page), ('conforms_to', DCT.conformsTo), ): values = self._object_value_list(distribution, predicate) if values: resource_dict[key] = json.dumps(values) # Format and media type normalize_ckan_format = config.get( 'ckanext.iaest.normalize_ckan_format', True) imt, label = self._distribution_format(distribution, normalize_ckan_format) if imt: resource_dict['mimetype'] = imt if label: resource_dict['format'] = label elif imt: resource_dict['format'] = imt # Size size = self._object_value_int(distribution, DCAT.byteSize) if size is not None: resource_dict['size'] = size # Checksum for checksum in self.g.objects(distribution, SPDX.checksum): algorithm = self._object_value(checksum, SPDX.algorithm) checksum_value = self._object_value(checksum, SPDX.checksumValue) if algorithm: resource_dict['hash_algorithm'] = algorithm if checksum_value: resource_dict['hash'] = checksum_value # Distribution URI (explicitly show the missing ones) resource_dict['uri'] = (unicode(distribution) if isinstance( distribution, rdflib.term.URIRef) else '') dataset_dict['resources'].append(resource_dict) if self.compatibility_mode: # Tweak the resulting dict to make it compatible with previous # versions of the ckanext-dcat parsers for extra in dataset_dict['extras']: if extra['key'] in ( 'issued', 'modified', 'publisher_name', 'publisher_email', ): extra['key'] = 'dcat_' + extra['key'] if extra['key'] == 'language': extra['value'] = ','.join( sorted(json.loads(extra['value']))) return dataset_dict