def populate_resource_license(package_dict): license_id = package_dict.get('license_id') license_url = None license = None access_constraints = None for ex in package_dict.get('extras') or []: if ex['key'] in ( 'license_url', 'licence_url', ): license_url = ex['value'] elif ex['key'] in ( 'license', 'licence', ): license = ex['value'] elif ex['key'] == 'access_constraints': access_constraints = ex['value'] if not (access_constraints or license_id or license or license_url): l = License.get(License.DEFAULT_LICENSE) else: l, default = License.find_by_token(access_constraints, license, license_id, license_url) for res in package_dict['resources']: res['license_type'] = l.uri return package_dict
def get_license_for_dcat(license_type): l = License.get(license_type or License.DEFAULT_LICENSE) if not l or not l.license_type: l = License.get(License.DEFAULT_LICENSE) if not l: log.error('*** Licenses vocabulary has not been loaded ***') return None, '-', None, None, None, None names = dict((k['lang'], k['name']) for k in l.get_names()) return l.license_type, l.default_name, l.document_uri, l.version, l.uri, names
def get_license_from_dcat(license_uri, license_dct, prefname, **license_names): # First try dcatapit info l = License.get(license_uri) if not l and prefname: l = License.get(prefname) if not l: for lang, name in license_names.items(): l = License.get_by_lang(lang, name) if l: break if not l and license_dct: # try and use DCT licence URI (usually level 2 in DCATAPIT voc) l = License.get(license_dct) return l or License.get(License.DEFAULT_LICENSE)
def load(g, name, uri, eurovoc): if name == LICENSES_NAME: ret = {'licenses_deleted': License.count()} clear_licenses() load_licenses(g) ret['licenses_created'] = License.count() Session.commit() return ret if name == SUBTHEME_NAME: ret = {'subthemes_deleted': Subtheme.count()} clear_subthemes() load_subthemes(None, eurovoc, themes_g=g) ret['subthemes_created'] = Subtheme.count() Session.commit() return ret return do_load(g, name)
def get_resource_licenses_tree(value, lang): options = License.for_select(lang) out = [] for license, label in options: out.append({'selected': license.uri == value, 'value': license.uri, # let's do indentation 'text': label, 'depth': license.rank_order -1, 'depth_str': ' '*(license.rank_order-1) or '', 'level': license.rank_order}) return out
def load_licenses(g: Graph): """ Loads license tree into db from provided graph """ License.delete_all() for license in g.subjects(None, SKOS.Concept): rank_order = g.value(license, CLVAPIT.hasRankOrder) version = g.value(license, OWL.versionInfo) doc_uri = g.value(license, DCATAPIT.referenceDoc) # exactMatch exists only in 2nd level license_type = g.value(license, SKOS.exactMatch) if not license_type: # 3rd level, need to go up parent = g.value(license, SKOS.broader) license_type = g.value(parent, SKOS.exactMatch) _labels = g.objects(license, SKOS.prefLabel) labels = dict([(l.language, l) for l in _labels]) license_path = str(license).split('/')[-1].split('_')[0] log.debug('Adding license [%r] [%s]', license, labels.get('it', None)) l = License.from_data(license_type or '', str(version) if version else None, uri=str(license), path=license_path, document_uri=str(doc_uri) if doc_uri else None, rank_order=int(str(rank_order)), names=labels, parent=None) # parent will be set later for license in g.subjects(None, SKOS.Concept): parents = list(g.objects(license, SKOS.broader)) if parents: parent = parents[0] License.get(license).set_parent(parent)
def get_package_dict(self, iso_values, harvest_object): package_dict = super(DCATAPITCSWHarvester, self).get_package_dict(iso_values, harvest_object) mapping_frequencies_to_mdr_vocabulary = self.source_config.get('mapping_frequencies_to_mdr_vocabulary', \ utils._mapping_frequencies_to_mdr_vocabulary) mapping_languages_to_mdr_vocabulary = self.source_config.get('mapping_languages_to_mdr_vocabulary', \ utils._mapping_languages_to_mdr_vocabulary) dcatapit_config = self.source_config.get('dcatapit_config', self._dcatapit_config) #if dcatapit_config and not all(name in dcatapit_config for name in self._dcatapit_config): # dcatapit_config = self._dcatapit_config # log.warning('Some keys are missing in dcatapit_config configuration property, \ # keyes to use are: dataset_theme, dataset_language, agent_code, frequency, \ # agent_code_regex, org_name_regex and dcatapit_skos_theme_id. Using defaults') #elif not dcatapit_config: # dcatapit_config = self._dcatapit_config controlled_vocabularies = dcatapit_config.get('controlled_vocabularies', \ self._dcatapit_config.get('controlled_vocabularies')) agents = dcatapit_config.get('agents', self._dcatapit_config.get('agents')) # ------------------------------# # MANDATORY FOR DCAT-AP_IT # # ------------------------------# # -- identifier -- # identifier = iso_values["guid"] package_dict['extras'].append({'key': 'identifier', 'value': identifier}) default_agent_code = identifier.split(':')[0] if ':' in identifier else None # -- theme -- # dataset_themes = [] if iso_values["keywords"]: default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id') dataset_themes = utils.get_controlled_vocabulary_values('eu_themes', \ controlled_vocabularies.get('dcatapit_skos_theme_id', default_vocab_id), iso_values["keywords"]) if dataset_themes and len(dataset_themes) > 1: dataset_themes = list(set(dataset_themes)) dataset_themes = '{' + ','.join(str(l) for l in dataset_themes) + '}' else: dataset_themes = dataset_themes[0] if dataset_themes and len(dataset_themes) > 0 else dcatapit_config.get('dataset_themes', \ self._dcatapit_config.get('dataset_themes')) log.info("Medatata harvested dataset themes: %r", dataset_themes) package_dict['extras'].append({'key': 'theme', 'value': dataset_themes}) # -- publisher -- # citedResponsiblePartys = iso_values["cited-responsible-party"] agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, agents.get('publisher', \ self._dcatapit_config.get('agents').get('publisher'))) package_dict['extras'].append({'key': 'publisher_name', 'value': agent_name}) package_dict['extras'].append({'key': 'publisher_identifier', 'value': agent_code or default_agent_code}) # -- modified -- # revision_date = iso_values["date-updated"] or iso_values["date-released"] package_dict['extras'].append({'key': 'modified', 'value': revision_date}) # -- frequency -- # updateFrequency = iso_values["frequency-of-update"] package_dict['extras'].append({'key': 'frequency', 'value': \ mapping_frequencies_to_mdr_vocabulary.get(updateFrequency, \ dcatapit_config.get('frequency', self._dcatapit_config.get('frequency')))}) # -- rights_holder -- # citedResponsiblePartys = iso_values["cited-responsible-party"] agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, \ agents.get('owner', self._dcatapit_config.get('agents').get('owner'))) package_dict['extras'].append({'key': 'holder_name', 'value': agent_name}) package_dict['extras'].append({'key': 'holder_identifier', 'value': agent_code or default_agent_code}) # -----------------------------------------------# # OTHER FIELDS NOT MANDATORY FOR DCAT_AP-IT # # -----------------------------------------------# # -- alternate_identifier nothing to do -- # # -- issued -- # publication_date = iso_values["date-released"] package_dict['extras'].append({'key': 'issued', 'value': publication_date}) # -- geographical_name -- # dataset_places = [] if iso_values["keywords"]: default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id') dataset_places = utils.get_controlled_vocabulary_values('places', \ controlled_vocabularies.get('dcatapit_skos_places_id', default_vocab_id), iso_values["keywords"]) if dataset_places and len(dataset_places) > 1: dataset_places = list(set(dataset_places)) dataset_places = '{' + ','.join(str(l) for l in dataset_places) + '}' else: dataset_places = dataset_places[0] if dataset_places and len(dataset_places) > 0 else dcatapit_config.get('dataset_places', \ self._dcatapit_config.get('dataset_places')) if dataset_places: log.info("Medatata harvested dataset places: %r", dataset_places) package_dict['extras'].append({'key': 'geographical_name', 'value': dataset_places}) # -- geographical_geonames_url nothing to do -- # # -- language -- # dataset_languages = iso_values["dataset-language"] language = None if dataset_languages and len(dataset_languages) > 0: languages = [] for language in dataset_languages: lang = mapping_languages_to_mdr_vocabulary.get(language, None) if lang: languages.append(lang) if len(languages) > 1: language = '{' + ','.join(str(l) for l in languages) + '}' else: language = languages[0] if len(languages) > 0 else dcatapit_config.get('dataset_languages', \ self._dcatapit_config.get('dataset_languages')) log.info("Medatata harvested dataset languages: %r", language) else: language = dcatapit_config.get('dataset_language') package_dict['extras'].append({'key': 'language', 'value': language}) # -- temporal_coverage -- # for key in ['temporal-extent-begin', 'temporal-extent-end']: if len(iso_values[key]) > 0: temporal_extent_value = iso_values[key][0] if key == 'temporal-extent-begin': package_dict['extras'].append({'key': 'temporal_start', 'value': temporal_extent_value}) if key == 'temporal-extent-end': package_dict['extras'].append({'key': 'temporal_end', 'value': temporal_extent_value}) # -- conforms_to -- # conforms_to = iso_values["conformity-specification-title"] package_dict['extras'].append({'key': 'conforms_to', 'value': conforms_to}) # -- creator -- # citedResponsiblePartys = iso_values["cited-responsible-party"] agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, \ agents.get('author', self._dcatapit_config.get('agents').get('author'))) package_dict['extras'].append({'key': 'creator_name', 'value': agent_name}) package_dict['extras'].append({'key': 'creator_identifier', 'value': agent_code or default_agent_code}) # -- license handling -- # license_id = package_dict.get('license_id') license_url = None license = None access_constraints = None for ex in package_dict['extras']: if ex['key'] == 'license_url': license_url = ex['value'] elif ex['key'] == 'license': license = ex['value'] elif ex['key'] == 'access_constraints': access_constraints = ex['value'] if not (access_constraints or license_id or license or license_url): l = License.get(License.DEFAULT_LICENSE) else: l, default = License.find_by_token(access_constraints, license, license_id, license_url) for res in package_dict['resources']: res['license_type'] = l.uri # End of processing, return the modified package return package_dict
def get_license_for_dcat(license_type): l = License.get(license_type or License.DEFAULT_LICENSE) if not l or not l.license_type: l = License.get(License.DEFAULT_LICENSE) names = dict((k['lang'], k['name']) for k in l.get_names()) return l.license_type, l.default_name, l.document_uri, l.version, l.uri, names
def create_base_dict(guid, metadata, config): """ metadata : StatWebMetadata The base statweb metadata object config : dict The configuration set at harvester level """ def dateformat(d): return d.strftime(r"%Y-%m-%d") start_date = metadata.get_anno_inizio() or '1970' if len(start_date) < 4: log.warn(f"Bad annoinizio found: '{start_date}'") start_date = '1970' created = datetime.datetime(int(start_date), 1, 1) updated = parse_ultimo_aggiornamento(metadata) now = dateformat(datetime.datetime.now()) lic_search = f'%({metadata.get_licenza()})' license = License.q().filter( License.default_name.like(lic_search)).first() or License.get( License.DEFAULT_LICENSE) try: freq = _parse_freq(metadata.get_frequenza().lower()) if not freq: log.warning( f'Could not parse frequency "{metadata.get_frequenza()}"') freq = 'UNKNOWN' except Exception as e: log.warning( f'Error handling frequency "{metadata.get_frequenza()}": {e}') freq = 'UNKNOWN' package_dict = { 'title': metadata.get_descrizione(), 'groups': config.get('groups', [{ 'name': 'statistica' }]), 'author': 'Servizio Statistica', 'author_email': '*****@*****.**', 'maintainer': 'Servizio Statistica', 'maintainer_email': '*****@*****.**', 'metadata_modified': now, #'tags': tags, # i tag non sembrano essere valorizzati 'license_id': license.default_name or 'cc-by', 'license': metadata.get_licenza() or 'Creative Commons Attribution', 'license_title': license.default_name or 'Creative Commons Attribution 2.5 it', 'license_url': license.uri or 'http://creativecommons.org/licenses/by/2.5/it/', 'isopen': True, 'resources': [] } extras = { 'holder_name': 'Provincia Autonoma di Trento', 'holder_identifier': 'p_TN', 'identifier': str(uuid.uuid4()), #'themes_aggregate': '[{"subthemes": [], "theme": "{tema}"}]'.format(tema=metadata.get_tema() or "OP_DATPRO"), 'themes_aggregate': [{ "subthemes": [], "theme": metadata.get_tema() or "OP_DATPRO" }], 'geographical_name': 'ITA_TRT', 'geographical_geonames_url': 'http://www.geonames.org/3165243', 'temporal_start': dateformat(created), 'frequency': freq, 'issued': now, 'modified': dateformat(updated), 'encoding': 'UTF-8', 'Algoritmo': metadata.get_algoritmo(), 'Anno di inizio': metadata.get_anno_inizio(), 'Measurement unit': metadata.get_um(), } if metadata.get_anno_inizio(): interval = {'temporal_start': dateformat(created)} if metadata.get_anno_fine(): interval['temporal_end'] = dateformat( datetime.date(int(metadata.get_anno_fine()), 12, 31)) extras['temporal_coverage'] = [interval] return package_dict, extras
def get_license_for_dcat(license_type): l = License.get(license_type or License.DEFAULT_LICENSE) if not l or not l.license_type: l = License.get(License.DEFAULT_LICENSE) names = l.get_names() return l.license_type, l.default_name, l.document_uri, l.version, l.uri, names