def populate_resource_license(package_dict):
    license_id = package_dict.get('license_id')
    license_url = None
    license = None
    access_constraints = None
    for ex in package_dict.get('extras') or []:
        if ex['key'] in (
                'license_url',
                'licence_url',
        ):
            license_url = ex['value']
        elif ex['key'] in (
                'license',
                'licence',
        ):
            license = ex['value']
        elif ex['key'] == 'access_constraints':
            access_constraints = ex['value']

    if not (access_constraints or license_id or license or license_url):
        l = License.get(License.DEFAULT_LICENSE)

    else:
        l, default = License.find_by_token(access_constraints, license,
                                           license_id, license_url)

    for res in package_dict['resources']:
        res['license_type'] = l.uri
    return package_dict
def get_license_for_dcat(license_type):
    l = License.get(license_type or License.DEFAULT_LICENSE)
    if not l or not l.license_type:
        l = License.get(License.DEFAULT_LICENSE)
    if not l:
        log.error('*** Licenses vocabulary has not been loaded ***')
        return None, '-', None, None, None, None
    names = dict((k['lang'], k['name']) for k in l.get_names())
    return l.license_type, l.default_name, l.document_uri, l.version, l.uri, names
Exemple #3
0
def get_license_from_dcat(license_uri, license_dct, prefname, **license_names):
    # First try dcatapit info
    l = License.get(license_uri)
    
    if not l and prefname:
        l = License.get(prefname)

    if not l:
        for lang, name in license_names.items():
            l = License.get_by_lang(lang, name)
            if l:
                break
    if not l and license_dct:
        # try and use DCT licence URI (usually level 2 in DCATAPIT voc)
        l = License.get(license_dct)

    return l or License.get(License.DEFAULT_LICENSE)
def load(g, name, uri, eurovoc):

    if name == LICENSES_NAME:
        ret = {'licenses_deleted': License.count()}
        clear_licenses()
        load_licenses(g)
        ret['licenses_created'] = License.count()
        Session.commit()
        return ret

    if name == SUBTHEME_NAME:
        ret = {'subthemes_deleted': Subtheme.count()}
        clear_subthemes()
        load_subthemes(None, eurovoc, themes_g=g)
        ret['subthemes_created'] = Subtheme.count()
        Session.commit()
        return ret

    return do_load(g, name)
Exemple #5
0
def get_resource_licenses_tree(value, lang):
    options = License.for_select(lang)

    out = []
    for license, label in options:
        out.append({'selected': license.uri == value,
                    'value': license.uri,
                    # let's do indentation
                    'text': label,
                    'depth': license.rank_order -1,
                    'depth_str': '  '*(license.rank_order-1) or '',
                    'level': license.rank_order})
    return out
def load_licenses(g: Graph):
    """
    Loads license tree into db from provided graph
    """
    License.delete_all()

    for license in g.subjects(None, SKOS.Concept):
        rank_order = g.value(license, CLVAPIT.hasRankOrder)
        version = g.value(license, OWL.versionInfo)
        doc_uri = g.value(license, DCATAPIT.referenceDoc)

        # exactMatch exists only in 2nd level
        license_type = g.value(license, SKOS.exactMatch)
        if not license_type:
            # 3rd level, need to go up
            parent = g.value(license, SKOS.broader)
            license_type = g.value(parent, SKOS.exactMatch)

        _labels = g.objects(license, SKOS.prefLabel)
        labels = dict([(l.language, l) for l in _labels])
        license_path = str(license).split('/')[-1].split('_')[0]
        log.debug('Adding license [%r] [%s]', license, labels.get('it', None))
        l = License.from_data(license_type or '',
                              str(version) if version else None,
                              uri=str(license),
                              path=license_path,
                              document_uri=str(doc_uri) if doc_uri else None,
                              rank_order=int(str(rank_order)),
                              names=labels,
                              parent=None)  # parent will be set later

    for license in g.subjects(None, SKOS.Concept):
        parents = list(g.objects(license, SKOS.broader))
        if parents:
            parent = parents[0]
            License.get(license).set_parent(parent)
Exemple #7
0
    def get_package_dict(self, iso_values, harvest_object):
        package_dict = super(DCATAPITCSWHarvester, self).get_package_dict(iso_values, harvest_object)

        mapping_frequencies_to_mdr_vocabulary = self.source_config.get('mapping_frequencies_to_mdr_vocabulary', \
            utils._mapping_frequencies_to_mdr_vocabulary)
        mapping_languages_to_mdr_vocabulary = self.source_config.get('mapping_languages_to_mdr_vocabulary', \
            utils._mapping_languages_to_mdr_vocabulary)

        dcatapit_config = self.source_config.get('dcatapit_config', self._dcatapit_config)

        #if dcatapit_config and not all(name in dcatapit_config for name in self._dcatapit_config):
        #    dcatapit_config = self._dcatapit_config
        #    log.warning('Some keys are missing in dcatapit_config configuration property, \
        #        keyes to use are: dataset_theme, dataset_language, agent_code, frequency, \
        #        agent_code_regex, org_name_regex and dcatapit_skos_theme_id. Using defaults')
        #elif not dcatapit_config:
        #    dcatapit_config = self._dcatapit_config

        controlled_vocabularies = dcatapit_config.get('controlled_vocabularies', \
            self._dcatapit_config.get('controlled_vocabularies'))
        agents = dcatapit_config.get('agents', self._dcatapit_config.get('agents'))

        # ------------------------------#
        #    MANDATORY FOR DCAT-AP_IT   #
        # ------------------------------#

        #  -- identifier -- #
        identifier = iso_values["guid"]
        package_dict['extras'].append({'key': 'identifier', 'value': identifier})

        default_agent_code = identifier.split(':')[0] if ':' in identifier else None

        #  -- theme -- #
        dataset_themes = []
        if iso_values["keywords"]:
            default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id')
            dataset_themes = utils.get_controlled_vocabulary_values('eu_themes', \
                controlled_vocabularies.get('dcatapit_skos_theme_id', default_vocab_id), iso_values["keywords"])

        if dataset_themes and len(dataset_themes) > 1:
            dataset_themes = list(set(dataset_themes))
            dataset_themes = '{' + ','.join(str(l) for l in dataset_themes) + '}'
        else:
            dataset_themes = dataset_themes[0] if dataset_themes and len(dataset_themes) > 0 else dcatapit_config.get('dataset_themes', \
                self._dcatapit_config.get('dataset_themes'))

        log.info("Medatata harvested dataset themes: %r", dataset_themes)
        package_dict['extras'].append({'key': 'theme', 'value': dataset_themes})

        #  -- publisher -- #
        citedResponsiblePartys = iso_values["cited-responsible-party"]
        agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, agents.get('publisher', \
            self._dcatapit_config.get('agents').get('publisher')))
        package_dict['extras'].append({'key': 'publisher_name', 'value': agent_name})
        package_dict['extras'].append({'key': 'publisher_identifier', 'value': agent_code or default_agent_code})

        #  -- modified -- #
        revision_date = iso_values["date-updated"] or iso_values["date-released"]
        package_dict['extras'].append({'key': 'modified', 'value': revision_date})

        #  -- frequency -- #
        updateFrequency = iso_values["frequency-of-update"]
        package_dict['extras'].append({'key': 'frequency', 'value': \
            mapping_frequencies_to_mdr_vocabulary.get(updateFrequency, \
            dcatapit_config.get('frequency', self._dcatapit_config.get('frequency')))})

        #  -- rights_holder -- #
        citedResponsiblePartys = iso_values["cited-responsible-party"]
        agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, \
            agents.get('owner', self._dcatapit_config.get('agents').get('owner')))
        package_dict['extras'].append({'key': 'holder_name', 'value': agent_name})
        package_dict['extras'].append({'key': 'holder_identifier', 'value': agent_code or default_agent_code})

        # -----------------------------------------------#
        #    OTHER FIELDS NOT MANDATORY FOR DCAT_AP-IT   #
        # -----------------------------------------------#

        #  -- alternate_identifier nothing to do  -- #

        #  -- issued -- #
        publication_date = iso_values["date-released"]
        package_dict['extras'].append({'key': 'issued', 'value': publication_date})

        #  -- geographical_name  -- #
        dataset_places = []
        if iso_values["keywords"]:
            default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id')
            dataset_places = utils.get_controlled_vocabulary_values('places', \
                controlled_vocabularies.get('dcatapit_skos_places_id', default_vocab_id), iso_values["keywords"])

        if dataset_places and len(dataset_places) > 1:
            dataset_places = list(set(dataset_places))
            dataset_places = '{' + ','.join(str(l) for l in dataset_places) + '}'
        else:
            dataset_places = dataset_places[0] if dataset_places and len(dataset_places) > 0 else dcatapit_config.get('dataset_places', \
                self._dcatapit_config.get('dataset_places'))

        if dataset_places:
            log.info("Medatata harvested dataset places: %r", dataset_places)
            package_dict['extras'].append({'key': 'geographical_name', 'value': dataset_places})

        #  -- geographical_geonames_url nothing to do  -- #

        #  -- language -- #
        dataset_languages = iso_values["dataset-language"]
        language = None
        if dataset_languages and len(dataset_languages) > 0:
            languages = []
            for language in dataset_languages:
                lang = mapping_languages_to_mdr_vocabulary.get(language, None)
                if lang:
                    languages.append(lang)

            if len(languages) > 1:
                language = '{' + ','.join(str(l) for l in languages) + '}'
            else:
                language = languages[0] if len(languages) > 0 else dcatapit_config.get('dataset_languages', \
                    self._dcatapit_config.get('dataset_languages'))

            log.info("Medatata harvested dataset languages: %r", language)
        else:
            language = dcatapit_config.get('dataset_language')

        package_dict['extras'].append({'key': 'language', 'value': language})

        #  -- temporal_coverage -- #
        for key in ['temporal-extent-begin', 'temporal-extent-end']:
            if len(iso_values[key]) > 0:
                temporal_extent_value = iso_values[key][0]
                if key == 'temporal-extent-begin':
                    package_dict['extras'].append({'key': 'temporal_start', 'value': temporal_extent_value})
                if key == 'temporal-extent-end':
                    package_dict['extras'].append({'key': 'temporal_end', 'value': temporal_extent_value})

        #  -- conforms_to -- #
        conforms_to = iso_values["conformity-specification-title"]
        package_dict['extras'].append({'key': 'conforms_to', 'value': conforms_to})

        #  -- creator -- #
        citedResponsiblePartys = iso_values["cited-responsible-party"]
        agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, \
            agents.get('author', self._dcatapit_config.get('agents').get('author')))
        package_dict['extras'].append({'key': 'creator_name', 'value': agent_name})
        package_dict['extras'].append({'key': 'creator_identifier', 'value': agent_code or default_agent_code})


        #  -- license handling -- #
        license_id = package_dict.get('license_id')
        license_url = None
        license = None
        access_constraints = None
        for ex in package_dict['extras']:
            if ex['key'] == 'license_url':
                license_url = ex['value']
            elif ex['key'] == 'license':
                license = ex['value']
            elif ex['key'] == 'access_constraints':
                access_constraints = ex['value']

        if not (access_constraints or license_id or license or license_url):
            l = License.get(License.DEFAULT_LICENSE)

        else:
            l, default = License.find_by_token(access_constraints, license, license_id, license_url)
        
        for res in package_dict['resources']:
            res['license_type'] = l.uri

        # End of processing, return the modified package
        return package_dict
Exemple #8
0
def get_license_for_dcat(license_type):
    l = License.get(license_type or License.DEFAULT_LICENSE)
    if not l or not l.license_type:
        l = License.get(License.DEFAULT_LICENSE)
    names = dict((k['lang'], k['name']) for k in l.get_names())
    return l.license_type, l.default_name, l.document_uri, l.version, l.uri, names
def create_base_dict(guid, metadata, config):
    """
    metadata : StatWebMetadata
       The base statweb metadata object
       
    config : dict
       The configuration set at harvester level
    """
    def dateformat(d):
        return d.strftime(r"%Y-%m-%d")

    start_date = metadata.get_anno_inizio() or '1970'
    if len(start_date) < 4:
        log.warn(f"Bad annoinizio found: '{start_date}'")
        start_date = '1970'
    created = datetime.datetime(int(start_date), 1, 1)

    updated = parse_ultimo_aggiornamento(metadata)

    now = dateformat(datetime.datetime.now())

    lic_search = f'%({metadata.get_licenza()})'
    license = License.q().filter(
        License.default_name.like(lic_search)).first() or License.get(
            License.DEFAULT_LICENSE)

    try:
        freq = _parse_freq(metadata.get_frequenza().lower())
        if not freq:
            log.warning(
                f'Could not parse frequency "{metadata.get_frequenza()}"')
            freq = 'UNKNOWN'
    except Exception as e:
        log.warning(
            f'Error handling frequency "{metadata.get_frequenza()}": {e}')
        freq = 'UNKNOWN'

    package_dict = {
        'title': metadata.get_descrizione(),
        'groups': config.get('groups', [{
            'name': 'statistica'
        }]),
        'author': 'Servizio Statistica',
        'author_email': '*****@*****.**',
        'maintainer': 'Servizio Statistica',
        'maintainer_email': '*****@*****.**',
        'metadata_modified': now,
        #'tags':              tags,  # i tag non sembrano essere valorizzati
        'license_id': license.default_name or 'cc-by',
        'license': metadata.get_licenza() or 'Creative Commons Attribution',
        'license_title': license.default_name
        or 'Creative Commons Attribution 2.5 it',
        'license_url': license.uri
        or 'http://creativecommons.org/licenses/by/2.5/it/',
        'isopen': True,
        'resources': []
    }

    extras = {
        'holder_name':
        'Provincia Autonoma di Trento',
        'holder_identifier':
        'p_TN',
        'identifier':
        str(uuid.uuid4()),
        #'themes_aggregate': '[{"subthemes": [], "theme": "{tema}"}]'.format(tema=metadata.get_tema() or "OP_DATPRO"),
        'themes_aggregate': [{
            "subthemes": [],
            "theme": metadata.get_tema() or "OP_DATPRO"
        }],
        'geographical_name':
        'ITA_TRT',
        'geographical_geonames_url':
        'http://www.geonames.org/3165243',
        'temporal_start':
        dateformat(created),
        'frequency':
        freq,
        'issued':
        now,
        'modified':
        dateformat(updated),
        'encoding':
        'UTF-8',
        'Algoritmo':
        metadata.get_algoritmo(),
        'Anno di inizio':
        metadata.get_anno_inizio(),
        'Measurement unit':
        metadata.get_um(),
    }

    if metadata.get_anno_inizio():
        interval = {'temporal_start': dateformat(created)}
        if metadata.get_anno_fine():
            interval['temporal_end'] = dateformat(
                datetime.date(int(metadata.get_anno_fine()), 12, 31))
        extras['temporal_coverage'] = [interval]

    return package_dict, extras
Exemple #10
0
def get_license_for_dcat(license_type):
    l = License.get(license_type or License.DEFAULT_LICENSE)
    if not l or not l.license_type:
        l = License.get(License.DEFAULT_LICENSE)
    names = l.get_names()
    return l.license_type, l.default_name, l.document_uri, l.version, l.uri, names