예제 #1
0
    def _amend_package(self, harvest_object):
        '''
        Amend package information.
        '''
        package = json.loads(harvest_object.content)
        if 'extras' not in package:
            package['extras'] = []

        portal = self._get_portal_from_config(harvest_object.source.config)
        set_extras_field(package, EXTRA_KEY_HARVESTED_PORTAL, portal)

        # ensure all resources have a (recent) license
        for resource in package.get('resources', []):
            log_prefix = u'{0}: Resource {1} of package {2} (GUID {3})'.format(
                harvest_object.source.title, resource.get('uri', ''),
                package.get('name', ''), harvest_object.guid)

            if resource.get(RES_EXTRA_KEY_LICENSE, '') == '':
                LOGGER.info(log_prefix +
                            u' has no license. Adding default value.')
                resource[RES_EXTRA_KEY_LICENSE] = self._get_fallback_license()
            elif self.licenses_upgrade:
                current_license = resource.get(RES_EXTRA_KEY_LICENSE)
                new_license = self.licenses_upgrade.get(current_license, '')
                if new_license == '':
                    LOGGER.info(log_prefix + u' has a deprecated or unknown license {0}. '\
                        u'Keeping old value.'.format(current_license))
                elif current_license != new_license:
                    LOGGER.info(log_prefix + u' had old license {0}. '\
                        u'Updated value to recent DCAT list.'.format(current_license))
                    resource[RES_EXTRA_KEY_LICENSE] = new_license
        # write changes back to harvest object content
        harvest_object.content = json.dumps(package)
예제 #2
0
    def parse_dataset(self, dataset_dict, dataset_ref):
        """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """
        # Manage different versions of DCATDE namespaces first.
        # Ensure that they are ordered from oldest to newest version, such that older values get overwritten
        # in case of multiple definitions
        dcatde_versions = [DCATDE_1_0, DCATDE]

        # geocodingText and legalbasisText got renamed, so handle them separately
        for key, predicate, in (
            ('legalbasisText', DCATDE_1_0.legalbasisText),
            ('geocodingText', DCATDE_1_0.geocodingText),
            ('legalbasisText', DCATDE.legalBasis),
            ('geocodingText', DCATDE.geocodingDescription),
        ):
            values = self._object_value_list(dataset_ref, predicate)
            if values:
                ds_utils.set_extras_field(dataset_dict, key,
                                          json.dumps(values))

        # iterate over all namespaces to import as much as possible
        for dcatde_namespace in dcatde_versions:
            # Simple additional fields
            for key, predicate in (
                ('qualityProcessURI', dcatde_namespace.qualityProcessURI),
                ('politicalGeocodingLevelURI',
                 dcatde_namespace.politicalGeocodingLevelURI),
            ):
                value = self._object_value(dataset_ref, predicate)
                if value:
                    ds_utils.set_extras_field(dataset_dict, key, value)

            # List fields
            for key, predicate, in (
                ('contributorID', dcatde_namespace.contributorID),
                ('politicalGeocodingURI',
                 dcatde_namespace.politicalGeocodingURI),
            ):
                values = self._object_value_list(dataset_ref, predicate)
                if values:
                    ds_utils.set_extras_field(dataset_dict, key,
                                              json.dumps(values))

            self._parse_contact(dataset_dict, dataset_ref,
                                dcatde_namespace.originator, 'originator',
                                True)
            self._parse_contact(dataset_dict, dataset_ref,
                                dcatde_namespace.maintainer, 'maintainer',
                                False)

            # Add additional distribution fields
            for distribution in self.g.objects(dataset_ref, DCAT.distribution):
                for resource_dict in dataset_dict.get('resources', []):
                    # Match distribution in graph and distribution in ckan-dict
                    if unicode(distribution) == resource_uri(resource_dict):
                        for key, predicate in (
                            ('licenseAttributionByText',
                             dcatde_namespace.licenseAttributionByText),
                            ('plannedAvailability',
                             dcatde_namespace.plannedAvailability)):
                            value = self._object_value(distribution, predicate)
                            if value:
                                ds_utils.insert_resource_extra(
                                    resource_dict, key, value)
        # -- end loop over dcatde namespaces --

        # additions in other namespaces than DCATDE
        self._parse_contact(dataset_dict, dataset_ref, DCT.contributor,
                            'contributor', True)
        self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author',
                            False)

        # dcat:landingPage
        landing_page = self._object_value(dataset_ref, DCAT.landingPage)
        if landing_page:
            ds_utils.set_extras_field(dataset_dict, 'metadata_original_html',
                                      landing_page)

        # dcat:contactPoint
        # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer?
        contact = self._object(dataset_ref, DCAT.contactPoint)
        self._add_maintainer_field(dataset_dict, contact, 'url', VCARD.hasURL)

        contact_tel = self._object_value(contact, VCARD.hasTelephone)
        if contact_tel:
            ds_utils.insert(dataset_dict, 'maintainer_tel',
                            self._without_tel(contact_tel), True)

        self._add_maintainer_field(dataset_dict, contact, 'street',
                                   VCARD.hasStreetAddress)
        self._add_maintainer_field(dataset_dict, contact, 'city',
                                   VCARD.hasLocality)
        self._add_maintainer_field(dataset_dict, contact, 'zip',
                                   VCARD.hasPostalCode)
        self._add_maintainer_field(dataset_dict, contact, 'country',
                                   VCARD.hasCountryName)

        # Groups
        groups = self._get_dataset_value(dataset_dict, 'groups')

        if not groups:
            groups = []

        for obj in self.g.objects(dataset_ref, DCAT.theme):
            current_theme = unicode(obj)

            if current_theme.startswith(dcat_theme_prefix):
                group = current_theme.replace(dcat_theme_prefix, '').lower()
                groups.append({'id': group, 'name': group})

        dataset_dict['groups'] = groups

        return dataset_dict
예제 #3
0
    def amend_package(self, package, portal):
        if 'extras' not in package:
            package['extras'] = []

        set_extras_field(package, EXTRA_KEY_HARVESTED_PORTAL, portal)
예제 #4
0
    def parse_dataset(self, dataset_dict, dataset_ref):
        """ Transforms DCAT-AP.de-Data to CKAN-Dictionary """

        # Different implementation of clean tags for keywords
        do_clean_tags = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False))
        if do_clean_tags:
            cleaned_tags = [
                _munge_tag(tag) for tag in self._keywords(dataset_ref)
            ]
            tags = [{'name': tag} for tag in cleaned_tags]
            dataset_dict['tags'] = tags

        # Manage different versions of DCATDE namespaces first.
        # Ensure that they are ordered from oldest to newest version, such that older values get overwritten
        # in case of multiple definitions
        dcatde_versions = [DCATDE_1_0, DCATDE_1_0_1, DCATDE]

        # iterate over all namespaces to import as much as possible
        for dcatde_namespace in dcatde_versions:
            # Simple additional fields
            for key, predicate in (
                ('qualityProcessURI', dcatde_namespace.qualityProcessURI),
                ('politicalGeocodingLevelURI',
                 dcatde_namespace.politicalGeocodingLevelURI),
            ):
                value = self._object_value(dataset_ref, predicate)
                if value:
                    ds_utils.set_extras_field(dataset_dict, key, value)

            # geocodingText and legalbasisText got renamed after 1.0, so assign the respective names
            legalbasisTextProperty = dcatde_namespace.legalBasis
            geocodingTextProperty = dcatde_namespace.geocodingDescription

            if (dcatde_namespace == DCATDE_1_0):
                legalbasisTextProperty = DCATDE_1_0.legalbasisText
                geocodingTextProperty = DCATDE_1_0.geocodingText

            # List fields
            for key, predicate, in (
                ('contributorID', dcatde_namespace.contributorID),
                ('politicalGeocodingURI',
                 dcatde_namespace.politicalGeocodingURI),
                ('legalbasisText', legalbasisTextProperty),
                ('geocodingText', geocodingTextProperty),
            ):
                values = self._object_value_list(dataset_ref, predicate)
                if values:
                    ds_utils.set_extras_field(dataset_dict, key,
                                              json.dumps(values))

            self._parse_contact(dataset_dict, dataset_ref,
                                dcatde_namespace.originator, 'originator',
                                True)
            self._parse_contact(dataset_dict, dataset_ref,
                                dcatde_namespace.maintainer, 'maintainer',
                                False)

            # Add additional distribution fields
            for distribution in self.g.objects(dataset_ref, DCAT.distribution):
                for resource_dict in dataset_dict.get('resources', []):
                    # Match distribution in graph and distribution in ckan-dict
                    if unicode(distribution) == resource_dict.get('uri'):
                        for key, predicate in (
                            ('licenseAttributionByText',
                             dcatde_namespace.licenseAttributionByText),
                            ('plannedAvailability',
                             dcatde_namespace.plannedAvailability)):
                            value = self._object_value(distribution, predicate)
                            if value:
                                ds_utils.insert_resource_extra(
                                    resource_dict, key, value)
        # -- end loop over dcatde namespaces --

        # additions in other namespaces than DCATDE
        self._parse_contact(dataset_dict, dataset_ref, DCT.contributor,
                            'contributor', True)
        self._parse_contact(dataset_dict, dataset_ref, DCT.creator, 'author',
                            False)

        # Simple additional fields to DCAT-AP 1.1
        for key, predicate in (('metadata_original_html', DCAT.landingPage),
                               ('granularity', DCAT.granularity)):
            value = self._object_value(dataset_ref, predicate)
            if value:
                ds_utils.set_extras_field(dataset_dict, key, value)

        # dcat:contactPoint
        # TODO: dcat-ap adds the values to extras.contact_... . Maybe better than maintainer?
        self._parse_contact_vcard(dataset_dict, dataset_ref, DCAT.contactPoint,
                                  'maintainer')

        # Groups
        groups = self._get_dataset_value(dataset_dict, 'groups')

        if not groups:
            groups = []

        for obj in self.g.objects(dataset_ref, DCAT.theme):
            current_theme = unicode(obj)

            if current_theme.startswith(dcat_theme_prefix):
                group = current_theme.replace(dcat_theme_prefix, '').lower()
                groups.append({'id': group, 'name': group})

        dataset_dict['groups'] = groups

        return dataset_dict
예제 #5
0
    def migrate_contributor_identifier(self):
        ''' Add govdata-contributor-IDs to datasets that are missing one '''
        util.get_migrator_log().info('Migrating dcatde:contributorID' + (
            ' [dry run without saving]' if self.dry_run else ''))

        starttime = time.time()
        package_obj_to_update = gather_dataset_ids()
        endtime = time.time()
        print "INFO: %s datasets found to check for contributor-ID. Total time: %s." % \
              (len(package_obj_to_update), str(endtime - starttime))

        organization_list = tk.get_action('organization_list')(
            self.create_context(), {
                'all_fields': True,
                'include_extras': True
            })
        updated_count = created_count = 0
        starttime = time.time()

        for dataset in self.iterate_datasets(package_obj_to_update.keys()):
            print u'Updating dataset: {}'.format(dataset['title'])

            dataset_org_id = dataset['organization']['id']
            dataset_org = next((item for item in organization_list
                                if item['id'] == dataset_org_id), None)
            if not dataset_org:
                print u'Did not find a Organization for ID: ' + dataset_org_id
                continue

            org_contributor_field = get_extras_field(dataset_org,
                                                     EXTRA_KEY_CONTRIBUTOR_ID)
            if not org_contributor_field:
                print u'Did not find a contributor ID for Organization: ' + dataset_org_id
                continue

            try:
                org_contributor_id_list = json.loads(
                    org_contributor_field['value'])
            except ValueError:
                # json.loads failed -> value is not an array but a single string
                org_contributor_id_list = [org_contributor_field['value']]

            dataset_contributor_field = get_extras_field(
                dataset, EXTRA_KEY_CONTRIBUTOR_ID)
            requires_update = False
            if not dataset_contributor_field:
                # Contributor-id field does not exist yet
                set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID,
                                 json.dumps(org_contributor_id_list))
                created_count = created_count + 1
                requires_update = True
            else:
                try:
                    current_ids_list = json.loads(
                        dataset_contributor_field['value'])
                except ValueError:
                    # json.loads failed -> value is not an array but a single string
                    current_ids_list = [dataset_contributor_field['value']]

                for contributor_id in org_contributor_id_list:
                    if contributor_id not in current_ids_list:
                        current_ids_list.append(contributor_id)
                        requires_update = True
                if requires_update:
                    updated_count = updated_count + 1
                    set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID,
                                     json.dumps(current_ids_list))

            if requires_update:
                self.update_dataset(dataset)

        endtime = time.time()
        print "INFO: A Contributor-ID was created for %s datasets that did not have one before." % \
              created_count
        print "INFO: %s datasets were updated. Total time: %s." % (
            updated_count, str(endtime - starttime))

        util.get_migrator_log().info(
            'Finished migration of dcatde:contributorID' +
            (' [dry run without saving]' if self.dry_run else ''))