Example #1
0
    def metadata_original_portal(self, dataset):
        '''metadata_original_portal -> contributorID'''
        orig_field = ds_utils.get_extras_field(dataset,
                                               u'metadata_original_portal')
        target_field = ds_utils.get_extras_field(dataset,
                                                 EXTRA_KEY_HARVESTED_PORTAL)

        if orig_field:
            util.rename_extras_field_migration(dataset,
                                               u'metadata_original_portal',
                                               u'contributorID', True, False)
            if target_field is None:
                ds_utils.insert_new_extras_field(dataset,
                                                 EXTRA_KEY_HARVESTED_PORTAL,
                                                 orig_field['value'], False)
Example #2
0
    def contacts_role_veroeffentlichende_stelle(self, dataset):
        '''contacts.role.veroeffentlichende_stelle -> extras.publisher'''
        fields = util.get_extras_contacts_data(dataset,
                                               'veroeffentlichende_stelle')
        target_field = ds_utils.get_extras_field(dataset,
                                                 u'publisher_contacttype')

        # only add if the field hasn't been migrated before (check for added field)
        if target_field is None:
            if fields is not None:
                ds_utils.insert_new_extras_field(dataset, u'publisher_name',
                                                 fields.pop('name', ''), False)
                ds_utils.insert_new_extras_field(dataset, u'publisher_email',
                                                 fields.pop('email', ''),
                                                 False)
                ds_utils.insert_new_extras_field(dataset, u'publisher_url',
                                                 fields.pop('url', ''), False)

                util.update_extras_contacts_data(dataset,
                                                 'veroeffentlichende_stelle',
                                                 fields)

                # Additional field
                ds_utils.insert_new_extras_field(dataset,
                                                 u'publisher_contacttype',
                                                 u'Organization', False)
                util.move_extras_contacts_address(dataset,
                                                  'veroeffentlichende_stelle',
                                                  'publisher', fields)
Example #3
0
def update_extras_json_list_data(dataset, extras_field, check_key, expected_val, content):
    '''Updates an extras value with key extras_field. This field is expected to
    contain a list of dicts as JSON string.
    The method updates the content of the list entry having
    check_key: expected_val.
    If the given content is nonempty, and if it contains more key-value
    pairs than the checked pair, the data is updated.
    Otherwise, the dict is dropped from the list.
    If the whole list became empty, the extras field is dropped.'''
    fld_content = ds_utils.get_extras_field(dataset, extras_field)

    if fld_content is not None:
        fld_list = json.loads(fld_content['value'], encoding='utf-8')

        for index, entry in enumerate(fld_list):
            if entry.get(check_key) == expected_val:
                # update only if there are additional values given,
                # otherwise drop the entry
                if content and content != {check_key: expected_val}:
                    fld_list[index] = content
                else:
                    del fld_list[index]
        if fld_list:
            fld_content['value'] = unicode(json.dumps(fld_list, sort_keys=True))
        else:
            # drop contacts if it became empty
            ds_utils.delete_extras_field(dataset, extras_field)
    else:
        log_warn(dataset, 'Could not update data, no field "' +
                 extras_field + '" in extras')
Example #4
0
    def contacts_role_ansprechpartner(self, dataset):
        '''contacts.role.ansprechpartner -> extras.maintainer'''
        fields = util.get_extras_contacts_data(dataset, 'ansprechpartner')
        target_field = ds_utils.get_extras_field(dataset,
                                                 u'maintainer_contacttype')

        # only add if the field hasn't been migrated before (check for added field)
        if target_field is None:
            if fields is not None:
                if fields.get('name') and fields.get('email'):
                    dataset['maintainer'] = fields.pop('name', '')
                    dataset['maintainer_email'] = fields.pop('email', '')
                    ds_utils.insert_new_extras_field(dataset,
                                                     u'maintainer_url',
                                                     fields.pop('url',
                                                                ''), False)

                    util.update_extras_contacts_data(dataset,
                                                     'ansprechpartner', fields)

                    # Additional field
                    ds_utils.insert_new_extras_field(
                        dataset, u'maintainer_contacttype', u'Organization',
                        False)

                util.move_extras_contacts_address(dataset, 'ansprechpartner',
                                                  'maintainer', fields)
Example #5
0
def move_extras_contacts_address(dataset, role, new_role, contact_data=None):
    # load the data if no preloaded dict is available
    if contact_data is None:
        contact_data = get_extras_contacts_data(dataset, role)

    if contact_data is not None:
        if 'address' in contact_data:
            parsed_addr = addr_parse(contact_data['address'])
            keys = ['addressee', 'details', 'street', 'zip', 'city', 'country']

            # first, check if any of the new fields is present. If yes, skip
            # the movement to avoid corrupt datasets
            for k in keys:
                if ds_utils.get_extras_field(dataset, new_role + '_' + k):
                    return

            for k in keys:
                if k in parsed_addr:
                    ds_utils.insert_new_extras_field(dataset, new_role + '_' + k,
                                                     parsed_addr[k], False)

            addr_field_new = parsed_addr.get('unknown')
            if addr_field_new:
                contact_data['address'] = addr_field_new
                log_warn(dataset, u'The following address parts of role ' +
                         role + u' were not recognized: "' + addr_field_new
                         + u'"')
            else:
                del contact_data['address']

            update_extras_contacts_data(dataset, role, contact_data)
Example #6
0
def migrate_dates_field(dataset, from_field, to_field):
    '''extras.dates.<<from_field>> -> extras.<<to_field>>'''
    extras_dates = get_extras_dates_data(dataset, from_field)
    target_field = ds_utils.get_extras_field(dataset, to_field)

    if target_field is None and extras_dates:
        ds_utils.insert_new_extras_field(dataset, to_field,
                                         extras_dates.pop('date', ''), False)
        update_extras_dates_data(dataset, from_field, extras_dates)
Example #7
0
    def geographical_granularity(self, dataset):
        '''geographical_granularity -> politicalGeocodingLevelUri'''
        valid_values = {
            'bund': 'federal',
            'land': 'state',
            'kommune': 'municipality',
            'stadt': 'municipality',

            # DCAT values (without URI part) stay the same
            'federal': 'federal',
            'state': 'state',
            'municipality': 'municipality',

            # Additional non-OGD value
            'kreis': 'administrativeDistrict'
        }

        geo_level = ds_utils.get_extras_field(dataset,
                                              'geographical_granularity')
        target_field = ds_utils.get_extras_field(
            dataset, u'politicalGeocodingLevelURI')

        # only add if the field hasn't been migrated before
        if target_field is None:
            if geo_level is not None:
                geo_level_value = geo_level['value'].lower()

                if geo_level_value in valid_values:
                    geo_level_value = (
                        'http://dcat-ap.de/def/politicalGeocoding/Level/' +
                        valid_values.get(geo_level_value))
                else:
                    util.log_error(
                        dataset, 'INVALID: politicalGeocodingLevelURI: ' +
                        geo_level_value)

                geo_level['value'] = geo_level_value

                util.rename_extras_field_migration(
                    dataset, u'geographical_granularity',
                    u'politicalGeocodingLevelURI', False)
Example #8
0
def get_extras_json_list_data(dataset, extras_field, check_key, expected_val):
    '''Gets data from extras_field. The field is expected to contain a list of
    dicts as JSON string.
    This method returns the deserialized list entry having expected_val in
    check_key, or None if no such element exists.'''
    fld_content = ds_utils.get_extras_field(dataset, extras_field)

    if fld_content is not None:
        fld_list = json.loads(fld_content['value'], encoding='utf-8')
        for entry in fld_list:
            if entry.get(check_key) == expected_val:
                return entry

    return None
Example #9
0
    def languages(self, dataset):
        '''convert ISO 639-1 language codes to DCAT-AP conform URIs (containing ISO 639-3 codes'''
        field_name = u'language'

        # dataset
        language_field = ds_utils.get_extras_field(dataset, field_name)
        if language_field:
            util.update_language_in(dataset, language_field, 'value',
                                    'language')

        # resources
        if 'resources' in dataset and dataset['resources']:
            for resource in dataset['resources']:
                if resource.get(field_name):
                    util.update_language_in(dataset, resource, field_name,
                                            'Resource->language')
Example #10
0
    def terms_of_use_attribution_text(self, dataset):
        '''
        Add attribution text to every resource
        handles: dataset['extras']['terms_of_use']: "{\"attribution_text\": \"bla\"}"
        '''
        fieldname = u'terms_of_use'
        resources = dataset['resources']

        terms_of_use = ds_utils.get_extras_field(dataset, fieldname)
        if terms_of_use is not None:
            text = json.loads(
                terms_of_use.get('value')).get('attribution_text')

            if text and resources:
                for resource in resources:
                    if '__extras' not in resource:
                        resource['__extras'] = dict()
                    resource['__extras'][u'licenseAttributionByText'] = text
Example #11
0
    def spatial_reference_text(self, dataset):
        '''spatial_reference.text -> extras.geocodingText'''
        spatial_reference = ds_utils.get_extras_field(dataset,
                                                      'spatial_reference')
        if spatial_reference is not None:
            sr_value = spatial_reference['value']
        else:
            sr_value = None

        if sr_value is not None:
            # Convert string representation of dictionary to actual dictionary
            sr_value_dict = json.loads(sr_value, encoding='utf-8')
            field = sr_value_dict.get('text')

            if field is not None:
                ds_utils.insert_new_extras_field(dataset, u'geocodingText',
                                                 field, True)

                sr_value_dict.pop('text', None)
                spatial_reference['value'] = unicode(
                    json.dumps(sr_value_dict, sort_keys=True))
Example #12
0
    def migrate_adms_identifier(self):
        util.get_migrator_log().info(
            'Migrating adms:identifier to dct:identifier' +
            (' [dry run without saving]' if self.dry_run else ''))

        for dataset in self.iterate_adms_id_datasets():
            # only migrate if dct:identifier is not already present
            if not dataset_utils.get_extras_field(dataset,
                                                  EXTRA_KEY_DCT_IDENTIFIER):
                util.rename_extras_field_migration(dataset,
                                                   EXTRA_KEY_ADMS_IDENTIFIER,
                                                   EXTRA_KEY_DCT_IDENTIFIER,
                                                   False)
                self.update_dataset(dataset)
            else:
                util.get_migrator_log().info(
                    '%sSkipping package as it already has a dct:identifier',
                    util.log_dataset_prefix(dataset))

        util.get_migrator_log().info(
            'Finished migration of adms:identifier to dct:identifier' +
            (' [dry run without saving]' if self.dry_run else ''))
Example #13
0
    def migrate_contributor_identifier(self):
        ''' Add govdata-contributor-IDs to datasets that are missing one '''
        util.get_migrator_log().info('Migrating dcatde:contributorID' + (
            ' [dry run without saving]' if self.dry_run else ''))

        starttime = time.time()
        package_obj_to_update = gather_dataset_ids()
        endtime = time.time()
        print "INFO: %s datasets found to check for contributor-ID. Total time: %s." % \
              (len(package_obj_to_update), str(endtime - starttime))

        organization_list = tk.get_action('organization_list')(
            self.create_context(), {
                'all_fields': True,
                'include_extras': True
            })
        updated_count = created_count = 0
        starttime = time.time()

        for dataset in self.iterate_datasets(package_obj_to_update.keys()):
            print u'Updating dataset: {}'.format(dataset['title'])

            dataset_org_id = dataset['organization']['id']
            dataset_org = next((item for item in organization_list
                                if item['id'] == dataset_org_id), None)
            if not dataset_org:
                print u'Did not find a Organization for ID: ' + dataset_org_id
                continue

            org_contributor_field = get_extras_field(dataset_org,
                                                     EXTRA_KEY_CONTRIBUTOR_ID)
            if not org_contributor_field:
                print u'Did not find a contributor ID for Organization: ' + dataset_org_id
                continue

            try:
                org_contributor_id_list = json.loads(
                    org_contributor_field['value'])
            except ValueError:
                # json.loads failed -> value is not an array but a single string
                org_contributor_id_list = [org_contributor_field['value']]

            dataset_contributor_field = get_extras_field(
                dataset, EXTRA_KEY_CONTRIBUTOR_ID)
            requires_update = False
            if not dataset_contributor_field:
                # Contributor-id field does not exist yet
                set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID,
                                 json.dumps(org_contributor_id_list))
                created_count = created_count + 1
                requires_update = True
            else:
                try:
                    current_ids_list = json.loads(
                        dataset_contributor_field['value'])
                except ValueError:
                    # json.loads failed -> value is not an array but a single string
                    current_ids_list = [dataset_contributor_field['value']]

                for contributor_id in org_contributor_id_list:
                    if contributor_id not in current_ids_list:
                        current_ids_list.append(contributor_id)
                        requires_update = True
                if requires_update:
                    updated_count = updated_count + 1
                    set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID,
                                     json.dumps(current_ids_list))

            if requires_update:
                self.update_dataset(dataset)

        endtime = time.time()
        print "INFO: A Contributor-ID was created for %s datasets that did not have one before." % \
              created_count
        print "INFO: %s datasets were updated. Total time: %s." % (
            updated_count, str(endtime - starttime))

        util.get_migrator_log().info(
            'Finished migration of dcatde:contributorID' +
            (' [dry run without saving]' if self.dry_run else ''))