Ejemplo n.º 1
0
 def test_munge_tag_multiple_pass(self):
     """Munge a list of tags muliple times gives expected results."""
     for org, exp in self.munge_list:
         first_munge = munge_tag(org)
         assert_equal(first_munge, exp)
         second_munge = munge_tag(first_munge)
         assert_equal(second_munge, exp)
Ejemplo n.º 2
0
 def test_munge_tag_multiple_pass(self):
     '''Munge a list of tags muliple times gives expected results.'''
     for org, exp in self.munge_list:
         first_munge = munge_tag(org)
         assert first_munge == exp
         second_munge = munge_tag(first_munge)
         assert second_munge == exp
Ejemplo n.º 3
0
    def _generate_term_translations(self, lang_index, file_path):
        '''
        '''
        try:
            translations = []

            de_cols = self._get_col_dict_array(0, file_path)
            other_cols = self._get_col_dict_array(lang_index, file_path)

            log.debug(de_cols)
            log.debug(other_cols)

            keys = ['title', 'notes', 'author', 'maintainer', 'license_id']

            for col_idx in range(len(de_cols)):
                for key in keys:
                    translations.append({
                        'lang_code': self.LANG_CODES[lang_index],
                        'term': de_cols[col_idx][key],
                        'term_translation': other_cols[col_idx][key]
                    })

                de_tags = de_cols[col_idx]['tags'].split(u', ')
                other_tags = other_cols[col_idx]['tags'].split(u', ')

                if len(de_tags) == len(other_tags):
                    for tag_idx in range(len(de_tags)):
                        translations.append({
                            'lang_code': self.LANG_CODES[lang_index],
                            'term': munge_tag(de_tags[tag_idx]),
                            'term_translation': munge_tag(other_tags[tag_idx])
                        })

            for lang, org in self.ORGANIZATION.items():
                if lang != 'de':
                    for field in ['name', 'description']:
                        translations.append({
                            'lang_code': lang,
                            'term': self.ORGANIZATION['de'][field],
                            'term_translation': org[field]
                        })

            for lang, groups in self.GROUPS.iteritems():
                if lang != u'de':
                    for idx, group in enumerate(self.GROUPS[lang]):
                        translations.append({
                            'lang_code': lang,
                            'term': self.GROUPS[u'de'][idx],
                            'term_translation': group
                        })

            return translations

        except Exception as e:
            log.exception(e)
            raise
Ejemplo n.º 4
0
 def _generate_tag_translations(self, lang, tags, orig_tags):
     tag_trans = []
     for idx, tag in enumerate(tags):
         if tag:
             tag_trans.append({
                 'lang_code': lang,
                 'term': munge_tag(orig_tags[idx]),
                 'term_translation': munge_tag(tag)
             })
     return tag_trans
Ejemplo n.º 5
0
    def parse_set(self, dataset):
        '''
        Parse one dataset and its resources and return them as dict
        '''

        log.debug('parsing dataset')

        dataset_attrs = dataset.find('dataset_attributes')
        metadata = {
            'id': dataset.get('id')
        }

        for attr in self.DATASET_ATTRIBUTES:
            metadata[attr] = dataset_attrs.find(attr).find('de').text

        log.debug(metadata)

        if 'name' in metadata:
            metadata['name'] = munge_tag(metadata['name'])
            metadata['resources'] = self._build_resources_list(dataset)
            metadata['translations'] = self._build_term_translations(
                dataset
            )

        log.debug(metadata)

        return metadata
Ejemplo n.º 6
0
    def _generate_term_translations(self, base_data, dataset):
        '''
        Return all the term_translations for a given dataset
        '''
        translations = []

        for data in dataset:
            if base_data.find('title') != data.find('title'):
                lang = data.get('{http://www.w3.org/XML/1998/namespace}lang')
                for base_group, group in zip(self._get_data_groups(base_data),
                                             self._get_data_groups(data)):
                    translations.append({
                        'lang_code': lang,
                        'term': base_group,
                        'term_translation': group
                    })
                for base_tag, tag in zip(self._generate_tags_array(base_data),
                                         self._generate_tags_array(data)):
                    translations.append({
                        'lang_code': lang,
                        'term': munge_tag(base_tag),
                        'term_translation': munge_tag(tag)
                    })
                for key in ['title', 'author', 'maintainer', 'description']:
                    if (base_data.find(key) is not None
                            and data.find(key) is not None):
                        translations.append({
                            'lang_code':
                            lang,
                            'term':
                            base_data.find(key).text,
                            'term_translation':
                            data.find(key).text
                        })
                for lang, org in self.ORGANIZATION.items():
                    if lang != u'de':
                        for field in ['name', 'description']:
                            translations.append({
                                'lang_code':
                                lang,
                                'term':
                                self.ORGANIZATION[u'de'][field],
                                'term_translation':
                                org[field]
                            })

        return translations
Ejemplo n.º 7
0
    def _generate_term_translations(self, base_data, dataset):
        '''
        Return all the term_translations for a given dataset
        '''
        translations = []

        for data in dataset:
            if base_data.find('title') != data.find('title'):
                lang = data.get('{http://www.w3.org/XML/1998/namespace}lang')
                for base_group, group in zip(
                        self._get_data_groups(base_data),
                        self._get_data_groups(data)):
                    translations.append({
                        'lang_code': lang,
                        'term': base_group,
                        'term_translation': group
                    })
                for base_tag, tag in zip(
                        self._generate_tags_array(base_data),
                        self._generate_tags_array(data)):
                    translations.append({
                        'lang_code': lang,
                        'term': munge_tag(base_tag),
                        'term_translation': munge_tag(tag)
                    })
                for key in ['title', 'author', 'maintainer', 'description']:
                    if (base_data.find(key) is not None
                            and data.find(key) is not None):
                        translations.append({
                            'lang_code': lang,
                            'term': base_data.find(key).text,
                            'term_translation': data.find(key).text
                            })
                for lang, org in self.ORGANIZATION.items():
                    if lang != u'de':
                        for field in ['name', 'description']:
                            translations.append({
                                'lang_code': lang,
                                'term': self.ORGANIZATION[u'de'][field],
                                'term_translation': org[field]
                            })

        return translations
Ejemplo n.º 8
0
    def gather_stage(self, harvest_job):
        log.debug('In FOPHHarvester gather_stage')
        try:
            file_path = self._fetch_metadata_file()
            ids = []

            de_cols = self._get_col_dict_array(0, file_path)
            for col in de_cols:
                # Construct the metadata dict for the dataset on CKAN
                metadata = {
                    'datasetID': col[u'id'],
                    'title': col[u'title'],
                    'url': col[u'url'],
                    'notes': col[u'notes'],
                    'author': col[u'author'],
                    'author_email': col[u'author_email'],
                    'maintainer': col[u'maintainer'],
                    'maintainer_email': col[u'maintainer_email'],
                    'license_id': col[u'license_id'].lower(),
                    'version': col[u'version'],
                    'translations': [],
                    'tags': []
                }
                tags = col[u'tags'].split(u', ')
                tags = [munge_tag(tag) for tag in tags]
                metadata['tags'] = tags

                metadata['resources'] = self._generate_resources_dict_array(
                    col[u'id'])
                metadata['resources'][0]['version'] = col[u'version']
                log.debug(metadata['resources'])

                # Adding term translations
                metadata['translations'].extend(
                    self._generate_term_translations(1, file_path))  # fr
                metadata['translations'].extend(
                    self._generate_term_translations(2, file_path))  # it
                metadata['translations'].extend(
                    self._generate_term_translations(3, file_path))  # en

                log.debug(metadata['translations'])

                obj = HarvestObject(
                    guid=self._create_uuid(col[u'id']),
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + col[u'id'] + ' to the queue')
                ids.append(obj.id)

                log.debug(de_cols)
        except Exception:
            return False
        return ids
    def infer_tags(self, values):
        tags = []
        theme_keywords = []
        place_keywords = []
        stratum_keywords = []
        temporal_keywords = []

        if len(values.get('keywords', [])):
            key = values['keywords'][0]

        for theme in key.get('theme-keyword', []):
            if re.match('^[\w .-]+$', theme) is None:
                theme = munge_tag(theme)
            if theme not in tags:
                tags.append(theme)
            if theme not in theme_keywords:
                theme_keywords.append(theme)
        for place in key.get('place-keyword', []):
            if re.match('^[\w .-]+$', place) is None:
                place = munge_tag(place)
            if place not in place_keywords:
                place_keywords.append(place)
        for stratum in key.get('stratum-keyword', []):
            if re.match('^[\w .-]+$', stratum) is None:
                stratum = munge_tag(stratum)
            if stratum not in stratum_keywords:
                stratum_keywords.append(stratum)
        for temporal in key.get('temporal-keyword', []):
            if re.match('^[\w .-]+$', temporal) is None:
                temporal = munge_tag(temporal)
            if temporal not in temporal_keywords:
                temporal_keywords.append(temporal)

        values['tags'] = tags
        values['theme-keywords'] = theme_keywords
        values['place-keywords'] = place_keywords
        values['stratum-keywords'] = stratum_keywords
        values['temporal-keywords'] = temporal_keywords
Ejemplo n.º 10
0
    def get_package_dict(self, iso_values, harvest_object):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        Extensions willing to modify the dict should do so implementing the
        ISpatialHarvester interface

            import ckan.plugins as p
            from ckanext.spatial.interfaces import ISpatialHarvester

            class MyHarvester(p.SingletonPlugin):

                p.implements(ISpatialHarvester, inherit=True)

                def get_package_dict(self, context, data_dict):

                    package_dict = data_dict['package_dict']

                    package_dict['extras'].append(
                        {'key': 'my-custom-extra', 'value': 'my-custom-value'}
                    )

                    return package_dict

        If a dict is not returned by this function, the import stage will be cancelled.

        :param iso_values: Dictionary with parsed values from the ISO 19139
            XML document
        :type iso_values: dict
        :param harvest_object: HarvestObject domain object (with access to
            job and source objects)
        :type harvest_object: HarvestObject

        :returns: A dataset dictionary (package_dict)
        :rtype: dict
        '''

        tags = []

        if 'tags' in iso_values:
            do_clean = self.source_config.get('clean_tags')
            tags_val = [
                munge_tag(tag) if do_clean else tag[:100]
                for tag in iso_values['tags']
            ]
            tags = [{'name': tag} for tag in tags_val]

        # Add default_tags from config
        default_tags = self.source_config.get('default_tags', [])
        if default_tags:
            for tag in default_tags:
                tags.append({'name': tag})

        package_dict = {
            'title': iso_values['title'],
            'notes': iso_values['abstract'],
            'tags': tags,
            'resources': [],
        }

        # We need to get the owner organization (if any) from the harvest
        # source dataset
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            package_dict['owner_org'] = source_dataset.owner_org

        # Package name
        package = harvest_object.package
        if package is None or package.title != iso_values['title']:
            name = self._gen_new_name(iso_values['title'])
            if not name:
                name = self._gen_new_name(six.text_type(iso_values['guid']))
            if not name:
                raise Exception(
                    'Could not generate a unique name from the title or the GUID. '
                    'Please choose a more unique title.')
            package_dict['name'] = name
        else:
            package_dict['name'] = package.name

        extras = {
            'guid': harvest_object.guid,
            'spatial_harvester': True,
        }

        # Just add some of the metadata as extras, not the whole lot
        for name in [
                # Essentials
                'spatial-reference-system',
                'guid',
                # Usefuls
                'dataset-reference-date',
                'metadata-language',  # Language
                'metadata-date',  # Released
                'coupled-resource',
                'contact-email',
                'frequency-of-update',
                'spatial-data-service-type',
        ]:
            extras[name] = iso_values[name]

        if len(iso_values.get('progress', [])):
            extras['progress'] = iso_values['progress'][0]
        else:
            extras['progress'] = ''

        if len(iso_values.get('resource-type', [])):
            extras['resource-type'] = iso_values['resource-type'][0]
        else:
            extras['resource-type'] = ''

        extras['licence'] = iso_values.get('use-constraints', '')

        def _extract_first_license_url(licences):
            for licence in licences:
                o = urlparse(licence)
                if o.scheme and o.netloc:
                    return licence
            return None

        if len(extras['licence']):
            license_url_extracted = _extract_first_license_url(
                extras['licence'])
            if license_url_extracted:
                extras['licence_url'] = license_url_extracted

        # Metadata license ID check for package
        use_constraints = iso_values.get('use-constraints')
        if use_constraints:

            context = {
                'model': model,
                'session': model.Session,
                'user': self._get_user_name()
            }
            license_list = p.toolkit.get_action('license_list')(context, {})

            for constraint in use_constraints:
                package_license = None

                for license in license_list:
                    if constraint.lower() == license.get(
                            'id') or constraint == license.get('url'):
                        package_license = license.get('id')
                        break

                if package_license:
                    package_dict['license_id'] = package_license
                    break

        extras['access_constraints'] = iso_values.get(
            'limitations-on-public-access', '')

        # Grpahic preview
        browse_graphic = iso_values.get('browse-graphic')
        if browse_graphic:
            browse_graphic = browse_graphic[0]
            extras['graphic-preview-file'] = browse_graphic.get('file')
            if browse_graphic.get('description'):
                extras['graphic-preview-description'] = browse_graphic.get(
                    'description')
            if browse_graphic.get('type'):
                extras['graphic-preview-type'] = browse_graphic.get('type')

        for key in ['temporal-extent-begin', 'temporal-extent-end']:
            if len(iso_values[key]) > 0:
                extras[key] = iso_values[key][0]

        # Save responsible organization roles
        if iso_values['responsible-organisation']:
            parties = {}
            for party in iso_values['responsible-organisation']:
                if party['organisation-name'] in parties:
                    if not party['role'] in parties[
                            party['organisation-name']]:
                        parties[party['organisation-name']].append(
                            party['role'])
                else:
                    parties[party['organisation-name']] = [party['role']]
            extras['responsible-party'] = [{
                'name': k,
                'roles': v
            } for k, v in parties.items()]

        if len(iso_values['bbox']) > 0:
            bbox = iso_values['bbox'][0]
            extras['bbox-east-long'] = bbox['east']
            extras['bbox-north-lat'] = bbox['north']
            extras['bbox-south-lat'] = bbox['south']
            extras['bbox-west-long'] = bbox['west']

            try:
                xmin = float(bbox['west'])
                xmax = float(bbox['east'])
                ymin = float(bbox['south'])
                ymax = float(bbox['north'])
            except ValueError as e:
                self._save_object_error(
                    'Error parsing bounding box value: {0}'.format(
                        six.text_type(e)), harvest_object, 'Import')
            else:
                # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry

                # Some publishers define the same two corners for the bbox (ie a point),
                # that causes problems in the search if stored as polygon
                if xmin == xmax or ymin == ymax:
                    extent_string = Template(
                        '{"type": "Point", "coordinates": [$x, $y]}'
                    ).substitute(x=xmin, y=ymin)
                    self._save_object_error(
                        'Point extent defined instead of polygon',
                        harvest_object, 'Import')
                else:
                    extent_string = self.extent_template.substitute(xmin=xmin,
                                                                    ymin=ymin,
                                                                    xmax=xmax,
                                                                    ymax=ymax)

                extras['spatial'] = extent_string.strip()
        else:
            log.debug('No spatial extent defined for this object')

        resource_locators = iso_values.get('resource-locator', []) +\
            iso_values.get('resource-locator-identification', [])

        if len(resource_locators):
            for resource_locator in resource_locators:
                url = resource_locator.get('url', '').strip()
                if url:
                    resource = {}
                    resource['format'] = guess_resource_format(url)
                    if resource['format'] == 'wms' and config.get(
                            'ckanext.spatial.harvest.validate_wms', False):
                        # Check if the service is a view service
                        test_url = url.split('?')[0] if '?' in url else url
                        if self._is_wms(test_url):
                            resource['verified'] = True
                            resource['verified_date'] = datetime.now(
                            ).isoformat()

                    resource.update({
                        'url':
                        url,
                        'name':
                        resource_locator.get('name')
                        or p.toolkit._('Unnamed resource'),
                        'description':
                        resource_locator.get('description') or '',
                        'resource_locator_protocol':
                        resource_locator.get('protocol') or '',
                        'resource_locator_function':
                        resource_locator.get('function') or '',
                    })
                    package_dict['resources'].append(resource)

        # Add default_extras from config
        default_extras = self.source_config.get('default_extras', {})
        if default_extras:
            override_extras = self.source_config.get('override_extras', False)
            for key, value in default_extras.items():
                log.debug('Processing extra %s', key)
                if not key in extras or override_extras:
                    # Look for replacement strings
                    if isinstance(value, six.string_types):
                        value = value.format(
                            harvest_source_id=harvest_object.job.source.id,
                            harvest_source_url=harvest_object.job.source.url.
                            strip('/'),
                            harvest_source_title=harvest_object.job.source.
                            title,
                            harvest_job_id=harvest_object.job.id,
                            harvest_object_id=harvest_object.id)
                    extras[key] = value

        extras_as_dict = []
        for key, value in extras.items():
            if isinstance(value, (list, dict)):
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})
            else:
                extras_as_dict.append({'key': key, 'value': value})

        package_dict['extras'] = extras_as_dict

        return package_dict
Ejemplo n.º 11
0
 def test_munge_tag(self):
     """Munge a list of tags gives expected results."""
     for org, exp in self.munge_list:
         munge = munge_tag(org)
         assert_equal(munge, exp)
Ejemplo n.º 12
0
 def test_munge_tag(self):
     '''Munge a list of tags gives expected results.'''
     for org, exp in self.munge_list:
         munge = munge_tag(org)
         assert munge == exp
Ejemplo n.º 13
0
    def parse_dataset(self, dataset_dict, dataset_ref):

        dataset_dict['extras'] = []
        dataset_dict['resources'] = []

        # Basic fields
        for key, predicate in (
                ('title', DCT.title),
                ('notes', DCT.description),
                ('url', DCAT.landingPage),
                ('version', OWL.versionInfo),
                ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                dataset_dict[key] = value

        if not dataset_dict.get('version'):
            # adms:version was supported on the first version of the DCAT-AP
            value = self._object_value(dataset_ref, ADMS.version)
            if value:
                dataset_dict['version'] = value

        # Tags
        keywords = self._object_value_list(dataset_ref, DCAT.keyword) or []
        # Split keywords with commas
        keywords_with_commas = [k for k in keywords if ',' in k]
        for keyword in keywords_with_commas:
            keywords.remove(keyword)
            keywords.extend([k.strip() for k in keyword.split(',')])

        # replace munge_tag to noop if there's no need to clean tags
        do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False))
        tags_val = [munge_tag(tag) if do_clean else tag for tag in keywords]
        tags = [{'name': tag} for tag in tags_val]
        dataset_dict['tags'] = tags

        # Extras

        #  Simple values
        for key, predicate in (
                ('issued', DCT.issued),
                ('modified', DCT.modified),
                ('identifier', DCT.identifier),
                ('version_notes', ADMS.versionNotes),
                ('frequency', DCT.accrualPeriodicity),
                ('access_rights', DCT.accessRights),
                ('provenance', DCT.provenance),
                ('dcat_type', DCT.type),
                ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                dataset_dict['extras'].append({'key': key, 'value': value})

        #  Lists
        for key, predicate, in (
                ('language', DCT.language),
                ('theme', DCAT.theme),
                ('alternate_identifier', ADMS.identifier),
                ('conforms_to', DCT.conformsTo),
                ('documentation', FOAF.page),
                ('related_resource', DCT.relation),
                ('has_version', DCT.hasVersion),
                ('is_version_of', DCT.isVersionOf),
                ('source', DCT.source),
                ('sample', ADMS.sample),
                ):
            values = self._object_value_list(dataset_ref, predicate)
            if values:
                dataset_dict['extras'].append({'key': key,
                                               'value': json.dumps(values)})

        # Contact details
        contact = self._contact_details(dataset_ref, DCAT.contactPoint)
        if not contact:
            # adms:contactPoint was supported on the first version of DCAT-AP
            contact = self._contact_details(dataset_ref, ADMS.contactPoint)

        if contact:
            for key in ('uri', 'name', 'email'):
                if contact.get(key):
                    dataset_dict['extras'].append(
                        {'key': 'contact_{0}'.format(key),
                         'value': contact.get(key)})

        # Publisher
        publisher = self._publisher(dataset_ref, DCT.publisher)
        for key in ('uri', 'name', 'email', 'url', 'type'):
            if publisher.get(key):
                dataset_dict['extras'].append(
                    {'key': 'publisher_{0}'.format(key),
                     'value': publisher.get(key)})

        # Temporal
        start, end = self._time_interval(dataset_ref, DCT.temporal)
        if start:
            dataset_dict['extras'].append(
                {'key': 'temporal_start', 'value': start})
        if end:
            dataset_dict['extras'].append(
                {'key': 'temporal_end', 'value': end})

        # Spatial
        spatial = self._spatial(dataset_ref, DCT.spatial)
        for key in ('uri', 'text', 'geom'):
            if spatial.get(key):
                dataset_dict['extras'].append(
                    {'key': 'spatial_{0}'.format(key) if key != 'geom' else 'spatial',
                     'value': spatial.get(key)})

        # Dataset URI (explicitly show the missing ones)
        dataset_uri = (unicode(dataset_ref)
                       if isinstance(dataset_ref, rdflib.term.URIRef)
                       else '')
        dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri})

        # License
        if 'license_id' not in dataset_dict:
            dataset_dict['license_id'] = self._license(dataset_ref)

        # Source Catalog
        if toolkit.asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)):
            catalog_src = self._get_source_catalog(dataset_ref)
            if catalog_src is not None:
                src_data = self._extract_catalog_dict(catalog_src)
                dataset_dict['extras'].extend(src_data)

        # Resources
        for distribution in self._distributions(dataset_ref):

            resource_dict = {}

            #  Simple values
            for key, predicate in (
                    ('name', DCT.title),
                    ('description', DCT.description),
                    ('download_url', DCAT.downloadURL),
                    ('issued', DCT.issued),
                    ('modified', DCT.modified),
                    ('status', ADMS.status),
                    ('rights', DCT.rights),
                    ('license', DCT.license),
                    ):
                value = self._object_value(distribution, predicate)
                if value:
                    resource_dict[key] = value

            resource_dict['url'] = (self._object_value(distribution,
                                                       DCAT.accessURL) or
                                    self._object_value(distribution,
                                                       DCAT.downloadURL))
            #  Lists
            for key, predicate in (
                    ('language', DCT.language),
                    ('documentation', FOAF.page),
                    ('conforms_to', DCT.conformsTo),
                    ):
                values = self._object_value_list(distribution, predicate)
                if values:
                    resource_dict[key] = json.dumps(values)

            # Format and media type
            normalize_ckan_format = config.get(
                'ckanext.dcat.normalize_ckan_format', True)
            imt, label = self._distribution_format(distribution,
                                                   normalize_ckan_format)

            if imt:
                resource_dict['mimetype'] = imt

            if label:
                resource_dict['format'] = label
            elif imt:
                resource_dict['format'] = imt

            # Size
            size = self._object_value_int(distribution, DCAT.byteSize)
            if size is not None:
                resource_dict['size'] = size

            # Checksum
            for checksum in self.g.objects(distribution, SPDX.checksum):
                algorithm = self._object_value(checksum, SPDX.algorithm)
                checksum_value = self._object_value(checksum, SPDX.checksumValue)
                if algorithm:
                    resource_dict['hash_algorithm'] = algorithm
                if checksum_value:
                    resource_dict['hash'] = checksum_value

            # Distribution URI (explicitly show the missing ones)
            resource_dict['uri'] = (unicode(distribution)
                                    if isinstance(distribution,
                                                  rdflib.term.URIRef)
                                    else '')

            dataset_dict['resources'].append(resource_dict)

        if self.compatibility_mode:
            # Tweak the resulting dict to make it compatible with previous
            # versions of the ckanext-dcat parsers
            for extra in dataset_dict['extras']:
                if extra['key'] in ('issued', 'modified', 'publisher_name',
                                    'publisher_email',):

                    extra['key'] = 'dcat_' + extra['key']

                if extra['key'] == 'language':
                    extra['value'] = ','.join(
                        sorted(json.loads(extra['value'])))

        return dataset_dict
Ejemplo n.º 14
0
    def _build_term_translations(self, dataset):
        """
        Generate meaningful term translations for all translated values
        """
        translations = []
        langs = ['fr', 'it', 'en']

        dataset_attrs = dataset.find('dataset_attributes')
        for attr in self.DATASET_ATTRIBUTES:
            term = dataset_attrs.find(attr).find('de').text
            log.debug('Create translation for %s' % term)
            if attr == 'tags':
                for lang in langs:
                    trans = dataset_attrs.find(attr).find(lang).text
                    # Tags are split and translated individually
                    split_term = self._clean_values(term.split(','))
                    split_trans = self._clean_values(trans.split(','))

                    if len(split_term) == len(split_trans):
                        for term, trans in zip(split_term, split_trans):
                            log.debug(
                                'Term (tag): %s, Translation (%s): %s'
                                % (term, lang, trans)
                            )
                            translations.append({
                                u'lang_code': lang,
                                u'term': munge_tag(term),
                                u'term_translation': munge_tag(trans)
                            })
            else:
                for lang in langs:
                    trans = dataset_attrs.find(attr).find(lang).text
                    if term != trans:
                        log.debug(
                            'Term: %s, Translation (%s): %s'
                            % (term, lang, trans)
                        )
                        translations.append({
                            u'lang_code': lang,
                            u'term': term,
                            u'term_translation': trans
                        })
        resources = dataset.findall('resource')
        for resource in resources:
            for attr in self.RESOURCE_ATTRIBUTES:
                res_attr = resource.find('resource_attributes')
                term = res_attr.find(attr).find('de').text
                log.debug('Create translation for %s' % term)
                for lang in langs:
                    trans = res_attr.find(attr).find(lang).text
                    if term != trans:
                        log.debug(
                            'Term: %s, Translation (%s): %s'
                            % (term, lang, trans)
                        )
                        translations.append({
                            u'lang_code': lang,
                            u'term': term,
                            u'term_translation': trans
                        })
        return translations