Ejemplo n.º 1
0
    def _get_facet_item_label_with_translation(self, dataset_facet_field, default_facet_label):
        '''
        Translate the default label of facet item. Return the default facet label if no translation available
        :param dataset_facet_field: the name of facet field in the dataset
        :param default_facet_label: the default label of the facet item
        '''
        from ckanext.scheming import helpers as scheming_helpers
        package_type = self._get_dataset_type_of_facet(dataset_facet_field)
        schema = scheming_helpers.scheming_get_dataset_schema(package_type)

        # if a facet has `facet_items` and `dataset_type`, wins `facet_items`
        if self._get_facet_items_of_facet(dataset_facet_field, self.additional_facets) is None:

            # if schema exists
            if schema is not None:
                schema_name = dataset_facet_field
                #remove prefix in facet name
                schema_name = schema_name.replace('extras_', '')
                schema_name = schema_name.replace('res_extras_', '')

                # switch for dataset or resource
                if schema_name.startswith( 'res_' ) and 'resource_fields' in schema:
                    fields_from_schema = schema['resource_fields']
                elif 'dataset_fields' in schema:
                    fields_from_schema = schema['dataset_fields']
                else:
                    return self._translate_facet_item_label(dataset_facet_field, default_facet_label)

                for field in fields_from_schema:
                    if field['field_name'] == schema_name:
                        #if item key is given - see facet_list.html
                        if default_facet_label is not None:
                            if 'choices' in field:
                                return scheming_helpers.scheming_choices_label(field['choices'], default_facet_label)
                            elif 'choices_helper' in field:
                                from ckantoolkit import h
                                choices_fn = getattr(h, field['choices_helper'])
                                return scheming_helpers.scheming_choices_label(choices_fn(field), default_facet_label)
                            else:
                                return default_facet_label;
                        else:
                            if len(field['label']) > 1 and type(field['label']) is dict:
                                label_array = field['label']
                                language = scheming_helpers.lang()
                                for key, value in label_array.iteritems():
                                    if key == language:
                                        if value is not None:
                                            return value
                                        else:
                                            return default_facet_label
                            if field['label'] is not None:
                                return field['label']
                            else:
                                return default_facet_label

        return self._translate_facet_item_label(dataset_facet_field, default_facet_label)
Ejemplo n.º 2
0
    def before_index(self, data_dict):
        dataset = sh.scheming_get_schema('dataset', 'dataset')
        if ('res_format' in data_dict):
            #Get format field
            formats = sh.scheming_field_by_name(dataset.get('resource_fields'),
                                                'format')

            #Create SOLR field
            data_dict['res_format_label'] = []
            for res_format in data_dict['res_format']:
                #Get format label
                res_format_label = sh.scheming_choices_label(
                    formats['choices'], res_format)
                if res_format_label:
                    #Add label to new SOLR field
                    data_dict['res_format_label'].append(res_format_label)

        if ('frequency' in data_dict):
            #Get frequency field
            frequency = data_dict['frequency']
            if frequency:
                freq = json.loads(frequency)
                ftype = freq['type']
                fvalue = freq['value']
                data_dict['frequency_id'] = '{value}-{type}'.format(
                    type=ftype, value=fvalue)
                data_dict[
                    'frequency_label'] = helpers.csc_dataset_display_frequency(
                        fvalue, ftype)
                #log.info('Frecuency = {f1}, frequency_id={f2}, frequency_label={f3}'.format(f1=frequency, f2=data_dict['frequency_id'], f3=data_dict['frequency_label']))

        if ('theme' in data_dict):
            #Get theme field
            categoria = sh.scheming_field_by_name(
                dataset.get('dataset_fields'), 'theme')

            #Get theme value
            valor_categoria = data_dict['theme']

            #Empty theme values
            data_dict['theme'] = []
            data_dict['theme_id'] = []
            data_dict['theme_es'] = []
            data_dict['theme_gl'] = []

            #Get key values
            valores = valor_categoria.replace('[', '').replace(']', '')
            categorias = valores.split('", "')
            #Get translated label for each key
            for term_categoria in list(categorias):
                clean_term = term_categoria.replace('"', '')
                data_dict['theme'].append(clean_term)
                data_dict['theme_id'].append(helpers.csc_theme_id(clean_term))
                #Look for label in the scheme
                for option in categoria.get('choices'):
                    if option['value'] == clean_term:
                        #Add label for each language
                        data_dict['theme_es'].append(option['label']['es'])
                        data_dict['theme_gl'].append(option['label']['gl'])
        return data_dict
Ejemplo n.º 3
0
    def after_search(self, search_results, search_params):
        pr = sh.scheming_get_preset("aafc_sector")
        choices = sh.scheming_field_choices(pr)
        #for result in search_results.get('results', []):
            #for extra in result.get('extras', []):
            #    if extra.get('key') in ['sector' ]:
            #        result[extra['key']] = "xxx" #extra['value']
        facets = search_results.get('search_facets')
        if not facets:
            return search_results
        for key, facet in facets.items():
            if key == 'tags':
               #log.info(">>>pop :" + key)
               #facets.pop('tags')
               #c.facet_titles.pop(key)
               continue
            if key != 'aafc_sector':
                continue
            #log.info(">>>###key:" + key)
            for item in facet['items']:
                field_value = item['name']				
                label = sh.scheming_choices_label(choices,field_value)
                item['display_name'] = label
        keys  = search_results.get('search_facets').keys()
        #log.info(">>>kesy before return  :" + str(keys))
        try:
            c.facet_titles.pop('tags')
        except (AttributeError, RuntimeError):
            pass
        
	return search_results
Ejemplo n.º 4
0
    def metadata_download(self, package_id):
        context = {
            'model': model,
            'session': model.Session,
            'user': p.toolkit.c.user
        }

        data_dict = {
            'id': package_id,
        }
        try:
            result = get_action('package_show')(context, data_dict)
        except (ObjectNotFound, NotAuthorized):
            abort(404, _('Package not found'))

        dataset_fields = helpers.scheming_get_dataset_schema(
            "dataset")['dataset_fields']
        if hasattr(response, u'headers'):
            response.headers['Content-Type'] = 'text/csv'
            response.headers['Content-disposition'] = \
            'attachment; filename="{name}-metadata.csv"'.format(name=package_id)

        f = StringIO.StringIO()
        wr = csv.writer(f, encoding='utf-8')

        header = ['Field', 'Value']
        wr.writerow(header)

        for field in dataset_fields:
            if field['field_name'] == 'tag_string':
                value = self.get_package_tags(result.get('tags'))
                wr.writerow(
                    [helpers.scheming_language_text(field['label']), value])
            elif field['field_name'] == 'owner_org':
                org_alias = str(
                    config.get('ckan.organization_alias', 'Organization'))
                wr.writerow([org_alias, result['organization']['title']])
            elif field['field_name'] == 'groups':
                group_alias = str(config.get('ckan.group_alias',
                                             'Group')) + 's'
                value = self.get_package_groups(result.get('groups'))
                wr.writerow([group_alias, value])
            elif helpers.scheming_field_choices(field):
                value = helpers.scheming_choices_label(
                    helpers.scheming_field_choices(field),
                    result.get(field['field_name']))
                wr.writerow(
                    [helpers.scheming_language_text(field['label']), value])
            else:
                wr.writerow([
                    helpers.scheming_language_text(field['label']),
                    result.get(field['field_name'])
                ])

        return f.getvalue()
Ejemplo n.º 5
0
def dge_list_themes(themes=None):
    '''
    Given an theme list values, get theirs translated labels
    
    :param themes: value theme list
    :type string list
    
    :rtype (string, string) list
    '''
    dataset = sh.scheming_get_schema('dataset', 'dataset')
    formats = sh.scheming_field_by_name(dataset.get('dataset_fields'), 'theme')
    label_list = []
    for theme in themes:
        label = sh.scheming_choices_label(formats['choices'], theme)
        if label:
            label_list.append((dge_theme_id(theme), label))
    return label_list
Ejemplo n.º 6
0
def dge_resource_format_label(res_format=None):
    '''
    Given an format, get its label
    
    :param res_format: format
    :type string
    
    :rtype string
    '''
    if format:
        dataset = sh.scheming_get_schema('dataset', 'dataset')
        formats = sh.scheming_field_by_name(dataset.get('resource_fields'),
                                            'format')
        res_format_label = sh.scheming_choices_label(formats['choices'],
                                                     res_format)
        if res_format_label:
            return res_format_label
    return res_format
Ejemplo n.º 7
0
def dge_list_reduce_resource_format_label(resources=None, field_name='format'):
    '''
    Given an resource list, get label of resource_format
    
    :param resources: resource dict
    :type dict list
    
    :param field_name: field_name of resource
    :type string
    
    :rtype string list
    '''

    format_list = h.dict_list_reduce(resources, field_name)
    dataset = sh.scheming_get_schema('dataset', 'dataset')
    formats = sh.scheming_field_by_name(dataset.get('resource_fields'),
                                        'format')
    label_list = []
    for res_format in format_list:
        res_format_label = sh.scheming_choices_label(formats['choices'],
                                                     res_format)
        if res_format_label:
            label_list.append(res_format_label)
    return label_list
Ejemplo n.º 8
0
    def before_index(self, data_dict):
        dataset = sh.scheming_get_schema('dataset', 'dataset')
        if ('res_format' in data_dict):
            #Get format field
            formats = sh.scheming_field_by_name(dataset.get('resource_fields'),
                                                'format')

            #Create SOLR field
            data_dict['res_format_label'] = []
            for res_format in data_dict['res_format']:
                #Get format label
                res_format_label = sh.scheming_choices_label(
                    formats['choices'], res_format)
                if res_format_label:
                    #Add label to new SOLR field
                    data_dict['res_format_label'].append(res_format_label)

        if ('publisher' in data_dict):
            organismo = data_dict['publisher']
            if is_frontend():
                publisher = toolkit.get_action('dge_organization_publisher')(
                    {
                        'model': model
                    }, {
                        'id': organismo
                    })
            else:
                publisher = h.get_organization(organismo)
            data_dict['publisher'] = publisher.get('id')
            data_dict['publisher_display_name'] = publisher.get('display_name')
            administration_level_code = helpers.dge_get_organization_administration_level_code(
                publisher)
            if not administration_level_code or administration_level_code not in TRANSLATED_UNITS:
                administration_level_code = DEFAULT_UNIT
            data_dict['administration_level'] = administration_level_code
            data_dict['administration_level_es'] = TRANSLATED_UNITS[
                administration_level_code]['es'] or ''
            data_dict['administration_level_en'] = TRANSLATED_UNITS[
                administration_level_code]['en'] or ''
            data_dict['administration_level_ca'] = TRANSLATED_UNITS[
                administration_level_code]['ca'] or ''
            data_dict['administration_level_eu'] = TRANSLATED_UNITS[
                administration_level_code]['eu'] or ''
            data_dict['administration_level_gl'] = TRANSLATED_UNITS[
                administration_level_code]['gl'] or ''

        if ('theme' in data_dict):
            #Get theme field
            categoria = sh.scheming_field_by_name(
                dataset.get('dataset_fields'), 'theme')

            #Get theme value
            valor_categoria = data_dict['theme']

            #Empty theme values
            data_dict['theme'] = []
            data_dict['theme_id'] = []
            data_dict['theme_es'] = []
            data_dict['theme_en'] = []
            data_dict['theme_ca'] = []
            data_dict['theme_eu'] = []
            data_dict['theme_gl'] = []

            #Get key values
            valores = valor_categoria.replace('[', '').replace(']', '')
            categorias = valores.split('", "')
            #Get translated label for each key
            for term_categoria in list(categorias):
                clean_term = term_categoria.replace('"', '')
                data_dict['theme'].append(clean_term)
                data_dict['theme_id'].append(helpers.dge_theme_id(clean_term))
                #Look for label in the scheme
                for option in categoria.get('choices'):
                    if option['value'] == clean_term:
                        #Add label for each language
                        data_dict['theme_es'].append(option['label']['es'])
                        data_dict['theme_en'].append(option['label']['en'])
                        data_dict['theme_ca'].append(option['label']['ca'])
                        data_dict['theme_eu'].append(option['label']['eu'])
                        data_dict['theme_gl'].append(option['label']['gl'])
        return data_dict
    def graph_from_dataset(self, dataset_dict, dataset_ref):
        '''
        Given a CKAN dataset dict, creates an RDF graph

        The class RDFLib graph (accessible via `self.g`) should be updated on
        this method

        `dataset_dict` is a dict with the dataset metadata like the one
        returned by `package_show`. `dataset_ref` is an rdflib URIRef object
        that must be used to reference the dataset when working with the graph.
        '''
        method_log_prefix = '[%s][graph_from_dataset]' % type(
            self).__name__
        #log.debug('%s Init method. Inputs dataset_dict=%r, dataset_ref=%r' % (method_log_prefix, dataset_dict, dataset_ref))
        #log.debug('%s Init method. Inputs, dataset_ref=%r' % (method_log_prefix, dataset_ref))
        try:
            g = self.g

            for prefix, namespace in namespaces.iteritems():
                g.bind(prefix, namespace)

            g.add((dataset_ref, RDF.type, DCAT.Dataset))

            # Title
            self._add_translated_triple_field_from_dict(
                dataset_dict, dataset_ref, DCT.title, DS_TITLE_TRANSLATED, None)

            # Description
            self._add_translated_triple_field_from_dict(
                dataset_dict, dataset_ref, DCT.description, DS_DESCRIPTION, None)

            # Theme
            value = self._get_dict_value(dataset_dict, DS_THEME)
            if value:
                themes = dataset_dict.get(EXPORT_AVAILABLE_THEMES, {})
                for theme in value:
                    #self._add_resource_list_triple(dataset_ref, DCAT.theme, value)
                    theme_values = themes.get(theme, {})
                    labels = theme_values.get('label')
                    descriptions = theme_values.get('description')
                    dcat_ap = theme_values.get('dcat_ap')
                    notation = theme_values.get('notation')
                    self._add_resource_list_triple(
                        dataset_ref, DCAT.theme, theme, labels, descriptions, dcat_ap, notation)

            # Tags
            for tag in dataset_dict.get('tags', []):
                self.g.add(
                    (dataset_ref, DCAT.keyword, Literal(tag['name'])))

            # Identifier
            self._add_triple_from_dict(
                dataset_dict, dataset_ref, DCT.identifier, DS_IDENTIFIER, None, False, False)

            # Issued, Modified dates
            self._add_date_triple(dataset_ref, DCT.issued, self._get_value_from_dict(
                dataset_dict, DS_ISSUED_DATE, ['metadata_created']))
            self._add_date_triple(dataset_ref, DCT.modified, self._get_value_from_dict(
                dataset_dict, DS_MODIFIED_DATE, ['metadata_modified']))
            self._add_date_triple(dataset_ref, DCT.valid, self._get_value_from_dict(
                dataset_dict, DS_VALID, None))

            # Accrual periodicity
            frequency = dataset_dict.get(DS_FREQUENCY)
            if frequency:
                ftypes = {'seconds': TIME.seconds,
                          'minutes': TIME.minutes,
                          'hours': TIME.hours,
                          'days': TIME.days,
                          'weeks': TIME.weeks,
                          'months': TIME.months,
                          'years': TIME.years}
                ftype = frequency.get('type')
                fvalue = frequency.get('value')
                if ftype and ftype in ftypes.keys() and fvalue:
                    duration = BNode()
                    frequency = BNode()
                    g.add((frequency, RDF.type, DCT.Frequency))
                    g.add((duration, RDF.type, TIME.DurationDescription))
                    g.add((dataset_ref, DCT.accrualPeriodicity, frequency))
                    g.add((frequency, RDF.value, duration))
                    g.add((duration, ftypes.get(ftype), Literal(
                        fvalue, datatype=XSD.decimal)))

            # Languages
            self._add_triple_from_dict(
                dataset_dict, dataset_ref, DCT.language, DS_LANGUAGE, None, True, False)

            # Publisher
            pub_dir3 = False
            publishers = dataset_dict.get(
                EXPORT_AVAILABLE_PUBLISHERS, {})
            organization_id = dataset_dict.get('owner_org')
            if organization_id in publishers:
                publisher = publishers.get(organization_id)
            else:
                org = h.get_organization(organization_id, False)
                publisher = [None, None, None]
                if org:
                    publisher = [org.get('title'), None, None]
                    if org['extras']:
                        for extra in org.get('extras'):
                            if extra and 'key' in extra and extra['key'] == ORG_PROP_ID_UD_ORGANICA:
                                notation = extra.get('value')
                                if notation and notation != '':
                                    pub_dir3 = True
                                    publisher[1] = PUBLISHER_PREFIX + notation
                                    publisher[2] = notation
                if pub_dir3:
                    publishers[organization_id] = publisher
                    dataset_dict[EXPORT_AVAILABLE_PUBLISHERS] = publishers
                else:
                    #publisher 
                    organizations = cdh.csc_dcat_organizations_available()
                    publisher_ref = config.get('ckanext.csc_dcat.catalog.publisher', None)
                    if publisher_ref and len(publisher_ref.strip()) > 0:
                        publisher_ref = publisher_ref.strip()
                        publisher = [publisher_ref, None, None]
                        s_publisher = publisher_ref.upper().split('/')
                        if s_publisher and len(s_publisher) > 0:
                            organization_minhap = s_publisher[-1]
                            org = organizations.get(organization_minhap, None)
                            if org:
                                publisher = [org[1], PUBLISHER_PREFIX +
                                        organization_minhap, organization_minhap]
            if publisher[1]:
                self._add_resource_list_triple(
                        dataset_ref, DCT.publisher, publisher[1], publisher[0], None, None, publisher[2])
            else:
                g.add((dataset_ref, DCT.publisher, URIRef(publisher[0])))

            # Spatial Coverage
            value = self._get_dict_value(dataset_dict, DS_SPATIAL)
            if value:
                self._add_resource_list_triple(
                    dataset_ref, DCT.spatial, value)

            # Temporal
            temporal_coverage = self._get_dataset_value(
                dataset_dict, DS_TEMPORAL_COVERAGE)
            i = 1
            if temporal_coverage:
                for key, value in temporal_coverage.items():
                    if (value):
                        start = end = None
                        if 'from' in value:
                            start = value.get('from')
                        if 'to' in value:
                            end = value.get('to')
                        if start or end:
                            temporal_extent = URIRef(
                                "%s/%s-%s" % (dataset_ref, 'PeriodOfTime', i))
                            g.add(
                                (temporal_extent, RDF.type, DCT.PeriodOfTime))
                            if start:
                                self._add_date_triple(
                                    temporal_extent, SCHEMA.startDate, start)
                            if end:
                                self._add_date_triple(
                                    temporal_extent, SCHEMA.endDate, end)
                            g.add((dataset_ref, DCT.temporal, temporal_extent))
                            i = i+1

            # References
            value = self._get_dict_value(dataset_dict, DS_REFERENCE)
            if value:
                self._add_resource_list_triple(
                    dataset_ref, DCT.references, value)

            # Conforms To
            value = self._get_dict_value(dataset_dict, DS_NORMATIVE)
            if value:
                self._add_resource_list_triple(
                    dataset_ref, DCT.conformsTo, value)

            # License (dataset license)
            if dataset_dict.get(DS_LICENSE):
                g.add((dataset_ref, DCT.license, URIRef(
                        dataset_dict.get(DS_LICENSE))))

            # Distributions/Resources
            for resource_dict in dataset_dict.get('resources', []):
                uri_resource = '%s/resource/%s' % (
                    dataset_ref, resource_dict['id'])
                distribution = URIRef(uri_resource)
                g.add((dataset_ref, DCAT.distribution, distribution))
                g.add((distribution, RDF.type, DCAT.Distribution))

                # Identifier
                self._add_triple_from_dict(
                    resource_dict, distribution, DCT.identifier, DS_RESOURCE_IDENTIFIER, None, False, False)

                # Title
                self._add_translated_triple_field_from_dict(
                    resource_dict, distribution, DCT.title, DS_RESOURCE_NAME_TRANSLATED, None)

                # License (dataset license)
                if dataset_dict.get(DS_LICENSE):
                    g.add((distribution, DCT.license, URIRef(
                        dataset_dict.get(DS_LICENSE))))

                # Access URL
                if resource_dict.get(DS_RESOURCE_ACCESS_URL):
                    g.add((distribution, DCAT.accessURL, Literal(
                        resource_dict.get(DS_RESOURCE_ACCESS_URL), datatype=XSD.anyURI)))

                # Format
                if resource_dict.get(DS_RESOURCE_FORMAT, None):
                    imt = URIRef("%s/format" % uri_resource)
                    g.add((imt, RDF.type, DCT.IMT))
                    g.add((distribution, DCT['format'], imt))

                    format = resource_dict.get(
                        DS_RESOURCE_FORMAT, None)
                    formats = dataset_dict.get(
                        EXPORT_AVAILABLE_RESOURCE_FORMATS, {})
                    label = None
                    if format and format in formats:
                        label = formats.get(format, None)
                    else:
                        _dataset = sh.scheming_get_schema(
                            'dataset', 'dataset')
                        res_format = sh.scheming_field_by_name(_dataset.get('resource_fields'),
                                                               'format')
                        formats[format] = sh.scheming_choices_label(
                            res_format['choices'], format)
                        label = formats.get(format, None)
                        dataset_dict[EXPORT_AVAILABLE_RESOURCE_FORMATS] = formats
                    if label:
                        g.add((imt, RDFS.label, Literal(label)))
                    g.add((imt, RDF.value, Literal(
                        resource_dict[DS_RESOURCE_FORMAT])))

                # Size
                if resource_dict.get(DS_RESOURCE_BYTE_SIZE):
                    try:
                        g.add((distribution, DCAT.byteSize,
                               Literal(float(resource_dict[DS_RESOURCE_BYTE_SIZE]),
                                       datatype=XSD.decimal)))
                    except (ValueError, TypeError):
                        g.add((distribution, DCAT.byteSize,
                               Literal(resource_dict[DS_RESOURCE_BYTE_SIZE])))
                # Relation
                value = self._get_dict_value(
                    dataset_dict, DS_NORMATIVE)
                if value:
                    self._add_resource_list_triple(
                        distribution, DCT.relation, value)

        except Exception, e:
            log.error("%s [dataset_ref: %s]. Unexpected Error %s: %s" % (
                method_log_prefix, dataset_ref, type(e).__name__, e))
Ejemplo n.º 10
0
def dge_harvest_catalog_show(context, data_dict):
    method_log_prefix = '[%s][dge_harvest_catalog_show]' % __name__
    output = None
    try:
        log.debug('%s Init method. Inputs context=%s, data_dict=%s' %
                  (method_log_prefix, context, data_dict))
        ini = datetime.datetime.now()
        toolkit.check_access('dge_harvest_catalog_show', context, data_dict)

        page = 1
        data_dict['page'] = page
        limit = data_dict.get('limit', -1)
        _format = data_dict.get('format')
        if _format == RDF_FORMAT:
            filepath = config.get('ckanext.dge_harvest.rdf.filepath',
                                  '/tmp/catalog.rdf')
        elif _format == CSV_FORMAT:
            filepath = config.get('ckanext.dge_harvest.csv.filepath',
                                  '/tmp/catalog.csv')
            columnsfilepath = config.get(
                'ckanext.dge_harvest.csv.columns.filepath',
                '/usr/lib/ckan/default/src/ckanext-dge-harvest/ckanext/dge_harvest/commands/columns.json'
            )
        else:
            filepath = '/tmp/catalog.' + _format
        query = _dge_harvest_search_ckan_datasets(context, data_dict)
        dataset_dicts = query['results']
        total_datasets = query['count']
        log.debug('%s Total_datasets obtenidos en la query: %s' %
                  (method_log_prefix, total_datasets))
        if limit > -1 and limit < total_datasets:
            total_datasets = limit
        num = len(dataset_dicts)
        log.debug('%s Total_datasets a exportar: %s' %
                  (method_log_prefix, total_datasets))

        while (total_datasets > num):
            page = page + 1
            data_dict['page'] = page
            query = _dge_harvest_search_ckan_datasets(context, data_dict)
            dataset_dicts.extend(query['results'])
            total_datasets = query['count']
            num = len(dataset_dicts)
            log.debug('%s Total_datasets obtenidos en la query: %s' %
                      (method_log_prefix, total_datasets))
            log.debug('%s Total_datasets a exportar: %s' %
                      (method_log_prefix, num))

        if _format == RDF_FORMAT:
            serializer = DGERDFSerializer()
            #log.debug("%s DATASET_DICTS = %s" % (method_log_prefix,dataset_dicts))
            output = serializer.serialize_catalog(
                {},
                dataset_dicts,
                _format=data_dict.get('format'),
                pagination_info=None)
        elif _format == CSV_FORMAT and columnsfilepath:
            #log.info('%s Dataset_dicts de partida =%s' % (method_log_prefix, dataset_dicts))
            organizations = {}
            themes = dhh.dge_harvest_dict_theme_option_label()
            spatial_coverages = dhh.dge_harvest_dict_spatial_coverage_option_label(
            )
            _dataset = sh.scheming_get_schema('dataset', 'dataset')
            res_format = sh.scheming_field_by_name(
                _dataset.get('resource_fields'), 'format')
            format_values = res_format['choices']
            formats = {}
            datasets = []
            num = 0
            for dataset in dataset_dicts:
                ds = {}
                #Id
                #ds['id'] = _encode_value(dataset.get('id', None))

                #ulr
                ds['url'] = dataset_uri(dataset)

                #Description
                descriptions = _from_dict_to_string(
                    dataset.get(dhc.DS_DESCRIPTION, None))
                ds['description'] = _encode_value(descriptions, True)

                #Title
                titles = _from_dict_to_string(
                    dataset.get(dhc.DS_TITLE_TRANSLATED, None))
                ds['title'] = _encode_value(titles, True)

                #Theme
                theme_values = dataset.get(dhc.DS_THEME, None)
                theme_labels = []
                if theme_values:
                    for value in theme_values:
                        theme = themes.get(value)
                        if theme and theme.get('label'):
                            theme_labels.append(theme.get('label').get('es'))
                    theme_value = _from_list_to_string(theme_labels)
                    ds['theme'] = _encode_value(theme_value, True)

                #Keywords
                tags = dataset.get(dhc.DS_TAGS)
                value = None
                if tags and len(tags) > 0:
                    for tag in tags:
                        stag = tag.get('name', None)
                        if stag:
                            if value:
                                value = '%s%s%s' % (value, MAIN_SEPARATOR,
                                                    stag)
                            else:
                                value = stag
                    ds['tags'] = _encode_value(value, True)

                #Identifier
                ds['identifier'] = _encode_value(
                    dataset.get('identifier', None), True)

                #Created
                ds['issued_date'] = _encode_value(
                    _from_iso8601_date_to_string(
                        dataset.get(dhc.DS_ISSUED_DATE, None)))

                #Modified
                ds['modified_date'] = _encode_value(
                    _from_iso8601_date_to_string(
                        dataset.get(dhc.DS_MODIFIED_DATE, None)))

                #Accrual Periodicity
                frequency = dataset.get(dhc.DS_FREQUENCY)
                if (frequency):
                    stype = frequency.get('type', '')
                    if stype and len(stype) > 0:
                        stype = 'http://www.w3.org/2006/time#' + stype
                    svalue = frequency.get('value', '')
                    sfrequency = '[TYPE]%s[VALUE]%s' % (stype, svalue)
                    ds['frequency'] = _encode_value(sfrequency, True)

                #Language
                languages = _from_list_to_string(dataset.get(dhc.DS_LANGUAGE))
                ds['language'] = _encode_value(languages, True)

                #Publisher
                publisher = dataset.get(dhc.DS_PUBLISHER, None)
                if publisher:
                    if publisher in organizations:
                        ds['publisher'] = _encode_value(
                            organizations.get(publisher, None), True)
                    else:
                        organization = h.get_organization(publisher, False)
                        if organization:
                            organizations[publisher] = organization.get(
                                'title',
                                organization.get('display_name', None))
                            ds['publisher'] = _encode_value(
                                organizations.get(publisher), True)

                #License
                ds['license_id'] = _encode_value(dataset.get(dhc.DS_LICENSE),
                                                 True)

                #Spatial
                spatial_values = dataset.get(dhc.DS_SPATIAL, None)
                spatial_labels = []
                if spatial_values:
                    for value in spatial_values:
                        spatial = spatial_coverages.get(value)
                        if spatial and spatial.get('label') and spatial.get(
                                'label').get('es'):
                            spatial_labels.append(
                                spatial.get('label').get('es'))
                    spatials = _from_list_to_string(spatial_labels)
                    ds['spatial'] = _encode_value(spatials, True)

                #Temporal
                temporal_coverage = dataset.get(dhc.DS_TEMPORAL_COVERAGE)
                if temporal_coverage:
                    value = None
                    for tc in temporal_coverage.itervalues():
                        if tc:
                            tc_from = _from_iso8601_date_to_string(
                                tc.get('from', None))
                            tc_to = _from_iso8601_date_to_string(
                                tc.get('to', None))
                            if tc_from or tc_to:
                                if value:
                                    value = '%s%s%s-%s' % (value,
                                                           MAIN_SEPARATOR,
                                                           (tc_from or ''),
                                                           (tc_to or ''))
                                else:
                                    value = '%s-%s' % ((tc_from or ''),
                                                       (tc_to or ''))
                    ds['coverage_new'] = _encode_value(value, True)

                #Valid
                ds['valid'] = _encode_value(
                    _from_iso8601_date_to_string(
                        dataset.get(dhc.DS_VALID, None)), True)

                #References
                references = _from_list_to_string(
                    dataset.get(dhc.DS_REFERENCE, None))
                ds['references'] = _encode_value(references, True)

                #Normative
                conforms_to = _from_list_to_string(
                    dataset.get(dhc.DS_NORMATIVE, None))
                ds['conforms_to'] = _encode_value(conforms_to, True)

                #Resources
                resources = dataset.get(dhc.DS_RESOURCES)
                sresources = []
                if resources:
                    for resource in resources:
                        sresource = None
                        if resource:
                            name = _from_dict_to_string(
                                resource.get(dhc.DS_RESOURCE_NAME_TRANSLATED,
                                             None), 'TITLE_')
                            if not name:
                                name = ''
                            url = resource.get(dhc.DS_RESOURCE_ACCESS_URL, '')
                            if url:
                                url = '[ACCESS_URL]%s' % (url)

                            format_value = resource.get(
                                dhc.DS_RESOURCE_FORMAT, None)
                            format = None
                            if format_value:
                                if format_value in formats:
                                    format = formats.get(format_value, None)
                                else:
                                    formats[
                                        format_value] = sh.scheming_choices_label(
                                            format_values, format_value)
                                    format = formats.get(format_value, None)
                            if format:
                                format = '[MEDIA_TYPE]%s' % (format)
                            size = resource.get(dhc.DS_RESOURCE_BYTE_SIZE, '')
                            if size:
                                size = '[BYTE_SIZE]%s' % (size)
                            relation = _from_list_to_string(
                                resource.get(dhc.DS_RESOURCE_RELATION, None),
                                SECONDARY_SEPARATOR)
                            relations = ''
                            if relation:
                                relations = '[RELATION]%s' % (relation)
                            sresource = '%s%s%s%s%s' % (name, url, format,
                                                        size, relations)
                            if sresource and len(sresource) > 0:
                                sresources.append(sresource)
                if len(sresources) > 0:
                    value = None
                    for item in sresources:
                        if value:
                            value = '%s%s%s' % (value, MAIN_SEPARATOR, item)
                        else:
                            value = item
                ds['resources'] = _encode_value(value, True)

                num = num + 1
                datasets.append(ds)
            #log.debug('%s Datasets con datos a exportar=%s' % (method_log_prefix, datasets))
            log.debug('%s Numero de datasets con datos a exportar...%s' %
                      (method_log_prefix, num))
            output = losser.losser.table(datasets,
                                         columnsfilepath,
                                         csv=True,
                                         pretty=False)

        if filepath:
            file = None
            try:
                file = open(filepath, "w")
                file.write(output)
                file.close()
            except:
                if file and not file.closed:
                    file.close()

        end = datetime.datetime.now()
        log.debug(
            "%s Time in serialize %s catalog [%s] with %s datasets ... %s milliseconds"
            % (method_log_prefix, _format, filepath, total_datasets,
               int((end - ini).total_seconds() * 1000)))
    except Exception, e:
        log.error("%s Exception %s: %s" %
                  (method_log_prefix, type(e).__name__, e))
        output = None
Ejemplo n.º 11
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        logger.debug("in import stage: %s" % harvest_object.guid)
        if not harvest_object:
            logger.error('No harvest object received')
            self._save_object_error('No harvest object received')
            return False
        try:
            self._set_config(harvest_object.job.source.config)

            package_dict = json.loads(harvest_object.content)
            data_dict = {}
            data_dict['id'] = package_dict['id']
            data_dict['title'] = package_dict['title']
            data_dict['name'] = munge_title_to_name(package_dict['name'])

            data_dict['notes'] = markdown_extract(
                package_dict.get('description'))

            tags = package_dict.get('keyword', [])
            data_dict['tag_string'] = ', '.join(
                [munge_tag(tag) for tag in tags])

            data_dict['private'] = False

            license_id = package_dict.get('license',
                                          'cc-by').strip('/').split('/')[-1]

            if license_id == 'de2a56f5-a565-481a-8589-406dc40b5588':
                license_id = 'sprep-public-license'
            data_dict['license_id'] = license_id or 'notspecified'

            data_dict['created'] = _parse_drupal_date(package_dict['issued'])
            data_dict['modified'] = _parse_drupal_date(
                package_dict['modified'])

            c_point, c_email = package_dict['contactPoint'][
                'fn'], package_dict['contactPoint']['hasEmail'].split(':')[-1]
            if c_email != '*****@*****.**':
                data_dict['contact_uri'] = c_point
                data_dict['contact_email'] = c_email
            data_dict['resources'] = []
            for res in package_dict.get('distribution', []):

                # res['issued'] = _parse_drupal_date(res.pop('created'))
                # res['modified'] = _parse_drupal_date(
                #     res.pop('last_modified').replace('Date changed ', '')
                # )
                res['url'] = res.get('downloadURL') or res.get('accessURL')
                res['name'] = res['title']
                res['description'] = markdown_extract(res.get('description'))
                data_dict['resources'].append(res)

            if 'spatial' in package_dict:
                data_dict['spatial'] = package_dict.pop('spatial')

                try:
                    geometry = {
                        "type":
                        "Polygon",
                        "coordinates":
                        [[[float(c) for c in pair.split()]
                          for pair in RE_SPATIAL.match(
                              data_dict['spatial']).group(1).split(', ')]]
                    }
                    shape = shapely.geometry.asShape(geometry)
                    if shape.is_valid and shape.is_closed:
                        data_dict['spatial'] = json.dumps(geometry)
                    else:
                        del data_dict['spatial']

                except KeyError:
                    pass
                except (AttributeError, ValueError):
                    del data_dict['spatial']
                    # logger.warn('-' * 80)
                    #
                    # logger.warn('Failed parsing of spatial field: %s', data_dict['spatial'])

                # package_dict.pop('type')

            # add owner_org
            source_dataset = get_action('package_show')(
                {
                    'ignore_auth': True
                }, {
                    'id': harvest_object.source.id
                })

            owner_org = source_dataset.get('owner_org')
            data_dict['owner_org'] = owner_org
            data_dict['member_countries'] = country_mapping[None]
            if 'isPartOf' in package_dict:
                country = package_dict['isPartOf'].split('.')[0]
                data_dict['member_countries'] = country_mapping.get(
                    country, country_mapping[None])
                org = model.Session.query(
                    model.Group).filter_by(name=country + '-data').first()
                if org:
                    data_dict['owner_org'] = org.id

            if 'spatial' in package_dict:
                data_dict['spatial'] = package_dict['spatial']
                try:
                    data_dict['spatial'] = json.dumps({
                        "type":
                        "Polygon",
                        "coordinates":
                        [[[float(c) for c in pair.split()]
                          for pair in RE_SPATIAL.match(
                              data_dict['spatial']).group(1).split(', ')]]
                    })
                except KeyError:
                    pass
                # package_dict.pop('type')
            else:
                schema = sh.scheming_get_dataset_schema('dataset')
                choices = sh.scheming_field_by_name(
                    schema['dataset_fields'], 'member_countries')['choices']
                member_country = sh.scheming_choices_label(
                    choices, data_dict['member_countries'])
                if member_country:
                    spatial = get_extent_for_country(member_country)
                    if spatial:
                        data_dict['spatial'] = spatial['value']

            data_dict['source'] = package_dict.get('landingPage')

            data_dict['theme'] = package_dict.get('theme', [])
            data_dict['theme'] = package_dict.get('theme', [])

            data_dict['thematic_area_string'] = _map_theme_to_topic(
                data_dict['theme'])

            data_dict['harvest_source'] = 'SPREP'

            self._create_or_update_package(data_dict, harvest_object,
                                           'package_show')

            Session.commit()
            stored_package = get_action('package_show')({
                'ignore_auth': True
            }, {
                'id': data_dict['id']
            })
            for res in stored_package.get('resources', []):
                get_action('resource_create_default_resource_views')(
                    {
                        'ignore_auth': True
                    }, {
                        'package': stored_package,
                        'resource': res
                    })

            logger.debug("Finished record")
        except:
            logger.exception('Something went wrong!')
            self._save_object_error('Exception in import stage',
                                    harvest_object)
            return False
        return True