Ejemplo n.º 1
0
 def test_parse_european_frequencies_as_url(self, freq, expected):
     assert frequency_from_rdf(str(freq)) == expected
Ejemplo n.º 2
0
 def test_parse_european_frequencies_as_resource(self, freq, expected):
     g = Graph()
     resource = RdfResource(g, freq)
     assert frequency_from_rdf(resource) == expected
Ejemplo n.º 3
0
 def test_parse_european_frequencies(self, freq, expected):
     assert frequency_from_rdf(freq) == expected
Ejemplo n.º 4
0
 def test_parse_dublin_core_frequencies_as_url(self, freq, expected):
     assert frequency_from_rdf(str(freq)) == expected
Ejemplo n.º 5
0
 def test_parse_dublin_core_frequencies(self, freq, expected):
     assert frequency_from_rdf(freq) == expected
Ejemplo n.º 6
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect license
        default_license = dataset.license or License.default()
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom, spatial_zone = None, None

        for extra in data['extras']:
            key = extra['key']
            value = extra['value']
            if value is None or (isinstance(value, str) and not value.strip()):
                # Skip empty extras
                continue
            elif key == 'spatial':
                # GeoJSON representation (Polygon or Point)
                spatial_geom = json.loads(value)
            elif key == 'spatial-text':
                # Textual representation of the extent / location
                qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
                qs = qs.valid_at(datetime.now())
                if qs.count() == 1:
                    spatial_zone = qs.first()
                else:
                    dataset.extras['ckan:spatial-text'] = value
                    log.debug('spatial-text value not handled: %s', value)
            elif key == 'spatial-uri':
                # Linked Data URI representing the place name
                dataset.extras['ckan:spatial-uri'] = value
                log.debug('spatial-uri value not handled: %s', value)
            elif key == 'frequency':
                # Update frequency
                freq = frequency_from_rdf(value)
                if freq:
                    dataset.frequency = freq
                elif value in UPDATE_FREQUENCIES:
                    dataset.frequency = value
                else:
                    dataset.extras['ckan:frequency'] = value
                    log.debug('frequency value not handled: %s', value)
            # Temporal coverage start
            elif key == 'temporal_start':
                temporal_start = daterange_start(value)
            # Temporal coverage end
            elif key == 'temporal_end':
                temporal_end = daterange_end(value)
            else:
                dataset.extras[extra['key']] = value

        if spatial_geom or spatial_zone:
            dataset.spatial = SpatialCoverage()

        if spatial_zone:
            dataset.spatial.zones = [spatial_zone]

        if spatial_geom:
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                raise HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        dataset.extras['remote_url'] = self.dataset_url(data['name'])
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['ckan:source'] = data['url']
            else:
                # use declared `url` as `remote_url` if any
                dataset.extras['remote_url'] = url

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset