Exemple #1
0
 def pre_validate(self, form):
     if self.data:
         try:
             uris.validate(self.data)
         except uris.ValidationError:
             raise validators.ValidationError(_('Invalid URL'))
     return True
Exemple #2
0
 def pre_validate(self, form):
     if self.data:
         try:
             uris.validate(self.data)
         except uris.ValidationError:
             raise validators.ValidationError(_('Invalid URL'))
     return True
Exemple #3
0
def dataset_from_rdf(graph, dataset=None, node=None):
    '''
    Create or update a dataset from a RDF/DCAT graph
    '''
    dataset = dataset or Dataset()

    if node is None:  # Assume first match is the only match
        node = graph.value(predicate=RDF.type, object=DCAT.Dataset)

    d = graph.resource(node)

    dataset.title = rdf_value(d, DCT.title)
    # Support dct:abstract if dct:description is missing (sometimes used instead)
    description = d.value(DCT.description) or d.value(DCT.abstract)
    dataset.description = sanitize_html(description)
    dataset.frequency = frequency_from_rdf(d.value(DCT.accrualPeriodicity))
    dataset.created_at = rdf_value(d, DCT.issued, dataset.created_at)
    dataset.last_modified = rdf_value(d, DCT.modified, dataset.last_modified)

    acronym = rdf_value(d, SKOS.altLabel)
    if acronym:
        dataset.acronym = acronym

    tags = [tag.toPython() for tag in d.objects(DCAT.keyword)]
    tags += [theme.toPython() for theme in d.objects(DCAT.theme) if not isinstance(theme, RdfResource)]
    dataset.tags = list(set(tags))

    identifier = rdf_value(d, DCT.identifier)
    if identifier:
        dataset.extras['dct:identifier'] = identifier

    if isinstance(d.identifier, URIRef):
        dataset.extras['uri'] = d.identifier.toPython()

    landing_page = url_from_rdf(d, DCAT.landingPage)
    if landing_page:
        try:
            uris.validate(landing_page)
            dataset.extras['remote_url'] = landing_page
        except uris.ValidationError:
            pass

    dataset.temporal_coverage = temporal_from_rdf(d.value(DCT.temporal))

    licenses = set()
    for distrib in d.objects(DCAT.distribution | DCAT.distributions):
        resource_from_rdf(distrib, dataset)
        for predicate in DCT.license, DCT.rights:
            value = distrib.value(predicate)
            if isinstance(value, (URIRef, Literal)):
                licenses.add(value.toPython())
            elif isinstance(value, RdfResource):
                licenses.add(value.identifier.toPython())

    default_license = dataset.license or License.default()
    dataset_license = rdf_value(d, DCT.license)
    dataset.license = License.guess(dataset_license, *licenses, default=default_license)

    return dataset
Exemple #4
0
 def validate(self, value):
     super(URLField, self).validate(value)
     kwargs = {
         a: getattr(self, a)
         for a in ('private', 'local', 'schemes', 'tlds')
         if getattr(self, a) is not None
     }
     try:
         uris.validate(value, **kwargs)
     except uris.ValidationError as e:
         self.error(e.message)
Exemple #5
0
 def validate(self, value):
     super(URLField, self).validate(value)
     kwargs = {
         a: getattr(self, a)
         for a in ('private', 'local', 'schemes', 'tlds')
         if getattr(self, a) is not None
     }
     try:
         uris.validate(value, **kwargs)
     except uris.ValidationError as e:
         self.error(e.message)
Exemple #6
0
 def converter(value):
     if value is None:
         return value
     if '://' not in value and default_scheme:
         value = '://'.join((default_scheme, value.strip()))
     try:
         return uris.validate(value)
     except uris.ValidationError as e:
         raise Invalid(str(e))
Exemple #7
0
 def converter(value):
     if value is None:
         return value
     if '://' not in value and default_scheme:
         value = '://'.join((default_scheme, value.strip()))
     try:
         return uris.validate(value)
     except uris.ValidationError as e:
         raise Invalid(e.message)
Exemple #8
0
def frequency_from_rdf(term):
    if isinstance(term, basestring):
        try:
            term = URIRef(uris.validate(term))
        except uris.ValidationError:
            pass
    if isinstance(term, RdfResource):
        term = term.identifier
    if isinstance(term, URIRef):
        if EUFREQ in term:
            return EU_RDF_REQUENCIES.get(term)
        _, _, freq = namespace_manager.compute_qname(term)
        return freq
Exemple #9
0
def frequency_from_rdf(term):
    if isinstance(term, str):
        try:
            term = URIRef(uris.validate(term))
        except uris.ValidationError:
            pass
    if isinstance(term, RdfResource):
        term = term.identifier
    if isinstance(term, URIRef):
        if EUFREQ in term:
            return EU_RDF_REQUENCIES.get(term)
        _, _, freq = namespace_manager.compute_qname(term)
        return freq
Exemple #10
0
def test_local_should_not_validate_private_urls(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url, local=True)
Exemple #11
0
def test_private_should_validate_public_and_private_urls(url):
    assert uris.validate(url, private=True) == url
Exemple #12
0
def test_default_should_not_validate_local_hosts(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #13
0
def test_default_should_not_validate_unknown_tlds(tld):
    url = 'http://somewhere.{0}'.format(tld)
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #14
0
def test_default_should_validate_default_schemes(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    assert uris.validate(url) == url
Exemple #15
0
def test_with_credentials(url):
    assert uris.validate(url) == url
Exemple #16
0
def test_private_should_validate_public_and_private_urls(url):
    assert uris.validate(url, private=True) == url
Exemple #17
0
def test_should_not_validate_multicast_urls(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #18
0
def test_default_should_not_validate_local_hosts(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #19
0
def test_default_should_not_validate_private_urls(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #20
0
def test_default_should_not_validate_unknown_tlds(tld):
    url = 'http://somewhere.{0}'.format(tld)
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #21
0
def test_default_should_not_validate_non_default_schemes(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #22
0
def test_default_should_validate_default_schemes(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    assert uris.validate(url) == url
Exemple #23
0
def test_custom_schemes(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    assert uris.validate(url, schemes=CUSTOM_SCHEMES) == url
Exemple #24
0
def test_custom_tlds(tld):
    url = 'http://somewhere.{0}'.format(tld)
    assert uris.validate(url, tlds=CUSTOM_TLDS) == url
Exemple #25
0
def test_local_should_validate_public_and_local_urls(url):
    assert uris.validate(url, local=True) == url
Exemple #26
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect Org
        organization_acronym = data['organization']['name']
        orgObj = Organization.objects(acronym=organization_acronym).first()
        if orgObj:
            #print 'Found %s' % orgObj.acronym
            dataset.organization = orgObj
        else:
            orgObj = Organization()
            orgObj.acronym = organization_acronym
            orgObj.name = data['organization']['title']
            orgObj.description = data['organization']['description']
            orgObj.save()
            #print 'Created %s' % orgObj.acronym

            dataset.organization = orgObj

        # Detect license
        default_license = self.harvest_config.get('license', License.default())
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.tags.append(urlparse(self.source.url).hostname)

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.frequency = 'unknown'
        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom = None

        for extra in data['extras']:
            # GeoJSON representation (Polygon or Point)
            if extra['key'] == 'spatial':
                spatial_geom = json.loads(extra['value'])
            #  Textual representation of the extent / location
            elif extra['key'] == 'spatial-text':
                log.debug('spatial-text value not handled')
            # Linked Data URI representing the place name
            elif extra['key'] == 'spatial-uri':
                log.debug('spatial-uri value not handled')
            # Update frequency
            elif extra['key'] == 'frequency':
                print 'frequency', extra['value']
            # Temporal coverage start
            elif extra['key'] == 'temporal_start':
                temporal_start = daterange_start(extra['value'])
                continue
            # Temporal coverage end
            elif extra['key'] == 'temporal_end':
                temporal_end = daterange_end(extra['value'])
                continue
            dataset.extras[extra['key']] = extra['value']

        # We don't want spatial to be added on harvester
        if self.harvest_config.get('geozones', False):
            dataset.spatial = SpatialCoverage()
            dataset.spatial.zones = []
            for zone in self.harvest_config.get('geozones'):
                geo_zone = GeoZone.objects.get(id=zone)
                dataset.spatial.zones.append(geo_zone)
        #
        # if spatial_geom:
        #     dataset.spatial = SpatialCoverage()
        #     if spatial_geom['type'] == 'Polygon':
        #         coordinates = [spatial_geom['coordinates']]
        #     elif spatial_geom['type'] == 'MultiPolygon':
        #         coordinates = spatial_geom['coordinates']
        #     else:
        #         HarvestException('Unsupported spatial geometry')
        #     dataset.spatial.geom = {
        #         'type': 'MultiPolygon',
        #         'coordinates': coordinates
        #     }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['remote_url'] = self.dataset_url(data['name'])
                dataset.extras['ckan:source'] = data['url']
            else:
                dataset.extras['remote_url'] = url

        dataset.extras['harvest:name'] = self.source.name

        current_resources = [
            str(resource.id) for resource in dataset.resources
        ]
        fetched_resources = []
        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue

            #Ignore invalid Resources
            try:
                url = uris.validate(res['url'])
            except uris.ValidationError:
                continue

            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue

            fetched_resources.append(str(res['id']))
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        # Clean up old resources removed from source
        for resource_id in current_resources:
            if resource_id not in fetched_resources:
                try:
                    resource = get_by(dataset.resources, 'id',
                                      UUID(resource_id))
                except Exception:
                    log.error('Unable to parse resource ID %s', resource_id)
                    continue
                else:
                    if resource and not self.dryrun:
                        dataset.resources.remove(resource)

        return dataset
Exemple #27
0
def test_local_should_not_validate_private_urls(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url, local=True)
Exemple #28
0
def test_default_should_validate_public_ips(url):
    assert uris.validate(url) == url
Exemple #29
0
def test_private_local_should_validate_any_valid_urls(url):
    assert uris.validate(url, local=True, private=True) == url
Exemple #30
0
def test_default_should_not_validate_non_default_schemes(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #31
0
def test_custom_schemes(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    assert uris.validate(url, schemes=CUSTOM_SCHEMES) == url
Exemple #32
0
def test_default_should_not_validate_private_urls(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #33
0
def test_custom_schemes_should_not_validate_defaults(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    with pytest.raises(uris.ValidationError):
        uris.validate(url, schemes=CUSTOM_SCHEMES)
Exemple #34
0
def test_should_not_validate_multicast_urls(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url)
Exemple #35
0
def test_custom_tlds(tld):
    url = 'http://somewhere.{0}'.format(tld)
    assert uris.validate(url, tlds=CUSTOM_TLDS) == url
Exemple #36
0
def test_local_should_validate_public_and_local_urls(url):
    assert uris.validate(url, local=True) == url
Exemple #37
0
def test_custom_tlds_should_not_validate_defaults(tld):
    url = 'http://somewhere.{0}'.format(tld)
    with pytest.raises(uris.ValidationError):
        uris.validate(url, tlds=CUSTOM_TLDS)
Exemple #38
0
def test_private_local_should_validate_any_valid_urls(url):
    assert uris.validate(url, local=True, private=True) == url
Exemple #39
0
def test_with_credentials(url):
    assert uris.validate(url) == url
Exemple #40
0
def test_custom_schemes_should_not_validate_defaults(scheme):
    url = '{0}://somewhere.com'.format(scheme)
    with pytest.raises(uris.ValidationError):
        uris.validate(url, schemes=CUSTOM_SCHEMES)
Exemple #41
0
def test_validate_strip_url():
    assert uris.validate('  http://somewhere.com  ') == 'http://somewhere.com'
Exemple #42
0
def test_custom_tlds_should_not_validate_defaults(tld):
    url = 'http://somewhere.{0}'.format(tld)
    with pytest.raises(uris.ValidationError):
        uris.validate(url, tlds=CUSTOM_TLDS)
Exemple #43
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect license
        default_license = dataset.license or License.default()
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom, spatial_zone = None, None

        for extra in data['extras']:
            key = extra['key']
            value = extra['value']
            if value is None or (isinstance(value, str) and not value.strip()):
                # Skip empty extras
                continue
            elif key == 'spatial':
                # GeoJSON representation (Polygon or Point)
                spatial_geom = json.loads(value)
            elif key == 'spatial-text':
                # Textual representation of the extent / location
                qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
                qs = qs.valid_at(datetime.now())
                if qs.count() == 1:
                    spatial_zone = qs.first()
                else:
                    dataset.extras['ckan:spatial-text'] = value
                    log.debug('spatial-text value not handled: %s', value)
            elif key == 'spatial-uri':
                # Linked Data URI representing the place name
                dataset.extras['ckan:spatial-uri'] = value
                log.debug('spatial-uri value not handled: %s', value)
            elif key == 'frequency':
                # Update frequency
                freq = frequency_from_rdf(value)
                if freq:
                    dataset.frequency = freq
                elif value in UPDATE_FREQUENCIES:
                    dataset.frequency = value
                else:
                    dataset.extras['ckan:frequency'] = value
                    log.debug('frequency value not handled: %s', value)
            # Temporal coverage start
            elif key == 'temporal_start':
                temporal_start = daterange_start(value)
            # Temporal coverage end
            elif key == 'temporal_end':
                temporal_end = daterange_end(value)
            else:
                dataset.extras[extra['key']] = value

        if spatial_geom or spatial_zone:
            dataset.spatial = SpatialCoverage()

        if spatial_zone:
            dataset.spatial.zones = [spatial_zone]

        if spatial_geom:
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                raise HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        dataset.extras['remote_url'] = self.dataset_url(data['name'])
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['ckan:source'] = data['url']
            else:
                # use declared `url` as `remote_url` if any
                dataset.extras['remote_url'] = url

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset
Exemple #44
0
def test_with_credentials_disabled(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url, credentials=False)
Exemple #45
0
def test_validate_strip_url():
    assert uris.validate('  http://somewhere.com  ') == 'http://somewhere.com'
Exemple #46
0
def test_default_should_validate_public_urls_with_utf8_tld(url):
    assert uris.validate(url) == url
Exemple #47
0
def test_default_should_validate_public_ips(url):
    assert uris.validate(url) == url
Exemple #48
0
def test_with_credentials_disabled(url):
    with pytest.raises(uris.ValidationError):
        uris.validate(url, credentials=False)
Exemple #49
0
def test_default_should_validate_public_urls_with_utf8_tld(url):
    assert uris.validate(url) == url