def temporal_from_literal(text): ''' Parse a temporal coverage from a literal ie. either: - an ISO date range - a single ISO date period (month,year) ''' if text.count('/') == 1: # This is an ISO date range as preconized by Gov.uk # http://guidance.data.gov.uk/dcat_fields.html start, end = text.split('/') return db.DateRange( start=parse_dt(start).date(), end=parse_dt(end).date() ) else: separators = text.count('-') if separators == 0: # this is a year return db.DateRange( start=date(int(text), 1, 1), end=date(int(text), 12, 31) ) elif separators == 1: # this is a month dt = parse_dt(text).date() return db.DateRange( start=dt.replace(day=1), end=dt.replace(day=calendar.monthrange(dt.year, dt.month)[1]) )
def temporal_from_resource(resource): ''' Parse a temporal coverage from a RDF class/resource ie. either: - a `dct:PeriodOfTime` with schema.org `startDate` and `endDate` properties - an inline gov.uk Time Interval value - an URI reference to a gov.uk Time Interval ontology http://reference.data.gov.uk/ ''' if isinstance(resource.identifier, URIRef): # Fetch remote ontology if necessary g = Graph().parse(str(resource.identifier)) resource = g.resource(resource.identifier) if resource.value(SCHEMA.startDate): return db.DateRange(start=resource.value(SCHEMA.startDate).toPython(), end=resource.value(SCHEMA.endDate).toPython()) elif resource.value(SCV.min): return db.DateRange(start=resource.value(SCV.min).toPython(), end=resource.value(SCV.max).toPython())
def process_formdata(self, valuelist): if valuelist and valuelist[0]: start, end = valuelist[0].split(' - ') self.data = db.DateRange( start=parse(start, yearfirst=True).date(), end=parse(end, yearfirst=True).date(), ) else: self.data = None
def test_initial_values(self): Fake, FakeForm = self.factory() dr = db.DateRange(start=date.today() - timedelta(days=1), end=date.today()) fake = Fake(daterange=dr) form = FakeForm(None, fake) expected = ' - '.join([to_iso_date(dr.start), to_iso_date(dr.end)]) self.assertEqual(form.daterange._value(), expected)
def process_formdata(self, valuelist): if valuelist and valuelist[0]: value = valuelist[0] if isinstance(value, str): start, end = value.split(' - ') self.data = db.DateRange( start=parse(start, yearfirst=True).date(), end=parse(end, yearfirst=True).date(), ) elif 'start' in value and 'end' in value: self.data = db.DateRange( start=parse(value['start'], yearfirst=True).date(), end=parse(value['end'], yearfirst=True).date(), ) else: raise validators.ValidationError( _('Unable to parse date range')) else: self.data = None
def test_temporal_coverage(self): start = faker.past_date(start_date='-30d') end = faker.future_date(end_date='+30d') temporal_coverage = db.DateRange(start=start, end=end) dataset = DatasetFactory(temporal_coverage=temporal_coverage) d = dataset_to_rdf(dataset) pot = d.value(DCT.temporal) assert pot.value(RDF.type).identifier == DCT.PeriodOfTime assert pot.value(SCHEMA.startDate).toPython() == start assert pot.value(SCHEMA.endDate).toPython() == end
def test_with_valid_dates(self): Fake, FakeForm = self.factory() start = date.today() - timedelta(days=1) end = date.today() expected = ' - '.join([to_iso_date(start), to_iso_date(end)]) fake = Fake() form = FakeForm(MultiDict({'daterange': expected})) form.validate() self.assertEqual(form.errors, {}) form.populate_obj(fake) self.assertEqual(fake.daterange, db.DateRange(start=start, end=end))
def test_with_valid_dates_from_json(self): Fake, FakeForm = self.factory() start = date.today() - timedelta(days=1) end = date.today() fake = Fake() form = FakeForm.from_json({ 'daterange': { 'start': to_iso_date(start), 'end': to_iso_date(end), } }) form.validate() self.assertEqual(form.errors, {}) form.populate_obj(fake) self.assertEqual(fake.daterange, db.DateRange(start=start, end=end))
def test_daterange_before_1900(self): '''Daterange filter should display range in an adaptive''' g.lang_code = 'en' iso2date = lambda s: date(*[int(v) for v in s.split('-')]) dr = lambda s, e: db.DateRange(start=iso2date(s), end=iso2date(e)) specs = ( (dr('1234-02-01', '1234-02-01'), '1234/02/01'), (dr('1232-01-01', '1232-01-31'), '1232/01'), (dr('1232-01-01', '1232-01-14'), '1232/01/01 to 1232/01/14'), (dr('1232-01-01', '1232-03-31'), '1232/01 to 1232/03'), (dr('1232-01-01', '1232-02-29'), '1232/01 to 1232/02'), (dr('1232-01-01', '1232-12-31'), '1232'), (dr('1232-01-01', '1234-12-31'), '1232 to 1234'), (dr('1232-02-02', '1234-12-25'), '1232/02/02 to 1234/12/25'), ) for given, expected in specs: self.assertEqual( render_template_string('{{given|daterange}}', given=given), expected)
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], schema) # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = data['notes'] dataset.license = License.objects(id=data['license_id']).first() # dataset.license = license or License.objects.get(id='notspecified') dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') print 'spatial-text', extra['value'] # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') print 'spatial-uri', extra['value'] # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': print 'temporal_start', extra['value'] temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': print 'temporal_end', extra['value'] temporal_end = daterange_end(extra['value']) continue # else: # print extra['key'], extra['value'] dataset.extras[extra['key']] = extra['value'] if spatial_geom: dataset.spatial = SpatialCoverage() if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): dataset.extras['remote_url'] = data['url'] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = res.get('description') resource.url = res['url'] resource.filetype = ('api' if res['resource_type'] == 'api' else 'remote') resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def dr(start, end, **kwargs): return db.DateRange(start=iso2date(start), end=iso2date(end), **kwargs)
def process(self, item): response = self.get(item.remote_id) encoding = chardet.detect(response.content)['encoding'] xml = self.parse_xml(response.content.decode(encoding)) metadata = xml['metadata'] # Resolve and remote id from metadata item.remote_id = metadata['id'] dataset = self.get_dataset(metadata['id']) dataset.title = metadata['title'] dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown') dataset.description = metadata['notes'] dataset.private = metadata['private'] dataset.tags = sorted(set(metadata['tags'])) if metadata.get('license_id'): dataset.license = License.objects.get(id=metadata['license_id']) if (metadata.get('temporal_coverage_from') and metadata.get('temporal_coverage_to')): dataset.temporal_coverage = db.DateRange( start=metadata['temporal_coverage_from'], end=metadata['temporal_coverage_to']) if (metadata.get('territorial_coverage_code') or metadata.get('territorial_coverage_granularity')): dataset.spatial = SpatialCoverage() if metadata.get('territorial_coverage_granularity'): dataset.spatial.granularity = GRANULARITIES.get( metadata['territorial_coverage_granularity']) if metadata.get('territorial_coverage_code'): dataset.spatial.zones = [ ZONES[metadata['territorial_coverage_code']] ] dataset.resources = [] cle = get_by(metadata['resources'], 'format', 'cle') for row in metadata['resources']: if row['format'] == 'cle': continue else: resource = Resource(title=row['name'], description=(row['description'] + '\n\n' + SSL_COMMENT).strip(), filetype='remote', url=row['url'], format=row['format']) if resource.format == 'csv' and cle: resource.checksum = Checksum(type='sha256', value=self.get( cle['url']).text) if row.get('last_modified'): resource.modified = row['last_modified'] dataset.resources.append(resource) if metadata.get('author'): dataset.extras['author'] = metadata['author'] if metadata.get('author_email'): dataset.extras['author_email'] = metadata['author_email'] if metadata.get('maintainer'): dataset.extras['maintainer'] = metadata['maintainer'] if metadata.get('maintainer_email'): dataset.extras['maintainer_email'] = metadata['maintainer_email'] for extra in metadata['extras']: dataset.extras[extra['key']] = extra['value'] return dataset
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect license default_license = dataset.license or License.default() dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom, spatial_zone = None, None for extra in data['extras']: key = extra['key'] value = extra['value'] if value is None or (isinstance(value, str) and not value.strip()): # Skip empty extras continue elif key == 'spatial': # GeoJSON representation (Polygon or Point) spatial_geom = json.loads(value) elif key == 'spatial-text': # Textual representation of the extent / location qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value)) qs = qs.valid_at(datetime.now()) if qs.count() == 1: spatial_zone = qs.first() else: dataset.extras['ckan:spatial-text'] = value log.debug('spatial-text value not handled: %s', value) elif key == 'spatial-uri': # Linked Data URI representing the place name dataset.extras['ckan:spatial-uri'] = value log.debug('spatial-uri value not handled: %s', value) elif key == 'frequency': # Update frequency freq = frequency_from_rdf(value) if freq: dataset.frequency = freq elif value in UPDATE_FREQUENCIES: dataset.frequency = value else: dataset.extras['ckan:frequency'] = value log.debug('frequency value not handled: %s', value) # Temporal coverage start elif key == 'temporal_start': temporal_start = daterange_start(value) # Temporal coverage end elif key == 'temporal_end': temporal_end = daterange_end(value) else: dataset.extras[extra['key']] = value if spatial_geom or spatial_zone: dataset.spatial = SpatialCoverage() if spatial_zone: dataset.spatial.zones = [spatial_zone] if spatial_geom: if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: raise HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL dataset.extras['remote_url'] = self.dataset_url(data['name']) if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['ckan:source'] = data['url'] else: # use declared `url` as `remote_url` if any dataset.extras['remote_url'] = url # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def dr(start, end): return db.DateRange(start=iso2date(start), end=iso2date(end))
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect Org organization_acronym = data['organization']['name'] orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: #print 'Found %s' % orgObj.acronym dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = data['organization']['title'] orgObj.description = data['organization']['description'] orgObj.save() #print 'Created %s' % orgObj.acronym dataset.organization = orgObj # Detect license default_license = self.harvest_config.get('license', License.default()) dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.tags.append(urlparse(self.source.url).hostname) dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.frequency = 'unknown' dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': temporal_end = daterange_end(extra['value']) continue dataset.extras[extra['key']] = extra['value'] # We don't want spatial to be added on harvester if self.harvest_config.get('geozones', False): dataset.spatial = SpatialCoverage() dataset.spatial.zones = [] for zone in self.harvest_config.get('geozones'): geo_zone = GeoZone.objects.get(id=zone) dataset.spatial.zones.append(geo_zone) # # if spatial_geom: # dataset.spatial = SpatialCoverage() # if spatial_geom['type'] == 'Polygon': # coordinates = [spatial_geom['coordinates']] # elif spatial_geom['type'] == 'MultiPolygon': # coordinates = spatial_geom['coordinates'] # else: # HarvestException('Unsupported spatial geometry') # dataset.spatial.geom = { # 'type': 'MultiPolygon', # 'coordinates': coordinates # } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['remote_url'] = self.dataset_url(data['name']) dataset.extras['ckan:source'] = data['url'] else: dataset.extras['remote_url'] = url dataset.extras['harvest:name'] = self.source.name current_resources = [ str(resource.id) for resource in dataset.resources ] fetched_resources = [] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue #Ignore invalid Resources try: url = uris.validate(res['url']) except uris.ValidationError: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue fetched_resources.append(str(res['id'])) if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created # Clean up old resources removed from source for resource_id in current_resources: if resource_id not in fetched_resources: try: resource = get_by(dataset.resources, 'id', UUID(resource_id)) except Exception: log.error('Unable to parse resource ID %s', resource_id) continue else: if resource and not self.dryrun: dataset.resources.remove(resource) return dataset
def remote_datasets(self): response = self.get('package_list') for name in response['result']: details = self.get('package_show', {'id': name})['result'] dataset = self.get_harvested(Dataset, details['id']) # Core attributes dataset.slug = details['name'] dataset.title = details['title'] dataset.description = details.get('notes', 'No description') dataset.license = License.objects( id=details['license_id']).first() or License.objects.get( id='notspecified') dataset.tags = [tag['name'].lower() for tag in details['tags']] dataset.frequency = self.map('frequency', details) or 'unknown' dataset.created_at = parse(details['metadata_created']) dataset.last_modified = parse(details['metadata_modified']) if any_field(details, 'territorial_coverage', 'territorial_coverage_granularity'): coverage = TerritorialCoverage( codes=[ code.strip() for code in details.get( 'territorial_coverage', '').split(',') if code.strip() ], granularity=self.map('territorial_coverage_granularity', details), ) dataset.extras['territorial_coverage'] = coverage try: dataset.spatial = territorial_to_spatial(dataset) except Exception as e: print 'Error while processing spatial coverage for {0}:'.format( dataset.title), e if all_field(details, 'temporal_coverage_from', 'temporal_coverage_to'): try: dataset.temporal_coverage = db.DateRange( start=daterange_start( details.get('temporal_coverage_from')), end=daterange_end(details.get('temporal_coverage_to')), ) except: log.error( 'Unable to parse temporal coverage for dataset %s', details['id']) # Organization if details.get('organization'): dataset.organization = self.get_harvested( Organization, details['organization']['id'], False) else: # Need to fetch user from roles roles = self.get('roles_show', {'domain_object': name})['result']['roles'] for role in roles: if role['role'] == 'admin' and role['context'] == 'Package': dataset.owner = self.get_harvested( User, role['user_id']) break # Supplier if details.get('supplier_id'): dataset.supplier = self.get_harvested(Organization, details['supplier_id'], False) # Remote URL if details.get('url'): dataset.extras['remote_url'] = details['url'] # Extras if 'extras' in details: extra_mapping = self.harvester.mapping.get('from_extras', {}) for extra in details['extras']: if extra['key'] in self.harvester.mapping: value = self.harvester.mapping[extra['key']].get( extra['value']) else: value = extra['value'] if extra['key'] in extra_mapping: setattr(dataset, extra_mapping[extra['key']], value) else: dataset.extras[extra['key']] = value # Resources for res in details['resources']: try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.url = res['url'] resource.description = res.get('description') resource.format = res.get('format') resource.hash = res.get('hash') resource.created = parse(res['created']) resource.modified = parse(res['revision_timestamp']) resource.published = resource.published or resource.created yield dataset if dataset.id: followers = self.get('dataset_follower_list', {'id': name})['result'] for follower in followers: user = self.get_harvested(User, follower['id'], False) if user: follow, created = FollowDataset.objects.get_or_create( follower=user, following=dataset)