def load(filename, drop=False): '''Load a GeoZones Bundle''' tmp = tempfile.mkdtemp() if filename.startswith('http'): log.info('Downloading GeoZones bundle: %s', filename) filename, _ = urlretrieve(filename, join(tmp, 'geozones.tar.xz')) log.info('Extracting GeoZones bundle') with contextlib.closing(lzma.LZMAFile(filename)) as xz: with tarfile.open(fileobj=xz) as f: f.extractall(tmp) log.info('Loading GeoZones levels') if (drop): log.info('Dropping existing levels') GeoLevel.drop_collection() log.info('Loading levels.json') total = 0 with open(join(tmp, 'levels.json')) as fp: levels = json.load(fp) for level in levels: GeoLevel.objects.create(id=level['id'], name=level['label'], parents=level['parents']) total += 1 log.info('Loaded {0} levels'.format(total)) if (drop): log.info('Dropping existing spatial zones') GeoZone.drop_collection() log.info('Loading zones.json') total = 0 with open(join(tmp, 'zones.json')) as fp: geozones = json.load(fp) for zone in geozones['features']: props = zone['properties'] GeoZone.objects.create( id=zone['id'], level=props['level'], code=props['code'], name=props['name'], keys=props['keys'], parents=props['parents'], population=props.get('population'), area=props.get('area'), geom=zone['geometry'] ) total += 1 log.info('Loaded {0} zones'.format(total)) log.info('Cleaning temporary working directory') shutil.rmtree(tmp)
def load(filename, drop=False): '''Load a GeoZones Bundle''' if filename.startswith('http'): log.info('Downloading GeoZones bundle: %s', filename) filename, _ = urlretrieve(filename, tmp.path('geozones.tar.xz')) log.info('Extracting GeoZones bundle') with contextlib.closing(lzma.LZMAFile(filename)) as xz: with tarfile.open(fileobj=xz) as f: f.extractall(tmp.root) log.info('Loading GeoZones levels') if drop: log.info('Dropping existing levels') GeoLevel.drop_collection() log.info('Loading levels.msgpack') levels_filepath = tmp.path('levels.msgpack') with open(levels_filepath) as fp: unpacker = msgpack.Unpacker(fp, encoding=str('utf-8')) for i, level in enumerate(unpacker, start=1): GeoLevel.objects.create( id=level['id'], name=level['label'], parents=level['parents'] ) os.remove(levels_filepath) log.info('Loaded {total} levels'.format(total=i)) if drop: log.info('Dropping existing spatial zones') GeoZone.drop_collection() log.info('Loading zones.msgpack') zones_filepath = tmp.path('zones.msgpack') with open(zones_filepath) as fp: unpacker = msgpack.Unpacker(fp, encoding=str('utf-8')) for i, geozone in enumerate(unpacker, start=1): GeoZone.objects.create( id=geozone['_id'], level=geozone['level'], code=geozone['code'], name=geozone['name'], keys=geozone.get('keys'), parents=geozone.get('parents'), population=geozone.get('population'), dbpedia=geozone.get('dbpedia'), logo=geozone.get('flag') or geozone.get('blazon'), wikipedia=geozone.get('wikipedia'), area=geozone.get('area'), geom=geozone['geom'] ) os.remove(zones_filepath) log.info('Loaded {total} zones'.format(total=i)) shutil.rmtree(tmp.path('translations')) # Not in use for now.
def load(filename, drop=False): '''Load a GeoZones Bundle''' if filename.startswith('http'): log.info('Downloading GeoZones bundle: %s', filename) filename, _ = urlretrieve(filename, tmp.path('geozones.tar.xz')) log.info('Extracting GeoZones bundle') with contextlib.closing(lzma.LZMAFile(filename)) as xz: with tarfile.open(fileobj=xz) as f: f.extractall(tmp.root) log.info('Loading GeoZones levels') if drop: log.info('Dropping existing levels') GeoLevel.drop_collection() log.info('Loading levels.msgpack') levels_filepath = tmp.path('levels.msgpack') with open(levels_filepath) as fp: unpacker = msgpack.Unpacker(fp, encoding=str('utf-8')) for i, level in enumerate(unpacker, start=1): GeoLevel.objects.create(id=level['id'], name=level['label'], parents=level['parents'], admin_level=level.get('admin_level')) os.remove(levels_filepath) log.info('Loaded {total} levels'.format(total=i)) if drop: log.info('Dropping existing spatial zones') GeoZone.drop_collection() log.info('Loading zones.msgpack') zones_filepath = tmp.path('zones.msgpack') with open(zones_filepath) as fp: unpacker = msgpack.Unpacker(fp, encoding=str('utf-8')) for i, geozone in enumerate(unpacker, start=1): GeoZone.objects.create(id=geozone['_id'], level=geozone['level'], code=geozone['code'], name=geozone['name'], keys=geozone.get('keys'), parents=geozone.get('parents'), population=geozone.get('population'), dbpedia=geozone.get('dbpedia'), logo=geozone.get('flag') or geozone.get('blazon'), wikipedia=geozone.get('wikipedia'), area=geozone.get('area'), geom=geozone['geom']) os.remove(zones_filepath) log.info('Loaded {total} zones'.format(total=i)) shutil.rmtree(tmp.path('translations')) # Not in use for now.
def attach_zone(insee_code, organization_id_or_slug, level="fr/town"): """Attach a zone restricted to level for a given organization.""" organization = Organization.objects.get_by_id_or_slug(organization_id_or_slug) geozone = GeoZone.objects(code=insee_code, level=level).first() log.info("Attaching {organization} with {geozone.name}".format(organization=organization, geozone=geozone)) organization.zone = geozone.id organization.save() log.info("Done")
def attach_zone(insee_code, organization_id_or_slug, level='fr/town'): '''Attach a zone restricted to level for a given organization.''' organization = Organization.objects.get_by_id_or_slug( organization_id_or_slug) geozone = GeoZone.objects(code=insee_code, level=level).first() log.info('Attaching {organization} with {geozone.name}'.format( organization=organization, geozone=geozone)) organization.zone = geozone.id organization.save() log.info('Done')
def check_for_territories(query): if (not query or len(query) < 4 or not current_app.config.get('ACTIVATE_TERRITORIES')): return GeoZone.objects.none() # If it's a code, try INSEE/postal, otherwise use the name. qs = GeoZone.objects(level='fr/town') if len(query) == 5 and query.isdigit(): # Match both INSEE and postal codes. qs = qs(db.Q(code=query) | db.Q(keys__postal__contains=query)) else: # Check names starting with query or exact match. qs = qs(db.Q(name__istartswith=query) | db.Q(name__iexact=query)) # Sort matching results by population and area. return qs.order_by('-population', '-area')
def sitemap_urls(): if current_app.config.get('ACTIVATE_TERRITORIES'): for level in current_app.config.get('HANDLED_LEVELS'): for territory in (GeoZone.objects(level=level).only( 'id', 'code', 'validity', 'slug')): # Remove 'fr:' manually from the level. territory = dict_to_namedtuple( 'Territory', { 'level_name': level[3:], 'id': territory.id, 'code': territory.code, 'slug': territory.slug, 'validity': territory.validity }) yield ('territories.territory', { 'territory': territory }, None, 'weekly', 0.5)
def render_home(): if not current_app.config.get('ACTIVATE_TERRITORIES'): return abort(404) highest_level = current_app.config['HANDLED_LEVELS'][-1] regions = GeoZone.objects(level=highest_level).valid_at(date.today()) regions = sorted(regions, key=lambda zone: unicodedata.normalize('NFD', zone.name). encode('ascii', 'ignore')) return theme.render( 'territories/home.html', **{ 'geojson': { 'type': 'FeatureCollection', 'features': [region.toGeoJSON() for region in regions] }, 'regions': regions })
def check_for_territories(query): """ Return a geozone queryset of territories given the `query`. Results are sorted by population and area (biggest first). """ if not query or not current_app.config.get('ACTIVATE_TERRITORIES'): return [] dbqs = db.Q() query = query.lower() is_digit = query.isdigit() query_length = len(query) for level in current_app.config.get('HANDLED_LEVELS'): if level == 'country': continue # Level not fully handled yet. q = db.Q(level=level) if (query_length == 2 and level == 'fr:departement' and (is_digit or query in ('2a', '2b'))): # Counties + Corsica. q &= db.Q(code=query) elif query_length == 3 and level == 'fr:departement' and is_digit: # French DROM-COM. q &= db.Q(code=query) elif query_length == 5 and level == 'fr:commune' and ( is_digit or query.startswith('2a') or query.startswith('2b')): # INSEE code then postal codes with Corsica exceptions. q &= db.Q(code=query) | db.Q(keys__postal__contains=query) elif query_length >= 4: # Check names starting with query or exact match. q &= db.Q(name__istartswith=query) | db.Q(name__iexact=query) else: continue # Meta Q object, ready to be passed to a queryset. dbqs |= q if dbqs.empty: return [] # Sort matching results by population and area. return GeoZone.objects(dbqs).order_by('-population', '-area')
def serialize(cls, dataset): organization = None owner = None if dataset.organization: org = Organization.objects(id=dataset.organization.id).first() organization = { 'id': str(org.id), 'name': org.name, 'public_service': 1 if org.public_service else 0, 'followers': org.metrics.get('followers', 0) } elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() document = { 'id': str(dataset.id), 'title': dataset.title, 'description': dataset.description, 'acronym': dataset.acronym or None, 'url': dataset.display_url, 'tags': dataset.tags, 'license': getattr(dataset.license, 'id', None), 'badges': [badge.kind for badge in dataset.badges], 'frequency': dataset.frequency, 'created_at': to_iso_datetime(dataset.created_at), 'views': dataset.metrics.get('views', 0), 'followers': dataset.metrics.get('followers', 0), 'reuses': dataset.metrics.get('reuses', 0), 'featured': 1 if dataset.featured else 0, 'resources_count': len(dataset.resources), 'organization': organization, 'owner': str(owner.id) if owner else None, 'format': [r.format.lower() for r in dataset.resources if r.format], 'schema': [r.schema.get('name') for r in dataset.resources if r.schema] } extras = {} for key, value in dataset.extras.items(): extras[key] = to_iso_datetime(value) if isinstance( value, datetime.datetime) else value document.update({'extras': extras}) if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = to_iso_datetime(dataset.temporal_coverage.start) end = to_iso_datetime(dataset.temporal_coverage.end) document.update({ 'temporal_coverage_start': start, 'temporal_coverage_end': end, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, }) return document
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect license default_license = dataset.license or License.default() dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom, spatial_zone = None, None for extra in data['extras']: key = extra['key'] value = extra['value'] if value is None or (isinstance(value, str) and not value.strip()): # Skip empty extras continue elif key == 'spatial': # GeoJSON representation (Polygon or Point) spatial_geom = json.loads(value) elif key == 'spatial-text': # Textual representation of the extent / location qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value)) qs = qs.valid_at(datetime.now()) if qs.count() == 1: spatial_zone = qs.first() else: dataset.extras['ckan:spatial-text'] = value log.debug('spatial-text value not handled: %s', value) elif key == 'spatial-uri': # Linked Data URI representing the place name dataset.extras['ckan:spatial-uri'] = value log.debug('spatial-uri value not handled: %s', value) elif key == 'frequency': # Update frequency freq = frequency_from_rdf(value) if freq: dataset.frequency = freq elif value in UPDATE_FREQUENCIES: dataset.frequency = value else: dataset.extras['ckan:frequency'] = value log.debug('frequency value not handled: %s', value) # Temporal coverage start elif key == 'temporal_start': temporal_start = daterange_start(value) # Temporal coverage end elif key == 'temporal_end': temporal_end = daterange_end(value) else: dataset.extras[extra['key']] = value if spatial_geom or spatial_zone: dataset.spatial = SpatialCoverage() if spatial_zone: dataset.spatial.zones = [spatial_zone] if spatial_geom: if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: raise HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL dataset.extras['remote_url'] = self.dataset_url(data['name']) if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['ckan:source'] = data['url'] else: # use declared `url` as `remote_url` if any dataset.extras['remote_url'] = url # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def load(filename, drop=False): '''Load a GeoZones Bundle''' if filename.startswith('http'): log.info('Downloading GeoZones bundle: %s', filename) filename, _ = urlretrieve(filename, tmp.path('geozones.tar.xz')) log.info('Extracting GeoZones bundle') with contextlib.closing(lzma.LZMAFile(filename)) as xz: with tarfile.open(fileobj=xz) as f: f.extractall(tmp.root) log.info('Loading GeoZones levels') if drop: log.info('Dropping existing levels') GeoLevel.drop_collection() log.info('Loading levels.msgpack') levels_filepath = tmp.path('levels.msgpack') with open(levels_filepath) as fp: unpacker = msgpack.Unpacker(fp, encoding=str('utf-8')) for i, level in enumerate(unpacker, start=1): GeoLevel.objects.create( id=level['id'], name=level['label'], parents=level['parents'], admin_level=level.get('admin_level') ) os.remove(levels_filepath) log.info('Loaded {total} levels'.format(total=i)) if drop: log.info('Dropping existing spatial zones') GeoZone.drop_collection() log.info('Loading zones.msgpack') zones_filepath = tmp.path('zones.msgpack') with open(zones_filepath) as fp: unpacker = msgpack.Unpacker(fp, encoding=str('utf-8')) unpacker.next() # Skip headers. for i, geozone in enumerate(unpacker): if not geozone.get('geom') or ( geozone['geom']['type'] == 'GeometryCollection' and not geozone['geom']['geometries']): geom = None else: geom = geozone['geom'] params = { 'id': geozone['_id'], 'slug': slugify.slugify(geozone['name'], separator='-'), 'level': geozone['level'], 'code': geozone['code'], 'name': geozone['name'], 'keys': geozone.get('keys'), 'parents': geozone.get('parents'), 'ancestors': geozone.get('ancestors'), 'successors': geozone.get('successors'), 'validity': geozone.get('validity'), 'population': geozone.get('population'), 'dbpedia': geozone.get('dbpedia'), 'flag': geozone.get('flag'), 'blazon': geozone.get('blazon'), 'wikipedia': geozone.get('wikipedia'), 'area': geozone.get('area'), 'geom': geom } try: GeoZone.objects.create(**params) except errors.ValidationError as e: log.warning('Validation error (%s) for %s with %s', e, geozone, params) continue os.remove(zones_filepath) log.info('Loaded {total} zones'.format(total=i)) shutil.rmtree(tmp.path('translations')) # Not in use for now.
def serialize(cls, dataset): organization = None owner = None image_url = None spatial_weight = DEFAULT_SPATIAL_WEIGHT temporal_weight = DEFAULT_TEMPORAL_WEIGHT if dataset.organization: organization = Organization.objects(id=dataset.organization.id).first() image_url = organization.logo(40, external=True) elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() image_url = owner.avatar(40, external=True) certified = organization and organization.certified document = { 'title': dataset.title, 'description': dataset.description, 'license': getattr(dataset.license, 'id', None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [ { 'title': r.title, 'description': r.description, 'format': r.format, 'type': r.type, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [str(dataset.id)], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'acronym': dataset.acronym, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime( '%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'featured': dataset.featured, 'from_certified': certified, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = dataset.temporal_coverage.start.toordinal() end = dataset.temporal_coverage.end.toordinal() temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT) document.update({ 'temporal_coverage': {'start': start, 'end': end}, 'temporal_weight': temporal_weight, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) spatial_weight = ADMIN_LEVEL_MAX / coverage_level document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, 'spatial_weight': spatial_weight, }) document['dataset_suggest']['weight'] = cls.get_suggest_weight( temporal_weight, spatial_weight, dataset.featured) if dataset.acronym: document['dataset_suggest']['input'].append(dataset.acronym) return document
def zone_labelizer(value): if value and isinstance(value, basestring): return GeoZone.objects(id=value).first() or value return value
def serialize(cls, dataset): org_id = (str(dataset.organization.id) if dataset.organization is not None else None) if dataset.organization: image_url = dataset.organization.logo(40) elif dataset.owner: image_url = dataset.owner.avatar(40) else: image_url = None document = { 'title': dataset.title, 'description': dataset.description, 'license': (dataset.license.id if dataset.license is not None else None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [ { 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': org_id, 'owner': str(dataset.owner.id) if dataset.owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime( '%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'extras': dataset.extras, 'featured': dataset.featured, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): document.update({ 'temporal_coverage': { 'start': dataset.temporal_coverage.start.toordinal(), 'end': dataset.temporal_coverage.end.toordinal(), } }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zones = GeoZone.objects( id__in=[z.id for z in dataset.spatial.zones]) parents = set() geozones = [] for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) geozones.extend([{'id': p} for p in parents]) document.update({ 'geozones': geozones, # 'geom': dataset.spatial.geom, 'granularity': dataset.spatial.granularity, }) return document
def migrate_zones_ids(): """Migrate zones from old to new ids in datasets. Should only be run once with the new version of geozones w/ geohisto. """ counter_datasets = 0 counter_zones = 0 counter_towns = 0 counter_counties = 0 counter_regions = 0 counter_drom = 0 counter_dromcom = 0 drom_zone = GeoZone.objects(id='country-subset:fr:drom').first() dromcom_zone = GeoZone.objects(id='country-subset:fr:dromcom').first() for dataset in Dataset.objects.all(): if dataset.spatial and dataset.spatial.zones: counter_datasets += 1 new_zones = [] for zone in dataset.spatial.zones: if zone.id.startswith('fr/'): counter_zones += 1 country, kind, zone_id = zone.id.split('/') zone_id = zone_id.upper() # Corsica 2a/b case. if kind == 'town': counter_towns += 1 new_zones.append( GeoZone.objects(code=zone_id, level='fr:commune').valid_at( date.today()).first()) elif kind == 'county': counter_counties += 1 new_zones.append( GeoZone.objects(code=zone_id, level='fr:departement').valid_at( date.today()).first()) elif kind == 'region': counter_regions += 1 # Only link to pre-2016 regions which kept the same id. new_zones.append( GeoZone.objects(code=zone_id, level='fr:region').first()) else: new_zones.append(zone) elif zone.id.startswith('country-subset/fr'): counter_zones += 1 subset, country, kind = zone.id.split('/') if kind == 'dom': counter_drom += 1 new_zones.append(drom_zone) elif kind == 'domtom': counter_dromcom += 1 new_zones.append(dromcom_zone) else: new_zones.append(zone) dataset.update(spatial=SpatialCoverage( zones=[z.id for z in new_zones if z])) print('{} datasets and {} zones affected.'.format(counter_datasets, counter_zones)) print('{} town zones, {} county zones and {} region zones updated.'.format( counter_towns, counter_counties, counter_regions)) print('{} DROM zones, {} DROM-COM zones updated.'.format( counter_drom, counter_dromcom)) log.info('Done')
def sitemap_urls(): if current_app.config.get('ACTIVATE_TERRITORIES'): for code in GeoZone.objects(level='fr/town').only('code'): yield ('territories.territory', { 'territory': code }, None, "weekly", 0.5)
def sitemap_urls(): for code in GeoZone.objects(level='fr/town').only('code'): yield ('territories.territory', {'territory': code}, None, "weekly", 0.5)
def migrate_zones_ids(): """Migrate zones from old to new ids in datasets. Should only be run once with the new version of geozones w/ geohisto. """ counter = Counter() drom_zone = GeoZone.objects(id='country-subset:fr:drom').first() dromcom_zone = GeoZone.objects(id='country-subset:fr:dromcom').first() # Iter over datasets with zones for dataset in Dataset.objects(spatial__zones__gt=[]): counter['datasets'] += 1 new_zones = [] for zone in dataset.spatial.zones: if zone.id.startswith('fr/'): counter['zones'] += 1 country, kind, zone_id = zone.id.split('/') zone_id = zone_id.upper() # Corsica 2a/b case. if kind == 'town': counter['towns'] += 1 new_zones.append( GeoZone.objects(code=zone_id, level='fr:commune').valid_at( date.today()).first()) elif kind == 'county': counter['counties'] += 1 new_zones.append( GeoZone.objects(code=zone_id, level='fr:departement').valid_at( date.today()).first()) elif kind == 'region': counter['regions'] += 1 # Only link to pre-2016 regions which kept the same id. new_zones.append( GeoZone.objects(code=zone_id, level='fr:region').first()) elif kind == 'epci': counter['epcis'] += 1 new_zones.append( GeoZone.objects(code=zone_id, level='fr:epci').valid_at( dataset.created_at.date()).first()) else: new_zones.append(zone) elif zone.id.startswith('country-subset/fr'): counter['zones'] += 1 subset, country, kind = zone.id.split('/') if kind == 'dom': counter['drom'] += 1 new_zones.append(drom_zone) elif kind == 'domtom': counter['dromcom'] += 1 new_zones.append(dromcom_zone) elif zone.id.startswith('country/'): counter['zones'] += 1 counter['countries'] += 1 new_zones.append(zone.id.replace('/', ':')) elif zone.id.startswith('country-group/'): counter['zones'] += 1 counter['countrygroups'] += 1 new_zones.append(zone.id.replace('/', ':')) else: new_zones.append(zone) dataset.update(spatial=SpatialCoverage( granularity=dataset.spatial.granularity, zones=[getattr(z, 'id', z) for z in new_zones if z])) log.info(Formatter().vformat( '''Summary Processed {zones} zones in {datasets} datasets: - {countrygroups} country groups (World/UE) - {countries} countries - France: - {regions} regions - {counties} counties - {epcis} EPCIs - {towns} towns - {drom} DROM - {dromcom} DROM-COM ''', (), counter)) log.info('Done')
def sitemap_urls(): if current_app.config.get('ACTIVATE_TERRITORIES'): for code in GeoZone.objects(level='fr/town').only('code'): yield ('territories.territory', {'territory': code}, None, "weekly", 0.5)
def serialize(cls, dataset): organization = None owner = None image_url = None spatial_weight = DEFAULT_SPATIAL_WEIGHT temporal_weight = DEFAULT_TEMPORAL_WEIGHT if dataset.organization: organization = Organization.objects( id=dataset.organization.id).first() image_url = organization.logo(40, external=True) elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() image_url = owner.avatar(40, external=True) certified = organization and organization.certified document = { 'title': dataset.title, 'description': dataset.description, 'license': getattr(dataset.license, 'id', None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'acronym': dataset.acronym, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'featured': dataset.featured, 'from_certified': certified, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = dataset.temporal_coverage.start.toordinal() end = dataset.temporal_coverage.end.toordinal() temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT) document.update({ 'temporal_coverage': { 'start': start, 'end': end }, 'temporal_weight': temporal_weight, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) spatial_weight = ADMIN_LEVEL_MAX / coverage_level document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, 'spatial_weight': spatial_weight, }) document['dataset_suggest']['weight'] = cls.get_suggest_weight( temporal_weight, spatial_weight, dataset.featured) if dataset.acronym: document['dataset_suggest']['input'].append(dataset.acronym) return document
def serialize(cls, dataset): org_id = (str(dataset.organization.id) if dataset.organization is not None else None) if dataset.organization: image_url = dataset.organization.logo(40) elif dataset.owner: image_url = dataset.owner.avatar(40) else: image_url = None document = { 'title': dataset.title, 'description': dataset.description, 'license': (dataset.license.id if dataset.license is not None else None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': org_id, 'owner': str(dataset.owner.id) if dataset.owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'extras': dataset.extras, 'featured': dataset.featured, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): document.update({ 'temporal_coverage': { 'start': dataset.temporal_coverage.start.toordinal(), 'end': dataset.temporal_coverage.end.toordinal(), } }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zones = GeoZone.objects( id__in=[z.id for z in dataset.spatial.zones]) parents = set() geozones = [] for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) geozones.extend([{'id': p} for p in parents]) document.update({ 'geozones': geozones, # 'geom': dataset.spatial.geom, 'granularity': dataset.spatial.granularity, }) return document
def serialize(cls, dataset): org_id = str(dataset.organization.id) if dataset.organization is not None else None supplier_id = str(dataset.supplier.id) if dataset.supplier is not None else None supplier_id = supplier_id if supplier_id != org_id else None if dataset.organization: image_url = dataset.organization.logo(40) elif dataset.owner: image_url = dataset.owner.avatar(40) else: image_url = None document = { "title": dataset.title, "description": dataset.description, "license": (dataset.license.id if dataset.license is not None else None), "tags": dataset.tags, "badges": [badge.kind for badge in dataset.badges], "tag_suggest": dataset.tags, "resources": [ {"title": r.title, "description": r.description, "format": r.format} for r in dataset.resources ], "format_suggest": [r.format.lower() for r in dataset.resources if r.format], "frequency": dataset.frequency, "organization": org_id, "owner": str(dataset.owner.id) if dataset.owner else None, "supplier": supplier_id, "dataset_suggest": { "input": cls.completer_tokenize(dataset.title) + [dataset.id], "output": dataset.title, "payload": {"id": str(dataset.id), "slug": dataset.slug, "image_url": image_url}, }, "created": dataset.created_at.strftime("%Y-%m-%dT%H:%M:%S"), "last_modified": dataset.last_modified.strftime("%Y-%m-%dT%H:%M:%S"), "metrics": dataset.metrics, "extras": dataset.extras, "featured": dataset.featured, } if dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end: document.update( { "temporal_coverage": { "start": dataset.temporal_coverage.start.toordinal(), "end": dataset.temporal_coverage.end.toordinal(), } } ) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zones = GeoZone.objects(id__in=[z.id for z in dataset.spatial.zones]) parents = set() geozones = [] for zone in zones: geozones.append({"id": zone.id, "name": zone.name, "code": zone.code}) parents |= set(zone.parents) geozones.extend([{"id": p} for p in parents]) document.update( { "geozones": geozones, # 'geom': dataset.spatial.geom, "granularity": dataset.spatial.granularity, } ) return document
def zone_labelizer(value): if isinstance(value, str): return GeoZone.objects(id=value).first() elif isinstance(value, GeoZone): return value