forked from etalab/geozones
/
geo.py
203 lines (174 loc) · 7.63 KB
/
geo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# -*- coding: utf-8 -*-
from os.path import join, basename
from zipfile import ZipFile
import fiona
from fiona.crs import to_string
from shapely.geometry import shape, MultiPolygon
from shapely.ops import cascaded_union
from tools import warning, error, info, success, extract_meta_from_headers
class Level(object):
'''This class handle level declaration and processing'''
def __init__(self, id, label, *parents):
# TODO: handle multiple parents
self.id = id
self.label = label
self.parents = parents
self.children = []
self.extractors = []
self.postprocessors = []
self.aggregates = []
for parent in parents:
parent.children.append(self)
def extractor(self, url, simplify=None):
'''
Register a dataset and its extractor
The function should have the following signature: ``function(polygon)``
where polygon will be a Shapely extracted polygon with GeoJSON interface.
It should return a dictionnary with extrated attributes, at least:
- name
- code
The simplify parameter is documented here (we use `0.005` for France):
http://toblerity.org/shapely/manual.html#object.simplify
'''
def wrapper(func):
func.simplify = simplify
self.extractors.append((url, func))
return func
return wrapper
def postprocessor(self, url=None):
'''
Register a non geospatial dataset and its processor.
'''
def wrapper(func):
self.postprocessors.append((url, func))
return func
return wrapper
@property
def urls(self):
'''The required datasets URLs list'''
return [url for url, _ in self.extractors + self.postprocessors if url]
def aggregate(self, id, label, zones, **properties):
'''Register a aggregate for this level'''
self.aggregates.append((id, label, zones, properties))
def traverse(self):
'''Deep tree traversal'''
levels = [self]
children = []
while len(levels) > 0:
for level in levels:
yield level
children.extend(level.children)
levels, children = children, []
def load(self, workdir, db):
'''Extract territories from a given file for a given level with a given extractor function'''
loaded = 0
for url, extractor in self.extractors:
loaded += self.process_dataset(workdir, db, url, extractor)
success('Loaded {0} zones for level {1}'.format(loaded, self.id))
return loaded
def process_dataset(self, workdir, db, url, extractor):
'''Extract territories from a given file for a given level with a given extractor function'''
loaded = 0
filename = join(workdir, basename(url))
# Identify the shapefile to avoid multiple file error on GDAL 2
with ZipFile(filename) as z:
candidates = [n for n in z.namelist() if n.endswith('.shp')]
if len(candidates) != 1:
raise ValueError('Unable to find a unique shpaefile into {0}'.format(filename))
shp = candidates[0]
with fiona.open('/{0}'.format(shp), vfs='zip://{0}'.format(filename), encoding='utf8') as collection:
info('Extracting {0} elements from {1} ({2} {3})'.format(
len(collection), basename(filename), collection.driver, to_string(collection.crs)
))
for polygon in collection:
try:
zone = extractor(polygon)
if not zone:
continue
zone['keys'] = dict((k, v) for k, v in zone.get('keys', {}).items() if v is not None)
geom = shape(polygon['geometry'])
if extractor.simplify:
geom = geom.simplify(extractor.simplify)
if geom.geom_type == 'Polygon':
geom = MultiPolygon([geom])
elif geom.geom_type != 'MultiPolygon':
warning('Unsupported geometry type "{0}" for "{1}"'.format(geom.geom_type, zone['name']))
continue
zoneid = '/'.join((self.id, zone['code']))
zone.update(_id=zoneid, level=self.id, geom=geom.__geo_interface__)
db.find_one_and_replace({'_id': zoneid}, zone, upsert=True)
loaded += 1
except Exception as e:
error('Error extracting polygon {0}: {1}', polygon['properties'], str(e))
info('Loaded {0} zones for level {1} from file {2}'.format(loaded, self.id, filename))
return loaded
def build_aggregates(self, db):
processed = 0
for code, name, zones, properties in self.aggregates:
info('Building aggregate "{0}" (level={1}, code={2})'.format(name, self.id, code))
zone = self.build_aggregate(code, name, zones, properties, db)
db.find_one_and_replace({'_id': zone['_id']}, zone, upsert=True)
processed += 1
return processed
def build_aggregate(self, code, name, zones, properties, db):
geoms = []
populations = []
areas = []
for zoneid in zones:
# Resolve wildcard
if zoneid.endswith('/*'):
level = zoneid.replace('/*', '')
ids = db.distinct('_id', {'level': level})
resolved = self.build_aggregate(code, name, ids, properties, db)
geoms.append(shape(resolved['geom']))
if resolved.get('population'):
populations.append(resolved['population'])
if resolved.get('area'):
areas.append(resolved['area'])
else:
zone = db.find_one({'_id': zoneid})
if not zone:
warning('Zone {0} not found'.format(zoneid))
continue
shp = shape(zone['geom'])
if not shp.is_valid:
warning('Skipping invalid polygon for {0}'.format(zone['name']))
continue
if shp.is_empty:
warning('Skipping empty polygon for {0}'.format(zone['name']))
continue
geoms.append(shp)
if zone.get('population'):
populations.append(zone['population'])
if zone.get('area'):
areas.append(zone['area'])
geom = cascaded_union(geoms)
if geom.geom_type == 'Polygon':
geom = MultiPolygon([geom])
data = {
'_id': '/'.join((self.id, code)),
'code': code,
'level': self.id,
'name': name,
'population': sum(populations),
'area': sum(areas),
'geom': geom.__geo_interface__
}
data.update(properties)
return data
def postprocess(self, workdir, db, only=None):
'''Perform postprocessing'''
for url, processor in self.postprocessors:
if only is not None and processor.__name__ != only:
continue
filepath = None
if url:
filename, _ = extract_meta_from_headers(url)
filepath = join(workdir, filename)
processor(db, filepath)
# Force translatables string extraction
_ = lambda s: s # noqa
# Register first levels
root = country_group = Level('country-group', _('Country group'))
country = Level('country', _('Country'), country_group)
country_subset = Level('country-subset', _('Country subset'), country)