def create_from_osm_file(cls, filename, output_dir, precision=None): ''' Given an OSM file (planet or some other bounds) containing relations and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other bounds. ''' if precision is None: precision = cls.GEOHASH_PRECISION index = cls(save_dir=output_dir, precision=precision) i = 0 for element_id, props, deps in parse_osm(filename): props = { safe_decode(k): safe_decode(v) for k, v in six.iteritems(props) } node_id = long(element_id.split(':')[-1]) lat = props.get('lat') lon = props.get('lon') if lat is None or lon is None: continue lat, lon = latlon_to_decimal(lat, lon) if lat is None or lon is None: continue if isclose(lon, 180.0): lon = 179.999 props = { k: v for k, v in six.iteritems(props) if k in ('id', 'type') or k in cls.include_property_patterns or (six.u(':') in k and six.u('{}:*').format( k.split(six.u(':'), 1)[0]) in cls.include_property_patterns ) } props['type'] = 'node' props['id'] = node_id index.add_point(lat, lon, props) if i % 1000 == 0 and i > 0: print('did {} points'.format(i)) i += 1 return index
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) ensure_dir(scratch_dir) logger = logging.getLogger('neighborhoods') logger.setLevel(logging.INFO) qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) logger.info('Creating Zetashapes neighborhoods') zs = cls.create_zetashapes_neighborhoods_index() logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (zs, qs): for i, (props, poly) in enumerate(idx.polygons): name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in attrs.iteritems(): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) qs.matched = [False] * qs.i zs.matched = [False] * zs.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons num_polys = 0 for node_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue is_neighborhood = attrs.get('place') == 'neighbourhood' ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))]) for idx in (zs, qs): candidates = idx.get_candidate_polygons(lat, lon, all_levels=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props, poly = idx.polygons[i] name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name level = props.get(QuattroshapesReverseGeocoder.LEVEL) if is_neighborhood and level != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare(osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs(osm_name, name) if sim > max_sim: max_sim = sim arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] if idx is zs: attrs['polygon_type'] = 'neighborhood' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' attrs['source'] = 'osm' index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')): for i, (props, poly) in enumerate(idx.polygons): if idx.matched[i]: continue props['source'] = source if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
def create_from_osm_file(cls, filename, output_dir, index_filename=None, polys_filename=DEFAULT_POLYS_FILENAME): ''' Given an OSM file (planet or some other bounds) containing relations and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other bounds. ''' index = cls(save_dir=output_dir, index_filename=index_filename) reader = cls.polygon_reader(filename) polygons = reader.polygons() logger = logging.getLogger('osm.reverse_geocode') for element_id, props, admin_center, outer_polys, inner_polys in polygons: props = {k: v for k, v in six.iteritems(props) if k in cls.include_property_patterns or (six.u(':') in k and six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)} id_type, element_id = osm_type_and_id(element_id) test_point = None if admin_center: admin_center_props = {k: v for k, v in six.iteritems(admin_center) if k in ('id', 'type', 'lat', 'lon') or k in cls.include_property_patterns or (six.u(':') in k and six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)} if cls.fix_invalid_polygons: center_lat, center_lon = latlon_to_decimal(admin_center_props['lat'], admin_center_props['lon']) test_point = Point(center_lon, center_lat) props['admin_center'] = admin_center_props if inner_polys and not outer_polys: logger.warn('inner polygons with no outer') continue if len(outer_polys) == 1 and not inner_polys: poly = cls.to_polygon(outer_polys[0]) if poly is None or not poly.bounds or len(poly.bounds) != 4: continue if poly.type != 'MultiPolygon': index.index_polygon(poly) else: for p in poly: index.index_polygon(p) else: multi = [] inner = [] # Validate inner polygons (holes) for p in inner_polys: poly = cls.to_polygon(p) if poly is None or not poly.bounds or len(poly.bounds) != 4 or not poly.is_valid: continue if poly.type != 'MultiPolygon': inner.append(poly) else: inner.extend(poly) # Validate outer polygons for p in outer_polys: poly = cls.to_polygon(p, test_point=test_point) if poly is None or not poly.bounds or len(poly.bounds) != 4: continue interior = [] try: # Figure out which outer polygon contains each inner polygon interior = [p2 for p2 in inner if poly.contains(p2)] except TopologicalError: continue if interior: # Polygon with holes constructor poly = cls.to_polygon(p, [zip(*p2.exterior.coords.xy) for p2 in interior], test_point=test_point) if poly is None or not poly.bounds or len(poly.bounds) != 4: continue # R-tree only stores the bounding box, so add the whole polygon if poly.type != 'MultiPolygon': index.index_polygon(poly) multi.append(poly) else: for p in poly: index.index_polygon(p) multi.extend(poly) if len(multi) > 1: poly = MultiPolygon(multi) elif multi: poly = multi[0] else: continue if index.simplify_polygons: poly = index.simplify_polygon(poly) index.add_polygon(poly, props) return index
def polygons(self, properties_only=False): ''' Generator which yields tuples like: (relation_id, properties, outer_polygons, inner_polygons) At this point a polygon is a list of coordinate tuples, suitable for passing to shapely's Polygon constructor but may be used for other purposes. outer_polygons is a list of the exterior polygons for this boundary. inner_polygons is a list of "holes" in the exterior polygons although donuts and donut-holes need to be matched by the caller using something like shapely's contains. ''' i = 0 for element_id, props, deps in parse_osm(self.filename, dependencies=True): props = { safe_decode(k): safe_decode(v) for k, v in six.iteritems(props) } if element_id.startswith('node'): node_id = long(element_id.split(':')[-1]) lat = props.get('lat') lon = props.get('lon') if lat is None or lon is None: continue lat, lon = latlon_to_decimal(lat, lon) if lat is None or lon is None: continue if isclose(lat, 90.0): lat = 89.999 if isclose(lon, 180.0): lon = 179.999 if 'name' in props and 'place' in props: self.nodes[node_id] = props # Nodes are stored in a sorted array, coordinate indices are simply # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i # Note that the pairs are lon, lat instead of lat, lon for geometry purposes self.coords.append(lon) self.coords.append(lat) self.node_ids.append(node_id) elif element_id.startswith('way'): way_id = long(element_id.split(':')[-1]) # Get node indices by binary search try: node_indices = [ self.binary_search(self.node_ids, node_id) for node_id in deps ] except ValueError: continue # Way ids stored in a sorted array self.way_ids.append(way_id) # way_deps is the list of dependent node ids # way_coords is a copy of coords indexed by way ids for node_id, node_index in izip(deps, node_indices): self.way_deps.append(node_id) self.way_coords.append(self.coords[node_index * 2]) self.way_coords.append(self.coords[node_index * 2 + 1]) self.way_indptr.append(len(self.way_deps)) if deps[0] == deps[-1] and self.include_polygon(props): way_id_offset = WAY_OFFSET + way_id if not properties_only: outer_polys = self.create_polygons([way_id]) inner_polys = [] yield way_id_offset, props, {}, outer_polys, inner_polys else: yield way_id_offset, props, {} elif element_id.startswith('relation'): if self.node_ids is not None: self.node_ids = None if self.coords is not None: self.coords = None relation_id = long(element_id.split(':')[-1]) if len(deps ) == 0 or not self.include_polygon(props) or props.get( 'type', '').lower() == 'multilinestring': continue outer_ways = [] inner_ways = [] admin_centers = [] for elem_id, elem_type, role in deps: if role in ('outer', '') and elem_type == 'way': outer_ways.append(elem_id) elif role == 'inner' and elem_type == 'way': inner_ways.append(elem_id) elif role == 'admin_centre' and elem_type == 'node': val = self.nodes.get(long(elem_id)) if val is not None: val['type'] = 'node' val['id'] = long(elem_id) admin_centers.append(val) elif role == 'label' and elem_type == 'node': val = self.nodes.get(long(elem_id)) if val is not None and val.get( 'name', six.u('')).lower() == props.get( 'name', six.u('')).lower(): props.update({ k: v for k, v in six.iteritems(val) if k not in props }) admin_center = {} if len(admin_centers) == 1: admin_center = admin_centers[0] relation_id_offset = RELATION_OFFSET + relation_id if not properties_only: outer_polys = self.create_polygons(outer_ways) inner_polys = self.create_polygons(inner_ways) yield relation_id_offset, props, admin_center, outer_polys, inner_polys else: yield relation_id_offset, props, admin_center if i % 1000 == 0 and i > 0: self.logger.info('doing {}s, at {}'.format( element_id.split(':')[0], i)) i += 1
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) logger = logging.getLogger('neighborhoods') qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating ClickThatHood neighborhoods') cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() logger.info('Creating OSM neighborhoods') osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir) osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) osm_admin_rtree.cache_size = 1000 logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (cth, qs, osmn): for i in xrange(idx.i): props = idx.get_properties(i) name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in six.iteritems(attrs): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) for i in six.moves.xrange(osmn.i): props = osmn.get_properties(i) poly = osmn.get_polygon(i) props['source'] = 'osm' props['component'] = AddressFormatter.SUBURB props['polygon_type'] = 'neighborhood' index.index_polygon(poly.context) index.add_polygon(poly.context, props) qs.matched = [False] * qs.i cth.matched = [False] * cth.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons num_polys = 0 for element_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue id_type, element_id = element_id.split(':') element_id = long(element_id) props['type'] = id_type props['id'] = element_id possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD) is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) country, candidate_languages = country_rtree.country_and_languages(lat, lon) component_name = None component_name = osm_address_components.component_from_properties(country, attrs) ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) for idx in (cth, qs): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props = idx.get_properties(i) name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare(osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs(osm_name, name) if sim > max_sim: max_sim = sim poly = idx.get_polygon(i) arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True) existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True) skip_node = False for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries): for poly_index, osm_props in enumerate(boundaries): containing_component = None name = osm_props.get('name') # Only exact name matches here since we're comparins OSM to OSM if name and name.lower() != attrs.get('name', '').lower(): continue if boundaries is existing_neighborhood_boundaries: containing_component = AddressFormatter.SUBURB skip_node = True break else: containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]] containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids) if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]: skip_node = True break if skip_node: break # Skip this element if skip_node: continue if idx is cth: if props['component'] == AddressFormatter.SUBURB: attrs['polygon_type'] = 'neighborhood' elif props['component'] == AddressFormatter.CITY_DISTRICT: attrs['polygon_type'] = 'local_admin' else: continue source = 'osm_cth' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries] component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids) attrs['component'] = component attrs['source'] = source index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')): for i in xrange(idx.i): props = idx.get_properties(i) poly = idx.get_polygon(i) if idx.matched[i]: continue props['source'] = source if idx is cth: component = props['component'] if component == AddressFormatter.SUBURB: props['polygon_type'] = 'neighborhood' elif component == AddressFormatter.CITY_DISTRICT: props['polygon_type'] = 'local_admin' else: continue elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': component = AddressFormatter.SUBURB name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) props['name'] = name if cls.quattroshapes_city_district_regex.match(name): component = AddressFormatter.CITY_DISTRICT props['component'] = component props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
def create_from_osm_file(cls, filename, output_dir, index_filename=None, polys_filename=DEFAULT_POLYS_FILENAME): ''' Given an OSM file (planet or some other bounds) containing relations and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other bounds. ''' index = cls(save_dir=output_dir, index_filename=index_filename) reader = cls.polygon_reader(filename) polygons = reader.polygons() logger = logging.getLogger('osm.reverse_geocode') for element_id, props, admin_center, outer_polys, inner_polys in polygons: props = { k: v for k, v in six.iteritems(props) if k in cls.include_property_patterns or (six.u(':') in k and six.u('{}:*').format( k.split(six.u(':'), 1)[0]) in cls.include_property_patterns ) } id_type, element_id = osm_type_and_id(element_id) test_point = None if admin_center: admin_center_props = { k: v for k, v in six.iteritems(admin_center) if k in ('id', 'type', 'lat', 'lon') or k in cls.include_property_patterns or (six.u(':') in k and six.u('{}:*').format(k.split(six.u( ':'), 1)[0]) in cls.include_property_patterns) } if cls.fix_invalid_polygons: center_lat, center_lon = latlon_to_decimal( admin_center_props['lat'], admin_center_props['lon']) test_point = Point(center_lon, center_lat) props['admin_center'] = admin_center_props if inner_polys and not outer_polys: logger.warn('inner polygons with no outer') continue if len(outer_polys) == 1 and not inner_polys: poly = cls.to_polygon(outer_polys[0]) if poly is None or not poly.bounds or len(poly.bounds) != 4: continue if poly.type != 'MultiPolygon': index.index_polygon(poly) else: for p in poly: index.index_polygon(p) else: multi = [] inner = [] # Validate inner polygons (holes) for p in inner_polys: poly = cls.to_polygon(p) if poly is None or not poly.bounds or len( poly.bounds) != 4 or not poly.is_valid: continue if poly.type != 'MultiPolygon': inner.append(poly) else: inner.extend(poly) # Validate outer polygons for p in outer_polys: poly = cls.to_polygon(p, test_point=test_point) if poly is None or not poly.bounds or len( poly.bounds) != 4: continue interior = [] try: # Figure out which outer polygon contains each inner polygon interior = [p2 for p2 in inner if poly.contains(p2)] except TopologicalError: continue if interior: # Polygon with holes constructor poly = cls.to_polygon( p, [zip(*p2.exterior.coords.xy) for p2 in interior], test_point=test_point) if poly is None or not poly.bounds or len( poly.bounds) != 4: continue # R-tree only stores the bounding box, so add the whole polygon if poly.type != 'MultiPolygon': index.index_polygon(poly) multi.append(poly) else: for p in poly: index.index_polygon(p) multi.extend(poly) if len(multi) > 1: poly = MultiPolygon(multi) elif multi: poly = multi[0] else: continue if index.simplify_polygons: poly = index.simplify_polygon(poly) index.add_polygon(poly, props) return index
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) ensure_dir(scratch_dir) logger = logging.getLogger('neighborhoods') logger.setLevel(logging.INFO) qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index( quattroshapes_dir, qs_scratch_dir) logger.info('Creating Zetashapes neighborhoods') zs = cls.create_zetashapes_neighborhoods_index() logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (zs, qs): for i, (props, poly) in enumerate(idx.polygons): name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in attrs.iteritems(): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) qs.matched = [False] * qs.i zs.matched = [False] * zs.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons num_polys = 0 for node_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue is_neighborhood = attrs.get('place') == 'neighbourhood' ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([ v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key)) ]) for idx in (zs, qs): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any( ((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props, poly = idx.polygons[i] name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name if is_neighborhood and idx is qs and props.get( QuattroshapesReverseGeocoder.LEVEL ) != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare( osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs( osm_name, name) if sim > max_sim: max_sim = sim arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] if idx is zs: attrs['polygon_type'] = 'neighborhood' source = 'osm_zeta' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' attrs['source'] = source index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')): for i, (props, poly) in enumerate(idx.polygons): if idx.matched[i]: continue props['source'] = source if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
def polygons(self): ''' Generator which yields tuples like: (relation_id, properties, outer_polygons, inner_polygons) At this point a polygon is a list of coordinate tuples, suitable for passing to shapely's Polygon constructor but may be used for other purposes. outer_polygons is a list of the exterior polygons for this boundary. inner_polygons is a list of "holes" in the exterior polygons although donuts and donut-holes need to be matched by the caller using something like shapely's contains. ''' i = 0 for element_id, props, deps in parse_osm(self.filename, dependencies=True): if element_id.startswith('node'): node_id = long(element_id.split(':')[-1]) lat = props.get('lat') lon = props.get('lon') if lat is None or lon is None: continue lat, lon = latlon_to_decimal(lat, lon) if lat is None or lon is None: continue # Nodes are stored in a sorted array, coordinate indices are simply # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i # Note that the pairs are lon, lat instead of lat, lon for geometry purposes self.coords.append(lon) self.coords.append(lat) self.node_ids.append(node_id) elif element_id.startswith('way'): way_id = long(element_id.split(':')[-1]) # Get node indices by binary search try: node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps] except ValueError: continue # Way ids stored in a sorted array self.way_ids.append(way_id) # way_deps is the list of dependent node ids # way_coords is a copy of coords indexed by way ids for node_id, node_index in izip(deps, node_indices): self.way_deps.append(node_id) self.way_coords.append(self.coords[node_index * 2]) self.way_coords.append(self.coords[node_index * 2 + 1]) self.way_indptr.append(len(self.way_deps)) elif element_id.startswith('relation'): if self.node_ids is not None: self.node_ids = None if self.coords is not None: self.coords = None relation_id = long(element_id.split(':')[-1]) if len(deps) == 0 or not props.get('boundary') or props.get('type', '').lower() == 'multilinestring': continue outer_ways = [] inner_ways = [] for way_id, role in deps: if role == 'outer': outer_ways.append(way_id) elif role == 'inner': inner_ways.append(way_id) outer_polys = self.create_polygons(outer_ways) inner_polys = self.create_polygons(inner_ways) yield relation_id, props, outer_polys, inner_polys if i % 1000 == 0 and i > 0: self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i)) i += 1
def polygons(self): ''' Generator which yields tuples like: (relation_id, properties, outer_polygons, inner_polygons) At this point a polygon is a list of coordinate tuples, suitable for passing to shapely's Polygon constructor but may be used for other purposes. outer_polygons is a list of the exterior polygons for this boundary. inner_polygons is a list of "holes" in the exterior polygons although donuts and donut-holes need to be matched by the caller using something like shapely's contains. ''' i = 0 for element_id, props, deps in parse_osm(self.filename, dependencies=True): if element_id.startswith('node'): node_id = long(element_id.split(':')[-1]) lat = props.get('lat') lon = props.get('lon') if lat is None or lon is None: continue lat, lon = latlon_to_decimal(lat, lon) if lat is None or lon is None: continue # Nodes are stored in a sorted array, coordinate indices are simply # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i # Note that the pairs are lon, lat instead of lat, lon for geometry purposes self.coords.append(lon) self.coords.append(lat) self.node_ids.append(node_id) elif element_id.startswith('way'): way_id = long(element_id.split(':')[-1]) # Get node indices by binary search try: node_indices = [ self.binary_search(self.node_ids, node_id) for node_id in deps ] except ValueError: continue # Way ids stored in a sorted array self.way_ids.append(way_id) # way_deps is the list of dependent node ids # way_coords is a copy of coords indexed by way ids for node_id, node_index in izip(deps, node_indices): self.way_deps.append(node_id) self.way_coords.append(self.coords[node_index * 2]) self.way_coords.append(self.coords[node_index * 2 + 1]) self.way_indptr.append(len(self.way_deps)) elif element_id.startswith('relation'): if self.node_ids is not None: self.node_ids = None if self.coords is not None: self.coords = None relation_id = long(element_id.split(':')[-1]) if len(deps) == 0 or not props.get('boundary') or props.get( 'type', '').lower() == 'multilinestring': continue outer_ways = [] inner_ways = [] for way_id, role in deps: if role == 'outer': outer_ways.append(way_id) elif role == 'inner': inner_ways.append(way_id) outer_polys = self.create_polygons(outer_ways) inner_polys = self.create_polygons(inner_ways) yield relation_id, props, outer_polys, inner_polys if i % 1000 == 0 and i > 0: self.logger.info('doing {}s, at {}'.format( element_id.split(':')[0], i)) i += 1
def polygons(self, properties_only=False): ''' Generator which yields tuples like: (relation_id, properties, outer_polygons, inner_polygons) At this point a polygon is a list of coordinate tuples, suitable for passing to shapely's Polygon constructor but may be used for other purposes. outer_polygons is a list of the exterior polygons for this boundary. inner_polygons is a list of "holes" in the exterior polygons although donuts and donut-holes need to be matched by the caller using something like shapely's contains. ''' i = 0 for element_id, props, deps in parse_osm(self.filename, dependencies=True): props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)} if element_id.startswith('node'): node_id = long(element_id.split(':')[-1]) lat = props.get('lat') lon = props.get('lon') if lat is None or lon is None: continue lat, lon = latlon_to_decimal(lat, lon) if lat is None or lon is None: continue if isclose(lat, 90.0): lat = 89.999 if isclose(lon, 180.0): lon = 179.999 if 'name' in props and 'place' in props: self.nodes[node_id] = props # Nodes are stored in a sorted array, coordinate indices are simply # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i # Note that the pairs are lon, lat instead of lat, lon for geometry purposes self.coords.append(lon) self.coords.append(lat) self.node_ids.append(node_id) elif element_id.startswith('way'): way_id = long(element_id.split(':')[-1]) # Get node indices by binary search try: node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps] except ValueError: continue # Way ids stored in a sorted array self.way_ids.append(way_id) # way_deps is the list of dependent node ids # way_coords is a copy of coords indexed by way ids for node_id, node_index in izip(deps, node_indices): self.way_deps.append(node_id) self.way_coords.append(self.coords[node_index * 2]) self.way_coords.append(self.coords[node_index * 2 + 1]) self.way_indptr.append(len(self.way_deps)) if deps[0] == deps[-1] and self.include_polygon(props): way_id_offset = WAY_OFFSET + way_id if not properties_only: outer_polys = self.create_polygons([way_id]) inner_polys = [] yield way_id_offset, props, {}, outer_polys, inner_polys else: yield way_id_offset, props, {} elif element_id.startswith('relation'): if self.node_ids is not None: self.node_ids = None if self.coords is not None: self.coords = None relation_id = long(element_id.split(':')[-1]) if len(deps) == 0 or not self.include_polygon(props) or props.get('type', '').lower() == 'multilinestring': continue outer_ways = [] inner_ways = [] admin_centers = [] for elem_id, elem_type, role in deps: if role in ('outer', '') and elem_type == 'way': outer_ways.append(elem_id) elif role == 'inner' and elem_type == 'way': inner_ways.append(elem_id) elif role == 'admin_centre' and elem_type == 'node': val = self.nodes.get(long(elem_id)) if val is not None: val['type'] = 'node' val['id'] = long(elem_id) admin_centers.append(val) elif role == 'label' and elem_type == 'node': val = self.nodes.get(long(elem_id)) if val is not None and val.get('name', six.u('')).lower() == props.get('name', six.u('')).lower(): props.update({k: v for k, v in six.iteritems(val) if k not in props}) admin_center = {} if len(admin_centers) == 1: admin_center = admin_centers[0] relation_id_offset = RELATION_OFFSET + relation_id if not properties_only: outer_polys = self.create_polygons(outer_ways) inner_polys = self.create_polygons(inner_ways) yield relation_id_offset, props, admin_center, outer_polys, inner_polys else: yield relation_id_offset, props, admin_center if i % 1000 == 0 and i > 0: self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i)) i += 1