コード例 #1
0
    def create_from_osm_file(cls, filename, output_dir, precision=None):
        '''
        Given an OSM file (planet or some other bounds) containing relations
        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other bounds.
        '''
        if precision is None:
            precision = cls.GEOHASH_PRECISION

        index = cls(save_dir=output_dir, precision=precision)

        i = 0
        for element_id, props, deps in parse_osm(filename):
            props = {
                safe_decode(k): safe_decode(v)
                for k, v in six.iteritems(props)
            }

            node_id = long(element_id.split(':')[-1])
            lat = props.get('lat')
            lon = props.get('lon')
            if lat is None or lon is None:
                continue
            lat, lon = latlon_to_decimal(lat, lon)
            if lat is None or lon is None:
                continue

            if isclose(lon, 180.0):
                lon = 179.999

            props = {
                k: v
                for k, v in six.iteritems(props)
                if k in ('id', 'type') or k in cls.include_property_patterns or
                (six.u(':') in k and six.u('{}:*').format(
                    k.split(six.u(':'), 1)[0]) in cls.include_property_patterns
                 )
            }

            props['type'] = 'node'
            props['id'] = node_id

            index.add_point(lat, lon, props)

            if i % 1000 == 0 and i > 0:
                print('did {} points'.format(i))
            i += 1

        return index
コード例 #2
0
ファイル: reverse_geocode.py プロジェクト: riordan/libpostal
    def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        ensure_dir(scratch_dir)

        logger = logging.getLogger('neighborhoods')
        logger.setLevel(logging.INFO)

        qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)
        logger.info('Creating Quattroshapes neighborhoods')

        qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
        logger.info('Creating Zetashapes neighborhoods')
        zs = cls.create_zetashapes_neighborhoods_index()

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (zs, qs):
            for i, (props, poly) in enumerate(idx.polygons):
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in attrs.iteritems():
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        qs.matched = [False] * qs.i
        zs.matched = [False] * zs.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
        num_polys = 0
        for node_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            is_neighborhood = attrs.get('place') == 'neighbourhood'

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))])

            for idx in (zs, qs):
                candidates = idx.get_candidate_polygons(lat, lon, all_levels=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
                                                   for c in safe_decode(osm_name)))

                        for i in candidates:
                            props, poly = idx.polygons[i]
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            level = props.get(QuattroshapesReverseGeocoder.LEVEL)
                            if is_neighborhood and level != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                arg_max = (max_sim, props, poly.context, idx, i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                if idx is zs:
                    attrs['polygon_type'] = 'neighborhood'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                attrs['source'] = 'osm'
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
            for i, (props, poly) in enumerate(idx.polygons):
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
コード例 #3
0
    def create_from_osm_file(cls, filename, output_dir,
                             index_filename=None,
                             polys_filename=DEFAULT_POLYS_FILENAME):
        '''
        Given an OSM file (planet or some other bounds) containing relations
        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other bounds.
        '''
        index = cls(save_dir=output_dir, index_filename=index_filename)

        reader = cls.polygon_reader(filename)
        polygons = reader.polygons()

        logger = logging.getLogger('osm.reverse_geocode')

        for element_id, props, admin_center, outer_polys, inner_polys in polygons:
            props = {k: v for k, v in six.iteritems(props)
                     if k in cls.include_property_patterns or (six.u(':') in k and
                     six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)}

            id_type, element_id = osm_type_and_id(element_id)

            test_point = None

            if admin_center:
                admin_center_props = {k: v for k, v in six.iteritems(admin_center)
                                      if k in ('id', 'type', 'lat', 'lon') or k in cls.include_property_patterns or (six.u(':') in k and
                                      six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)}

                if cls.fix_invalid_polygons:
                    center_lat, center_lon = latlon_to_decimal(admin_center_props['lat'], admin_center_props['lon'])
                    test_point = Point(center_lon, center_lat)

                props['admin_center'] = admin_center_props

            if inner_polys and not outer_polys:
                logger.warn('inner polygons with no outer')
                continue
            if len(outer_polys) == 1 and not inner_polys:
                poly = cls.to_polygon(outer_polys[0])
                if poly is None or not poly.bounds or len(poly.bounds) != 4:
                    continue
                if poly.type != 'MultiPolygon':
                    index.index_polygon(poly)
                else:
                    for p in poly:
                        index.index_polygon(p)
            else:
                multi = []
                inner = []
                # Validate inner polygons (holes)
                for p in inner_polys:
                    poly = cls.to_polygon(p)
                    if poly is None or not poly.bounds or len(poly.bounds) != 4 or not poly.is_valid:
                        continue

                    if poly.type != 'MultiPolygon':
                        inner.append(poly)
                    else:
                        inner.extend(poly)

                # Validate outer polygons
                for p in outer_polys:
                    poly = cls.to_polygon(p, test_point=test_point)
                    if poly is None or not poly.bounds or len(poly.bounds) != 4:
                        continue

                    interior = []
                    try:
                        # Figure out which outer polygon contains each inner polygon
                        interior = [p2 for p2 in inner if poly.contains(p2)]
                    except TopologicalError:
                        continue

                    if interior:
                        # Polygon with holes constructor
                        poly = cls.to_polygon(p, [zip(*p2.exterior.coords.xy) for p2 in interior], test_point=test_point)
                        if poly is None or not poly.bounds or len(poly.bounds) != 4:
                            continue
                    # R-tree only stores the bounding box, so add the whole polygon
                    if poly.type != 'MultiPolygon':
                        index.index_polygon(poly)
                        multi.append(poly)
                    else:
                        for p in poly:
                            index.index_polygon(p)
                        multi.extend(poly)

                if len(multi) > 1:
                    poly = MultiPolygon(multi)
                elif multi:
                    poly = multi[0]
                else:
                    continue
            if index.simplify_polygons:
                poly = index.simplify_polygon(poly)
            index.add_polygon(poly, props)

        return index
コード例 #4
0
    def polygons(self, properties_only=False):
        '''
        Generator which yields tuples like:

        (relation_id, properties, outer_polygons, inner_polygons)

        At this point a polygon is a list of coordinate tuples,
        suitable for passing to shapely's Polygon constructor
        but may be used for other purposes.

        outer_polygons is a list of the exterior polygons for this
        boundary. inner_polygons is a list of "holes" in the exterior
        polygons although donuts and donut-holes need to be matched
        by the caller using something like shapely's contains.
        '''
        i = 0

        for element_id, props, deps in parse_osm(self.filename,
                                                 dependencies=True):
            props = {
                safe_decode(k): safe_decode(v)
                for k, v in six.iteritems(props)
            }
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                lat = props.get('lat')
                lon = props.get('lon')
                if lat is None or lon is None:
                    continue
                lat, lon = latlon_to_decimal(lat, lon)
                if lat is None or lon is None:
                    continue

                if isclose(lat, 90.0):
                    lat = 89.999

                if isclose(lon, 180.0):
                    lon = 179.999

                if 'name' in props and 'place' in props:
                    self.nodes[node_id] = props

                # Nodes are stored in a sorted array, coordinate indices are simply
                # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
                # Note that the pairs are lon, lat instead of lat, lon for geometry purposes
                self.coords.append(lon)
                self.coords.append(lat)
                self.node_ids.append(node_id)
            elif element_id.startswith('way'):
                way_id = long(element_id.split(':')[-1])

                # Get node indices by binary search
                try:
                    node_indices = [
                        self.binary_search(self.node_ids, node_id)
                        for node_id in deps
                    ]
                except ValueError:
                    continue

                # Way ids stored in a sorted array
                self.way_ids.append(way_id)

                # way_deps is the list of dependent node ids
                # way_coords is a copy of coords indexed by way ids
                for node_id, node_index in izip(deps, node_indices):
                    self.way_deps.append(node_id)
                    self.way_coords.append(self.coords[node_index * 2])
                    self.way_coords.append(self.coords[node_index * 2 + 1])

                self.way_indptr.append(len(self.way_deps))

                if deps[0] == deps[-1] and self.include_polygon(props):
                    way_id_offset = WAY_OFFSET + way_id
                    if not properties_only:
                        outer_polys = self.create_polygons([way_id])
                        inner_polys = []
                        yield way_id_offset, props, {}, outer_polys, inner_polys
                    else:
                        yield way_id_offset, props, {}

            elif element_id.startswith('relation'):
                if self.node_ids is not None:
                    self.node_ids = None
                if self.coords is not None:
                    self.coords = None

                relation_id = long(element_id.split(':')[-1])
                if len(deps
                       ) == 0 or not self.include_polygon(props) or props.get(
                           'type', '').lower() == 'multilinestring':
                    continue

                outer_ways = []
                inner_ways = []
                admin_centers = []

                for elem_id, elem_type, role in deps:
                    if role in ('outer', '') and elem_type == 'way':
                        outer_ways.append(elem_id)
                    elif role == 'inner' and elem_type == 'way':
                        inner_ways.append(elem_id)
                    elif role == 'admin_centre' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None:
                            val['type'] = 'node'
                            val['id'] = long(elem_id)
                            admin_centers.append(val)
                    elif role == 'label' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None and val.get(
                                'name', six.u('')).lower() == props.get(
                                    'name', six.u('')).lower():
                            props.update({
                                k: v
                                for k, v in six.iteritems(val)
                                if k not in props
                            })

                admin_center = {}
                if len(admin_centers) == 1:
                    admin_center = admin_centers[0]

                relation_id_offset = RELATION_OFFSET + relation_id
                if not properties_only:
                    outer_polys = self.create_polygons(outer_ways)
                    inner_polys = self.create_polygons(inner_ways)
                    yield relation_id_offset, props, admin_center, outer_polys, inner_polys
                else:
                    yield relation_id_offset, props, admin_center
            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(
                    element_id.split(':')[0], i))
            i += 1
コード例 #5
0
    def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        logger = logging.getLogger('neighborhoods')

        qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)

        logger.info('Creating ClickThatHood neighborhoods')
        cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()

        logger.info('Creating OSM neighborhoods')
        osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)

        logger.info('Creating Quattroshapes neighborhoods')
        qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)

        country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)

        osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
        osm_admin_rtree.cache_size = 1000

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (cth, qs, osmn):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in six.iteritems(attrs):
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        for i in six.moves.xrange(osmn.i):
            props = osmn.get_properties(i)
            poly = osmn.get_polygon(i)

            props['source'] = 'osm'
            props['component'] = AddressFormatter.SUBURB
            props['polygon_type'] = 'neighborhood'

            index.index_polygon(poly.context)
            index.add_polygon(poly.context, props)

        qs.matched = [False] * qs.i
        cth.matched = [False] * cth.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons
        num_polys = 0
        for element_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            id_type, element_id = element_id.split(':')
            element_id = long(element_id)

            props['type'] = id_type
            props['id'] = element_id

            possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD)
            is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)

            country, candidate_languages = country_rtree.country_and_languages(lat, lon)

            component_name = None

            component_name = osm_address_components.component_from_properties(country, attrs)

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])

            for idx in (cth, qs):
                candidates = idx.get_candidate_polygons(lat, lon, return_all=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
                                                   for c in safe_decode(osm_name)))

                        for i in candidates:
                            props = idx.get_properties(i)
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                poly = idx.get_polygon(i)
                                arg_max = (max_sim, props, poly.context, idx, i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True)
                existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True)

                skip_node = False

                for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries):
                    for poly_index, osm_props in enumerate(boundaries):
                        containing_component = None
                        name = osm_props.get('name')
                        # Only exact name matches here since we're comparins OSM to OSM
                        if name and name.lower() != attrs.get('name', '').lower():
                            continue

                        if boundaries is existing_neighborhood_boundaries:
                            containing_component = AddressFormatter.SUBURB
                            skip_node = True
                            break
                        else:
                            containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]]

                            containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids)

                        if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]:
                            skip_node = True
                            break
                    if skip_node:
                        break

                # Skip this element
                if skip_node:
                    continue

                if idx is cth:
                    if props['component'] == AddressFormatter.SUBURB:
                        attrs['polygon_type'] = 'neighborhood'
                    elif props['component'] == AddressFormatter.CITY_DISTRICT:
                        attrs['polygon_type'] = 'local_admin'
                    else:
                        continue
                    source = 'osm_cth'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)

                    source = 'osm_quattro'
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries]
                component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids)
                attrs['component'] = component

                attrs['source'] = source
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                poly = idx.get_polygon(i)
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is cth:
                    component = props['component']
                    if component == AddressFormatter.SUBURB:
                        props['polygon_type'] = 'neighborhood'
                    elif component == AddressFormatter.CITY_DISTRICT:
                        props['polygon_type'] = 'local_admin'
                    else:
                        continue
                elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
                    component = AddressFormatter.SUBURB
                    name = props.get('name')
                    if not name:
                        continue
                    for pattern, repl in cls.regex_replacements:
                        name = pattern.sub(repl, name)

                    props['name'] = name

                    if cls.quattroshapes_city_district_regex.match(name):
                        component = AddressFormatter.CITY_DISTRICT

                    props['component'] = component
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
コード例 #6
0
    def create_from_osm_file(cls,
                             filename,
                             output_dir,
                             index_filename=None,
                             polys_filename=DEFAULT_POLYS_FILENAME):
        '''
        Given an OSM file (planet or some other bounds) containing relations
        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other bounds.
        '''
        index = cls(save_dir=output_dir, index_filename=index_filename)

        reader = cls.polygon_reader(filename)
        polygons = reader.polygons()

        logger = logging.getLogger('osm.reverse_geocode')

        for element_id, props, admin_center, outer_polys, inner_polys in polygons:
            props = {
                k: v
                for k, v in six.iteritems(props)
                if k in cls.include_property_patterns or
                (six.u(':') in k and six.u('{}:*').format(
                    k.split(six.u(':'), 1)[0]) in cls.include_property_patterns
                 )
            }

            id_type, element_id = osm_type_and_id(element_id)

            test_point = None

            if admin_center:
                admin_center_props = {
                    k: v
                    for k, v in six.iteritems(admin_center)
                    if k in ('id', 'type', 'lat',
                             'lon') or k in cls.include_property_patterns or
                    (six.u(':') in k
                     and six.u('{}:*').format(k.split(six.u(
                         ':'), 1)[0]) in cls.include_property_patterns)
                }

                if cls.fix_invalid_polygons:
                    center_lat, center_lon = latlon_to_decimal(
                        admin_center_props['lat'], admin_center_props['lon'])
                    test_point = Point(center_lon, center_lat)

                props['admin_center'] = admin_center_props

            if inner_polys and not outer_polys:
                logger.warn('inner polygons with no outer')
                continue
            if len(outer_polys) == 1 and not inner_polys:
                poly = cls.to_polygon(outer_polys[0])
                if poly is None or not poly.bounds or len(poly.bounds) != 4:
                    continue
                if poly.type != 'MultiPolygon':
                    index.index_polygon(poly)
                else:
                    for p in poly:
                        index.index_polygon(p)
            else:
                multi = []
                inner = []
                # Validate inner polygons (holes)
                for p in inner_polys:
                    poly = cls.to_polygon(p)
                    if poly is None or not poly.bounds or len(
                            poly.bounds) != 4 or not poly.is_valid:
                        continue

                    if poly.type != 'MultiPolygon':
                        inner.append(poly)
                    else:
                        inner.extend(poly)

                # Validate outer polygons
                for p in outer_polys:
                    poly = cls.to_polygon(p, test_point=test_point)
                    if poly is None or not poly.bounds or len(
                            poly.bounds) != 4:
                        continue

                    interior = []
                    try:
                        # Figure out which outer polygon contains each inner polygon
                        interior = [p2 for p2 in inner if poly.contains(p2)]
                    except TopologicalError:
                        continue

                    if interior:
                        # Polygon with holes constructor
                        poly = cls.to_polygon(
                            p,
                            [zip(*p2.exterior.coords.xy) for p2 in interior],
                            test_point=test_point)
                        if poly is None or not poly.bounds or len(
                                poly.bounds) != 4:
                            continue
                    # R-tree only stores the bounding box, so add the whole polygon
                    if poly.type != 'MultiPolygon':
                        index.index_polygon(poly)
                        multi.append(poly)
                    else:
                        for p in poly:
                            index.index_polygon(p)
                        multi.extend(poly)

                if len(multi) > 1:
                    poly = MultiPolygon(multi)
                elif multi:
                    poly = multi[0]
                else:
                    continue
            if index.simplify_polygons:
                poly = index.simplify_polygon(poly)
            index.add_polygon(poly, props)

        return index
コード例 #7
0
    def create_from_osm_and_quattroshapes(cls,
                                          filename,
                                          quattroshapes_dir,
                                          output_dir,
                                          scratch_dir=SCRATCH_DIR):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        ensure_dir(scratch_dir)

        logger = logging.getLogger('neighborhoods')
        logger.setLevel(logging.INFO)

        qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)
        logger.info('Creating Quattroshapes neighborhoods')

        qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(
            quattroshapes_dir, qs_scratch_dir)
        logger.info('Creating Zetashapes neighborhoods')
        zs = cls.create_zetashapes_neighborhoods_index()

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (zs, qs):
            for i, (props, poly) in enumerate(idx.polygons):
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in attrs.iteritems():
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        qs.matched = [False] * qs.i
        zs.matched = [False] * zs.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
        num_polys = 0
        for node_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            is_neighborhood = attrs.get('place') == 'neighbourhood'

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([
                    v for k, v in attrs.iteritems()
                    if k.startswith('{}:'.format(name_key))
                ])

            for idx in (zs, qs):
                candidates = idx.get_candidate_polygons(lat,
                                                        lon,
                                                        return_all=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(
                            ((char_scripts[ord(c)] or '').lower()
                             in ideographic_scripts
                             for c in safe_decode(osm_name)))

                        for i in candidates:
                            props, poly = idx.polygons[i]
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            if is_neighborhood and idx is qs and props.get(
                                    QuattroshapesReverseGeocoder.LEVEL
                            ) != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(
                                    osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(
                                    osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                arg_max = (max_sim, props, poly.context, idx,
                                           i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                if idx is zs:
                    attrs['polygon_type'] = 'neighborhood'
                    source = 'osm_zeta'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
                    source = 'osm_quattro'
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                attrs['source'] = source
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
            for i, (props, poly) in enumerate(idx.polygons):
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL,
                                          None) == 'neighborhood':
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
コード例 #8
0
    def polygons(self):
        '''
        Generator which yields tuples like:

        (relation_id, properties, outer_polygons, inner_polygons)

        At this point a polygon is a list of coordinate tuples,
        suitable for passing to shapely's Polygon constructor
        but may be used for other purposes.

        outer_polygons is a list of the exterior polygons for this
        boundary. inner_polygons is a list of "holes" in the exterior
        polygons although donuts and donut-holes need to be matched
        by the caller using something like shapely's contains.
        '''
        i = 0

        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                lat = props.get('lat')
                lon = props.get('lon')
                if lat is None or lon is None:
                    continue
                lat, lon = latlon_to_decimal(lat, lon)
                if lat is None or lon is None:
                    continue
                # Nodes are stored in a sorted array, coordinate indices are simply
                # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
                # Note that the pairs are lon, lat instead of lat, lon for geometry purposes
                self.coords.append(lon)
                self.coords.append(lat)
                self.node_ids.append(node_id)
            elif element_id.startswith('way'):
                way_id = long(element_id.split(':')[-1])

                # Get node indices by binary search
                try:
                    node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps]
                except ValueError:
                    continue

                # Way ids stored in a sorted array
                self.way_ids.append(way_id)

                # way_deps is the list of dependent node ids
                # way_coords is a copy of coords indexed by way ids
                for node_id, node_index in izip(deps, node_indices):
                    self.way_deps.append(node_id)
                    self.way_coords.append(self.coords[node_index * 2])
                    self.way_coords.append(self.coords[node_index * 2 + 1])

                self.way_indptr.append(len(self.way_deps))
            elif element_id.startswith('relation'):
                if self.node_ids is not None:
                    self.node_ids = None
                if self.coords is not None:
                    self.coords = None

                relation_id = long(element_id.split(':')[-1])
                if len(deps) == 0 or not props.get('boundary') or props.get('type', '').lower() == 'multilinestring':
                    continue

                outer_ways = []
                inner_ways = []

                for way_id, role in deps:
                    if role == 'outer':
                        outer_ways.append(way_id)
                    elif role == 'inner':
                        inner_ways.append(way_id)

                outer_polys = self.create_polygons(outer_ways)
                inner_polys = self.create_polygons(inner_ways)

                yield relation_id, props, outer_polys, inner_polys
            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
            i += 1
コード例 #9
0
    def polygons(self):
        '''
        Generator which yields tuples like:

        (relation_id, properties, outer_polygons, inner_polygons)

        At this point a polygon is a list of coordinate tuples,
        suitable for passing to shapely's Polygon constructor
        but may be used for other purposes.

        outer_polygons is a list of the exterior polygons for this
        boundary. inner_polygons is a list of "holes" in the exterior
        polygons although donuts and donut-holes need to be matched
        by the caller using something like shapely's contains.
        '''
        i = 0

        for element_id, props, deps in parse_osm(self.filename,
                                                 dependencies=True):
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                lat = props.get('lat')
                lon = props.get('lon')
                if lat is None or lon is None:
                    continue
                lat, lon = latlon_to_decimal(lat, lon)
                if lat is None or lon is None:
                    continue
                # Nodes are stored in a sorted array, coordinate indices are simply
                # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
                # Note that the pairs are lon, lat instead of lat, lon for geometry purposes
                self.coords.append(lon)
                self.coords.append(lat)
                self.node_ids.append(node_id)
            elif element_id.startswith('way'):
                way_id = long(element_id.split(':')[-1])

                # Get node indices by binary search
                try:
                    node_indices = [
                        self.binary_search(self.node_ids, node_id)
                        for node_id in deps
                    ]
                except ValueError:
                    continue

                # Way ids stored in a sorted array
                self.way_ids.append(way_id)

                # way_deps is the list of dependent node ids
                # way_coords is a copy of coords indexed by way ids
                for node_id, node_index in izip(deps, node_indices):
                    self.way_deps.append(node_id)
                    self.way_coords.append(self.coords[node_index * 2])
                    self.way_coords.append(self.coords[node_index * 2 + 1])

                self.way_indptr.append(len(self.way_deps))
            elif element_id.startswith('relation'):
                if self.node_ids is not None:
                    self.node_ids = None
                if self.coords is not None:
                    self.coords = None

                relation_id = long(element_id.split(':')[-1])
                if len(deps) == 0 or not props.get('boundary') or props.get(
                        'type', '').lower() == 'multilinestring':
                    continue

                outer_ways = []
                inner_ways = []

                for way_id, role in deps:
                    if role == 'outer':
                        outer_ways.append(way_id)
                    elif role == 'inner':
                        inner_ways.append(way_id)

                outer_polys = self.create_polygons(outer_ways)
                inner_polys = self.create_polygons(inner_ways)

                yield relation_id, props, outer_polys, inner_polys
            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(
                    element_id.split(':')[0], i))
            i += 1
コード例 #10
0
ファイル: admin_boundaries.py プロジェクト: BERENZ/libpostal
    def polygons(self, properties_only=False):
        '''
        Generator which yields tuples like:

        (relation_id, properties, outer_polygons, inner_polygons)

        At this point a polygon is a list of coordinate tuples,
        suitable for passing to shapely's Polygon constructor
        but may be used for other purposes.

        outer_polygons is a list of the exterior polygons for this
        boundary. inner_polygons is a list of "holes" in the exterior
        polygons although donuts and donut-holes need to be matched
        by the caller using something like shapely's contains.
        '''
        i = 0

        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
            props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                lat = props.get('lat')
                lon = props.get('lon')
                if lat is None or lon is None:
                    continue
                lat, lon = latlon_to_decimal(lat, lon)
                if lat is None or lon is None:
                    continue

                if isclose(lat, 90.0):
                    lat = 89.999

                if isclose(lon, 180.0):
                    lon = 179.999

                if 'name' in props and 'place' in props:
                    self.nodes[node_id] = props

                # Nodes are stored in a sorted array, coordinate indices are simply
                # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
                # Note that the pairs are lon, lat instead of lat, lon for geometry purposes
                self.coords.append(lon)
                self.coords.append(lat)
                self.node_ids.append(node_id)
            elif element_id.startswith('way'):
                way_id = long(element_id.split(':')[-1])

                # Get node indices by binary search
                try:
                    node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps]
                except ValueError:
                    continue

                # Way ids stored in a sorted array
                self.way_ids.append(way_id)

                # way_deps is the list of dependent node ids
                # way_coords is a copy of coords indexed by way ids
                for node_id, node_index in izip(deps, node_indices):
                    self.way_deps.append(node_id)
                    self.way_coords.append(self.coords[node_index * 2])
                    self.way_coords.append(self.coords[node_index * 2 + 1])

                self.way_indptr.append(len(self.way_deps))

                if deps[0] == deps[-1] and self.include_polygon(props):
                    way_id_offset = WAY_OFFSET + way_id
                    if not properties_only:
                        outer_polys = self.create_polygons([way_id])
                        inner_polys = []
                        yield way_id_offset, props, {}, outer_polys, inner_polys
                    else:
                        yield way_id_offset, props, {}

            elif element_id.startswith('relation'):
                if self.node_ids is not None:
                    self.node_ids = None
                if self.coords is not None:
                    self.coords = None

                relation_id = long(element_id.split(':')[-1])
                if len(deps) == 0 or not self.include_polygon(props) or props.get('type', '').lower() == 'multilinestring':
                    continue

                outer_ways = []
                inner_ways = []
                admin_centers = []

                for elem_id, elem_type, role in deps:
                    if role in ('outer', '') and elem_type == 'way':
                        outer_ways.append(elem_id)
                    elif role == 'inner' and elem_type == 'way':
                        inner_ways.append(elem_id)
                    elif role == 'admin_centre' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None:
                            val['type'] = 'node'
                            val['id'] = long(elem_id)
                            admin_centers.append(val)
                    elif role == 'label' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None and val.get('name', six.u('')).lower() == props.get('name', six.u('')).lower():
                            props.update({k: v for k, v in six.iteritems(val)
                                          if k not in props})

                admin_center = {}
                if len(admin_centers) == 1:
                    admin_center = admin_centers[0]

                relation_id_offset = RELATION_OFFSET + relation_id
                if not properties_only:
                    outer_polys = self.create_polygons(outer_ways)
                    inner_polys = self.create_polygons(inner_ways)
                    yield relation_id_offset, props, admin_center, outer_polys, inner_polys
                else:
                    yield relation_id_offset, props, admin_center
            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
            i += 1