def create_neighborhoods_index(cls):
        scratch_dir = cls.SCRATCH_DIR
        repo_path = os.path.join(scratch_dir, 'click_that_hood')
        cls.clone_repo(repo_path)

        data_path = os.path.join(repo_path, 'public', 'data')

        neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods')
        ensure_dir(neighborhoods_dir)

        index = cls(save_dir=neighborhoods_dir)

        for c in cls.config['files']:
            filename = c['filename']
            component = c['component']

            path = os.path.join(data_path, filename)
            features = json.load(open(path))['features']
            for f in features:
                f['properties']['component'] = component

            try:
                index.add_geojson_like_file(features)
            except ValueError:
                continue

        return index
Exemple #2
0
def download_cldr(temp_dir=None):
    if os.path.exists(CLDR_DIR):
        shutil.rmtree(CLDR_DIR)
    ensure_dir(CLDR_DIR)

    if not temp_dir:
        temp_dir = tempfile.gettempdir()

    cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1])

    subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename])
    subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
Exemple #3
0
    def download_file(self, wof_id):
        s3_path, filename = self.path_and_filename(wof_id)

        local_path = self.local_path(wof_id)
        local_dir = os.path.dirname(local_path)

        s3_key = six.u('/').join(('data', s3_path, filename))
        try:
            bucket = self.WOF_S3_BUCKET
            self.s3.head_object(Bucket=bucket, Key=s3_key)
            ensure_dir(local_dir)
            if not os.path.exists(local_path):
                self.s3.download_file(self.WOF_S3_BUCKET, s3_key, local_path)
            return True
        except Exception:
            return False
def download_wof_postcodes(wof_dir):
    ensure_dir(wof_dir)

    clone_repo(wof_dir, WOF_PLACE_DATA_REPO)

    response = requests.get(SEED_URLS_JSON)
    if response.ok:
        content = json.loads(response.content)

        for d in content:
            repo_name = d['name']

            if int(d.get('count', 0)) > 0:
                repo = d['url']
                print('doing {}'.format(repo_name))

                repo_dir = clone_repo(wof_dir, repo)

            else:
                print('skipping {}'.format(repo_name))
Exemple #5
0
def download_wof_postcodes(wof_dir):
    ensure_dir(wof_dir)

    clone_repo(wof_dir, WOF_PLACE_DATA_REPO)

    response = requests.get(SEED_URLS_JSON)
    if response.ok:
        content = json.loads(response.content)

        for d in content:
            repo_name = d['name']

            if int(d.get('count', 0)) > 0:
                repo = d['url']
                print('doing {}'.format(repo_name))

                repo_dir = clone_repo(wof_dir, repo)

            else:
                print('skipping {}'.format(repo_name))
Exemple #6
0
    def create_zetashapes_neighborhoods_index(cls):
        scratch_dir = cls.SCRATCH_DIR
        repo_path = os.path.join(scratch_dir, 'neighborhoods')
        cls.clone_repo(repo_path)

        neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
        ensure_dir(neighborhoods_dir)

        index = GeohashPolygonIndex()

        have_geonames = set()
        is_neighborhood = set()

        for filename in os.listdir(repo_path):
            path = os.path.join(repo_path, filename)
            base_name = filename.split('.')[0].split('gn-')[-1]
            if filename.endswith('.geojson') and filename.startswith('gn-'):
                have_geonames.add(base_name)
            elif filename.endswith('metadata.json'):
                data = json.load(open(os.path.join(repo_path, filename)))
                if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'):
                    is_neighborhood.add(base_name)

        for filename in os.listdir(repo_path):
            if not filename.endswith('.geojson'):
                continue
            base_name = filename.rsplit('.geojson')[0]
            if base_name in have_geonames:
                f = open(os.path.join(repo_path, 'gn-{}'.format(filename)))
            elif base_name in is_neighborhood:
                f = open(os.path.join(repo_path, filename))
            else:
                continue
            index.add_geojson_like_file(json.load(f)['features'])

        return index
Exemple #7
0
    def create_zetashapes_neighborhoods_index(cls):
        scratch_dir = cls.SCRATCH_DIR
        repo_path = os.path.join(scratch_dir, 'neighborhoods')
        cls.clone_repo(repo_path)

        neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
        ensure_dir(neighborhoods_dir)

        index = GeohashPolygonIndex()

        have_geonames = set()
        is_neighborhood = set()

        for filename in os.listdir(repo_path):
            path = os.path.join(repo_path, filename)
            base_name = filename.split('.')[0].split('gn-')[-1]
            if filename.endswith('.geojson') and filename.startswith('gn-'):
                have_geonames.add(base_name)
            elif filename.endswith('metadata.json'):
                data = json.load(open(os.path.join(repo_path, filename)))
                if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'):
                    is_neighborhood.add(base_name)

        for filename in os.listdir(repo_path):
            if not filename.endswith('.geojson'):
                continue
            base_name = filename.rsplit('.geojson')[0]
            if base_name in have_geonames:
                f = open(os.path.join(repo_path, 'gn-{}'.format(filename)))
            elif base_name in is_neighborhood:
                f = open(os.path.join(repo_path, filename))
            else:
                continue
            index.add_geojson_like_file(json.load(f)['features'])

        return index
Exemple #8
0
    def __init__(self, filename, db_dir):
        self.filename = filename

        self.node_ids = array.array('l')

        self.logger = logging.getLogger('osm.intersections')

        # Store these in a LevelDB
        ensure_dir(db_dir)
        ways_dir = os.path.join(db_dir, 'ways')
        ensure_dir(ways_dir)
        nodes_dir = os.path.join(db_dir, 'nodes')
        ensure_dir(nodes_dir)
        self.way_props = LevelDB(ways_dir)
        self.node_props = LevelDB(nodes_dir)

        # These form a graph and should always have the same length
        self.intersection_edges_nodes = array.array('l')
        self.intersection_edges_ways = array.array('l')
Exemple #9
0
    def __init__(self, filename, db_dir):
        self.filename = filename

        self.node_ids = array.array('l')

        self.logger = logging.getLogger('osm.intersections')

        # Store these in a LevelDB
        ensure_dir(db_dir)
        ways_dir = os.path.join(db_dir, 'ways')
        ensure_dir(ways_dir)
        nodes_dir = os.path.join(db_dir, 'nodes')
        ensure_dir(nodes_dir)
        self.way_props = LevelDB(ways_dir)
        self.node_props = LevelDB(nodes_dir)

        # These form a graph and should always have the same length
        self.intersection_edges_nodes = array.array('l')
        self.intersection_edges_ways = array.array('l')
Exemple #10
0
 def __init__(self, wof_dir, cache_size=10000, **s3_args):
     self.wof_dir = wof_dir
     self.admin_dir = os.path.join(wof_dir, 'admin')
     ensure_dir(self.admin_dir)
     self.client = WhosOnFirst(self.admin_dir, **s3_args)
Exemple #11
0
    def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        ensure_dir(scratch_dir)

        logger = logging.getLogger('neighborhoods')
        logger.setLevel(logging.INFO)

        qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)
        logger.info('Creating Quattroshapes neighborhoods')

        qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
        logger.info('Creating Zetashapes neighborhoods')
        zs = cls.create_zetashapes_neighborhoods_index()

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (zs, qs):
            for i, (props, poly) in enumerate(idx.polygons):
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in attrs.iteritems():
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        qs.matched = [False] * qs.i
        zs.matched = [False] * zs.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
        num_polys = 0
        for node_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            is_neighborhood = attrs.get('place') == 'neighbourhood'

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))])

            for idx in (zs, qs):
                candidates = idx.get_candidate_polygons(lat, lon, all_levels=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
                                                   for c in safe_decode(osm_name)))

                        for i in candidates:
                            props, poly = idx.polygons[i]
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            level = props.get(QuattroshapesReverseGeocoder.LEVEL)
                            if is_neighborhood and level != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                arg_max = (max_sim, props, poly.context, idx, i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                if idx is zs:
                    attrs['polygon_type'] = 'neighborhood'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                attrs['source'] = 'osm'
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
            for i, (props, poly) in enumerate(idx.polygons):
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
    def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        logger = logging.getLogger('neighborhoods')

        qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)

        logger.info('Creating ClickThatHood neighborhoods')
        cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()

        logger.info('Creating OSM neighborhoods')
        osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)

        logger.info('Creating Quattroshapes neighborhoods')
        qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)

        country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)

        osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
        osm_admin_rtree.cache_size = 1000

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (cth, qs, osmn):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in six.iteritems(attrs):
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        for i in six.moves.xrange(osmn.i):
            props = osmn.get_properties(i)
            poly = osmn.get_polygon(i)

            props['source'] = 'osm'
            props['component'] = AddressFormatter.SUBURB
            props['polygon_type'] = 'neighborhood'

            index.index_polygon(poly.context)
            index.add_polygon(poly.context, props)

        qs.matched = [False] * qs.i
        cth.matched = [False] * cth.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons
        num_polys = 0
        for element_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            id_type, element_id = element_id.split(':')
            element_id = long(element_id)

            props['type'] = id_type
            props['id'] = element_id

            possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD)
            is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)

            country, candidate_languages = country_rtree.country_and_languages(lat, lon)

            component_name = None

            component_name = osm_address_components.component_from_properties(country, attrs)

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])

            for idx in (cth, qs):
                candidates = idx.get_candidate_polygons(lat, lon, return_all=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
                                                   for c in safe_decode(osm_name)))

                        for i in candidates:
                            props = idx.get_properties(i)
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                poly = idx.get_polygon(i)
                                arg_max = (max_sim, props, poly.context, idx, i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True)
                existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True)

                skip_node = False

                for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries):
                    for poly_index, osm_props in enumerate(boundaries):
                        containing_component = None
                        name = osm_props.get('name')
                        # Only exact name matches here since we're comparins OSM to OSM
                        if name and name.lower() != attrs.get('name', '').lower():
                            continue

                        if boundaries is existing_neighborhood_boundaries:
                            containing_component = AddressFormatter.SUBURB
                            skip_node = True
                            break
                        else:
                            containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]]

                            containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids)

                        if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]:
                            skip_node = True
                            break
                    if skip_node:
                        break

                # Skip this element
                if skip_node:
                    continue

                if idx is cth:
                    if props['component'] == AddressFormatter.SUBURB:
                        attrs['polygon_type'] = 'neighborhood'
                    elif props['component'] == AddressFormatter.CITY_DISTRICT:
                        attrs['polygon_type'] = 'local_admin'
                    else:
                        continue
                    source = 'osm_cth'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)

                    source = 'osm_quattro'
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries]
                component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids)
                attrs['component'] = component

                attrs['source'] = source
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                poly = idx.get_polygon(i)
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is cth:
                    component = props['component']
                    if component == AddressFormatter.SUBURB:
                        props['polygon_type'] = 'neighborhood'
                    elif component == AddressFormatter.CITY_DISTRICT:
                        props['polygon_type'] = 'local_admin'
                    else:
                        continue
                elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
                    component = AddressFormatter.SUBURB
                    name = props.get('name')
                    if not name:
                        continue
                    for pattern, repl in cls.regex_replacements:
                        name = pattern.sub(repl, name)

                    props['name'] = name

                    if cls.quattroshapes_city_district_regex.match(name):
                        component = AddressFormatter.CITY_DISTRICT

                    props['component'] = component
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
    def create_neighborhoods_index(cls, osm_neighborhoods_file):
        scratch_dir = cls.SCRATCH_DIR
        neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
        ensure_dir(neighborhoods_dir)

        return cls.create_from_osm_file(osm_neighborhoods_file, output_dir=neighborhoods_dir)
        zip_path = filename + '.zip'
        zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)])

        url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)

        download_pre_release_downloads(out_dir)

        print(six.u('doing {}').format(safe_decode(source)))
        success = download_and_unzip_file(url, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('-o', '--out-dir',
                        required=True,
                        help='Output directory')

    parser.add_argument('--all', action='store_true',
                        default=False, help='Download all completed OpenAddresses files')

    args = parser.parse_args()
    ensure_dir(args.out_dir)

    if args.all:
        openaddresses_download_all_files(args.out_dir)
    else:
        openaddresses_download_configured_files(args.out_dir)
Exemple #15
0
    def create_from_osm_and_quattroshapes(cls,
                                          filename,
                                          quattroshapes_dir,
                                          output_dir,
                                          scratch_dir=SCRATCH_DIR):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        ensure_dir(scratch_dir)

        logger = logging.getLogger('neighborhoods')
        logger.setLevel(logging.INFO)

        qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)
        logger.info('Creating Quattroshapes neighborhoods')

        qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(
            quattroshapes_dir, qs_scratch_dir)
        logger.info('Creating Zetashapes neighborhoods')
        zs = cls.create_zetashapes_neighborhoods_index()

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (zs, qs):
            for i, (props, poly) in enumerate(idx.polygons):
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in attrs.iteritems():
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        qs.matched = [False] * qs.i
        zs.matched = [False] * zs.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
        num_polys = 0
        for node_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            is_neighborhood = attrs.get('place') == 'neighbourhood'

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([
                    v for k, v in attrs.iteritems()
                    if k.startswith('{}:'.format(name_key))
                ])

            for idx in (zs, qs):
                candidates = idx.get_candidate_polygons(lat,
                                                        lon,
                                                        return_all=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(
                            ((char_scripts[ord(c)] or '').lower()
                             in ideographic_scripts
                             for c in safe_decode(osm_name)))

                        for i in candidates:
                            props, poly = idx.polygons[i]
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            if is_neighborhood and idx is qs and props.get(
                                    QuattroshapesReverseGeocoder.LEVEL
                            ) != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(
                                    osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(
                                    osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                arg_max = (max_sim, props, poly.context, idx,
                                           i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                if idx is zs:
                    attrs['polygon_type'] = 'neighborhood'
                    source = 'osm_zeta'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
                    source = 'osm_quattro'
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                attrs['source'] = source
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
            for i, (props, poly) in enumerate(idx.polygons):
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL,
                                          None) == 'neighborhood':
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
        url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)

        download_pre_release_downloads(out_dir)

        print(six.u('doing {}').format(safe_decode(source)))
        success = download_and_unzip_file(url, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('-o',
                        '--out-dir',
                        required=True,
                        help='Output directory')

    parser.add_argument('--all',
                        action='store_true',
                        default=False,
                        help='Download all completed OpenAddresses files')

    args = parser.parse_args()
    ensure_dir(args.out_dir)

    if args.all:
        openaddresses_download_all_files(args.out_dir)
    else:
        openaddresses_download_configured_files(args.out_dir)
Exemple #17
0
 def __init__(self, wof_dir, cache_size=10000, **s3_args):
     self.wof_dir = wof_dir
     self.admin_dir = os.path.join(wof_dir, 'admin')
     ensure_dir(self.admin_dir)
     self.client = WhosOnFirst(self.admin_dir, **s3_args)