Beispiel #1
0
def _add_fixed_alt_names(locations_by_name):
    for real_name, alt_names, resolution in (
        # Countries
        (
            'United States of America',
            ('USA', 'U.S.A.', 'US', 'U.S.', 'United States', 'the United States', 'America'),
            ResolutionTypes.COUNTRY
        ),
        ('United Kingdom', ('Great Britain', 'Britain', 'UK', 'U.K.'), ResolutionTypes.COUNTRY),
        ('South Korea', ('Korea',), ResolutionTypes.COUNTRY),
        ('North Korea', ('Korea',), ResolutionTypes.COUNTRY),
        ('The Netherlands', ('Netherlands', 'Holland',), ResolutionTypes.COUNTRY),
        ("Cote d'Ivoire", ('Ivory Coast',), ResolutionTypes.COUNTRY),
        ('Palestinian Territory', ('Palestine',), ResolutionTypes.COUNTRY),
        # Cities
        ('New York City', ('NYC', 'N.Y.C.'), ResolutionTypes.CITY),
        ('Los Angeles', ('LA', 'L.A.'), ResolutionTypes.CITY),
    ):
        locations = [
            loc for loc in locations_by_name[standardize_loc_name(real_name)].itervalues()
            if loc['resolution'] == resolution
        ]
        assert len(locations) == 1
        location = locations[0]

        for alt_name in alt_names:
            locations_by_name[standardize_loc_name(alt_name)][location['id']] = location
Beispiel #2
0
def _add_state_abbreviations(filepath, locations_by_name):
    """
    We think the abbreviation for a US state can be a name, so add them to the locations map.
    """
    with open(filepath) as states_file:
        csv_reader = csv.reader(states_file, delimiter='\t')
        for state, abbrev in csv_reader:
            assert len(abbrev) == 2
            state = standardize_loc_name(state)

            state_found = False
            for candidate in locations_by_name[state].itervalues():
                if (
                    candidate['resolution'] == ResolutionTypes.ADMIN_1 and
                    candidate['admin_level_1'] == state and
                    candidate['country_code'] == u'US'
                ):
                    for abbrev_name in (abbrev, '%s.%s.' % (abbrev[0], abbrev[1])):
                        abbrev_name = standardize_loc_name(abbrev_name)
                        locations_by_name[abbrev_name][candidate['id']] = candidate
                    state_found = True
                    break

            if not state_found:
                raise ValueError
Beispiel #3
0
def _find_single_location(name, country, resolution):
    name = standardize_loc_name(name)
    matches = [
        loc for loc in _LOCATIONS_BY_NAME[name].itervalues()
        if (loc['name'] == name and loc['country'] == standardize_loc_name(
            country) and loc['resolution'] == resolution)
    ]
    assert len(matches) == 1
    return matches[0]
Beispiel #4
0
 def _name_search(self, name, resolution=None):
     name = standardize_loc_name(name)
     return {
         id_: loc.copy()
         for id_, loc in self._locations_by_name.get(name, {}).iteritems()
         if not resolution or loc['resolution'] == resolution
     }
Beispiel #5
0
def _add_fixed_alt_names():
    for (real_name, country,
         resolution), alt_names in FIXED_ALTERNATE_NAMES.iteritems():
        location = _find_single_location(real_name, country, resolution)
        for alt_name in alt_names:
            _LOCATIONS_BY_NAME[standardize_loc_name(alt_name)][
                location['id']] = location
Beispiel #6
0
def _load_admin1_data(filepath, countries_by_code):
    admin1_by_code = {}

    with open(filepath) as admin1_file:
        reader = csv.reader(admin1_file,
                            dialect='excel-tab',
                            quoting=csv.QUOTE_NONE)
        for (full_admin1_code, name, ascii_name, geoname_id) in reader:
            standard_name = standardize_loc_name(name)
            if not geoname_id or not standard_name:
                continue

            country_code, admin1_code = full_admin1_code.split('.')
            country = countries_by_code[country_code]
            data = {
                'id': geoname_id,
                'resolution': ResolutionTypes.ADMIN_1,
                'name': standard_name,
                'country_code': country_code,
                'country': country['name'],
                'country_id': country['id'],
                'population': 0,
            }

            _LOCATIONS_BY_NAME[standard_name][geoname_id] = data
            for alt_name in set(get_alt_punc_names(standard_name)):
                _LOCATIONS_BY_NAME[alt_name][geoname_id] = data

            assert geoname_id not in _LOCATIONS_BY_ID
            _LOCATIONS_BY_ID[geoname_id] = data
            admin1_by_code[full_admin1_code] = data

    return admin1_by_code
Beispiel #7
0
 def _name_search(self, name, resolution=None):
     name = standardize_loc_name(name)
     if name in DataSource.CONTINENTS or name in DataSource.OCEANS:
         return {}
     return {
         id_: loc.copy() for id_, loc in self._locations_by_name[name].iteritems()
         if not resolution or loc['resolution'] == resolution
     }
Beispiel #8
0
def _add_alternate_names(filepath):
    _add_fixed_alt_names()

    if not os.path.isfile(filepath):
        return

    with open(filepath) as alt_names_file:
        alt_names_by_id = json.load(alt_names_file)

    for id_, alt_names in alt_names_by_id.iteritems():
        location = _LOCATIONS_BY_ID[id_]
        if location['population'] >= _MIN_POPULATION_FOR_ALT_WIKI_NAMES:
            for alt_name in alt_names:
                _LOCATIONS_BY_NAME[standardize_loc_name(
                    alt_name)][id_] = location
Beispiel #9
0
def _load_city_data(filepath, countries_by_code, admin1_by_code,
                    admin2_by_code):
    with open(filepath) as city_file:
        reader = csv.reader(city_file,
                            dialect='excel-tab',
                            quoting=csv.QUOTE_NONE)
        for (geoname_id, name, ascii_name, alternate_names, latitude,
             longitude, feature_class, feature_code, country_code, cc2,
             admin1_code, admin2_code, admin3_code, admin4_code, population,
             elevation, dem, timezone, modification_date) in reader:
            if feature_code.upper() not in _KEEP_FEATURE_CODES:
                continue

            standard_name = standardize_loc_name(name)
            if not geoname_id or not standard_name:
                continue

            admin1 = admin1_by_code.get('%s.%s' % (country_code, admin1_code))
            admin2 = admin2_by_code.get(
                '%s.%s.%s' % (country_code, admin1_code, admin2_code))
            country = countries_by_code[country_code]
            data = {
                'id': geoname_id,
                'resolution': ResolutionTypes.CITY,
                'name': standard_name,
                'country_code': country_code,
                'country': country['name'],
                'country_id': country['id'],
                'admin_level_1': admin1['name'] if admin1 else None,
                'admin_level_1_id': admin1['id'] if admin1 else None,
                'admin_level_2': admin2['name'] if admin2 else None,
                'admin_level_2_id': admin2['id'] if admin2 else None,
                'population': int(population),
                'latitude': float(latitude),
                'longitude': float(longitude),
            }

            _LOCATIONS_BY_NAME[standard_name][geoname_id] = data
            for alt_name in set(get_alt_punc_names(standard_name)):
                _LOCATIONS_BY_NAME[alt_name][geoname_id] = data

            assert geoname_id not in _LOCATIONS_BY_ID
            _LOCATIONS_BY_ID[geoname_id] = data

            if admin1:
                admin1['population'] += int(population)
            if admin2:
                admin2['population'] += int(population)
Beispiel #10
0
def _load_country_data(filepath):
    countries_by_code = {}

    with open(filepath) as country_file:
        reader = csv.reader(country_file,
                            dialect='excel-tab',
                            quoting=csv.QUOTE_NONE)
        for row in reader:
            if row[0].startswith('#'):
                continue

            (iso, iso3, isonumeric, fips, name, capital, areakm2, population,
             continent_code, tld, currency_code, currency_name, phone,
             postal_code_format, postal_code_regex, languages, geoname_id,
             neighbors, equivalent_fips_code) = row
            standard_name = standardize_loc_name(name)
            if not geoname_id or not standard_name:
                continue

            data = {
                'id': geoname_id,
                'resolution': ResolutionTypes.COUNTRY,
                'name': standard_name,
                'country_code': iso,
                'country': standard_name,
                'country_id': geoname_id,
                'population': int(population),
                'neighbor_country_codes': neighbors.split(','),
            }

            _LOCATIONS_BY_NAME[standard_name][geoname_id] = data
            for alt_name in set(get_alt_punc_names(standard_name)):
                _LOCATIONS_BY_NAME[alt_name][geoname_id] = data

            assert geoname_id not in _LOCATIONS_BY_ID
            _LOCATIONS_BY_ID[geoname_id] = data
            countries_by_code[iso] = data

    for country in _LOCATIONS_BY_ID.itervalues():
        country['neighbor_country_ids'] = [
            countries_by_code[code]['country_id']
            for code in country['neighbor_country_codes']
            if code in countries_by_code
        ]
        del country['neighbor_country_codes']

    return countries_by_code
Beispiel #11
0
def _load_main_data(filepath, alt_names_by_id):
    locations_by_name = defaultdict(dict)
    locations_by_id = {}

    with open(filepath) as loc_file:
        csv_reader = csv.reader(loc_file, delimiter='\t')
        keys = next(csv_reader)
        last_importance = 1.

        for row in csv_reader:
            assert len(row) == 23
            loc_info = dict(zip(keys, row))
            importance = float(loc_info['importance'])
            assert importance <= last_importance
            last_importance = importance

            resolution = _get_resolution(loc_info)
            if not resolution:
                continue

            data = dict(
                id=int(loc_info['osm_id']),
                resolution=resolution,
                name=standardize_loc_name(loc_info['name']),
                latitude=float(loc_info['lat']),
                longitude=float(loc_info['lon']),
                importance=importance,
                city=standardize_loc_name(loc_info['city']),
                admin_level_2=standardize_loc_name(loc_info['county']),
                admin_level_1=standardize_loc_name(loc_info['state']),
                country=standardize_loc_name(loc_info['country']),
                country_code=loc_info['country_code'].upper(),
            )

            if _should_skip_location(data, locations_by_name):
                continue

            alt_osm_names = [
                name for name in loc_info['alternative_names'].split(',') if _is_ascii(name)
            ]
            alt_wiki_names = alt_names_by_id[data['id']]
            alt_punc_name = get_alt_punc_names(loc_info['name'])

            all_names = set(
                standardize_loc_name(name)
                for name in [loc_info['name']] + alt_osm_names + alt_wiki_names + alt_punc_name
            )
            for name in all_names:
                locations_by_name[name][data['id']] = data

            assert data['id'] not in locations_by_id
            locations_by_id[data['id']] = data

    return locations_by_name, locations_by_id
Beispiel #12
0
def _add_missing_countries(filepath, locations_by_name, locations_by_id):
    """
    Some countries appear as countries for another location, but don't appear as a distinct row
    themselves. Add these precalculated countries to the data.
    """
    if not os.path.isfile(filepath):
        return

    with open(filepath) as country_file:
        missing_countries = json.load(country_file)

    for country in missing_countries:
        alt_wiki_names = country['alt_names']
        del country['alt_names']

        for alt_name in set(
            standardize_loc_name(name)
            for name in [country['name']] + alt_wiki_names + get_alt_punc_names(country['name'])
        ):
            locations_by_name[alt_name][country['id']] = country

        assert country['id'] not in locations_by_id
        locations_by_id[country['id']] = country