def _add_fixed_alt_names(locations_by_name): for real_name, alt_names, resolution in ( # Countries ( 'United States of America', ('USA', 'U.S.A.', 'US', 'U.S.', 'United States', 'the United States', 'America'), ResolutionTypes.COUNTRY ), ('United Kingdom', ('Great Britain', 'Britain', 'UK', 'U.K.'), ResolutionTypes.COUNTRY), ('South Korea', ('Korea',), ResolutionTypes.COUNTRY), ('North Korea', ('Korea',), ResolutionTypes.COUNTRY), ('The Netherlands', ('Netherlands', 'Holland',), ResolutionTypes.COUNTRY), ("Cote d'Ivoire", ('Ivory Coast',), ResolutionTypes.COUNTRY), ('Palestinian Territory', ('Palestine',), ResolutionTypes.COUNTRY), # Cities ('New York City', ('NYC', 'N.Y.C.'), ResolutionTypes.CITY), ('Los Angeles', ('LA', 'L.A.'), ResolutionTypes.CITY), ): locations = [ loc for loc in locations_by_name[standardize_loc_name(real_name)].itervalues() if loc['resolution'] == resolution ] assert len(locations) == 1 location = locations[0] for alt_name in alt_names: locations_by_name[standardize_loc_name(alt_name)][location['id']] = location
def _add_state_abbreviations(filepath, locations_by_name): """ We think the abbreviation for a US state can be a name, so add them to the locations map. """ with open(filepath) as states_file: csv_reader = csv.reader(states_file, delimiter='\t') for state, abbrev in csv_reader: assert len(abbrev) == 2 state = standardize_loc_name(state) state_found = False for candidate in locations_by_name[state].itervalues(): if ( candidate['resolution'] == ResolutionTypes.ADMIN_1 and candidate['admin_level_1'] == state and candidate['country_code'] == u'US' ): for abbrev_name in (abbrev, '%s.%s.' % (abbrev[0], abbrev[1])): abbrev_name = standardize_loc_name(abbrev_name) locations_by_name[abbrev_name][candidate['id']] = candidate state_found = True break if not state_found: raise ValueError
def _find_single_location(name, country, resolution): name = standardize_loc_name(name) matches = [ loc for loc in _LOCATIONS_BY_NAME[name].itervalues() if (loc['name'] == name and loc['country'] == standardize_loc_name( country) and loc['resolution'] == resolution) ] assert len(matches) == 1 return matches[0]
def _name_search(self, name, resolution=None): name = standardize_loc_name(name) return { id_: loc.copy() for id_, loc in self._locations_by_name.get(name, {}).iteritems() if not resolution or loc['resolution'] == resolution }
def _add_fixed_alt_names(): for (real_name, country, resolution), alt_names in FIXED_ALTERNATE_NAMES.iteritems(): location = _find_single_location(real_name, country, resolution) for alt_name in alt_names: _LOCATIONS_BY_NAME[standardize_loc_name(alt_name)][ location['id']] = location
def _load_admin1_data(filepath, countries_by_code): admin1_by_code = {} with open(filepath) as admin1_file: reader = csv.reader(admin1_file, dialect='excel-tab', quoting=csv.QUOTE_NONE) for (full_admin1_code, name, ascii_name, geoname_id) in reader: standard_name = standardize_loc_name(name) if not geoname_id or not standard_name: continue country_code, admin1_code = full_admin1_code.split('.') country = countries_by_code[country_code] data = { 'id': geoname_id, 'resolution': ResolutionTypes.ADMIN_1, 'name': standard_name, 'country_code': country_code, 'country': country['name'], 'country_id': country['id'], 'population': 0, } _LOCATIONS_BY_NAME[standard_name][geoname_id] = data for alt_name in set(get_alt_punc_names(standard_name)): _LOCATIONS_BY_NAME[alt_name][geoname_id] = data assert geoname_id not in _LOCATIONS_BY_ID _LOCATIONS_BY_ID[geoname_id] = data admin1_by_code[full_admin1_code] = data return admin1_by_code
def _name_search(self, name, resolution=None): name = standardize_loc_name(name) if name in DataSource.CONTINENTS or name in DataSource.OCEANS: return {} return { id_: loc.copy() for id_, loc in self._locations_by_name[name].iteritems() if not resolution or loc['resolution'] == resolution }
def _add_alternate_names(filepath): _add_fixed_alt_names() if not os.path.isfile(filepath): return with open(filepath) as alt_names_file: alt_names_by_id = json.load(alt_names_file) for id_, alt_names in alt_names_by_id.iteritems(): location = _LOCATIONS_BY_ID[id_] if location['population'] >= _MIN_POPULATION_FOR_ALT_WIKI_NAMES: for alt_name in alt_names: _LOCATIONS_BY_NAME[standardize_loc_name( alt_name)][id_] = location
def _load_city_data(filepath, countries_by_code, admin1_by_code, admin2_by_code): with open(filepath) as city_file: reader = csv.reader(city_file, dialect='excel-tab', quoting=csv.QUOTE_NONE) for (geoname_id, name, ascii_name, alternate_names, latitude, longitude, feature_class, feature_code, country_code, cc2, admin1_code, admin2_code, admin3_code, admin4_code, population, elevation, dem, timezone, modification_date) in reader: if feature_code.upper() not in _KEEP_FEATURE_CODES: continue standard_name = standardize_loc_name(name) if not geoname_id or not standard_name: continue admin1 = admin1_by_code.get('%s.%s' % (country_code, admin1_code)) admin2 = admin2_by_code.get( '%s.%s.%s' % (country_code, admin1_code, admin2_code)) country = countries_by_code[country_code] data = { 'id': geoname_id, 'resolution': ResolutionTypes.CITY, 'name': standard_name, 'country_code': country_code, 'country': country['name'], 'country_id': country['id'], 'admin_level_1': admin1['name'] if admin1 else None, 'admin_level_1_id': admin1['id'] if admin1 else None, 'admin_level_2': admin2['name'] if admin2 else None, 'admin_level_2_id': admin2['id'] if admin2 else None, 'population': int(population), 'latitude': float(latitude), 'longitude': float(longitude), } _LOCATIONS_BY_NAME[standard_name][geoname_id] = data for alt_name in set(get_alt_punc_names(standard_name)): _LOCATIONS_BY_NAME[alt_name][geoname_id] = data assert geoname_id not in _LOCATIONS_BY_ID _LOCATIONS_BY_ID[geoname_id] = data if admin1: admin1['population'] += int(population) if admin2: admin2['population'] += int(population)
def _load_country_data(filepath): countries_by_code = {} with open(filepath) as country_file: reader = csv.reader(country_file, dialect='excel-tab', quoting=csv.QUOTE_NONE) for row in reader: if row[0].startswith('#'): continue (iso, iso3, isonumeric, fips, name, capital, areakm2, population, continent_code, tld, currency_code, currency_name, phone, postal_code_format, postal_code_regex, languages, geoname_id, neighbors, equivalent_fips_code) = row standard_name = standardize_loc_name(name) if not geoname_id or not standard_name: continue data = { 'id': geoname_id, 'resolution': ResolutionTypes.COUNTRY, 'name': standard_name, 'country_code': iso, 'country': standard_name, 'country_id': geoname_id, 'population': int(population), 'neighbor_country_codes': neighbors.split(','), } _LOCATIONS_BY_NAME[standard_name][geoname_id] = data for alt_name in set(get_alt_punc_names(standard_name)): _LOCATIONS_BY_NAME[alt_name][geoname_id] = data assert geoname_id not in _LOCATIONS_BY_ID _LOCATIONS_BY_ID[geoname_id] = data countries_by_code[iso] = data for country in _LOCATIONS_BY_ID.itervalues(): country['neighbor_country_ids'] = [ countries_by_code[code]['country_id'] for code in country['neighbor_country_codes'] if code in countries_by_code ] del country['neighbor_country_codes'] return countries_by_code
def _load_main_data(filepath, alt_names_by_id): locations_by_name = defaultdict(dict) locations_by_id = {} with open(filepath) as loc_file: csv_reader = csv.reader(loc_file, delimiter='\t') keys = next(csv_reader) last_importance = 1. for row in csv_reader: assert len(row) == 23 loc_info = dict(zip(keys, row)) importance = float(loc_info['importance']) assert importance <= last_importance last_importance = importance resolution = _get_resolution(loc_info) if not resolution: continue data = dict( id=int(loc_info['osm_id']), resolution=resolution, name=standardize_loc_name(loc_info['name']), latitude=float(loc_info['lat']), longitude=float(loc_info['lon']), importance=importance, city=standardize_loc_name(loc_info['city']), admin_level_2=standardize_loc_name(loc_info['county']), admin_level_1=standardize_loc_name(loc_info['state']), country=standardize_loc_name(loc_info['country']), country_code=loc_info['country_code'].upper(), ) if _should_skip_location(data, locations_by_name): continue alt_osm_names = [ name for name in loc_info['alternative_names'].split(',') if _is_ascii(name) ] alt_wiki_names = alt_names_by_id[data['id']] alt_punc_name = get_alt_punc_names(loc_info['name']) all_names = set( standardize_loc_name(name) for name in [loc_info['name']] + alt_osm_names + alt_wiki_names + alt_punc_name ) for name in all_names: locations_by_name[name][data['id']] = data assert data['id'] not in locations_by_id locations_by_id[data['id']] = data return locations_by_name, locations_by_id
def _add_missing_countries(filepath, locations_by_name, locations_by_id): """ Some countries appear as countries for another location, but don't appear as a distinct row themselves. Add these precalculated countries to the data. """ if not os.path.isfile(filepath): return with open(filepath) as country_file: missing_countries = json.load(country_file) for country in missing_countries: alt_wiki_names = country['alt_names'] del country['alt_names'] for alt_name in set( standardize_loc_name(name) for name in [country['name']] + alt_wiki_names + get_alt_punc_names(country['name']) ): locations_by_name[alt_name][country['id']] = country assert country['id'] not in locations_by_id locations_by_id[country['id']] = country