def export_ne0(simp: int): levels = read_json(id_dir / 'ne012.json') simp_str = f'-{simp}' if simp else '' countries = read_json(geojson_dir / f'countries{simp_str}.geojson')['features'] features_by_id = {} for feature in countries: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) ne_id = prop['ne_id'] features_by_id[ne_id] = feature for ne0, ne0_data in levels.items(): ne_id = ne0_data['ne_id'] feature_data = features_by_id[ne_id] fix_props(feature_data, ne0_data, ne0) filename = ne0[4:].lower() export_subdir = export_geojson_dir / f'{simp_map[simp]}' / 'ne0' export_path = export_subdir / f'{filename}.geojson' export_subdir.mkdir(exist_ok=True, parents=True) write_json(export_path, feature_data) print(f'{len(countries)} ne0 GeoJSONs exported, simplification: {simp}')
def export_ne3(simp: int): simp_str = f'-{simp}' if simp else '' states = read_json(geojson_dir / f'states{simp_str}.geojson')['features'] features_by_id = {} for feature in states: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) ne_id = prop['ne_id'] features_by_id[ne_id] = feature country_jsons = ne3_dir.glob('*.json') for country_json in country_jsons: country_code = country_json.stem country_level_3 = read_json(country_json) for ne3, ne3_data in country_level_3.items(): ne_id = ne3_data['ne_id'] feature_data = features_by_id[ne_id] fix_props(feature_data, ne3_data, ne3) ne3_start, ne3_end = ne3.split(':')[1].split('-') assert ne3_start.lower() == country_code export_subdir = export_geojson_dir / f'{simp_map[simp]}' / 'ne3' / country_code export_path = export_subdir / f'{ne3_end.lower()}.geojson' export_subdir.mkdir(exist_ok=True, parents=True) write_json(export_path, feature_data) print(f'{len(states)} ne3 GeoJSONs exported, simplification: {simp}')
def generate_iso1_list(): iso1_json = read_json(export_dir / 'iso1.json') iso2_json = read_json(export_dir / 'iso2.json') iso2_country_codes = {i.split('-')[0] for i in iso2_json} doc_md = ( '# ISO 3166-1 list\n' 'Name | ISO1 | ISO2 | GeoJSON | OSM | Wikidata | Wikipedia | population \n' '--- | --- | --- | --- | --- | --- | --- | --: \n') for item in sorted(iso1_json.values(), key=lambda k: k['name']): data = build_row_data(item) iso1 = item['iso1'] geojson_path = item['geojson_path'] geojson_link = f'[GeoJSON](../export/geojson/q7/{geojson_path})' if iso1 in iso2_country_codes: iso_2_link = f'[ISO2](iso2_list/{iso1}.md)' else: iso_2_link = '' doc_md += ( f'{data["name"]} | {iso1} | {iso_2_link} | {geojson_link} | ' f'{data["osm_link"]} | {data["wikidata_link"]} | ' f'{data["wikipedia_link"]} | {data["population_str"]}' f'\n') write_file(docs_dir / 'iso1_list.md', doc_md)
def collect_iso(): global osm_iso1_map, osm_iso2_map, osm_wd_map osm_iso1_map = get_osm_iso1_map() osm_iso2_map = get_osm_iso2_map() osm_wd_map = get_osm_wd_map() custom_osm = read_json(fixes_dir / 'custom_osm.json') custom_iso1 = { int(k): v['iso1'] for k, v in custom_osm.items() if 'iso1' in v } custom_iso2 = { int(k): v['iso2'] for k, v in custom_osm.items() if 'iso2' in v } custom_wd = { int(k): v['wikidata_id'] for k, v in custom_osm.items() if 'wikidata_id' in v } osm_iso1_map.update(custom_iso1) osm_iso2_map.update(custom_iso2) osm_wd_map.update(custom_wd) download_dir = wam_geojson_download_dir if os.environ.get('COUNTRY'): download_dir = download_dir / os.environ.get('COUNTRY') geojson_files = (download_dir).glob( '**/*.GeoJson') # strange capitalization inside zips collected_dir.mkdir(parents=True, exist_ok=True) iso1_found = open(iso1_collected_path, 'w') iso2_found = open(iso2_collected_path, 'w') geojson_files_sorted = sorted(geojson_files, key=lambda p: p.stem, reverse=False) for f in geojson_files_sorted: print(f.parent.stem, f.stem) try: features = read_json(f)['features'] except json.decoder.JSONDecodeError as e: print(f' Error reading {f.stem} {e}') continue add_iso(features, iso1_found, iso2_found) # add features from osm_missing if not os.environ.get('COUNTRY'): print('osm_missing_features') osm_missing_features = get_osm_missing_features() add_iso(osm_missing_features, iso1_found, iso2_found, is_fixes=True) iso1_found.close() iso2_found.close()
def get_geometry_from_medium_high(geojson_path): medium_geojson_path = export_dir / 'geojson' / 'medium' / geojson_path high_geojson_path = export_dir / 'geojson' / 'high' / geojson_path if medium_geojson_path.is_file(): medium_geojson = read_json(medium_geojson_path) if medium_geojson['geometry']: print(' using geometry from medium geojson') return medium_geojson['geometry'] high_geojson = read_json(high_geojson_path) print(' using geometry from high geojson') return high_geojson['geometry']
def save_wam_population(): features = read_json(simp_dir / 'high' / f'iso1.geojson')['features'] iso1_ids = {f['properties']['wikidata_id'] for f in features} features = read_json(simp_dir / 'high' / f'iso2.geojson')['features'] iso2_ids = {f['properties']['wikidata_id'] for f in features} all_ids = list(iso1_ids.union(iso2_ids)) population_data = get_population(all_ids) wam_data_dir.mkdir(parents=True, exist_ok=True) write_json(wam_data_dir / 'population.json', population_data, indent=2, sort_keys=True)
def generate_fips_list(): fips_json = read_json(export_dir / 'fips.json') counties = sorted(fips_json.values(), key=lambda k: k['name']) state_by_code = get_state_codes()[0] doc_md = f'# US county FIPS code list\n' doc_md += '[GeoJSON for all counties](../export/geojson/q5/fips_all.geojson)' county_by_state = {} for item in sorted(counties, key=lambda k: k['state_code']): state_code = item['state_code'] county_by_state.setdefault(state_code, []) county_by_state[state_code].append(item) for state_code, state_items in county_by_state.items(): state_name = state_by_code[state_code] doc_md += f'\n\n#### {state_name}, state code: {state_code}\n' doc_md += 'Name | FIPS | GeoJSON | population \n' doc_md += '--- | --- | --- | --: \n' for item in sorted(state_items, key=lambda k: k['name']): fips = item['fips'] name = item['name'] population = item['population'] population_str = '' if population: population_str = f'{population:,}' state_code_str = fips[:2] geojson_link = f'[GeoJSON](../export/geojson/q8/fips/{state_code_str}/{fips}.geojson)' doc_md += f'{name} | {fips} | {geojson_link} | {population_str}\n' write_file(docs_dir / 'fips_list.md', doc_md)
def get_osm_wd_map(): file_path = wikidata_dir / 'osm_wd_map.json' if file_path.is_file(): return read_json(file_path) endpoint_url = "https://query.wikidata.org/sparql" query = """SELECT ?region ?osm WHERE { ?region wdt:P297 ?iso1; wdt:P402 ?osm. }""" iso1_results = get_results(endpoint_url, query) query = """SELECT ?region ?osm WHERE { ?region wdt:P300 ?iso2; wdt:P402 ?osm. }""" iso2_results = get_results(endpoint_url, query) osm_wd_map = {} for result in iso1_results["results"]["bindings"] + iso2_results["results"]["bindings"]: osm = int(result['osm']['value']) wd_id = result['region']['value'].split('/')[-1] osm_wd_map[osm] = wd_id write_json(file_path, osm_wd_map, indent=2) return osm_wd_map
def generate_br_muni_list(): br_muni_json = read_json(export_dir / 'br_muni.json') counties = sorted(br_muni_json.values(), key=lambda k: k['name']) doc_md = f'# Brazil municipality IBGE code list\n' doc_md += '[GeoJSON for all municipalities](../export/geojson/q5/br_muni_all.geojson)' county_by_state = {} for item in sorted(counties, key=lambda k: k['state_code']): state_code = item['state_code'] county_by_state.setdefault(state_code, []) county_by_state[state_code].append(item) for state_code, state_items in county_by_state.items(): doc_md += f'\n\n#### {state_code}\n' doc_md += 'Name | IBGE code | GeoJSON | population \n' doc_md += '--- | --- | --- | --: \n' for item in sorted(state_items, key=lambda k: k['name']): ibge_code = item['ibge_code'] name_long = item['name_long'] population = item['population'] geojson_path = item['geojson_path'] population_str = '' if population: population_str = f'{population:,}' geojson_link = f'[GeoJSON](../geojson/q8/{geojson_path})' doc_md += f'{name_long} | {ibge_code} | {geojson_link} | {population_str}\n' docs_dir.mkdir(parents=True, exist_ok=True) write_file(docs_dir / 'br_muni_list.md', doc_md)
def generate_iso2_list_country(iso1): iso2_json = read_json(export_dir / 'iso2.json') iso2_filtered = [ i for i in iso2_json.values() if i['iso2'].split('-')[0] == iso1 ] doc_md = f'# ISO 3166-2 list: {iso1}\n' iso2_by_level = {} for item in sorted(iso2_filtered, key=lambda k: k['admin_level']): level = item['admin_level'] iso2_by_level.setdefault(level, []) iso2_by_level[level].append(item) for level, level_items in iso2_by_level.items(): doc_md += f'\n\n#### Level {level}\n' doc_md += ( 'Name | ISO2 | GeoJSON | OSM | Wikidata | Wikipedia | population \n' '--- | --- | --- | --- | --- | --- | --: \n') for item in sorted(level_items, key=lambda k: k['name']): data = build_row_data(item) iso2 = item['iso2'] geojson_path = item['geojson_path'] geojson_link = f'[GeoJSON](../../export/geojson/q8/{geojson_path})' doc_md += (f'{data["name"]} | {iso2} | {geojson_link} | ' f'{data["osm_link"]} | {data["wikidata_link"]} | ' f'{data["wikipedia_link"]} | {data["population_str"]}' f'\n') write_file(docs_dir / 'iso2_list' / f'{iso1}.md', doc_md)
def generate_iso2_list(): iso2_json = read_json(export_dir / 'iso2.json') country_codes = {i.split('-')[0] for i in iso2_json} subdir = docs_dir / 'iso2_list' shutil.rmtree(subdir, ignore_errors=True) subdir.mkdir(parents=True) for country_code in sorted(country_codes): generate_iso2_list_country(country_code)
def export_ne2(simp: int): levels = read_json(id_dir / 'ne012.json') simp_str = f'-{simp}' if simp else '' subunits = read_json(geojson_dir / f'subunits{simp_str}.geojson')['features'] features_by_id = {} for feature in subunits: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) ne_id = prop['ne_id'] features_by_id[ne_id] = feature counter = 0 for ne0, ne0_data in levels.items(): if 'sub1' not in ne0_data: continue sub1 = ne0_data['sub1'] for ne1, ne1_data in sub1.items(): if 'sub2' not in ne1_data: continue sub2 = ne1_data['sub2'] for ne2, ne2_data in sub2.items(): ne_id = ne2_data['ne_id'] feature_data = features_by_id[ne_id] fix_props(feature_data, ne2_data, ne2) filename = ne2[4:].lower() export_subdir = export_geojson_dir / f'{simp_map[simp]}' / 'ne2' export_path = export_subdir / f'{filename}.geojson' export_subdir.mkdir(exist_ok=True, parents=True) write_json(export_path, feature_data) counter += 1 print(f'{counter} ne2 GeoJSONs exported, simplification: {simp}')
def split_geojson(iso_level: int, simp_level: str): global population_map, iso1_json, iso2_json if not population_map: population_map = read_json(wam_data_dir / 'population.json') if simp_level != 'high': iso1_json = read_json(export_dir / 'iso1.json') iso2_json = read_json(export_dir / 'iso2.json') print(f'Splitting iso{iso_level} to level: {simp_level}') file_path = simp_dir / simp_level / f'iso{iso_level}.geojson' features = read_json(file_path)['features'] features_sorted = sorted(features, key=lambda i: i['properties']['admin_level']) features_by_iso = dict() for feature in features_sorted: feature_processed = process_feature_properties(feature, iso_level, simp_level) if feature_processed is None: continue feature_clean = feature_processed['feature'] iso = feature_processed['iso'] if iso_level == 1: if not validate_iso1(iso): print(f'invalid iso1: {iso}') continue else: if not validate_iso2(iso): print(f'invalid iso2: {iso}') continue features_by_iso.setdefault(iso, list()) features_by_iso[iso].append(feature_clean) deduplicated_by_iso = deduplicate_features_by_iso(features_by_iso) write_json_and_geojsons(deduplicated_by_iso, iso_level, simp_level)
def generate_country_list(): levels = read_json(ne_id_dir / 'ne012.json') doc_md = '# Country list' for ne0, ne0_data in sorted(levels.items(), key=lambda item: item[1]['name']): name = ne0_data['name'] code = ne0[4:].lower() doc_md += ( f'\n{name}{md_space}' f'code: **{ne0}**{md_space}' f'[view](../export/geojson/medium/ne0/{code}.geojson){md_space}') if (ne3_dir / f'{code}.json').is_file(): doc_md += f'[states/provinces](country_list_ne3/{code}.md)' doc_md += '\n\n' if 'sub1' not in ne0_data: continue sub1 = ne0_data['sub1'] for ne1, ne1_data in sorted(sub1.items(), key=lambda item: item[1]['name']): name = ne1_data['name'] level = ne1[:3] code = ne1[4:].lower() doc_md += ( f' - {name}{md_space}' f'code: **{ne1}**{md_space}' f'[view](../export/geojson/medium/{level}/{code}.geojson){md_space} ' f'\n\n') if 'sub2' not in ne1_data: continue sub2 = ne1_data['sub2'] for ne2, ne2_data in sorted(sub2.items(), key=lambda item: item[1]['name']): name = ne2_data['name'] level = ne2[:3] code = ne2[4:].lower() doc_md += ( f' - {name}{md_space}' f'code: **{ne2}**{md_space}' f'[view](../export/geojson/medium/{level}/{code}.geojson){md_space}' f'\n\n') write_file(docs_dir / 'country_list.md', doc_md) print(f'country_list.md updated')
def process_ne3(): countries = read_json(geojson_dir / 'countries.geojson')['features'] states = read_json(geojson_dir / 'states.geojson')['features'] print(f'{len(states)} states') adm_iso_map = create_adm_iso_map(countries) processed_states = build_states(states, adm_iso_map) clean_duplicate_states(processed_states) ne3_data = dict() for feature in processed_states: prop = feature['properties']['_clean'] state_iso = prop['state_iso'] state_name = prop['state_name'] country_iso = prop['country_iso'] country_name = prop['country_name'] ne_id = prop['ne_id'] population = prop['population'] wikidata_id = prop['wikidata_id'] ne3_data.setdefault(country_iso, {}) ne3 = f'ne3:{state_iso}' print(f'{country_name}; {state_name}; {ne3}') ne3_data[country_iso][ne3] = { 'name': state_name, 'ne_id': ne_id, 'wikidata_id': wikidata_id, 'population': population, } for country_iso, country_states in ne3_data.items(): ne3_dir.mkdir(exist_ok=True, parents=True) filename = f'{country_iso.lower()}.json' write_json(ne3_dir / filename, country_states, indent=2, sort_keys=True)
def download_all_regions(): config = read_json(wam_data_dir / 'config.json') for country_code, country_data in config.items(): print(country_data['name']) if country_code == 'USA': continue downloaded = download_country(country_code, 2, 8) if downloaded: time.sleep(60) download_country('USA', 2, 6)
def get_all_ids( get_countries: bool = True, get_units: bool = True, get_subunits: bool = True, get_states: bool = True, ): if get_countries: countries = read_json(geojson_dir / 'countries.geojson')['features'] else: countries = [] if get_units: units = read_json(geojson_dir / 'units.geojson')['features'] else: units = [] if get_subunits: subunits = read_json(geojson_dir / 'subunits.geojson')['features'] else: subunits = [] if get_states: states = read_json(geojson_dir / 'states.geojson')['features'] else: states = [] all_ids = set() for feature in countries + units + subunits + states: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) if not prop.get('wikidataid'): continue all_ids.add(prop['wikidataid']) return sorted(all_ids)
def generate_ne3_md(country_iso): filename = f'{country_iso.lower()}.json' level3 = read_json(ne3_dir / filename) level012 = read_json(ne_id_dir / 'ne012.json') country_name = level012[f'ne0:{country_iso.upper()}']['name'] doc_md = f'# {country_name} states/provinces/counties' for ne3, ne3_data in sorted(level3.items(), key=lambda item: item[1]['name']): name = ne3_data['name'] ne3_country, ne3_state = ne3.split(':')[1].split('-') assert country_iso.lower() == ne3_country.lower() doc_md += ( f'\n{name}{md_space}' f'code: **{ne3}**{md_space}' f'[view](../../export/geojson/medium/ne3/{country_iso.lower()}/{ne3_state.lower()}.geojson){md_space}' f'\n\n') write_file(docs_dir / 'country_list_ne3' / f'{country_iso}.md', doc_md)
def get_osm_missing_features(): osm_missing_dir = fixes_dir / 'osm_missing' geojson_paths = osm_missing_dir.glob('*.geojson') features = [] for geojson_path in geojson_paths: geojson = read_json(geojson_path) if geojson['type'] == 'Feature': clean_tags_overpass(geojson) features.append(geojson) if geojson['type'] == 'FeatureCollection': for feature in geojson['features']: clean_tags_overpass(feature) features.append(feature) return features
def generate_fips_list(): fips_json = read_json(export_dir / 'fips.json') counties = sorted(fips_json.values(), key=lambda k: k['name']) states_by_int = get_state_data()[0] doc_md = f'# US county FIPS code list\n' doc_md += '[GeoJSON for all counties](../export/geojson/q5/fips_all.geojson)' county_by_state = {} for item in sorted(counties, key=lambda k: k['state_code_int']): state_code_int = item['state_code_int'] county_by_state.setdefault(state_code_int, []) county_by_state[state_code_int].append(item) for state_code_int, state_items in county_by_state.items(): state_name = states_by_int[state_code_int]['name'] state_code_postal = states_by_int[state_code_int]['postal_code'] doc_md += f'\n\n#### {state_name} - {state_code_postal}, state code: {state_code_int}\n' doc_md += 'Name | FIPS | GeoJSON | population \n' doc_md += '--- | --- | --- | --: \n' for item in sorted(state_items, key=lambda k: k['name']): fips = item['fips'] name_long = item['name_long'] population = item['population'] geojson_path = item['geojson_path'] population_str = '' if population: population_str = f'{population:,}' geojson_link = f'[GeoJSON](../geojson/q8/{geojson_path})' doc_md += f'{name_long} | {fips} | {geojson_link} | {population_str}\n' docs_dir.mkdir(parents=True, exist_ok=True) write_file(docs_dir / 'fips_list.md', doc_md)
def get_osm_iso1_map(): file_path = wikidata_dir / 'osm_iso1_map.json' if file_path.is_file(): return read_json(file_path) endpoint_url = "https://query.wikidata.org/sparql" query = """SELECT ?region ?iso1 ?iso2 ?osm WHERE { ?region wdt:P297 ?iso1; wdt:P402 ?osm. }""" results = get_results(endpoint_url, query) osm_iso1_map = {} for result in results["results"]["bindings"]: iso1 = result['iso1']['value'] osm = int(result['osm']['value']) osm_iso1_map[osm] = iso1 write_json(file_path, osm_iso1_map, indent=2) return osm_iso1_map
import shutil import sys from country_levels_lib.fips import fips_utils from country_levels_lib.config import export_dir, fixes_dir from country_levels_lib.geo import calculate_centroid, find_timezone from country_levels_lib.utils import read_json, osm_url, write_json, wikidata_url from country_levels_lib.wam.wam_collect import validate_iso1, validate_iso2, simp_dir from country_levels_lib.wam.wam_download import wam_data_dir from area import area population_map = None population_fixes = read_json(fixes_dir / 'population.json') timezone_fixes = read_json(fixes_dir / 'timezone.json') us_states_by_postal = fips_utils.get_state_data()[1] iso1_json = None iso2_json = None def split_geojson(iso_level: int, simp_level: str): global population_map, iso1_json, iso2_json if not population_map: population_map = read_json(wam_data_dir / 'population.json') if simp_level != 'high': iso1_json = read_json(export_dir / 'iso1.json') iso2_json = read_json(export_dir / 'iso2.json') print(f'Splitting iso{iso_level} to level: {simp_level}') file_path = simp_dir / simp_level / f'iso{iso_level}.geojson'
def process_ne012(): countries = read_json(geojson_dir / 'countries.geojson')['features'] print(f'{len(countries)} countries') units = read_json(geojson_dir / 'units.geojson')['features'] print(f'{len(units)} units') subunits = read_json(geojson_dir / 'subunits.geojson')['features'] print(f'{len(subunits)} subunits') wikidata_population = read_json(wikidata_dir / 'population.json') adm_iso_map = create_adm_iso_map(countries) levels = dict() for feature in countries: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) country_name = prop['admin'] country_iso = adm_iso_map[prop['adm0_a3']] validate_iso_012(country_iso) ne_id = prop['ne_id'] assert type(ne_id) == int wikidata_id = prop.get('wikidataid') population = calculate_population(prop, wikidata_population) ne0 = f'ne0:{country_iso}' levels.setdefault( ne0, { 'name': country_name, 'ne_id': ne_id, 'wikidata_id': wikidata_id, 'population': population, 'sub1': {}, }, ) for feature in units: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) country_iso = adm_iso_map[prop['adm0_a3']] validate_iso_012(country_iso) unit_name = prop['geounit'] unit_iso = prop['gu_a3'] validate_iso_012(unit_iso) ne_id = prop['ne_id'] assert type(ne_id) == int wikidata_id = prop.get('wikidataid') population = calculate_population(prop, wikidata_population) ne0 = f'ne0:{country_iso}' ne1 = f'ne1:{unit_iso}' sub1 = levels[ne0]['sub1'] sub1.setdefault( ne1, { 'name': unit_name, 'ne_id': ne_id, 'wikidata_id': wikidata_id, 'population': population, 'sub2': {}, }, ) for feature in subunits: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) country_iso = adm_iso_map[prop['adm0_a3']] validate_iso_012(country_iso) unit_iso = prop['gu_a3'] validate_iso_012(unit_iso) subunit_name = prop['subunit'] subunit_iso = prop['su_a3'] validate_iso_012(subunit_iso) ne_id = prop['ne_id'] assert type(ne_id) == int wikidata_id = prop.get('wikidataid') population = calculate_population(prop, wikidata_population) ne0 = f'ne0:{country_iso}' ne1 = f'ne1:{unit_iso}' ne2 = f'ne2:{subunit_iso}' sub1 = levels[ne0]['sub1'] sub2 = sub1[ne1]['sub2'] sub2.setdefault( ne2, { 'name': subunit_name, 'ne_id': ne_id, 'wikidata_id': wikidata_id, 'population': population, }, ) cleanup_sub2(levels) cleanup_sub1(levels) one_to_one_fix(levels) id_dir.mkdir(exist_ok=True, parents=True) write_json(id_dir / 'ne012.json', levels, indent=2, sort_keys=True)
def process_fips_quality(quality): assert quality in [5, 7, 8] print(f'Processing FIPS county GeoJSON {quality_map[quality]}') features = read_json( fips_geojson_dir / f'counties_{quality_map[quality]}.geojson')['features'] counties_by_str = get_county_data()[1] states_by_code = get_state_codes()[0] geojson_export_dir = export_dir / 'geojson' / f'q{quality}' / 'fips' shutil.rmtree(geojson_export_dir, ignore_errors=True) new_features = list() json_data = dict() count = 0 for feature in features: prop = feature['properties'] full_code_str = prop['GEOID'] state_code = int(prop['STATEFP']) county_code = int(prop['COUNTYFP']) # skip minor islands without state code found in 500k dataset if state_code not in states_by_code: continue county_data = counties_by_str[full_code_str] assert county_data['county_code'] == county_code assert county_data['state_code'] == state_code name = county_data['name'] population = county_data['population'] countrylevel_id = f'fips:{full_code_str}' for key in ['NAME', 'GEOID', 'STATEFP', 'COUNTYFP']: del prop[key] new_prop = { 'name': name, 'fips': full_code_str, 'state_code': state_code, 'county_code': county_code, 'population': population, 'countrylevel_id': countrylevel_id, 'census_data': prop, } feature['properties'] = new_prop new_features.append(feature) state_code_str = full_code_str[:2] state_subdir = geojson_export_dir / state_code_str state_subdir.mkdir(parents=True, exist_ok=True) write_json(state_subdir / f'{full_code_str}.geojson', feature) count += 1 json_data[full_code_str] = { k: v for k, v in new_prop.items() if k != 'census_data' } json_data[full_code_str][ 'geojson_path'] = f'fips/{state_code_str}/{full_code_str}.geojson' write_json( export_dir / 'geojson' / f'q{quality}' / 'fips_all.geojson', { "type": "FeatureCollection", "features": new_features }, ) if quality == 5: # only write the file once write_json(export_dir / f'fips.json', json_data, indent=2, sort_keys=True) assert count == len(counties_by_str) print(f' {count} GeoJSON processed')
def build_states(states: list, adm_iso_map: dict): wikidata_population = read_json(wikidata_dir / 'population.json') wikidata_iso_ne3 = read_json(wikidata_dir / 'iso_ne3.json') clean_states = list() for feature in states: prop = feature['properties'] for key in prop: prop[key.lower()] = prop.pop(key) country_name = prop['admin'] country_iso = adm_iso_map[prop['adm0_a3']] validate_iso_012(country_iso) state_name = prop['name'] state_iso = fix_iso_3_codes.get(prop['iso_3166_2'], prop['iso_3166_2']) wikidata_id = prop.get('wikidataid') wikidata_url = f'https://www.wikidata.org/wiki/{wikidata_id}' population = wikidata_population.get(wikidata_id, 0) wikidata_iso = wikidata_iso_ne3.get(wikidata_id) if wikidata_iso is not None and wikidata_iso != state_iso: state_iso = wikidata_iso ne_id = prop['ne_id'] assert type(ne_id) == int # skipping minor island if prop['featurecla'] == 'Admin-1 minor island': continue # skipping unnamed places (right now the same as minor islands) if state_name is None: continue if state_iso.startswith('-99-'): continue # check if state's iso code matches country's iso code country_iso_from_state, country_code_from_state = state_iso.split('-') if country_iso_from_state != country_iso: # print( # f'ci: {country_iso} si:{state_iso} cn:{country_name} sn: {state_name} {wikidata_url}' # ) # country_iso = country_iso_from_state state_iso = f'{country_iso}-{country_code_from_state}' # clean up state_iso state_iso = state_iso.replace('~', '') # regex check state_iso validate_iso_3(state_iso) prop['_clean'] = { 'country_name': country_name, 'country_iso': country_iso, 'state_name': state_name, 'state_iso': state_iso, 'ne_id': ne_id, 'population': population, 'wikidata_id': wikidata_id, } clean_states.append(feature) return clean_states
def process_fips_quality(quality): print(f'Processing FIPS county GeoJSON {quality_map[quality]}') features = read_json(fips_geojson_dir / f'counties_{quality_map[quality]}.geojson')['features'] counties_by_str = get_county_data()[1] states_by_int = get_state_data()[0] geojson_export_dir = export_dir / 'geojson' / quality / 'fips' shutil.rmtree(geojson_export_dir, ignore_errors=True) new_features = list() json_data = dict() count = 0 for feature in features: prop = feature['properties'] full_code_str = prop['GEOID'] state_code_int = int(prop['STATEFP']) county_code = int(prop['COUNTYFP']) # skip minor islands without state code found in 500k dataset if state_code_int not in states_by_int: continue centroid = calculate_centroid(feature) timezone = find_timezone(centroid['lon'], centroid['lat']) area_m2 = int(prop['ALAND'] + prop['AWATER']) county_data = counties_by_str[full_code_str] assert county_data['county_code'] == county_code assert county_data['state_code_int'] == state_code_int name = county_data['name'] name_long = county_data['name_long'] state_code_int = county_data['state_code_int'] state_code_postal = county_data['state_code_postal'] state_code_iso = county_data['state_code_iso'] population = county_data['population'] countrylevel_id = f'fips:{full_code_str}' for key in ['NAME', 'GEOID', 'STATEFP', 'COUNTYFP']: del prop[key] new_prop = { 'name': name, 'name_long': name_long, 'fips': full_code_str, 'state_code_int': state_code_int, 'state_code_postal': state_code_postal, 'state_code_iso': state_code_iso, 'county_code': county_code, 'population': population, 'countrylevel_id': countrylevel_id, 'census_data': prop, 'center_lat': round(centroid['lat'], 2), 'center_lon': round(centroid['lon'], 2), 'area_m2': area_m2, 'timezone': timezone, } feature['properties'] = new_prop new_features.append(feature) state_code_str = full_code_str[:2] state_subdir = geojson_export_dir / state_code_str state_subdir.mkdir(parents=True, exist_ok=True) write_json(state_subdir / f'{full_code_str}.geojson', feature) count += 1 json_data[full_code_str] = new_prop # json_data[full_code_str] = {k: v for k, v in new_prop.items() if k != 'census_data'} json_data[full_code_str]['geojson_path'] = f'fips/{state_code_str}/{full_code_str}.geojson' write_json( export_dir / 'geojson' / quality / 'fips_all.geojson', {"type": "FeatureCollection", "features": new_features}, ) if quality == 'high': # only write the file once write_json(export_dir / f'fips.json', json_data, indent=2, sort_keys=True) assert count == len(counties_by_str) print(f' {count} GeoJSON processed')
def split_geojson(iso_level: int, simp_level, *, debug: bool = False): assert iso_level in [1, 2] print(f'Splitting iso{iso_level} to level: q{simp_level}') file_path = wam_geojson_simp_dir / f'iso{iso_level}-{simp_level}.geojson' features = read_json(file_path)['features'] features_sorted = sorted(features, key=lambda i: i['properties']['admin_level']) level_subdir = export_dir / 'geojson' / f'q{simp_level}' / f'iso{iso_level}' level_subdir.mkdir(parents=True) population_map = read_json(wam_data_dir / 'population.json') json_data = dict() seen = dict() for feature in features_sorted: prop = feature['properties'] alltags = prop['alltags'] name = prop.pop('name') osm_id = int(prop.pop('id')) iso = prop.pop(f'iso{iso_level}') admin_level = int(prop.pop('admin_level')) wikidata_id = prop.pop('wikidata_id', None) countrylevel_id = f'iso{iso_level}:{iso}' population = population_map.get(wikidata_id) wikipedia_from_prop = prop.pop('wikipedia', None) wikipedia_from_alltags = alltags.pop('wikipedia', None) if (wikipedia_from_prop and wikipedia_from_alltags and wikipedia_from_prop != wikipedia_from_alltags): print(wikipedia_from_prop, wikipedia_from_alltags) wikipedia_id = wikipedia_from_alltags if wikipedia_from_prop: wikipedia_id = wikipedia_from_prop del feature['bbox'] for key in ['boundary', 'note', 'rpath', 'srid', 'timestamp']: prop.pop(key, None) for key in [ 'ISO3166-1', 'ISO3166-1:alpha2', 'ISO3166-1:numeric', 'ISO3166-2', 'ISO3166-2:alpha2', 'ISO3166-2:numeric', 'land_area', 'wikidata', ]: alltags.pop(key, None) seen.setdefault(iso, list()) if seen[iso] and not debug: # print(f' duplicate {iso}, skipping') continue new_prop = { 'name': name, f'iso{iso_level}': iso, 'admin_level': admin_level, 'osm_id': osm_id, 'wikidata_id': wikidata_id, 'wikipedia_id': wikipedia_id, 'population': population, 'countrylevel_id': countrylevel_id, 'osm_data': prop, } new_prop_without_osm_data = { k: v for k, v in new_prop.items() if k != 'osm_data' } feature['properties'] = new_prop seen[iso].append(new_prop_without_osm_data) json_data[iso] = new_prop_without_osm_data if iso_level == 1: if not validate_iso1(iso): print(f'invalid iso1: {iso}') continue write_json(level_subdir / f'{iso}.geojson', feature) json_data[iso]['geojson_path'] = f'iso1/{iso}.geojson' else: if not validate_iso2(iso): print(f'invalid iso2: {iso}') continue iso2_start, iso2_end = iso.split('-') iso2_subdir = level_subdir / iso2_start iso2_subdir.mkdir(exist_ok=True) write_json(level_subdir / iso2_start / f'{iso}.geojson', feature) json_data[iso]['geojson_path'] = f'iso2/{iso2_start}/{iso}.geojson' if simp_level == 5: write_json(export_dir / f'iso{iso_level}.json', json_data, indent=2, sort_keys=True) # # if debug: # debug duplicates, fixed by sorting by admin_level debug_dir = geojson_dir / 'wam' / 'debug' / f'iso{iso_level}' shutil.rmtree(debug_dir, ignore_errors=True) debug_dir.mkdir(parents=True) # choose lowest admin level from available ones for iso, iso_matches in seen.items(): if len(iso_matches) != 1: matches_sorted = sorted(iso_matches, key=lambda i: i['admin_level']) print(f'duplicate iso{iso_level}: {iso}') for match in matches_sorted: name = match['name'] osm_id = match['osm_id'] url = osm_url(osm_id) admin_level = match['admin_level'] print(f' {name} {admin_level} {url}')
def process_br_muni_quality(quality): assert quality in [5, 7, 8] print(f'Processing BR_Muni county GeoJSON {quality}') features = read_json(geojson_dir / 'br_muni' / 'simp' / f'simp-{quality}.geojson')['features'] geojson_export_dir = export_dir / 'geojson' / f'q{quality}' / 'br_muni' shutil.rmtree(geojson_export_dir, ignore_errors=True) json_data = dict() for feature in features: prop = feature['properties'] name = prop.pop('name') name_long = prop.pop('name_long') population = int(prop.pop('population')) state_code = prop.pop('state') ibge_code = prop.pop('ibge_code') assert not prop # make sure we removed everything from the original properties countrylevel_id = f'br_muni:{ibge_code}' centroid = calculate_centroid(feature) timezone = find_timezone(centroid['lon'], centroid['lat']) new_prop = { 'name': name, 'name_long': name_long, 'state_code': state_code, 'state_code_iso': f'iso2:BR-{state_code}', 'ibge_code': ibge_code, 'population': population, 'countrylevel_id': countrylevel_id, 'center_lat': round(centroid['lat'], 2), 'center_lon': round(centroid['lon'], 2), 'timezone': timezone, } feature['properties'] = new_prop state_subdir = geojson_export_dir / state_code state_subdir.mkdir(parents=True, exist_ok=True) write_json(state_subdir / f'{ibge_code}.geojson', feature) json_data[ibge_code] = dict(new_prop) json_data[ibge_code][ 'geojson_path'] = f'br_muni/{state_code}/{ibge_code}.geojson' write_json( export_dir / 'geojson' / f'q{quality}' / 'br_muni_all.geojson', { "type": "FeatureCollection", "features": features }, ) if quality == 7: # only write the file once write_json(export_dir / f'br_muni.json', json_data, indent=2, sort_keys=True) print(f' {len(features)} GeoJSON processed')
get_osm_iso1_map, get_osm_wd_map, get_osm_iso2_map, ) from country_levels_lib.wikidata.wikidata_population import get_population collected_dir = geojson_dir / 'wam' / 'collected' iso1_collected_path = collected_dir / 'iso1.ndjson' iso2_collected_path = collected_dir / 'iso2.ndjson' simp_dir = geojson_dir / 'wam' / 'simp' osm_iso1_map = {} osm_iso2_map = {} osm_wd_map = {} skip_osm_features = {int(i) for i in read_json(fixes_dir / 'skip_osm.json')} iso1_regex = re.compile('[A-Z]{2}') iso2_regex = re.compile('[A-Z]{2}-[A-Z0-9]{1,4}') def collect_iso(): global osm_iso1_map, osm_iso2_map, osm_wd_map osm_iso1_map = get_osm_iso1_map() osm_iso2_map = get_osm_iso2_map() osm_wd_map = get_osm_wd_map() custom_osm = read_json(fixes_dir / 'custom_osm.json') custom_iso1 = { int(k): v['iso1']