def extend_country_spec(): print('[+] patch COUNTRY_SPEC dict') for cc2, r in sorted(WIKIP_ISO3166.items()): if r['sovereignity'] in WIKIP_ISO3166: # country dependent from another one sov = WIKIP_ISO3166[r['sovereignity']] if sov['country_name'] not in COUNTRY_SPEC: COUNTRY_SPEC[sov['country_name']] = {} sovs = COUNTRY_SPEC[sov['country_name']] if 'sub' not in sovs: sovs['sub'] = [] if 'sub_cc2' not in sovs: sovs['sub_cc2'] = [] if r['country_name'] not in sovs['sub']: sovs['sub'].append(r['country_name']) print('> country %s (%s) added under %s' % (r['country_name'], cc2, sov['country_name'])) if cc2 not in sovs['sub_cc2']: sovs['sub_cc2'].append(cc2)
def patch_wikip_iso3166(): print('[+] patch Wikipedia ISO3166 dict: WIKIP_ISO3166') # # 1) add entries for country, infos in sorted(COUNTRY_SPEC.items()): if 'cc2' in infos and infos['cc2'] not in WIKIP_ISO3166: r = dict(REC_ISO3166) for k in r: if k in infos: r[k] = infos[k] r['country_name'] = country r['code_alpha_2'] = infos['cc2'] if 'url' in infos: r['country_url'] = infos['url'] WIKIP_ISO3166[r['code_alpha_2']] = r print('> CC2 %s, %s added' % (r['code_alpha_2'], r['country_name'])) # for new, old in sorted(CC2_ALIAS.items()): if new not in WIKIP_ISO3166: WIKIP_ISO3166[new] = WIKIP_ISO3166[old] print('> CC2 %s, alias to %s' % (new, old)) # # 1bis) add more entries, extracted from the international telephone numbering listing for pref, infos in sorted(WIKIP_MSISDN.items()): for cc2, name, url in sorted(infos): if cc2 not in WIKIP_ISO3166: r = dict(REC_ISO3166) r['code_alpha_2'] = cc2 r['country_name'] = name r['country_url'] = url WIKIP_ISO3166[cc2] = r print('> CC2 %s, %s added from WIKIP_MSISDN' % (cc2, name)) # # 2) patch country names for cc2, infos in sorted(WIKIP_ISO3166.items()): for oldname, newname in sorted(COUNTRY_RENAME.items()): if country_match(infos['country_name'], oldname): infos['country_name'] = newname if newname in COUNTRY_SPEC_CC2: assert( COUNTRY_SPEC[newname]['cc2'] == cc2 ) if 'url' in COUNTRY_SPEC[newname]: infos['country_url'] = COUNTRY_SPEC[newname]['url'] print('> country name changed from %s to %s, CC2 %s' % (oldname, newname, cc2)) # # 3) ensure all tld are lower case, and CC codes are upper case for infos in WIKIP_ISO3166.values(): infos['cc_tld'] = infos['cc_tld'].lower() infos['cc_tld_url'] = infos['cc_tld_url'].lower() infos['code_alpha_2'] = infos['code_alpha_2'].upper() infos['code_alpha_3'] = infos['code_alpha_3'].upper() # # 4) ensure all canon names do not collide names = [country_name_canon(r['country_name']) for r in \ [WIKIP_ISO3166[cc2] for cc2 in sorted(WIKIP_ISO3166)]] for i, nameset in enumerate(names[:-1]): for name in nameset: for j, nameset_totest in enumerate(names[1+i:]): if name in nameset_totest: print('>>> country name collision %s / %s'\ % (sorted(WIKIP_ISO3166)[i], sorted(WIKIP_ISO3166)[i+1+j])) # # 5) ensure all overseas, sub-territories and other geographic specificities # are referenced correctly, and verify sovereignity for country in COUNTRY_SPEC_CC2: cc2 = COUNTRY_SPEC[country]['cc2'] wc = WIKIP_ISO3166[cc2] if country != wc['country_name']: print('> country name changed from %s to %s, CC2 %s' % (wc['country_name'], country, cc2)) wc['country_name'] = country if 'url' in COUNTRY_SPEC[country]: wc['country_url'] = COUNTRY_SPEC[country]['url'] if 'sub_cc2' in COUNTRY_SPEC[country]: for cc2_s in COUNTRY_SPEC[country]['sub_cc2']: if cc2_s not in WIKIP_ISO3166: print('>>> missing CC2 %s, part of %s' % (cc2_s, country)) else: wc_s = WIKIP_ISO3166[cc2_s] if wc_s['sovereignity'] == '': wc_s['sovereignity'] = cc2 elif wc_s['sovereignity'] != cc2: print('>>> CC2 %s, %s, sovereignity mismatch %s / %s'\ % (cc2_s, wc_s['country_name'], wc_s['sovereignity'], cc2)) # # 6) keep track of country name variants for cc2, infos in sorted(WIKIP_ISO3166.items()): infos['nameset'] = country_name_canon(infos['country_name']) if infos['state_name']: infos['nameset'].update( country_name_canon(infos['state_name']) )