Ejemplo n.º 1
0
def extend_country_spec():
    print('[+] patch COUNTRY_SPEC dict')
    for cc2, r in sorted(WIKIP_ISO3166.items()):
        if r['sovereignity'] in WIKIP_ISO3166:
            # country dependent from another one
            sov  = WIKIP_ISO3166[r['sovereignity']]
            if sov['country_name'] not in COUNTRY_SPEC:
                COUNTRY_SPEC[sov['country_name']] = {}
            sovs = COUNTRY_SPEC[sov['country_name']]
            if 'sub' not in sovs:
                sovs['sub'] = []
            if 'sub_cc2' not in sovs:
                sovs['sub_cc2'] = []
            if r['country_name'] not in sovs['sub']:
                sovs['sub'].append(r['country_name'])
                print('> country %s (%s) added under %s' % (r['country_name'], cc2, sov['country_name']))
            if cc2 not in sovs['sub_cc2']:
                sovs['sub_cc2'].append(cc2)
Ejemplo n.º 2
0
def patch_wikip_iso3166():
    print('[+] patch Wikipedia ISO3166 dict: WIKIP_ISO3166')
    #
    # 1) add entries
    for country, infos in sorted(COUNTRY_SPEC.items()):
        if 'cc2' in infos and infos['cc2'] not in WIKIP_ISO3166:
            r = dict(REC_ISO3166)
            for k in r:
                if k in infos:
                    r[k] = infos[k]
            r['country_name'] = country
            r['code_alpha_2'] = infos['cc2']
            if 'url' in infos:
                r['country_url'] = infos['url']
            WIKIP_ISO3166[r['code_alpha_2']] = r
            print('> CC2 %s, %s added' % (r['code_alpha_2'], r['country_name']))
    #
    for new, old in sorted(CC2_ALIAS.items()):
        if new not in WIKIP_ISO3166:
            WIKIP_ISO3166[new] = WIKIP_ISO3166[old]
            print('> CC2 %s, alias to %s' % (new, old))
    #
    # 1bis) add more entries, extracted from the international telephone numbering listing
    for pref, infos in sorted(WIKIP_MSISDN.items()):
        for cc2, name, url in sorted(infos):
            if cc2 not in WIKIP_ISO3166:
                r = dict(REC_ISO3166)
                r['code_alpha_2']  = cc2
                r['country_name']  = name
                r['country_url']   = url
                WIKIP_ISO3166[cc2] = r
                print('> CC2 %s, %s added from WIKIP_MSISDN' % (cc2, name))
    #
    # 2) patch country names
    for cc2, infos in sorted(WIKIP_ISO3166.items()):
        for oldname, newname in sorted(COUNTRY_RENAME.items()):
            if country_match(infos['country_name'], oldname): 
                infos['country_name'] = newname
                if newname in COUNTRY_SPEC_CC2:
                    assert( COUNTRY_SPEC[newname]['cc2'] == cc2 )
                    if 'url' in COUNTRY_SPEC[newname]:
                        infos['country_url'] = COUNTRY_SPEC[newname]['url']
                print('> country name changed from %s to %s, CC2 %s' % (oldname, newname, cc2))
    #
    # 3) ensure all tld are lower case, and CC codes are upper case
    for infos in WIKIP_ISO3166.values():
        infos['cc_tld']       = infos['cc_tld'].lower()
        infos['cc_tld_url']   = infos['cc_tld_url'].lower()
        infos['code_alpha_2'] = infos['code_alpha_2'].upper()
        infos['code_alpha_3'] = infos['code_alpha_3'].upper()
    #
    # 4) ensure all canon names do not collide
    names = [country_name_canon(r['country_name']) for r in \
             [WIKIP_ISO3166[cc2] for cc2 in sorted(WIKIP_ISO3166)]]
    for i, nameset in enumerate(names[:-1]):
        for name in nameset:
            for j, nameset_totest in enumerate(names[1+i:]):
                if name in nameset_totest:
                    print('>>> country name collision %s / %s'\
                          % (sorted(WIKIP_ISO3166)[i], sorted(WIKIP_ISO3166)[i+1+j]))
    #
    # 5) ensure all overseas, sub-territories and other geographic specificities
    # are referenced correctly, and verify sovereignity
    for country in COUNTRY_SPEC_CC2:
        cc2 = COUNTRY_SPEC[country]['cc2']
        wc  = WIKIP_ISO3166[cc2]
        if country != wc['country_name']:
            print('> country name changed from %s to %s, CC2 %s' % (wc['country_name'], country, cc2))
            wc['country_name'] = country
            if 'url' in COUNTRY_SPEC[country]:
                wc['country_url'] = COUNTRY_SPEC[country]['url']
        if 'sub_cc2' in COUNTRY_SPEC[country]:
            for cc2_s in COUNTRY_SPEC[country]['sub_cc2']:
                if cc2_s not in WIKIP_ISO3166:
                    print('>>> missing CC2 %s, part of %s' % (cc2_s, country))
                else:
                    wc_s = WIKIP_ISO3166[cc2_s]
                    
                    
                    if wc_s['sovereignity'] == '':
                        wc_s['sovereignity'] = cc2
                    elif wc_s['sovereignity'] != cc2:
                        print('>>> CC2 %s, %s, sovereignity mismatch %s / %s'\
                              % (cc2_s, wc_s['country_name'], wc_s['sovereignity'], cc2))
    #
    # 6) keep track of country name variants
    for cc2, infos in sorted(WIKIP_ISO3166.items()):
        infos['nameset'] = country_name_canon(infos['country_name'])
        if infos['state_name']:
            infos['nameset'].update( country_name_canon(infos['state_name']) )