def __add_to_two_level_mappings(self): # TODO: ADD TO DTwoLevelMappings!!! DOut = {} for iso in get_L_possible_isos(): iso_info = ISOTools.split(iso) if iso_info.territory: from iso_tools.ISOCodes import DCountries region = DCountries.get(iso_info.territory, ['Unknown'])[0] # TODO: ALLOW FOR i18n etc!!! else: part3 = iso_info.lang # OPEN ISSUE: Use LCountry[2] here, to use continent rather than country?? try: region = ISOCodes.get_D_iso(part3)['LCountry'][1] except KeyError: region = 'Unknown' DOut.setdefault(region, []).append(iso) LOut = [] for region, LValues in sorted(DOut.items()): LOut.append((region, LValues)) from char_data.data_processors.consts import DTwoLevelMappings DTwoLevelMappings['cldr_alphabets.alphabets'] = LOut # HACK!
def split(self, s): m = RE_ISO.match(s) assert m, "invalid ISO code format: %s" % s lang, script, territory, variant = m.group(1, 2, 3, 4) if SPLIT_CHECKING: if lang and not lang in self.SOKISOs: assert lang in ISOCodes, \ "lang code %s not valid!" % lang # Shorten if possible to a two-letter code if lang and len(lang) == 3: DISO = ISOCodes.get_D_iso(lang) assert not 'part1' in DISO, \ "lang code %s should be shortened to %s!" % (lang, DISO['part1']) # This ISO checking is very slow, so # remember this ISO as OK for next time self.SOKISOs.add(lang) if script: assert script in self.DScript2Name, \ "script %s not valid!" % script if territory: assert territory in self.STerritories, \ "territory code %s not valid!" % territory return ISOCode(lang, script, territory, variant)
def get_lang_name(self, part3, typ=LANG_DEFAULT, default=KeyError): """ Finds the localized name for ISO 639-3 code `part3`, converting first to ISO 639-1 if necessary NOTE: See also ISOPrettifier for ISO string aware (e.g. "en_Latn-AU")-style pretty formatting! TODO: Add support for profiles/scripts/variants, e.g. "zh_CN"! TODO: Make sure CLDR doesn't used ISO 639-2! ========================= """ DLangs = self.D['DLangs'] if part3 in DLangs: return get(DLangs, default, [part3, typ]) # Get the ISO-639-1 two-letter code try: DISO = ISOCodes.get_D_iso(part3, add_alternates=False) except KeyError: if default == KeyError: raise return default if 'part1' in DISO: part1 = DISO['part1'] else: part1 = part3 # WARNING! =========================================== return get(DLangs, default, [part1, typ])
def get_L_part3(self): LRtn = [] for iso in self.D['DLangs']: if len(iso) == 2: if iso == 'bh': continue # Bihari not in 639-3 HACK! ===================================== iso = ISOCodes.to_part3(iso) LRtn.append(iso) return sorted(LRtn)
def get_L_pretty(self, s): """ get the localized names of the language, script, territory+variant (if specified) """ from iso_tools.ISOTools import ISOTools lang, script, territory, variant = ISOTools.split(s) if lang: DISO = ISOCodes.get_D_iso(lang) territory_default = DCountries.get(territory, [territory])[0] return (self.get_lang_name( lang, default=DISO.get('short_name', DISO['long_name'])) if lang else None, self.get_script_name(script) if script else None, self.get_territory_name(territory, default=territory_default) if territory else None, self.get_variant_name(variant, default=variant) if variant else None)
def get_D_lang_names_to_iso(self, make_case_insensitive=False): DRtn = {} DLangs = self.D['DLangs'] for part3, DLang in DLangs.items(): for typ, loc_lang_name in DLang.items(): try: DISO = ISOCodes.get_D_iso(part3, add_alternates=False) except KeyError: DISO = {} # Try to convert to two-char codes if 'part1' in DISO: part1 = DISO['part1'] else: part1 = part3 if make_case_insensitive: loc_lang_name = loc_lang_name.lower() DRtn[loc_lang_name] = part1 return DRtn
def join(self, part3=None, script=None, territory=None, variant=None, part2=None): _lang = _script = _territory = _variant = None # Process ISO if part3 and part3 in self.DPart3Cache: _lang = self.DPart3Cache[part3] elif part2 and part2 in self.DPart2Cache: _lang = self.DPart2Cache[part2] else: if part3: _lang = part3 assert _lang in ISOCodes, \ "language code %s not valid!" % _lang elif part2: _lang = ISOCodes.to_part3(part2) # "undefined" iso codes will be omitted _lang = None if _lang == 'und' else _lang # Shorten if possible to a two-letter code if _lang and len(_lang) == 3: DISO = ISOCodes.get_D_iso(_lang) if DISO and 'part1' in DISO: _lang = DISO['part1'] # Cache the result for later DSet = self.DPart3Cache if part3 else self.DPart2Cache DSet[part3 if part3 else part2] = _lang # Process script, converting from the # English script name if necessary if script: if not len(script) == 4 or not script.istitle(): assert script.lower() in self.DName2Script,\ "script name %s not valid!" % script script = self.DName2Script[script.lower()] else: assert script in self.DScript2Name, \ "script %s not valid!" % script _script = script # Process Country if territory: if ( ( not len(territory)==2 or not territory.isupper() ) and not territory.isdigit() ): # Convert to a territory code from # the English script name if necessary assert territory.lower() in DRevCountries,\ "territory name %s was not found!" % territory _territory = DRevCountries[territory.lower()] else: # Otherwise just verify the territory code is valid assert territory in self.STerritories, \ "territory code %s not valid!" % territory _territory = territory # Process Variant # (currently no checking, but it may pay # to check if a localization is available etc) if variant: _variant = variant return '%s%s%s%s' % ( _lang if _lang else '', ('_' if _lang else '') + _script if _script else '', ('-' if _script or _lang else '') + _territory if _territory else '', '|'+_variant if _variant else '' )
def get_D_name_to_iso(): from iso_tools.ISOCodes import ISOCodes D = {} for part3 in ISOCodes: DISO = ISOCodes.get_D_iso(part3, add_alternates=True) if 'part1' in DISO: part3 = DISO['part1'] # HACK: shorten the code!!! D[DISO['short_name'].lower()] = part3 D[DISO['long_name'].lower()] = part3 DAlt = DISO['DAlt'] if 'language name' in DAlt: DLangNames = DAlt['language name'] for k, v in list(DLangNames.items()): D[k.lower()] = part3 if 'alternate language name' in v: DAltNames = v['alternate language name'] for k2 in DAltNames: D[k2.lower()] = part3 for k in list(D.keys()): # HACK: Always use the generic "zh" # Chinese code for Mandarin Chinese! if D[k] == 'cmn': D[k] = 'zh' # HACK: Fix e.g. "chinese, hakka" -> "hakka" if ', ' in k: _, _, lang = k.partition(', ') if not lang in D: D[lang] = D[k] D['cantonese'] = 'yue' D['mandarin'] = 'zh' # HACK! D['translingual'] = 'mul' # Codes from Wiktionary headers with more than # ~400 entries that aren't in the above data DExc = { 'middle english': 'enm', # 21579, 'ancient greek': 'grc', # 19189, #'norman': 'nrf', # 10941, 'old english': 'ang', # 8847, 'old french': 'fro', # 7636, 'old armenian': 'xcl', # 6945, 'khmer': 'khm', # 4121, 'middle french': 'frm', # 4049, 'interlingua': 'ina', # 3212, 'livonian': 'liv', # 2995, 'old irish': 'sga', # 2920, 'west frisian': 'fry', # 2865, # 'westrobothnian': 2735, 'egyptian': 'egy', # 2622, 'old church slavonic': 'chu', # 2419, 'middle dutch': 'dum', # 1958, 'aramaic': 'arc', # 1755, 'alemannic german': 'gsw', # 1585, 'swazi': 'ssw', # 1408, 'vilamovian': 'wym', # 1385, 'old high german': 'goh', # 1334, 'tocharian b': 'txb', # 1258, 'punjabi': 'pan', # 1240, 'ottoman turkish': 'ota', # 1106, 'saterland frisian': 'stq', # 1085, 'german low german': 'nds', # 945, 'haitian creole': 'hat', # 913, 'chichewa': 'nya', # 881, 'mauritian creole': 'mfe', # 852, 'dupaningan agta': 'duo', # 702, 'zazaki': 'zza', # 688, # 'old swedish': 640, 'bikol central': 'bcl', # 607, # 'tarantino': 593, 'old occitan': 'pro', # 580, # 'old portuguese': 573, 'north frisian': 'frr', # 567, # 'bourguignon': 539, 'taos': 'twf', # 523, 'romani': 'rom', # 494, # 'central franconian': 486, 'okinawan': 'ryu', # 429, 'ojibwe': 'oji', # 403, } for lang_name, part3 in DExc.items(): DISO = ISOCodes.get_D_iso(part3, add_alternates=True) if 'part1' in DISO: # Shorten the code DExc[lang_name] = DISO['part1'] D.update(DExc) return D