def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': print 'used lang is und for script %s in region %s' % (script, region) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_script_codes(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': all_lang_scripts[lang].add(script) elif script not in known_scripts: print 'adding script with unknown language %s' % script all_lang_scripts[lang].add(script) else: print '### script %s with unknown language already seen' % script all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': if _DEBUG: print 'used lang is und for script %s in region %s' % (script, region) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_script_codes(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': if _DEBUG and script not in all_lang_scripts[lang]: print '# adding likely lang %s for script %s' % (lang, script) all_lang_scripts[lang].add(script) elif script not in known_scripts: if _DEBUG: print '# adding script with unknown language %s' % script all_lang_scripts[lang].add(script) elif _DEBUG: print '### script %s with unknown language already seen' % script # Patch: ensure ryu-Jpan exists # - Okinawan can be written in either Kana or a combination of Hira # and Kanji. Rather than take a strong position on this, add a # mapping to Jpan. all_lang_scripts['ryu'].add('Jpan') all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data