def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': print 'used lang is und for script %s in region %s' % (script, region) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_script_codes(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': all_lang_scripts[lang].add(script) elif script not in known_scripts: print 'adding script with unknown language %s' % script all_lang_scripts[lang].add(script) else: print '### script %s with unknown language already seen' % script all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def _create_script_to_default_lang(lang_script_data): """Iterates over all the scripts in lang_script_data, and returns a map from each script to the default language code, generally based on cldr likely subtag data. This assigns 'en' to Latn by fiat (cldr defaults to 'und'). Some other scripts (e.g. Dsrt) just get 'und'. This checks that the default lang for a script actually uses that script in lang_script_data, when the default lang is not 'und'. """ script_to_default_lang = {} all_scripts = set() script_to_used = collections.defaultdict(set) script_to_unused = collections.defaultdict(set) for lang in lang_script_data: used, unused = lang_script_data[lang] all_scripts |= used all_scripts |= unused for script in used: script_to_used[script].add(lang) for script in unused: script_to_unused[script].add(lang) # Add scripts without langs. all_scripts.add('Zsym') all_scripts.add('Qaae') for script in sorted(all_scripts): default_lang = cldr_data.get_likely_subtags('und-' + script)[0] # CLDR has no names for these, and two are collectives, so it's simpler to omit them. # if script in ['Kthi', 'Khar', 'Brah']: # print 'likely lang for %s is %s, replace with und' % (script, default_lang) # default_lang = 'und' if default_lang == 'und': if script == 'Latn': default_lang = 'en' # cultural bias... else: print 'no default lang for script %s' % script langs = script_to_used[script] if langs: default_lang = iter(langs).next() print 'using used lang %s from %s' % (default_lang, langs) else: langs = script_to_unused[script] if langs: default_lang = iter(langs).next() print 'using unused lang %s from %s' % (default_lang, langs) else: print 'defaulting to \'und\'' else: used, unused = lang_script_data[default_lang] assert script in used or script in unused script_to_default_lang[script] = default_lang return script_to_default_lang
def _create_script_to_default_lang(lang_script_data): """Iterates over all the scripts in lang_script_data, and returns a map from each script to the default language code, generally based on cldr likely subtag data. This assigns 'en' to Latn by fiat (cldr defaults to 'und'). Some other scripts (e.g. Dsrt) just get 'und'. This checks that the default lang for a script actually uses that script in lang_script_data, when the default lang is not 'und'. """ script_to_default_lang = {} all_scripts = set() script_to_used = collections.defaultdict(set) script_to_unused = collections.defaultdict(set) for lang in lang_script_data: used, unused = lang_script_data[lang] all_scripts |= used all_scripts |= unused for script in used: script_to_used[script].add(lang) for script in unused: script_to_unused[script].add(lang) # Add scripts without langs. all_scripts.add('Zsym') all_scripts.add('Zsye') # Patch Klingon as default lang for (unused) script pIqaD script_to_used['Piqd'].add('tlh') for script in sorted(all_scripts): default_lang = cldr_data.get_likely_subtags('und-' + script)[0] if default_lang == 'und': if script == 'Latn': default_lang = 'en' # cultural bias... else: _log('no default lang for script %s' % script) langs = script_to_used[script] if langs: default_lang = iter(langs).next() _log('using used lang %s from %s' % (default_lang, langs)) else: langs = script_to_unused[script] if langs: default_lang = iter(langs).next() _log('using unused lang %s from %s' % (default_lang, langs)) else: _log('defaulting to \'und\'') else: used, unused = lang_script_data[default_lang] assert script in used or script in unused script_to_default_lang[script] = default_lang return script_to_default_lang
def _create_script_to_default_lang(lang_script_data): """Iterates over all the scripts in lang_script_data, and returns a map from each script to the default language code, generally based on cldr likely subtag data. This assigns 'en' to Latn by fiat (cldr defaults to 'und'). Some other scripts (e.g. Dsrt) just get 'und'. This checks that the default lang for a script actually uses that script in lang_script_data, when the default lang is not 'und'. """ script_to_default_lang = {} all_scripts = set() script_to_used = collections.defaultdict(set) script_to_unused = collections.defaultdict(set) for lang in lang_script_data: used, unused = lang_script_data[lang] all_scripts |= used all_scripts |= unused for script in used: script_to_used[script].add(lang) for script in unused: script_to_unused[script].add(lang) # Add scripts without langs. all_scripts.add('Zsym') all_scripts.add('Zsye') # Patch Klingon as default lang for (unused) script pIqaD script_to_used['Piqd'].add('tlh') for script in sorted(all_scripts): default_lang = cldr_data.get_likely_subtags('und-' + script)[0] if default_lang == 'und': if script == 'Latn': default_lang = 'en' # cultural bias... else: print 'no default lang for script %s' % script langs = script_to_used[script] if langs: default_lang = iter(langs).next() print 'using used lang %s from %s' % (default_lang, langs) else: langs = script_to_unused[script] if langs: default_lang = iter(langs).next() print 'using unused lang %s from %s' % (default_lang, langs) else: print 'defaulting to \'und\'' else: used, unused = lang_script_data[default_lang] assert script in used or script in unused script_to_default_lang[script] = default_lang return script_to_default_lang
def get_family_id_to_default_lang_tag(family_id_to_lang_tags): """Return a mapping from family id to default lang tag, for families that have multiple lang tags. This is based on likely subtags and the script of the family (Latn for LGC). """ # TODO(dougfelt): this reintroduces language tags that we'd previously filtered # out. We should not be doing this here. Figure out a better way to handle this. family_id_to_default_lang_tag = {} for family_id, lang_tags in family_id_to_lang_tags.iteritems(): parts = family_id.split('-') if len(parts) == 1: # 'sans' or 'serif' script = 'Latn' else: script = parts[1].capitalize() lang = cldr_data.get_likely_subtags('und-' + script)[0] # CLDR has no names for these, and two are collectives, so it's simpler to omit them. if script in ['Kthi', 'Khar', 'Brah']: print 'likely lang for %s is %s, replace with und' % (script, lang) lang = 'und' if lang == 'und': # special case if script == 'Latn': lang_tag = 'en' elif script == 'Aran': lang_tag = 'ur' else: lang_tag = 'und' + '-' + script elif lang not in lang_tags: lang_tag = lang + '-' + script if lang_tag not in lang_tags: print 'Akk, lang and lang_scr \'%s\' not listed for family %s' % ( lang_tag, family_id) else: lang_tag = lang family_id_to_default_lang_tag[family_id] = lang_tag return family_id_to_default_lang_tag
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': _log('used lang is und for script %s in region %s' % (script, region)) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_scripts(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': if script not in all_lang_scripts[lang]: _log('adding likely lang %s for script %s' % (lang, script)) all_lang_scripts[lang].add(script) elif script not in known_scripts: _log('adding script with unknown language %s' % script) all_lang_scripts[lang].add(script) else: _log('script %s with unknown language already seen' % script) # Patch: ensure ryu-Jpan exists # - Okinawan can be written in either Kana or a combination of Hira # and Kanji. Rather than take a strong position on this, add a # mapping to Jpan. all_lang_scripts['ryu'].add('Jpan') # Patch: see noto-fonts#133 comment on June 8th. all_lang_scripts['tlh'] |= {'Latn', 'Piqd'} all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': if _DEBUG: print 'used lang is und for script %s in region %s' % (script, region) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_script_codes(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': if _DEBUG and script not in all_lang_scripts[lang]: print '# adding likely lang %s for script %s' % (lang, script) all_lang_scripts[lang].add(script) elif script not in known_scripts: if _DEBUG: print '# adding script with unknown language %s' % script all_lang_scripts[lang].add(script) elif _DEBUG: print '### script %s with unknown language already seen' % script # Patch: ensure ryu-Jpan exists # - Okinawan can be written in either Kana or a combination of Hira # and Kanji. Rather than take a strong position on this, add a # mapping to Jpan. all_lang_scripts['ryu'].add('Jpan') all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def get_used_lang_data(supported_scripts): """Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region""" # Get additional scripts for a lang by using get_likely_subtags from script to # lang. This might not be the same as the likely script for a lang, but it does # indicate the language can be written in the script, or so we assume. lang_to_additional_script = {} for script in supported_scripts: lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': lang_to_additional_script[lang] = script unsupported_scripts = set() lang_data = {} used_lang_scripts = collections.defaultdict(set) for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') if script == 'Kana': print 'remap %s to use Jpan' % lang_script script = 'Jpan' if script not in supported_scripts: unsupported_scripts.add(script) used_lang_scripts[lang].add(script) if unsupported_scripts: print 'used scripts that are not supported: %s' % ', '.join(sorted(unsupported_scripts)) known_langs = set(cldr_data.known_langs()) for lang in lang_to_additional_script: if not lang in known_langs: print 'lang %s not in known langs' % lang known_langs.add(lang) for lang in known_langs: if lang in ['ryu', 'ain']: all_scripts = set(['Jpan']) else: all_scripts = set(cldr_data.lang_to_scripts(lang)) # add additional scripts for lang if lang in lang_to_additional_script: script = lang_to_additional_script[lang] if script not in all_scripts: print 'cldr data does not have script %s for lang %s' % (script, lang) all_scripts.add(script) if not all_scripts & supported_scripts: print 'no supported scripts among %s for lang %s' % (all_scripts, lang) continue used_scripts = used_lang_scripts[lang] if not used_scripts: script = cldr_data.get_likely_script(lang) if script != 'Zzzz': used_scripts = set([script]) unused_scripts = all_scripts - used_scripts lang_data[lang] = (used_scripts, unused_scripts) # Patch out langs whose sample data Noto doesn't support # A bunch of these resolve to the same sample. Would be easier to check if I just had # sample names independent of language names, but then harder to remove the languages. for lang in ['abq', 'ady', 'aii-Cyrl', 'av', 'bua', 'chm']: if not lang in lang_data: print 'patched out lang %s not present' % lang else: print 'patch out lang %s' % lang del lang_data[lang] return lang_data
def get_used_lang_data(supported_scripts): """Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region""" # Get additional scripts for a lang by using get_likely_subtags from script to # lang. This might not be the same as the likely script for a lang, but it does # indicate the language can be written in the script, or so we assume. lang_to_additional_script = {} for script in supported_scripts: lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': lang_to_additional_script[lang] = script unsupported_scripts = set() lang_data = {} used_lang_scripts = collections.defaultdict(set) for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') if script == 'Kana': print 'remap %s to use Jpan' % lang_script script = 'Jpan' if script not in supported_scripts: unsupported_scripts.add(script) used_lang_scripts[lang].add(script) if unsupported_scripts: print 'used scripts that are not supported: %s' % ', '.join( sorted(unsupported_scripts)) known_langs = set(cldr_data.known_langs()) for lang in lang_to_additional_script: if not lang in known_langs: print 'lang %s not in known langs, adding' % lang known_langs.add(lang) for lang in known_langs: if lang in ['ryu', 'ain']: all_scripts = set(['Jpan']) else: all_scripts = set(cldr_data.lang_to_scripts(lang)) # add additional scripts for lang if lang in lang_to_additional_script: script = lang_to_additional_script[lang] if script not in all_scripts: print 'cldr data does not have script %s for lang %s' % ( script, lang) all_scripts.add(script) if not all_scripts & supported_scripts: print 'no supported scripts among %s for lang %s' % (all_scripts, lang) continue used_scripts = used_lang_scripts[lang] if not used_scripts: script = cldr_data.get_likely_script(lang) if script != 'Zzzz': used_scripts = set([script]) unused_scripts = all_scripts - used_scripts lang_data[lang] = (used_scripts, unused_scripts) # Patch out langs whose sample data Noto doesn't support # A bunch of these resolve to the same sample. Would be easier to check if I just had # sample names independent of language names, but then harder to remove the languages. for lang in ['abq', 'ady', 'aii-Cyrl', 'av', 'bua', 'chm']: if not lang in lang_data: print 'patched out lang %s not present' % lang else: print 'patch out lang %s' % lang del lang_data[lang] return lang_data