def get_family_id_to_regions(family_id_to_lang_scr_to_sample_key): lang_scr_to_regions = collections.defaultdict(set) for region in sorted(cldr_data.known_regions()): if region == 'ZZ': continue if len(region) > 2: # e.g. world print 'skipping region %s' % region continue lang_scrs = cldr_data.region_to_lang_scripts(region) for lang_scr in lang_scrs: lang_scr_to_regions[lang_scr].add(region) family_id_to_regions = collections.defaultdict(set) warnings = set() for family_id, lang_scr_to_sample_key in family_id_to_lang_scr_to_sample_key.iteritems(): for lang_scr in lang_scr_to_sample_key: if lang_scr in lang_scr_to_regions: for region in lang_scr_to_regions[lang_scr]: family_id_to_regions[family_id].add(region) else: # don't warn about undefined languages if not lang_scr.startswith('und'): warnings.add(lang_scr) for lang_scr in sorted(warnings): print 'no mapping from %s to any region' % lang_scr return family_id_to_regions
def get_family_id_to_regions(family_id_to_lang_scr_to_sample_key): lang_scr_to_regions = collections.defaultdict(set) for region in sorted(cldr_data.known_regions()): if region == 'ZZ': continue if len(region) > 2: # e.g. world print 'skipping region %s' % region continue lang_scrs = cldr_data.region_to_lang_scripts(region) for lang_scr in lang_scrs: lang_scr_to_regions[lang_scr].add(region) family_id_to_regions = collections.defaultdict(set) warnings = set() for tup in family_id_to_lang_scr_to_sample_key.iteritems(): family_id, lang_scr_to_sample_key = tup for lang_scr in lang_scr_to_sample_key: if lang_scr in lang_scr_to_regions: for region in lang_scr_to_regions[lang_scr]: family_id_to_regions[family_id].add(region) else: # don't warn about undefined languages if not lang_scr.startswith('und'): warnings.add(lang_scr) for lang_scr in sorted(warnings): print 'no mapping from %s to any region' % lang_scr return family_id_to_regions
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': print 'used lang is und for script %s in region %s' % (script, region) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_script_codes(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': all_lang_scripts[lang].add(script) elif script not in known_scripts: print 'adding script with unknown language %s' % script all_lang_scripts[lang].add(script) else: print '### script %s with unknown language already seen' % script all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def get_region_to_family_ids(script_to_family_ids): region_to_family_ids = collections.defaultdict(set) for region in cldr_data.known_regions(): if region == 'ZZ': continue if len(region) > 2: print 'skipping region %s' % region continue lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') if script == 'Kana': print 'remap %s to use Jpan script' % lang_script script = 'Jpan' if not script in script_to_family_ids: print 'unsupported script %s for lang %s in region %s' % (script, lang, region) else: families = script_to_family_ids[script] region_to_family_ids[region].update(families) return region_to_family_ids
def get_region_to_family_ids(script_to_family_ids): region_to_family_ids = collections.defaultdict(set) for region in cldr_data.known_regions(): if region == 'ZZ': continue if len(region) > 2: print 'skipping region %s' % region continue lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') if script == 'Kana': print 'remap %s to use Jpan script' % lang_script script = 'Jpan' if not script in script_to_family_ids: print 'unsupported script %s for lang %s in region %s' % ( script, lang, region) else: families = script_to_family_ids[script] region_to_family_ids[region].update(families) return region_to_family_ids
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': _log('used lang is und for script %s in region %s' % (script, region)) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_scripts(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': if script not in all_lang_scripts[lang]: _log('adding likely lang %s for script %s' % (lang, script)) all_lang_scripts[lang].add(script) elif script not in known_scripts: _log('adding script with unknown language %s' % script) all_lang_scripts[lang].add(script) else: _log('script %s with unknown language already seen' % script) # Patch: ensure ryu-Jpan exists # - Okinawan can be written in either Kana or a combination of Hira # and Kanji. Rather than take a strong position on this, add a # mapping to Jpan. all_lang_scripts['ryu'].add('Jpan') # Patch: see noto-fonts#133 comment on June 8th. all_lang_scripts['tlh'] |= {'Latn', 'Piqd'} all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': if _DEBUG: print 'used lang is und for script %s in region %s' % (script, region) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_script_codes(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': if _DEBUG and script not in all_lang_scripts[lang]: print '# adding likely lang %s for script %s' % (lang, script) all_lang_scripts[lang].add(script) elif script not in known_scripts: if _DEBUG: print '# adding script with unknown language %s' % script all_lang_scripts[lang].add(script) elif _DEBUG: print '### script %s with unknown language already seen' % script # Patch: ensure ryu-Jpan exists # - Okinawan can be written in either Kana or a combination of Hira # and Kanji. Rather than take a strong position on this, add a # mapping to Jpan. all_lang_scripts['ryu'].add('Jpan') all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def get_used_lang_data(supported_scripts): """Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region""" # Get additional scripts for a lang by using get_likely_subtags from script to # lang. This might not be the same as the likely script for a lang, but it does # indicate the language can be written in the script, or so we assume. lang_to_additional_script = {} for script in supported_scripts: lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': lang_to_additional_script[lang] = script unsupported_scripts = set() lang_data = {} used_lang_scripts = collections.defaultdict(set) for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') if script == 'Kana': print 'remap %s to use Jpan' % lang_script script = 'Jpan' if script not in supported_scripts: unsupported_scripts.add(script) used_lang_scripts[lang].add(script) if unsupported_scripts: print 'used scripts that are not supported: %s' % ', '.join(sorted(unsupported_scripts)) known_langs = set(cldr_data.known_langs()) for lang in lang_to_additional_script: if not lang in known_langs: print 'lang %s not in known langs' % lang known_langs.add(lang) for lang in known_langs: if lang in ['ryu', 'ain']: all_scripts = set(['Jpan']) else: all_scripts = set(cldr_data.lang_to_scripts(lang)) # add additional scripts for lang if lang in lang_to_additional_script: script = lang_to_additional_script[lang] if script not in all_scripts: print 'cldr data does not have script %s for lang %s' % (script, lang) all_scripts.add(script) if not all_scripts & supported_scripts: print 'no supported scripts among %s for lang %s' % (all_scripts, lang) continue used_scripts = used_lang_scripts[lang] if not used_scripts: script = cldr_data.get_likely_script(lang) if script != 'Zzzz': used_scripts = set([script]) unused_scripts = all_scripts - used_scripts lang_data[lang] = (used_scripts, unused_scripts) # Patch out langs whose sample data Noto doesn't support # A bunch of these resolve to the same sample. Would be easier to check if I just had # sample names independent of language names, but then harder to remove the languages. for lang in ['abq', 'ady', 'aii-Cyrl', 'av', 'bua', 'chm']: if not lang in lang_data: print 'patched out lang %s not present' % lang else: print 'patch out lang %s' % lang del lang_data[lang] return lang_data
def get_used_lang_data(supported_scripts): """Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region""" # Get additional scripts for a lang by using get_likely_subtags from script to # lang. This might not be the same as the likely script for a lang, but it does # indicate the language can be written in the script, or so we assume. lang_to_additional_script = {} for script in supported_scripts: lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': lang_to_additional_script[lang] = script unsupported_scripts = set() lang_data = {} used_lang_scripts = collections.defaultdict(set) for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') if script == 'Kana': print 'remap %s to use Jpan' % lang_script script = 'Jpan' if script not in supported_scripts: unsupported_scripts.add(script) used_lang_scripts[lang].add(script) if unsupported_scripts: print 'used scripts that are not supported: %s' % ', '.join( sorted(unsupported_scripts)) known_langs = set(cldr_data.known_langs()) for lang in lang_to_additional_script: if not lang in known_langs: print 'lang %s not in known langs, adding' % lang known_langs.add(lang) for lang in known_langs: if lang in ['ryu', 'ain']: all_scripts = set(['Jpan']) else: all_scripts = set(cldr_data.lang_to_scripts(lang)) # add additional scripts for lang if lang in lang_to_additional_script: script = lang_to_additional_script[lang] if script not in all_scripts: print 'cldr data does not have script %s for lang %s' % ( script, lang) all_scripts.add(script) if not all_scripts & supported_scripts: print 'no supported scripts among %s for lang %s' % (all_scripts, lang) continue used_scripts = used_lang_scripts[lang] if not used_scripts: script = cldr_data.get_likely_script(lang) if script != 'Zzzz': used_scripts = set([script]) unused_scripts = all_scripts - used_scripts lang_data[lang] = (used_scripts, unused_scripts) # Patch out langs whose sample data Noto doesn't support # A bunch of these resolve to the same sample. Would be easier to check if I just had # sample names independent of language names, but then harder to remove the languages. for lang in ['abq', 'ady', 'aii-Cyrl', 'av', 'bua', 'chm']: if not lang in lang_data: print 'patched out lang %s not present' % lang else: print 'patch out lang %s' % lang del lang_data[lang] return lang_data