def _init_lang_for_script_map(): locs_by_lit_pop = [loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop()] for t in locs_by_lit_pop: lsrv = cldr_data.loc_tag_to_lsrv(t) script = lsrv[1] if script not in _lang_for_script_map: lang = lsrv[0] # print '%s lang => %s' % (script, lang) _lang_for_script_map[script] = lang
def _init_lang_for_script_map(): locs_by_lit_pop = [loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop()] for t in locs_by_lit_pop: lsrv = cldr_data.loc_tag_to_lsrv(t) script = lsrv[1] if script not in _lang_for_script_map: lang = lsrv[0] # print('%s lang => %s' % (script, lang)) _lang_for_script_map[script] = lang
def select_rare_chars_for_loc(script, locs_with_rare_chars, shared_lang_threshold, char_to_lang_map): """Return a list of 2-tuples of loc and selected rare chars, ordered by decreasing literate population of the locale.""" rarity_threshold_map = {} for lang_tag in locs_with_rare_chars: rarity_threshold_map[lang_tag] = shared_lang_threshold selected = [] locs_by_lit_pop = [ loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop() ] # examine locales in decreasing order of literate population for loc_tag in locs_by_lit_pop: if script not in loc_tag: continue loc_tag = loc_tag.replace('_', '-') if loc_tag not in locs_with_rare_chars: continue most_specific_chars = set() most_specific_chars_count = rarity_threshold_map[loc_tag] # From the rare chars for this locale, select those that # are most specific to this language. In most cases they # are unique to this language. for cp in locs_with_rare_chars[loc_tag]: num_chars = len(char_to_lang_map[cp]) if num_chars <= most_specific_chars_count: if num_chars < most_specific_chars_count: most_specific_chars = set() most_specific_chars.add(cp) most_specific_chars_count = num_chars if most_specific_chars: selected.append((loc_tag, most_specific_chars)) for cp in most_specific_chars: for tag in char_to_lang_map[cp]: if rarity_threshold_map[tag] > most_specific_chars_count: rarity_threshold_map[tag] = most_specific_chars_count return selected
def select_rare_chars_for_loc(script, locs_with_rare_chars, shared_lang_threshold, char_to_lang_map): """Return a list of 2-tuples of loc and selected rare chars, ordered by decreasing literate population of the locale.""" rarity_threshold_map = {} for lang_tag in locs_with_rare_chars: rarity_threshold_map[lang_tag] = shared_lang_threshold selected = [] locs_by_lit_pop = [loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop()] # examine locales in decreasing order of literate population for loc_tag in locs_by_lit_pop: if script not in loc_tag: continue loc_tag = loc_tag.replace('_', '-') if loc_tag not in locs_with_rare_chars: continue most_specific_chars = set() most_specific_chars_count = rarity_threshold_map[loc_tag] # From the rare chars for this locale, select those that # are most specific to this language. In most cases they # are unique to this language. for cp in locs_with_rare_chars[loc_tag]: num_chars = len(char_to_lang_map[cp]) if num_chars <= most_specific_chars_count: if num_chars < most_specific_chars_count: most_specific_chars = set() most_specific_chars.add(cp) most_specific_chars_count = num_chars if most_specific_chars: selected.append((loc_tag, most_specific_chars)) for cp in most_specific_chars: for tag in char_to_lang_map[cp]: if rarity_threshold_map[tag] > most_specific_chars_count: rarity_threshold_map[tag] = most_specific_chars_count return selected