def get_script_to_exemplar_data_map(): """Return a map from script to 3-tuples of: - locale tuple (lang, script, region, variant) - cldr_relative path to src of exemplar data - tuple of the exemplar chars""" script_map = collections.defaultdict(dict) for directory in ['common', 'seed', 'exemplars']: data_dir = path.join(directory, 'main') for filename in os.listdir(path.join(CLDR_DIR, data_dir)): if not filename.endswith('.xml'): continue exemplar_list = cldr_data.get_exemplar_from_file(path.join(data_dir, filename)) if not exemplar_list: if _VERBOSE: print ' no exemplar list for %s' % path.join(data_dir, filename) continue lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4]) if not lsrv: if _VERBOSE: print ' no lsrv for %s' % path.join(data_dir, filename) continue src = path.join(directory, filename) script = lsrv[1] if not script: if _VERBOSE: print ' no script for %s' % path.join(data_dir, filename) continue loc_tag = cldr_data.lsrv_to_loc_tag(lsrv) loc_to_exemplar_info = script_map[script] if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # fix exemplars that look incorrect if script == 'Arab' and 'd' in exemplar_list: if _VERBOSE: print 'found \'d\' in %s for %s' % (src, lsrv) no_latin = True else: no_latin = False # exclude exemplar strings, and restrict to letters and digits def accept_cp(cp): if len(cp) != 1: return False cat = unicode_data.category(cp) if cat[0] != 'L' and cat != 'Nd': return False if no_latin and cp in 'df': return False return True filtered_exemplar_list = filter(accept_cp, exemplar_list) # some exemplar lists don't surround strings with curly braces, and end up # with duplicate characters. Flag these exemplar_chars = set() dup_chars = set() fixed_exemplar_list = [] for cp in filtered_exemplar_list: if cp in exemplar_chars: dup_chars.add(cp) else: exemplar_chars.add(cp) fixed_exemplar_list.append(cp) if len(dup_chars) > 0 and _VERBOSE: print 'duplicate exemplars in %s: %s' % ( src, ', '.join([u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars])) loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(fixed_exemplar_list)) # supplement with extra locale data for loc_tag in extra_locale_data.EXEMPLARS: exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag) lang, script = loc_tag.split('-') lsrv = (lang, script, None, None) loc_to_exemplar_info = script_map[script] src = '[extra locale data]/%s' % loc_tag if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # restrict to letters, except for zsym def accept_cp(cp): cat = unicode_data.category(cp) return cat[0] == 'L' or cat == 'Nd' if 'Zsym' not in loc_tag: filtered_exemplar_list = filter(accept_cp, exemplar_list) if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE: print 'filtered some characters from %s' % src else: filtered_exemplar_list = exemplar_list loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(filtered_exemplar_list)) return script_map
def get_script_to_exemplar_data_map(): """Return a map from script to 3-tuples of: - locale tuple (lang, script, region, variant) - cldr_relative path to src of exemplar data - tuple of the exemplar chars""" script_map = collections.defaultdict(dict) for directory in ['common', 'seed', 'exemplars']: data_dir = path.join(directory, 'main') for filename in os.listdir(path.join(CLDR_DIR, data_dir)): if not filename.endswith('.xml'): continue exemplar_list = cldr_data.get_exemplar_from_file( path.join(data_dir, filename)) if not exemplar_list: if _VERBOSE: print ' no exemplar list for %s' % path.join( data_dir, filename) continue lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4]) if not lsrv: if _VERBOSE: print ' no lsrv for %s' % path.join(data_dir, filename) continue src = path.join(directory, filename) script = lsrv[1] if not script: if _VERBOSE: print ' no script for %s' % path.join(data_dir, filename) continue loc_tag = cldr_data.lsrv_to_loc_tag(lsrv) loc_to_exemplar_info = script_map[script] if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # fix exemplars that look incorrect if script == 'Arab' and 'd' in exemplar_list: if _VERBOSE: print 'found \'d\' in %s for %s' % (src, lsrv) no_latin = True else: no_latin = False # exclude exemplar strings, and restrict to letters and digits def accept_cp(cp): if len(cp) != 1: return False cat = unicode_data.category(cp) if cat[0] != 'L' and cat != 'Nd': return False if no_latin and cp in 'df': return False return True filtered_exemplar_list = filter(accept_cp, exemplar_list) # some exemplar lists don't surround strings with curly braces, and end up # with duplicate characters. Flag these exemplar_chars = set() dup_chars = set() fixed_exemplar_list = [] for cp in filtered_exemplar_list: if cp in exemplar_chars: dup_chars.add(cp) else: exemplar_chars.add(cp) fixed_exemplar_list.append(cp) if len(dup_chars) > 0 and _VERBOSE: print 'duplicate exemplars in %s: %s' % (src, ', '.join([ u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars ])) loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(fixed_exemplar_list)) # supplement with extra locale data for loc_tag in extra_locale_data.EXEMPLARS: exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag) lang, script = loc_tag.split('-') lsrv = (lang, script, None, None) loc_to_exemplar_info = script_map[script] src = '[extra locale data]/%s' % loc_tag if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # restrict to letters, except for zsym def accept_cp(cp): cat = unicode_data.category(cp) return cat[0] == 'L' or cat == 'Nd' if 'Zsym' not in loc_tag: filtered_exemplar_list = filter(accept_cp, exemplar_list) if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE: print 'filtered some characters from %s' % src else: filtered_exemplar_list = exemplar_list loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(filtered_exemplar_list)) return script_map