def output_charmap(i18n_file, map_name, map_function): '''Output a LC_CTYPE character map section Example: toupper / (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/ … (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/ (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>) ''' max_column = 75 prefix = ' ' line = prefix map_string = '' i18n_file.write('%s /\n' %map_name) for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): mapped = map_function(code_point) if code_point != mapped: if line.strip(): line += ';' map_string = '(' \ + unicode_utils.ucs_symbol(code_point) \ + ',' \ + unicode_utils.ucs_symbol(mapped) \ + ')' if len(line+map_string) > max_column: i18n_file.write(line+'/\n') line = prefix line += map_string if line.strip(): i18n_file.write(line+'\n') i18n_file.write('\n')
def process_width(outfile, ulines, elines): '''ulines are lines from UnicodeData.txt, elines are lines from EastAsianWidth.txt ''' width_dict = {} for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] == "Cf": width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t0' for line in elines: # If an entry in EastAsianWidth.txt is found, it overrides entries in # UnicodeData.txt: fields = line.split(";") if not '..' in fields[0]: width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t2' else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): if key in width_dict: del width_dict[key] width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format( unicode_utils.ucs_symbol(int(code_points[0], 16)), unicode_utils.ucs_symbol(int(code_points[1], 16))) for key in sorted(width_dict): outfile.write(width_dict[key]+'\n')
def output_charmap(i18n_file, map_name, map_function): '''Output a LC_CTYPE character map section Example: toupper / (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/ … (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/ (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>) ''' max_column = 75 prefix = ' ' line = prefix map_string = '' i18n_file.write('%s /\n' % map_name) for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): mapped = map_function(code_point) if code_point != mapped: if line.strip(): line += ';' map_string = '(' \ + unicode_utils.ucs_symbol(code_point) \ + ',' \ + unicode_utils.ucs_symbol(mapped) \ + ')' if len(line + map_string) > max_column: i18n_file.write(line + '/\n') line = prefix line += map_string if line.strip(): i18n_file.write(line + '\n') i18n_file.write('\n')
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][ 'decomposition'] if decomposition.startswith('<font>'): decomposition = decomposition[7:] decomposed_code_points = [[ int(x, 16) for x in decomposition.split(' ') ]] if decomposed_code_points[0]: translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write(' % {:s}\n'.format(name)) translit_file.write('\n')
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[ code_point]['decomposition'] if decomposition.startswith('<font>'): decomposition = decomposition[7:] decomposed_code_points = [[int(x, 16) for x in decomposition.split(' ')]] if decomposed_code_points[0]: translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write(' % {:s}\n'.format(name)) translit_file.write('\n')
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposed_code_points = [compatibility_decompose(code_point)] if not decomposed_code_points[0]: if special_decompose([code_point]) != [code_point]: decomposed_code_points[0] = special_decompose([code_point]) else: special_decomposed_code_points = [] while True: special_decomposed_code_points = special_decompose( decomposed_code_points[-1]) if (special_decomposed_code_points != decomposed_code_points[-1]): decomposed_code_points.append( special_decomposed_code_points) continue special_decomposed_code_points = [] for decomposed_code_point in decomposed_code_points[-1]: special_decomposed_code_points += special_decompose( [decomposed_code_point]) if (special_decomposed_code_points == decomposed_code_points[-1]): break decomposed_code_points.append( special_decomposed_code_points) if decomposed_code_points[0]: translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) translit_file.write('"') translit_file.write('\n') elif 'LIGATURE' in name and 'ARABIC' not in name: decomposed_code_points = special_ligature_decompose(code_point) if decomposed_code_points[0] != code_point: translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) translit_file.write('"') for decomposed_code_point in decomposed_code_points: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) translit_file.write('"') translit_file.write('\n') else: print('Warning: unhandled ligature: {:x} {:s}'.format( code_point, name)) translit_file.write('\n')
def process_width(outfile, ulines, elines): '''ulines are lines from UnicodeData.txt, elines are lines from EastAsianWidth.txt ''' width_dict = {} for line in elines: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16) + 1): width_dict[key] = unicode_utils.ucs_symbol(key) + '\t2' for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): width_dict[int( fields[0], 16)] = unicode_utils.ucs_symbol(int(fields[0], 16)) + '\t0' # handle special cases for compatibility for key in list(range(0x1160, 0x1200)) + list(range(0x3248, 0x3250)) + \ list(range(0x4DC0, 0x4E00)) + list((0x00AD,)): if key in width_dict: del width_dict[key] width_dict[0x1160] = '{:s}...{:s}\t0'.format( unicode_utils.ucs_symbol(0x1160), unicode_utils.ucs_symbol(0x11FF)) width_dict[0x3248] = '{:s}...{:s}\t2'.format( unicode_utils.ucs_symbol(0x3248), unicode_utils.ucs_symbol(0x324F)) width_dict[0x4DC0] = '{:s}...{:s}\t2'.format( unicode_utils.ucs_symbol(0x4DC0), unicode_utils.ucs_symbol(0x4DFF)) for key in sorted(width_dict): outfile.write(width_dict[key] + '\n')
def process_range(start, end, outfile, name): '''Writes a range of code points into the CHARMAP section of the output file ''' if 'Hangul Syllable' in name: # from glibc/localedata/ChangeLog: # # 2000-09-24 Bruno Haible <*****@*****.**> # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges, # so they become printable and carry a width. Comment out surrogate # ranges. Add a WIDTH table # # So we expand the Hangul Syllables here: for i in range(int(start, 16), int(end, 16)+1 ): index2, index3 = divmod(i - 0xaC00, 28) index1, index2 = divmod(index2, 21) hangul_syllable_name = 'HANGUL SYLLABLE ' \ + JAMO_INITIAL_SHORT_NAME[index1] \ + JAMO_MEDIAL_SHORT_NAME[index2] \ + JAMO_FINAL_SHORT_NAME[index3] outfile.write('{:<11s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(i), convert_to_hex(i), hangul_syllable_name)) return # UnicodeData.txt file has contains code point ranges like this: # # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; # # The glibc UTF-8 file splits ranges like these into shorter # ranges of 64 code points each: # # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> # … # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A> for i in range(int(start, 16), int(end, 16), 64 ): if i > (int(end, 16)-64): outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(i), unicode_utils.ucs_symbol(int(end,16)), convert_to_hex(i), name)) break outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(i), unicode_utils.ucs_symbol(i+63), convert_to_hex(i), name))
def output_decompositions(translit_file): '''Write the section of the translit_combining file where characters characters are decomposed and combining characters stripped from the decompositions. ''' for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): if special_decompose([code_point]) != [code_point]: decomposed_code_points = [special_decompose([code_point])] else: decomposed_code_points = [canonical_decompose(code_point)] if decomposed_code_points[0]: while True: special_decomposed_code_points = special_decompose( decomposed_code_points[-1]) if (special_decomposed_code_points != decomposed_code_points[-1]): decomposed_code_points.append( special_decomposed_code_points) continue special_decomposed_code_points = [] for decomposed_code_point in decomposed_code_points[-1]: special_decomposed_code_points += special_decompose( [decomposed_code_point]) if (special_decomposed_code_points == decomposed_code_points[-1]): break decomposed_code_points.append(special_decomposed_code_points) for index in range(0, len(decomposed_code_points)): decomposed_code_points[index] = [ x for x in decomposed_code_points[index] if not is_combining_remove(x) ] if decomposed_code_points[0]: translit_file.write('% {:s}\n'.format( unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'])) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write('\n') translit_file.write('\n')
def output_decompositions(translit_file): '''Write the section of the translit_combining file where characters characters are decomposed and combining characters stripped from the decompositions. ''' for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): if special_decompose([code_point]) != [code_point]: decomposed_code_points = [special_decompose([code_point])] else: decomposed_code_points = [canonical_decompose(code_point)] if decomposed_code_points[0]: while True: special_decomposed_code_points = special_decompose( decomposed_code_points[-1]) if (special_decomposed_code_points != decomposed_code_points[-1]): decomposed_code_points.append( special_decomposed_code_points) continue special_decomposed_code_points = [] for decomposed_code_point in decomposed_code_points[-1]: special_decomposed_code_points += special_decompose( [decomposed_code_point]) if (special_decomposed_code_points == decomposed_code_points[-1]): break decomposed_code_points.append( special_decomposed_code_points) for index in range(0, len(decomposed_code_points)): decomposed_code_points[index] = [ x for x in decomposed_code_points[index] if not is_combining_remove(x)] if decomposed_code_points[0]: translit_file.write('% {:s}\n'.format( unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'])) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write('\n') translit_file.write('\n')
def output_transliteration(translit_file): """Write the new transliteration to the output file""" translit_file.write("\n") for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]["name"] decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point]["decomposition"] if decomposition.startswith("<circle>"): decomposition = decomposition[9:] decomposed_code_points = [int(x, 16) for x in decomposition.split(" ")] translit_file.write("% {:s}\n".format(name)) translit_file.write('{:s} "<U0028>'.format(unicode_utils.ucs_symbol(code_point))) for decomposed_code_point in decomposed_code_points: translit_file.write("{:s}".format(unicode_utils.ucs_symbol(decomposed_code_point))) translit_file.write('<U0029>"\n') translit_file.write("\n")
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][ 'decomposition'] if decomposition.startswith('<fraction>'): decomposition = decomposition[11:] decomposed_code_points = [[ int(x, 16) for x in decomposition.split(' ') ]] if decomposed_code_points[0]: decomposed_code_points[0] = [0x0020] \ + decomposed_code_points[0] \ + [0x0020] while True: special_decomposed_code_points = special_decompose( decomposed_code_points[-1]) if (special_decomposed_code_points != decomposed_code_points[-1]): decomposed_code_points.append( special_decomposed_code_points) continue special_decomposed_code_points = [] for decomposed_code_point in decomposed_code_points[-1]: special_decomposed_code_points += special_decompose( [decomposed_code_point]) if (special_decomposed_code_points == decomposed_code_points[-1]): break decomposed_code_points.append( special_decomposed_code_points) translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write('\n') translit_file.write('\n')
def output_charclass(i18n_file, class_name, is_class_function): '''Output a LC_CTYPE character class section Example: upper / <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/ … <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/ <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189> ''' cp_ranges = code_point_ranges(is_class_function) if cp_ranges: i18n_file.write('%s /\n' %class_name) max_column = 75 prefix = ' ' line = prefix range_string = '' for code_point_range in cp_ranges: if line.strip(): line += ';' if len(code_point_range) == 1: range_string = unicode_utils.ucs_symbol(code_point_range[0]) else: range_string = unicode_utils.ucs_symbol_range( code_point_range[0], code_point_range[-1]) if len(line+range_string) > max_column: i18n_file.write(line+'/\n') line = prefix line += range_string if line.strip(): i18n_file.write(line+'\n') i18n_file.write('\n')
def output_charclass(i18n_file, class_name, is_class_function): '''Output a LC_CTYPE character class section Example: upper / <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/ … <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/ <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189> ''' cp_ranges = code_point_ranges(is_class_function) if cp_ranges: i18n_file.write('%s /\n' % class_name) max_column = 75 prefix = ' ' line = prefix range_string = '' for code_point_range in cp_ranges: if line.strip(): line += ';' if len(code_point_range) == 1: range_string = unicode_utils.ucs_symbol(code_point_range[0]) else: range_string = unicode_utils.ucs_symbol_range( code_point_range[0], code_point_range[-1]) if len(line + range_string) > max_column: i18n_file.write(line + '/\n') line = prefix line += range_string if line.strip(): i18n_file.write(line + '\n') i18n_file.write('\n')
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[ code_point]['decomposition'] if decomposition.startswith('<fraction>'): decomposition = decomposition[11:] decomposed_code_points = [[int(x, 16) for x in decomposition.split(' ')]] if decomposed_code_points[0]: decomposed_code_points[0] = [0x0020] \ + decomposed_code_points[0] \ + [0x0020] while True: special_decomposed_code_points = special_decompose( decomposed_code_points[-1]) if (special_decomposed_code_points != decomposed_code_points[-1]): decomposed_code_points.append( special_decomposed_code_points) continue special_decomposed_code_points = [] for decomposed_code_point in decomposed_code_points[-1]: special_decomposed_code_points += special_decompose( [decomposed_code_point]) if (special_decomposed_code_points == decomposed_code_points[-1]): break decomposed_code_points.append( special_decomposed_code_points) translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write('\n') translit_file.write('\n')
def check_charmap(original_file_name, new_file_name): '''Report differences in the CHARMAP section between the old and the new file ''' print('************************************************************') print('Report on CHARMAP:') ocharmap = create_charmap_dictionary(original_file_name) ncharmap = create_charmap_dictionary(new_file_name) print('------------------------------------------------------------') print('Total removed characters in newly generated CHARMAP: %d' %len(set(ocharmap)-set(ncharmap))) if ARGS.show_missing_characters: for key in sorted(set(ocharmap)-set(ncharmap)): print('removed: {:s} {:s} {:s}'.format( unicode_utils.ucs_symbol(key), ocharmap[key], unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_charmap = {} for key in set(ocharmap).intersection(set(ncharmap)): if ocharmap[key] != ncharmap[key]: changed_charmap[key] = (ocharmap[key], ncharmap[key]) print('Total changed characters in newly generated CHARMAP: %d' %len(changed_charmap)) if ARGS.show_changed_characters: for key in sorted(changed_charmap): print('changed: {:s} {:s}->{:s} {:s}'.format( unicode_utils.ucs_symbol(key), changed_charmap[key][0], changed_charmap[key][1], unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated CHARMAP: %d' %len(set(ncharmap)-set(ocharmap))) if ARGS.show_added_characters: for key in sorted(set(ncharmap)-set(ocharmap)): print('added: {:s} {:s} {:s}'.format( unicode_utils.ucs_symbol(key), ncharmap[key], unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[ code_point]['decomposition'] if decomposition.startswith('<circle>'): decomposition = decomposition[9:] decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')] translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} "<U0028>'.format( unicode_utils.ucs_symbol(code_point))) for decomposed_code_point in decomposed_code_points: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) translit_file.write('<U0029>"\n') translit_file.write('\n')
def check_charmap(original_file_name, new_file_name): '''Report differences in the CHARMAP section between the old and the new file ''' print('************************************************************') print('Report on CHARMAP:') ocharmap = create_charmap_dictionary(original_file_name) ncharmap = create_charmap_dictionary(new_file_name) print('------------------------------------------------------------') print('Total removed characters in newly generated CHARMAP: %d' % len(set(ocharmap) - set(ncharmap))) if ARGS.show_missing_characters: for key in sorted(set(ocharmap) - set(ncharmap)): print('removed: {:s} {:s} {:s}'.format( unicode_utils.ucs_symbol(key), ocharmap[key], unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_charmap = {} for key in set(ocharmap).intersection(set(ncharmap)): if ocharmap[key] != ncharmap[key]: changed_charmap[key] = (ocharmap[key], ncharmap[key]) print('Total changed characters in newly generated CHARMAP: %d' % len(changed_charmap)) if ARGS.show_changed_characters: for key in sorted(changed_charmap): print('changed: {:s} {:s}->{:s} {:s}'.format( unicode_utils.ucs_symbol(key), changed_charmap[key][0], changed_charmap[key][1], unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated CHARMAP: %d' % len(set(ncharmap) - set(ocharmap))) if ARGS.show_added_characters: for key in sorted(set(ncharmap) - set(ocharmap)): print('added: {:s} {:s} {:s}'.format( unicode_utils.ucs_symbol(key), ncharmap[key], unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
def output_combining_remove(translit_file): """Write the section of the translit_combining file where combining characters are replaced by empty strings. """ translit_file.write("\n") for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]["name"] if is_combining_remove(code_point): translit_file.write("% {:s}\n".format(name)) translit_file.write('{:s} ""\n'.format(unicode_utils.ucs_symbol(code_point))) translit_file.write("\n")
def output_combining_remove(translit_file): '''Write the section of the translit_combining file where combining characters are replaced by empty strings. ''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] if is_combining_remove(code_point): translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} ""\n'.format( unicode_utils.ucs_symbol(code_point))) translit_file.write('\n')
def output_transliteration(translit_file): """Write the new transliteration to the output file""" translit_file.write("\n") for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]["name"] decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point]["decomposition"] if decomposition.startswith("<font>"): decomposition = decomposition[7:] decomposed_code_points = [[int(x, 16) for x in decomposition.split(" ")]] if decomposed_code_points[0]: translit_file.write("{:s} ".format(unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(";") if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write("{:s}".format(unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write(" % {:s}\n".format(name)) translit_file.write("\n")
def process_width(outfile, ulines, elines, plines): '''ulines are lines from UnicodeData.txt, elines are lines from EastAsianWidth.txt containing characters with width “W” or “F”, plines are lines from PropList.txt which contain characters with the property “Prepended_Concatenation_Mark”. ''' width_dict = {} for line in elines: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): width_dict[key] = 2 for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): width_dict[int(fields[0], 16)] = 0 for line in plines: # Characters with the property “Prepended_Concatenation_Mark” # should have the width 1: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): del width_dict[key] # default width is 1 # handle special cases for compatibility for key in list((0x00AD,)): # https://www.cs.tut.fi/~jkorpela/shy.html if key in width_dict: del width_dict[key] # default width is 1 for key in list(range(0x1160, 0x1200)): width_dict[key] = 0 for key in list(range(0x3248, 0x3250)): # These are “A” which means we can decide whether to treat them # as “W” or “N” based on context: # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html # For us, “W” seems better. width_dict[key] = 2 for key in list(range(0x4DC0, 0x4E00)): width_dict[key] = 2 same_width_lists = [] current_width_list = [] for key in sorted(width_dict): if not current_width_list: current_width_list = [key] elif (key == current_width_list[-1] + 1 and width_dict[key] == width_dict[current_width_list[0]]): current_width_list.append(key) else: same_width_lists.append(current_width_list) current_width_list = [key] if current_width_list: same_width_lists.append(current_width_list) for same_width_list in same_width_lists: if len(same_width_list) == 1: outfile.write('{:s}\t{:d}\n'.format( unicode_utils.ucs_symbol(same_width_list[0]), width_dict[same_width_list[0]])) else: outfile.write('{:s}...{:s}\t{:d}\n'.format( unicode_utils.ucs_symbol(same_width_list[0]), unicode_utils.ucs_symbol(same_width_list[-1]), width_dict[same_width_list[0]]))
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][ 'decomposition'] if decomposition.startswith('<square>'): decomposition = decomposition[9:] decomposed_code_points = [[ int(x, 16) for x in decomposition.split(' ') ]] if decomposed_code_points[0]: while True: special_decomposed_code_points = special_decompose( decomposed_code_points[-1]) if (special_decomposed_code_points != decomposed_code_points[-1]): decomposed_code_points.append( special_decomposed_code_points) continue special_decomposed_code_points = [] for decomposed_code_point in decomposed_code_points[-1]: special_decomposed_code_points += special_decompose( [decomposed_code_point]) if (special_decomposed_code_points == decomposed_code_points[-1]): break decomposed_code_points.append( special_decomposed_code_points) translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][ 'decomposition'] if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'): decomposed_code_points = [ int(x, 16) for x in decomposition.split(' ') ] if len(decomposed_code_points) != 1: sys.stderr.write( 'Unexpected decomposition length {:x} {:s} {:s}\n'.format( code_point, name, decomposition)) exit(1) translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for decomposed_code_point in decomposed_code_points: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) translit_file.write('\n') translit_file.write('\n')
def process_charmap(flines, outfile): '''This function takes an array which contains *all* lines of of UnicodeData.txt and write lines to outfile as used in the CHARMAP … END CHARMAP section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. Samples for input lines: 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;; DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;; 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;; 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;; Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): <U0010> /x10 DATA LINK ESCAPE <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First> %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last> <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> ''' fields_start = [] for line in flines: fields = line.split(";") # Some characters have “<control>” as their name. We try to # use the “Unicode 1.0 Name” (10th field in # UnicodeData.txt) for them. # # The Characters U+0080, U+0081, U+0084 and U+0099 have # “<control>” as their name but do not even have aa # ”Unicode 1.0 Name”. We could write code to take their # alternate names from NameAliases.txt. if fields[1] == "<control>" and fields[10]: fields[1] = fields[10] # Handling code point ranges like: # # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: fields_start = fields continue if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: process_range(fields_start[0], fields[0], outfile, fields[1][:-7]+'>') fields_start = [] continue fields_start = [] if 'Surrogate,' in fields[1]: # Comment out the surrogates in the UTF-8 file. # One could of course skip them completely but # the original UTF-8 file in glibc had them as # comments, so we keep these comment lines. outfile.write('%') outfile.write('{:<11s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(int(fields[0], 16)), convert_to_hex(int(fields[0], 16)), fields[1]))
def process_charmap(flines, outfile): '''This function takes an array which contains *all* lines of of UnicodeData.txt and write lines to outfile as used in the CHARMAP … END CHARMAP section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. Samples for input lines: 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;; DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;; 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;; 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;; Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): <U0010> /x10 DATA LINK ESCAPE <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First> %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last> <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> ''' fields_start = [] for line in flines: fields = line.split(";") # Some characters have “<control>” as their name. We try to # use the “Unicode 1.0 Name” (10th field in # UnicodeData.txt) for them. # # The Characters U+0080, U+0081, U+0084 and U+0099 have # “<control>” as their name but do not even have aa # ”Unicode 1.0 Name”. We could write code to take their # alternate names from NameAliases.txt. if fields[1] == "<control>" and fields[10]: fields[1] = fields[10] # Handling code point ranges like: # # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: fields_start = fields continue if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: process_range(fields_start[0], fields[0], outfile, fields[1][:-7] + '>') fields_start = [] continue fields_start = [] if 'Surrogate,' in fields[1]: # Comment out the surrogates in the UTF-8 file. # One could of course skip them completely but # the original UTF-8 file in glibc had them as # comments, so we keep these comment lines. outfile.write('%') outfile.write('{:<11s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(int(fields[0], 16)), convert_to_hex(int(fields[0], 16)), fields[1]))
def check_width(original_file_name, new_file_name): '''Report differences in the WIDTH section between the old and the new file ''' print('************************************************************') print('Report on WIDTH:') owidth = create_width_dictionary(original_file_name) nwidth = create_width_dictionary(new_file_name) print('------------------------------------------------------------') print('Total removed characters in newly generated WIDTH: %d' %len(set(owidth)-set(nwidth))) print('(Characters not in WIDTH get width 1 by default, ' + 'i.e. these have width 1 now.)') if ARGS.show_missing_characters: for key in sorted(set(owidth)-set(nwidth)): print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(owidth[key]) + 'eaw={:s} '.format( unicode_utils.EAST_ASIAN_WIDTHS[key] if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['category'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_width = {} for key in set(owidth).intersection(set(nwidth)): if owidth[key] != nwidth[key]: changed_width[key] = (owidth[key], nwidth[key]) print('Total changed characters in newly generated WIDTH: %d' %len(changed_width)) if ARGS.show_changed_characters: for key in sorted(changed_width): print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d}->{:d} : '.format(changed_width[key][0], changed_width[key][1]) + 'eaw={:s} '.format( unicode_utils.EAST_ASIAN_WIDTHS[key] if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['category'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated WIDTH: %d' %len(set(nwidth)-set(owidth))) print('(Characters not in WIDTH get width 1 by default, ' + 'i.e. these had width 1 before.)') if ARGS.show_added_characters: for key in sorted(set(nwidth)-set(owidth)): print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(nwidth[key]) + 'eaw={:s} '.format( unicode_utils.EAST_ASIAN_WIDTHS[key] if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['category'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
def check_width(original_file_name, new_file_name): '''Report differences in the WIDTH section between the old and the new file ''' print('************************************************************') print('Report on WIDTH:') owidth = create_width_dictionary(original_file_name) nwidth = create_width_dictionary(new_file_name) print('------------------------------------------------------------') print('Total removed characters in newly generated WIDTH: %d' % len(set(owidth) - set(nwidth))) print('(Characters not in WIDTH get width 1 by default, ' + 'i.e. these have width 1 now.)') if ARGS.show_missing_characters: for key in sorted(set(owidth) - set(nwidth)): print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(owidth[key]) + 'eaw={:s} '.format( unicode_utils.EAST_ASIAN_WIDTHS[key] if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['category'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_width = {} for key in set(owidth).intersection(set(nwidth)): if owidth[key] != nwidth[key]: changed_width[key] = (owidth[key], nwidth[key]) print('Total changed characters in newly generated WIDTH: %d' % len(changed_width)) if ARGS.show_changed_characters: for key in sorted(changed_width): print( 'changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d}->{:d} : '.format(changed_width[key][0], changed_width[key][1]) + 'eaw={:s} '.format( unicode_utils.EAST_ASIAN_WIDTHS[key] if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['category'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated WIDTH: %d' % len(set(nwidth) - set(owidth))) print('(Characters not in WIDTH get width 1 by default, ' + 'i.e. these had width 1 before.)') if ARGS.show_added_characters: for key in sorted(set(nwidth) - set(owidth)): print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(nwidth[key]) + 'eaw={:s} '.format( unicode_utils.EAST_ASIAN_WIDTHS[key] if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['category'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
def output_transliteration(translit_file): '''Write the new transliteration to the output file''' translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[ code_point]['decomposition'] if decomposition.startswith('<square>'): decomposition = decomposition[9:] decomposed_code_points = [[int(x, 16) for x in decomposition.split(' ')]] if decomposed_code_points[0]: while True: special_decomposed_code_points = special_decompose( decomposed_code_points[-1]) if (special_decomposed_code_points != decomposed_code_points[-1]): decomposed_code_points.append( special_decomposed_code_points) continue special_decomposed_code_points = [] for decomposed_code_point in decomposed_code_points[-1]: special_decomposed_code_points += special_decompose( [decomposed_code_point]) if (special_decomposed_code_points == decomposed_code_points[-1]): break decomposed_code_points.append( special_decomposed_code_points) translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for index in range(0, len(decomposed_code_points)): if index > 0: translit_file.write(';') if len(decomposed_code_points[index]) > 1: translit_file.write('"') for decomposed_code_point in decomposed_code_points[index]: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) if len(decomposed_code_points[index]) > 1: translit_file.write('"') translit_file.write('\n') for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] decomposition = unicode_utils.UNICODE_ATTRIBUTES[ code_point]['decomposition'] if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'): decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')] if len(decomposed_code_points) != 1: sys.stderr.write( 'Unexpected decomposition length {:x} {:s} {:s}\n'.format( code_point, name, decomposition)) exit(1) translit_file.write('% {:s}\n'.format(name)) translit_file.write('{:s} '.format( unicode_utils.ucs_symbol(code_point))) for decomposed_code_point in decomposed_code_points: translit_file.write('{:s}'.format( unicode_utils.ucs_symbol(decomposed_code_point))) translit_file.write('\n') translit_file.write('\n')
def process_width(outfile, ulines, elines, plines): '''ulines are lines from UnicodeData.txt, elines are lines from EastAsianWidth.txt containing characters with width “W” or “F”, plines are lines from PropList.txt which contain characters with the property “Prepended_Concatenation_Mark”. ''' width_dict = {} for line in elines: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16) + 1): width_dict[key] = 2 for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): width_dict[int(fields[0], 16)] = 0 for line in plines: # Characters with the property “Prepended_Concatenation_Mark” # should have the width 1: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16) + 1): del width_dict[key] # default width is 1 # handle special cases for compatibility for key in list((0x00AD, )): # https://www.cs.tut.fi/~jkorpela/shy.html if key in width_dict: del width_dict[key] # default width is 1 for key in list(range(0x1160, 0x1200)): width_dict[key] = 0 for key in list(range(0x3248, 0x3250)): # These are “A” which means we can decide whether to treat them # as “W” or “N” based on context: # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html # For us, “W” seems better. width_dict[key] = 2 for key in list(range(0x4DC0, 0x4E00)): width_dict[key] = 2 same_width_lists = [] current_width_list = [] for key in sorted(width_dict): if not current_width_list: current_width_list = [key] elif (key == current_width_list[-1] + 1 and width_dict[key] == width_dict[current_width_list[0]]): current_width_list.append(key) else: same_width_lists.append(current_width_list) current_width_list = [key] if current_width_list: same_width_lists.append(current_width_list) for same_width_list in same_width_lists: if len(same_width_list) == 1: outfile.write('{:s}\t{:d}\n'.format( unicode_utils.ucs_symbol(same_width_list[0]), width_dict[same_width_list[0]])) else: outfile.write('{:s}...{:s}\t{:d}\n'.format( unicode_utils.ucs_symbol(same_width_list[0]), unicode_utils.ucs_symbol(same_width_list[-1]), width_dict[same_width_list[0]]))