Ejemplo n.º 1
0
def output_charmap(i18n_file, map_name, map_function):
    '''Output a LC_CTYPE character map section

    Example:

    toupper /
      (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
      …
      (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
      (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
    '''
    max_column = 75
    prefix = '   '
    line = prefix
    map_string = ''
    i18n_file.write('%s /\n' %map_name)
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        mapped = map_function(code_point)
        if code_point != mapped:
            if line.strip():
                line += ';'
            map_string = '(' \
                         + unicode_utils.ucs_symbol(code_point) \
                         + ',' \
                         + unicode_utils.ucs_symbol(mapped) \
                         + ')'
            if len(line+map_string) > max_column:
                i18n_file.write(line+'/\n')
                line = prefix
            line += map_string
    if line.strip():
        i18n_file.write(line+'\n')
    i18n_file.write('\n')
Ejemplo n.º 2
0
def process_width(outfile, ulines, elines):
    '''ulines are lines from UnicodeData.txt, elines are lines from
    EastAsianWidth.txt

    '''
    width_dict = {}
    for line in ulines:
        fields = line.split(";")
        if fields[4] == "NSM" or fields[2] == "Cf":
            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
                int(fields[0], 16)) + '\t0'

    for line in elines:
        # If an entry in EastAsianWidth.txt is found, it overrides entries in
        # UnicodeData.txt:
        fields = line.split(";")
        if not '..' in fields[0]:
            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
                int(fields[0], 16)) + '\t2'
        else:
            code_points = fields[0].split("..")
            for key in range(int(code_points[0], 16),
                             int(code_points[1], 16)+1):
                if  key in width_dict:
                    del width_dict[key]
            width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
                unicode_utils.ucs_symbol(int(code_points[0], 16)),
                unicode_utils.ucs_symbol(int(code_points[1], 16)))

    for key in sorted(width_dict):
        outfile.write(width_dict[key]+'\n')
Ejemplo n.º 3
0
def output_charmap(i18n_file, map_name, map_function):
    '''Output a LC_CTYPE character map section

    Example:

    toupper /
      (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
      …
      (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
      (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
    '''
    max_column = 75
    prefix = '   '
    line = prefix
    map_string = ''
    i18n_file.write('%s /\n' % map_name)
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        mapped = map_function(code_point)
        if code_point != mapped:
            if line.strip():
                line += ';'
            map_string = '(' \
                         + unicode_utils.ucs_symbol(code_point) \
                         + ',' \
                         + unicode_utils.ucs_symbol(mapped) \
                         + ')'
            if len(line + map_string) > max_column:
                i18n_file.write(line + '/\n')
                line = prefix
            line += map_string
    if line.strip():
        i18n_file.write(line + '\n')
    i18n_file.write('\n')
Ejemplo n.º 4
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][
            'decomposition']
        if decomposition.startswith('<font>'):
            decomposition = decomposition[7:]
            decomposed_code_points = [[
                int(x, 16) for x in decomposition.split(' ')
            ]]
            if decomposed_code_points[0]:
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                for index in range(0, len(decomposed_code_points)):
                    if index > 0:
                        translit_file.write(';')
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                    for decomposed_code_point in decomposed_code_points[index]:
                        translit_file.write('{:s}'.format(
                            unicode_utils.ucs_symbol(decomposed_code_point)))
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                translit_file.write(' % {:s}\n'.format(name))
    translit_file.write('\n')
Ejemplo n.º 5
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
            code_point]['decomposition']
        if decomposition.startswith('<font>'):
            decomposition = decomposition[7:]
            decomposed_code_points = [[int(x, 16)
                                       for x in decomposition.split(' ')]]
            if decomposed_code_points[0]:
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                for index in range(0, len(decomposed_code_points)):
                    if index > 0:
                        translit_file.write(';')
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                    for decomposed_code_point in decomposed_code_points[index]:
                        translit_file.write('{:s}'.format(
                            unicode_utils.ucs_symbol(decomposed_code_point)))
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                translit_file.write(' % {:s}\n'.format(name))
    translit_file.write('\n')
Ejemplo n.º 6
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposed_code_points = [compatibility_decompose(code_point)]
        if not decomposed_code_points[0]:
            if special_decompose([code_point]) != [code_point]:
                decomposed_code_points[0] = special_decompose([code_point])
        else:
            special_decomposed_code_points = []
            while True:
                special_decomposed_code_points = special_decompose(
                    decomposed_code_points[-1])
                if (special_decomposed_code_points
                        != decomposed_code_points[-1]):
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                    continue
                special_decomposed_code_points = []
                for decomposed_code_point in decomposed_code_points[-1]:
                    special_decomposed_code_points += special_decompose(
                        [decomposed_code_point])
                if (special_decomposed_code_points
                        == decomposed_code_points[-1]):
                    break
                decomposed_code_points.append(
                    special_decomposed_code_points)
        if decomposed_code_points[0]:
            translit_file.write('% {:s}\n'.format(name))
            translit_file.write('{:s} '.format(
                unicode_utils.ucs_symbol(code_point)))
            for index in range(0, len(decomposed_code_points)):
                if index > 0:
                    translit_file.write(';')
                translit_file.write('"')
                for decomposed_code_point in decomposed_code_points[index]:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                translit_file.write('"')
            translit_file.write('\n')
        elif 'LIGATURE' in name and 'ARABIC' not in name:
            decomposed_code_points = special_ligature_decompose(code_point)
            if decomposed_code_points[0] != code_point:
                translit_file.write('% {:s}\n'.format(name))
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                translit_file.write('"')
                for decomposed_code_point in decomposed_code_points:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                translit_file.write('"')
                translit_file.write('\n')
            else:
                print('Warning: unhandled ligature: {:x} {:s}'.format(
                    code_point, name))
    translit_file.write('\n')
Ejemplo n.º 7
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposed_code_points = [compatibility_decompose(code_point)]
        if not decomposed_code_points[0]:
            if special_decompose([code_point]) != [code_point]:
                decomposed_code_points[0] = special_decompose([code_point])
        else:
            special_decomposed_code_points = []
            while True:
                special_decomposed_code_points = special_decompose(
                    decomposed_code_points[-1])
                if (special_decomposed_code_points
                        != decomposed_code_points[-1]):
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                    continue
                special_decomposed_code_points = []
                for decomposed_code_point in decomposed_code_points[-1]:
                    special_decomposed_code_points += special_decompose(
                        [decomposed_code_point])
                if (special_decomposed_code_points
                        == decomposed_code_points[-1]):
                    break
                decomposed_code_points.append(
                    special_decomposed_code_points)
        if decomposed_code_points[0]:
            translit_file.write('% {:s}\n'.format(name))
            translit_file.write('{:s} '.format(
                unicode_utils.ucs_symbol(code_point)))
            for index in range(0, len(decomposed_code_points)):
                if index > 0:
                    translit_file.write(';')
                translit_file.write('"')
                for decomposed_code_point in decomposed_code_points[index]:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                translit_file.write('"')
            translit_file.write('\n')
        elif 'LIGATURE' in name and 'ARABIC' not in name:
            decomposed_code_points = special_ligature_decompose(code_point)
            if decomposed_code_points[0] != code_point:
                translit_file.write('% {:s}\n'.format(name))
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                translit_file.write('"')
                for decomposed_code_point in decomposed_code_points:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                translit_file.write('"')
                translit_file.write('\n')
            else:
                print('Warning: unhandled ligature: {:x} {:s}'.format(
                    code_point, name))
    translit_file.write('\n')
Ejemplo n.º 8
0
def process_width(outfile, ulines, elines):
    '''ulines are lines from UnicodeData.txt, elines are lines from
    EastAsianWidth.txt

    '''
    width_dict = {}
    for line in elines:
        fields = line.split(";")
        if not '..' in fields[0]:
            code_points = (fields[0], fields[0])
        else:
            code_points = fields[0].split("..")
        for key in range(int(code_points[0], 16), int(code_points[1], 16) + 1):
            width_dict[key] = unicode_utils.ucs_symbol(key) + '\t2'
    for line in ulines:
        fields = line.split(";")
        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
            width_dict[int(
                fields[0],
                16)] = unicode_utils.ucs_symbol(int(fields[0], 16)) + '\t0'

    # handle special cases for compatibility
    for key in list(range(0x1160, 0x1200)) + list(range(0x3248, 0x3250)) + \
               list(range(0x4DC0, 0x4E00)) + list((0x00AD,)):
        if key in width_dict:
            del width_dict[key]
    width_dict[0x1160] = '{:s}...{:s}\t0'.format(
        unicode_utils.ucs_symbol(0x1160), unicode_utils.ucs_symbol(0x11FF))
    width_dict[0x3248] = '{:s}...{:s}\t2'.format(
        unicode_utils.ucs_symbol(0x3248), unicode_utils.ucs_symbol(0x324F))
    width_dict[0x4DC0] = '{:s}...{:s}\t2'.format(
        unicode_utils.ucs_symbol(0x4DC0), unicode_utils.ucs_symbol(0x4DFF))

    for key in sorted(width_dict):
        outfile.write(width_dict[key] + '\n')
Ejemplo n.º 9
0
def process_range(start, end, outfile, name):
    '''Writes a range of code points into the CHARMAP section of the
    output file

    '''
    if 'Hangul Syllable' in name:
        # from glibc/localedata/ChangeLog:
        #
        #  2000-09-24  Bruno Haible  <*****@*****.**>
        #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
        #  so they become printable and carry a width. Comment out surrogate
        #  ranges. Add a WIDTH table
        #
        # So we expand the Hangul Syllables here:
        for i in range(int(start, 16), int(end, 16)+1 ):
            index2, index3 = divmod(i - 0xaC00, 28)
            index1, index2 = divmod(index2, 21)
            hangul_syllable_name = 'HANGUL SYLLABLE ' \
                                   + JAMO_INITIAL_SHORT_NAME[index1] \
                                   + JAMO_MEDIAL_SHORT_NAME[index2] \
                                   + JAMO_FINAL_SHORT_NAME[index3]
            outfile.write('{:<11s} {:<12s} {:s}\n'.format(
                unicode_utils.ucs_symbol(i), convert_to_hex(i),
                hangul_syllable_name))
        return
    # UnicodeData.txt file has contains code point ranges like this:
    #
    # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    #
    # The glibc UTF-8 file splits ranges like these into shorter
    # ranges of 64 code points each:
    #
    # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
    # …
    # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
    for i in range(int(start, 16), int(end, 16), 64 ):
        if i > (int(end, 16)-64):
            outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
                    unicode_utils.ucs_symbol(i),
                    unicode_utils.ucs_symbol(int(end,16)),
                    convert_to_hex(i),
                    name))
            break
        outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
                unicode_utils.ucs_symbol(i),
                unicode_utils.ucs_symbol(i+63),
                convert_to_hex(i),
                name))
Ejemplo n.º 10
0
def output_decompositions(translit_file):
    '''Write the section of the translit_combining file where characters
    characters are decomposed and combining characters stripped from
    the decompositions.
    '''
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        if special_decompose([code_point]) != [code_point]:
            decomposed_code_points = [special_decompose([code_point])]
        else:
            decomposed_code_points = [canonical_decompose(code_point)]
        if decomposed_code_points[0]:
            while True:
                special_decomposed_code_points = special_decompose(
                    decomposed_code_points[-1])
                if (special_decomposed_code_points !=
                        decomposed_code_points[-1]):
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                    continue
                special_decomposed_code_points = []
                for decomposed_code_point in decomposed_code_points[-1]:
                    special_decomposed_code_points += special_decompose(
                        [decomposed_code_point])
                if (special_decomposed_code_points ==
                        decomposed_code_points[-1]):
                    break
                decomposed_code_points.append(special_decomposed_code_points)
            for index in range(0, len(decomposed_code_points)):
                decomposed_code_points[index] = [
                    x for x in decomposed_code_points[index]
                    if not is_combining_remove(x)
                ]
        if decomposed_code_points[0]:
            translit_file.write('% {:s}\n'.format(
                unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
            translit_file.write('{:s} '.format(
                unicode_utils.ucs_symbol(code_point)))
            for index in range(0, len(decomposed_code_points)):
                if index > 0:
                    translit_file.write(';')
                if len(decomposed_code_points[index]) > 1:
                    translit_file.write('"')
                for decomposed_code_point in decomposed_code_points[index]:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                if len(decomposed_code_points[index]) > 1:
                    translit_file.write('"')
            translit_file.write('\n')
    translit_file.write('\n')
Ejemplo n.º 11
0
def output_decompositions(translit_file):
    '''Write the section of the translit_combining file where characters
    characters are decomposed and combining characters stripped from
    the decompositions.
    '''
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        if special_decompose([code_point]) != [code_point]:
            decomposed_code_points = [special_decompose([code_point])]
        else:
            decomposed_code_points = [canonical_decompose(code_point)]
        if decomposed_code_points[0]:
            while True:
                special_decomposed_code_points = special_decompose(
                    decomposed_code_points[-1])
                if (special_decomposed_code_points
                        != decomposed_code_points[-1]):
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                    continue
                special_decomposed_code_points = []
                for decomposed_code_point in decomposed_code_points[-1]:
                    special_decomposed_code_points += special_decompose(
                        [decomposed_code_point])
                if (special_decomposed_code_points
                        == decomposed_code_points[-1]):
                    break
                decomposed_code_points.append(
                    special_decomposed_code_points)
            for index in range(0, len(decomposed_code_points)):
                decomposed_code_points[index] = [
                    x for x in decomposed_code_points[index]
                    if not is_combining_remove(x)]
        if decomposed_code_points[0]:
            translit_file.write('% {:s}\n'.format(
                unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
            translit_file.write('{:s} '.format(
                unicode_utils.ucs_symbol(code_point)))
            for index in range(0, len(decomposed_code_points)):
                if index > 0:
                    translit_file.write(';')
                if len(decomposed_code_points[index]) > 1:
                    translit_file.write('"')
                for decomposed_code_point in decomposed_code_points[index]:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                if len(decomposed_code_points[index]) > 1:
                    translit_file.write('"')
            translit_file.write('\n')
    translit_file.write('\n')
Ejemplo n.º 12
0
def output_transliteration(translit_file):
    """Write the new transliteration to the output file"""
    translit_file.write("\n")
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]["name"]
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point]["decomposition"]
        if decomposition.startswith("<circle>"):
            decomposition = decomposition[9:]
            decomposed_code_points = [int(x, 16) for x in decomposition.split(" ")]
            translit_file.write("% {:s}\n".format(name))
            translit_file.write('{:s} "<U0028>'.format(unicode_utils.ucs_symbol(code_point)))
            for decomposed_code_point in decomposed_code_points:
                translit_file.write("{:s}".format(unicode_utils.ucs_symbol(decomposed_code_point)))
            translit_file.write('<U0029>"\n')
    translit_file.write("\n")
Ejemplo n.º 13
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][
            'decomposition']
        if decomposition.startswith('<fraction>'):
            decomposition = decomposition[11:]
            decomposed_code_points = [[
                int(x, 16) for x in decomposition.split(' ')
            ]]
            if decomposed_code_points[0]:
                decomposed_code_points[0] = [0x0020] \
                                            + decomposed_code_points[0] \
                                            + [0x0020]
                while True:
                    special_decomposed_code_points = special_decompose(
                        decomposed_code_points[-1])
                    if (special_decomposed_code_points !=
                            decomposed_code_points[-1]):
                        decomposed_code_points.append(
                            special_decomposed_code_points)
                        continue
                    special_decomposed_code_points = []
                    for decomposed_code_point in decomposed_code_points[-1]:
                        special_decomposed_code_points += special_decompose(
                            [decomposed_code_point])
                    if (special_decomposed_code_points ==
                            decomposed_code_points[-1]):
                        break
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                translit_file.write('% {:s}\n'.format(name))
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                for index in range(0, len(decomposed_code_points)):
                    if index > 0:
                        translit_file.write(';')
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                    for decomposed_code_point in decomposed_code_points[index]:
                        translit_file.write('{:s}'.format(
                            unicode_utils.ucs_symbol(decomposed_code_point)))
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                translit_file.write('\n')
    translit_file.write('\n')
Ejemplo n.º 14
0
def output_charclass(i18n_file, class_name, is_class_function):
    '''Output a LC_CTYPE character class section

    Example:

    upper /
       <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
       …
       <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
       <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
    '''
    cp_ranges = code_point_ranges(is_class_function)
    if cp_ranges:
        i18n_file.write('%s /\n' %class_name)
        max_column = 75
        prefix = '   '
        line = prefix
        range_string = ''
        for code_point_range in cp_ranges:
            if line.strip():
                line  += ';'
            if len(code_point_range) == 1:
                range_string = unicode_utils.ucs_symbol(code_point_range[0])
            else:
                range_string = unicode_utils.ucs_symbol_range(
                    code_point_range[0], code_point_range[-1])
            if len(line+range_string) > max_column:
                i18n_file.write(line+'/\n')
                line = prefix
            line += range_string
        if line.strip():
            i18n_file.write(line+'\n')
        i18n_file.write('\n')
Ejemplo n.º 15
0
def output_charclass(i18n_file, class_name, is_class_function):
    '''Output a LC_CTYPE character class section

    Example:

    upper /
       <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
       …
       <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
       <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
    '''
    cp_ranges = code_point_ranges(is_class_function)
    if cp_ranges:
        i18n_file.write('%s /\n' % class_name)
        max_column = 75
        prefix = '   '
        line = prefix
        range_string = ''
        for code_point_range in cp_ranges:
            if line.strip():
                line += ';'
            if len(code_point_range) == 1:
                range_string = unicode_utils.ucs_symbol(code_point_range[0])
            else:
                range_string = unicode_utils.ucs_symbol_range(
                    code_point_range[0], code_point_range[-1])
            if len(line + range_string) > max_column:
                i18n_file.write(line + '/\n')
                line = prefix
            line += range_string
        if line.strip():
            i18n_file.write(line + '\n')
        i18n_file.write('\n')
Ejemplo n.º 16
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
            code_point]['decomposition']
        if decomposition.startswith('<fraction>'):
            decomposition = decomposition[11:]
            decomposed_code_points = [[int(x, 16)
                                       for x in decomposition.split(' ')]]
            if decomposed_code_points[0]:
                decomposed_code_points[0] = [0x0020] \
                                            + decomposed_code_points[0] \
                                            + [0x0020]
                while True:
                    special_decomposed_code_points = special_decompose(
                        decomposed_code_points[-1])
                    if (special_decomposed_code_points
                            != decomposed_code_points[-1]):
                        decomposed_code_points.append(
                            special_decomposed_code_points)
                        continue
                    special_decomposed_code_points = []
                    for decomposed_code_point in decomposed_code_points[-1]:
                        special_decomposed_code_points += special_decompose(
                            [decomposed_code_point])
                    if (special_decomposed_code_points
                            == decomposed_code_points[-1]):
                        break
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                translit_file.write('% {:s}\n'.format(name))
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                for index in range(0, len(decomposed_code_points)):
                    if index > 0:
                        translit_file.write(';')
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                    for decomposed_code_point in decomposed_code_points[index]:
                        translit_file.write('{:s}'.format(
                            unicode_utils.ucs_symbol(decomposed_code_point)))
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                translit_file.write('\n')
    translit_file.write('\n')
Ejemplo n.º 17
0
def check_charmap(original_file_name, new_file_name):
    '''Report differences in the CHARMAP section between the old and the
    new file
    '''
    print('************************************************************')
    print('Report on CHARMAP:')
    ocharmap = create_charmap_dictionary(original_file_name)
    ncharmap = create_charmap_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated CHARMAP: %d'
          %len(set(ocharmap)-set(ncharmap)))
    if ARGS.show_missing_characters:
        for key in sorted(set(ocharmap)-set(ncharmap)):
            print('removed: {:s}     {:s} {:s}'.format(
                unicode_utils.ucs_symbol(key),
                ocharmap[key],
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    changed_charmap = {}
    for key in set(ocharmap).intersection(set(ncharmap)):
        if ocharmap[key] != ncharmap[key]:
            changed_charmap[key] = (ocharmap[key], ncharmap[key])
    print('Total changed characters in newly generated CHARMAP: %d'
          %len(changed_charmap))
    if ARGS.show_changed_characters:
        for key in sorted(changed_charmap):
            print('changed: {:s}     {:s}->{:s} {:s}'.format(
                unicode_utils.ucs_symbol(key),
                changed_charmap[key][0],
                changed_charmap[key][1],
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated CHARMAP: %d'
          %len(set(ncharmap)-set(ocharmap)))
    if ARGS.show_added_characters:
        for key in sorted(set(ncharmap)-set(ocharmap)):
            print('added: {:s}     {:s} {:s}'.format(
                unicode_utils.ucs_symbol(key),
                ncharmap[key],
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Ejemplo n.º 18
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
            code_point]['decomposition']
        if decomposition.startswith('<circle>'):
            decomposition = decomposition[9:]
            decomposed_code_points = [int(x, 16)
                                      for x in decomposition.split(' ')]
            translit_file.write('% {:s}\n'.format(name))
            translit_file.write('{:s} "<U0028>'.format(
                unicode_utils.ucs_symbol(code_point)))
            for decomposed_code_point in decomposed_code_points:
                translit_file.write('{:s}'.format(
                    unicode_utils.ucs_symbol(decomposed_code_point)))
            translit_file.write('<U0029>"\n')
    translit_file.write('\n')
Ejemplo n.º 19
0
def check_charmap(original_file_name, new_file_name):
    '''Report differences in the CHARMAP section between the old and the
    new file
    '''
    print('************************************************************')
    print('Report on CHARMAP:')
    ocharmap = create_charmap_dictionary(original_file_name)
    ncharmap = create_charmap_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated CHARMAP: %d' %
          len(set(ocharmap) - set(ncharmap)))
    if ARGS.show_missing_characters:
        for key in sorted(set(ocharmap) - set(ncharmap)):
            print('removed: {:s}     {:s} {:s}'.format(
                unicode_utils.ucs_symbol(key),
                ocharmap[key],
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    changed_charmap = {}
    for key in set(ocharmap).intersection(set(ncharmap)):
        if ocharmap[key] != ncharmap[key]:
            changed_charmap[key] = (ocharmap[key], ncharmap[key])
    print('Total changed characters in newly generated CHARMAP: %d' %
          len(changed_charmap))
    if ARGS.show_changed_characters:
        for key in sorted(changed_charmap):
            print('changed: {:s}     {:s}->{:s} {:s}'.format(
                unicode_utils.ucs_symbol(key),
                changed_charmap[key][0],
                changed_charmap[key][1],
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated CHARMAP: %d' %
          len(set(ncharmap) - set(ocharmap)))
    if ARGS.show_added_characters:
        for key in sorted(set(ncharmap) - set(ocharmap)):
            print('added: {:s}     {:s} {:s}'.format(
                unicode_utils.ucs_symbol(key),
                ncharmap[key],
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Ejemplo n.º 20
0
def output_combining_remove(translit_file):
    """Write the section of the translit_combining file where combining
    characters are replaced by empty strings.
    """
    translit_file.write("\n")
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]["name"]
        if is_combining_remove(code_point):
            translit_file.write("% {:s}\n".format(name))
            translit_file.write('{:s} ""\n'.format(unicode_utils.ucs_symbol(code_point)))
    translit_file.write("\n")
Ejemplo n.º 21
0
def output_combining_remove(translit_file):
    '''Write the section of the translit_combining file where combining
    characters are replaced by empty strings.
    '''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        if is_combining_remove(code_point):
            translit_file.write('% {:s}\n'.format(name))
            translit_file.write('{:s} ""\n'.format(
                unicode_utils.ucs_symbol(code_point)))
    translit_file.write('\n')
Ejemplo n.º 22
0
def output_transliteration(translit_file):
    """Write the new transliteration to the output file"""
    translit_file.write("\n")
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]["name"]
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point]["decomposition"]
        if decomposition.startswith("<font>"):
            decomposition = decomposition[7:]
            decomposed_code_points = [[int(x, 16) for x in decomposition.split(" ")]]
            if decomposed_code_points[0]:
                translit_file.write("{:s} ".format(unicode_utils.ucs_symbol(code_point)))
                for index in range(0, len(decomposed_code_points)):
                    if index > 0:
                        translit_file.write(";")
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                    for decomposed_code_point in decomposed_code_points[index]:
                        translit_file.write("{:s}".format(unicode_utils.ucs_symbol(decomposed_code_point)))
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                translit_file.write(" % {:s}\n".format(name))
    translit_file.write("\n")
Ejemplo n.º 23
0
def process_width(outfile, ulines, elines, plines):
    '''ulines are lines from UnicodeData.txt, elines are lines from
    EastAsianWidth.txt containing characters with width “W” or “F”,
    plines are lines from PropList.txt which contain characters
    with the property “Prepended_Concatenation_Mark”.

    '''
    width_dict = {}
    for line in elines:
        fields = line.split(";")
        if not '..' in fields[0]:
            code_points = (fields[0], fields[0])
        else:
            code_points = fields[0].split("..")
        for key in range(int(code_points[0], 16),
                         int(code_points[1], 16)+1):
            width_dict[key] = 2

    for line in ulines:
        fields = line.split(";")
        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
            width_dict[int(fields[0], 16)] = 0

    for line in plines:
        # Characters with the property “Prepended_Concatenation_Mark”
        # should have the width 1:
        fields = line.split(";")
        if not '..' in fields[0]:
            code_points = (fields[0], fields[0])
        else:
            code_points = fields[0].split("..")
        for key in range(int(code_points[0], 16),
                         int(code_points[1], 16)+1):
            del width_dict[key] # default width is 1

    # handle special cases for compatibility
    for key in list((0x00AD,)):
        # https://www.cs.tut.fi/~jkorpela/shy.html
        if key in width_dict:
            del width_dict[key] # default width is 1
    for key in list(range(0x1160, 0x1200)):
        width_dict[key] = 0
    for key in list(range(0x3248, 0x3250)):
        # These are “A” which means we can decide whether to treat them
        # as “W” or “N” based on context:
        # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
        # For us, “W” seems better.
        width_dict[key] = 2
    for key in list(range(0x4DC0, 0x4E00)):
        width_dict[key] = 2

    same_width_lists = []
    current_width_list = []
    for key in sorted(width_dict):
        if not current_width_list:
            current_width_list = [key]
        elif (key == current_width_list[-1] + 1
              and width_dict[key] == width_dict[current_width_list[0]]):
            current_width_list.append(key)
        else:
            same_width_lists.append(current_width_list)
            current_width_list = [key]
    if current_width_list:
        same_width_lists.append(current_width_list)

    for same_width_list in same_width_lists:
        if len(same_width_list) == 1:
            outfile.write('{:s}\t{:d}\n'.format(
                unicode_utils.ucs_symbol(same_width_list[0]),
                width_dict[same_width_list[0]]))
        else:
            outfile.write('{:s}...{:s}\t{:d}\n'.format(
                unicode_utils.ucs_symbol(same_width_list[0]),
                unicode_utils.ucs_symbol(same_width_list[-1]),
                width_dict[same_width_list[0]]))
Ejemplo n.º 24
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][
            'decomposition']
        if decomposition.startswith('<square>'):
            decomposition = decomposition[9:]
            decomposed_code_points = [[
                int(x, 16) for x in decomposition.split(' ')
            ]]
            if decomposed_code_points[0]:
                while True:
                    special_decomposed_code_points = special_decompose(
                        decomposed_code_points[-1])
                    if (special_decomposed_code_points !=
                            decomposed_code_points[-1]):
                        decomposed_code_points.append(
                            special_decomposed_code_points)
                        continue
                    special_decomposed_code_points = []
                    for decomposed_code_point in decomposed_code_points[-1]:
                        special_decomposed_code_points += special_decompose(
                            [decomposed_code_point])
                    if (special_decomposed_code_points ==
                            decomposed_code_points[-1]):
                        break
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                translit_file.write('% {:s}\n'.format(name))
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                for index in range(0, len(decomposed_code_points)):
                    if index > 0:
                        translit_file.write(';')
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                    for decomposed_code_point in decomposed_code_points[index]:
                        translit_file.write('{:s}'.format(
                            unicode_utils.ucs_symbol(decomposed_code_point)))
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[code_point][
            'decomposition']
        if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
            decomposed_code_points = [
                int(x, 16) for x in decomposition.split(' ')
            ]
            if len(decomposed_code_points) != 1:
                sys.stderr.write(
                    'Unexpected decomposition length {:x} {:s} {:s}\n'.format(
                        code_point, name, decomposition))
                exit(1)
            translit_file.write('% {:s}\n'.format(name))
            translit_file.write('{:s} '.format(
                unicode_utils.ucs_symbol(code_point)))
            for decomposed_code_point in decomposed_code_points:
                translit_file.write('{:s}'.format(
                    unicode_utils.ucs_symbol(decomposed_code_point)))
            translit_file.write('\n')
    translit_file.write('\n')
Ejemplo n.º 25
0
def process_charmap(flines, outfile):
    '''This function takes an array which contains *all* lines of
    of UnicodeData.txt and write lines to outfile as used in the

    CHARMAP
    …
    END CHARMAP

    section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.

    Samples for input lines:

    0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
    DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
    100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
    10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;

    Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):

    <U0010>     /x10 DATA LINK ESCAPE
    <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
    %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
    %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
    <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>

    '''
    fields_start = []
    for line in flines:
        fields = line.split(";")
         # Some characters have “<control>” as their name. We try to
         # use the “Unicode 1.0 Name” (10th field in
         # UnicodeData.txt) for them.
         #
         # The Characters U+0080, U+0081, U+0084 and U+0099 have
         # “<control>” as their name but do not even have aa
         # ”Unicode 1.0 Name”. We could write code to take their
         # alternate names from NameAliases.txt.
        if fields[1] == "<control>" and fields[10]:
            fields[1] = fields[10]
        # Handling code point ranges like:
        #
        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
        # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
        if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
            fields_start = fields
            continue
        if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
            process_range(fields_start[0], fields[0],
                          outfile, fields[1][:-7]+'>')
            fields_start = []
            continue
        fields_start = []
        if 'Surrogate,' in fields[1]:
            # Comment out the surrogates in the UTF-8 file.
            # One could of course skip them completely but
            # the original UTF-8 file in glibc had them as
            # comments, so we keep these comment lines.
            outfile.write('%')
        outfile.write('{:<11s} {:<12s} {:s}\n'.format(
                unicode_utils.ucs_symbol(int(fields[0], 16)),
                convert_to_hex(int(fields[0], 16)),
                fields[1]))
Ejemplo n.º 26
0
def process_charmap(flines, outfile):
    '''This function takes an array which contains *all* lines of
    of UnicodeData.txt and write lines to outfile as used in the

    CHARMAP
    …
    END CHARMAP

    section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.

    Samples for input lines:

    0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
    DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
    100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
    10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;

    Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):

    <U0010>     /x10 DATA LINK ESCAPE
    <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
    %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
    %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
    <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>

    '''
    fields_start = []
    for line in flines:
        fields = line.split(";")
        # Some characters have “<control>” as their name. We try to
        # use the “Unicode 1.0 Name” (10th field in
        # UnicodeData.txt) for them.
        #
        # The Characters U+0080, U+0081, U+0084 and U+0099 have
        # “<control>” as their name but do not even have aa
        # ”Unicode 1.0 Name”. We could write code to take their
        # alternate names from NameAliases.txt.
        if fields[1] == "<control>" and fields[10]:
            fields[1] = fields[10]
        # Handling code point ranges like:
        #
        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
        # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
        if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
            fields_start = fields
            continue
        if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
            process_range(fields_start[0], fields[0], outfile,
                          fields[1][:-7] + '>')
            fields_start = []
            continue
        fields_start = []
        if 'Surrogate,' in fields[1]:
            # Comment out the surrogates in the UTF-8 file.
            # One could of course skip them completely but
            # the original UTF-8 file in glibc had them as
            # comments, so we keep these comment lines.
            outfile.write('%')
        outfile.write('{:<11s} {:<12s} {:s}\n'.format(
            unicode_utils.ucs_symbol(int(fields[0], 16)),
            convert_to_hex(int(fields[0], 16)), fields[1]))
Ejemplo n.º 27
0
def check_width(original_file_name, new_file_name):
    '''Report differences in the WIDTH section between the old and the new
    file
    '''
    print('************************************************************')
    print('Report on WIDTH:')
    owidth = create_width_dictionary(original_file_name)
    nwidth = create_width_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated WIDTH: %d'
          %len(set(owidth)-set(nwidth)))
    print('(Characters not in WIDTH get width 1 by default, '
          + 'i.e. these have width 1 now.)')
    if ARGS.show_missing_characters:
        for key in sorted(set(owidth)-set(nwidth)):
            print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
                  + '{:d} : '.format(owidth[key])
                  + 'eaw={:s} '.format(
                      unicode_utils.EAST_ASIAN_WIDTHS[key]
                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                  + 'category={:2s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                  + 'bidi={:3s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                  + 'name={:s}'.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    changed_width = {}
    for key in set(owidth).intersection(set(nwidth)):
        if owidth[key] != nwidth[key]:
            changed_width[key] = (owidth[key], nwidth[key])
    print('Total changed characters in newly generated WIDTH: %d'
          %len(changed_width))
    if ARGS.show_changed_characters:
        for key in sorted(changed_width):
            print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
                  + '{:d}->{:d} : '.format(changed_width[key][0],
                                          changed_width[key][1])
                  + 'eaw={:s} '.format(
                      unicode_utils.EAST_ASIAN_WIDTHS[key]
                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                  + 'category={:2s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                  + 'bidi={:3s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                  + 'name={:s}'.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated WIDTH: %d'
          %len(set(nwidth)-set(owidth)))
    print('(Characters not in WIDTH get width 1 by default, '
          + 'i.e. these had width 1 before.)')
    if ARGS.show_added_characters:
        for key in sorted(set(nwidth)-set(owidth)):
            print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
                  + '{:d} : '.format(nwidth[key])
                  + 'eaw={:s} '.format(
                      unicode_utils.EAST_ASIAN_WIDTHS[key]
                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                  + 'category={:2s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                  + 'bidi={:3s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                  + 'name={:s}'.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Ejemplo n.º 28
0
def check_width(original_file_name, new_file_name):
    '''Report differences in the WIDTH section between the old and the new
    file
    '''
    print('************************************************************')
    print('Report on WIDTH:')
    owidth = create_width_dictionary(original_file_name)
    nwidth = create_width_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated WIDTH: %d' %
          len(set(owidth) - set(nwidth)))
    print('(Characters not in WIDTH get width 1 by default, ' +
          'i.e. these have width 1 now.)')
    if ARGS.show_missing_characters:
        for key in sorted(set(owidth) - set(nwidth)):
            print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) +
                  '{:d} : '.format(owidth[key]) + 'eaw={:s} '.format(
                      unicode_utils.EAST_ASIAN_WIDTHS[key] if key in
                      unicode_utils.EAST_ASIAN_WIDTHS else 'None') +
                  'category={:2s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') +
                  'bidi={:3s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in
                      unicode_utils.UNICODE_ATTRIBUTES else 'None') +
                  'name={:s}'.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in
                      unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    changed_width = {}
    for key in set(owidth).intersection(set(nwidth)):
        if owidth[key] != nwidth[key]:
            changed_width[key] = (owidth[key], nwidth[key])
    print('Total changed characters in newly generated WIDTH: %d' %
          len(changed_width))
    if ARGS.show_changed_characters:
        for key in sorted(changed_width):
            print(
                'changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) +
                '{:d}->{:d} : '.format(changed_width[key][0],
                                       changed_width[key][1]) +
                'eaw={:s} '.format(
                    unicode_utils.EAST_ASIAN_WIDTHS[key] if key in
                    unicode_utils.EAST_ASIAN_WIDTHS else 'None') +
                'category={:2s} '.format(
                    unicode_utils.UNICODE_ATTRIBUTES[key]['category'] if key in
                    unicode_utils.UNICODE_ATTRIBUTES else 'None') +
                'bidi={:3s} '.format(
                    unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in
                    unicode_utils.UNICODE_ATTRIBUTES else 'None') +
                'name={:s}'.format(
                    unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in
                    unicode_utils.UNICODE_ATTRIBUTES else 'None'))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated WIDTH: %d' %
          len(set(nwidth) - set(owidth)))
    print('(Characters not in WIDTH get width 1 by default, ' +
          'i.e. these had width 1 before.)')
    if ARGS.show_added_characters:
        for key in sorted(set(nwidth) - set(owidth)):
            print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) +
                  '{:d} : '.format(nwidth[key]) + 'eaw={:s} '.format(
                      unicode_utils.EAST_ASIAN_WIDTHS[key] if key in
                      unicode_utils.EAST_ASIAN_WIDTHS else 'None') +
                  'category={:2s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') +
                  'bidi={:3s} '.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] if key in
                      unicode_utils.UNICODE_ATTRIBUTES else 'None') +
                  'name={:s}'.format(
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name'] if key in
                      unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Ejemplo n.º 29
0
def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
            code_point]['decomposition']
        if decomposition.startswith('<square>'):
            decomposition = decomposition[9:]
            decomposed_code_points = [[int(x, 16)
                                       for x in decomposition.split(' ')]]
            if decomposed_code_points[0]:
                while True:
                    special_decomposed_code_points = special_decompose(
                        decomposed_code_points[-1])
                    if (special_decomposed_code_points
                            != decomposed_code_points[-1]):
                        decomposed_code_points.append(
                            special_decomposed_code_points)
                        continue
                    special_decomposed_code_points = []
                    for decomposed_code_point in decomposed_code_points[-1]:
                        special_decomposed_code_points += special_decompose(
                            [decomposed_code_point])
                    if (special_decomposed_code_points
                            == decomposed_code_points[-1]):
                        break
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                translit_file.write('% {:s}\n'.format(name))
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                for index in range(0, len(decomposed_code_points)):
                    if index > 0:
                        translit_file.write(';')
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                    for decomposed_code_point in decomposed_code_points[index]:
                        translit_file.write('{:s}'.format(
                            unicode_utils.ucs_symbol(decomposed_code_point)))
                    if len(decomposed_code_points[index]) > 1:
                        translit_file.write('"')
                translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
            code_point]['decomposition']
        if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
            decomposed_code_points = [int(x, 16)
                                      for x in decomposition.split(' ')]
            if len(decomposed_code_points) != 1:
                sys.stderr.write(
                    'Unexpected decomposition length {:x} {:s} {:s}\n'.format(
                        code_point, name, decomposition))
                exit(1)
            translit_file.write('% {:s}\n'.format(name))
            translit_file.write('{:s} '.format(
                unicode_utils.ucs_symbol(code_point)))
            for decomposed_code_point in decomposed_code_points:
                translit_file.write('{:s}'.format(
                    unicode_utils.ucs_symbol(decomposed_code_point)))
            translit_file.write('\n')
    translit_file.write('\n')
Ejemplo n.º 30
0
def process_width(outfile, ulines, elines, plines):
    '''ulines are lines from UnicodeData.txt, elines are lines from
    EastAsianWidth.txt containing characters with width “W” or “F”,
    plines are lines from PropList.txt which contain characters
    with the property “Prepended_Concatenation_Mark”.

    '''
    width_dict = {}
    for line in elines:
        fields = line.split(";")
        if not '..' in fields[0]:
            code_points = (fields[0], fields[0])
        else:
            code_points = fields[0].split("..")
        for key in range(int(code_points[0], 16), int(code_points[1], 16) + 1):
            width_dict[key] = 2

    for line in ulines:
        fields = line.split(";")
        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
            width_dict[int(fields[0], 16)] = 0

    for line in plines:
        # Characters with the property “Prepended_Concatenation_Mark”
        # should have the width 1:
        fields = line.split(";")
        if not '..' in fields[0]:
            code_points = (fields[0], fields[0])
        else:
            code_points = fields[0].split("..")
        for key in range(int(code_points[0], 16), int(code_points[1], 16) + 1):
            del width_dict[key]  # default width is 1

    # handle special cases for compatibility
    for key in list((0x00AD, )):
        # https://www.cs.tut.fi/~jkorpela/shy.html
        if key in width_dict:
            del width_dict[key]  # default width is 1
    for key in list(range(0x1160, 0x1200)):
        width_dict[key] = 0
    for key in list(range(0x3248, 0x3250)):
        # These are “A” which means we can decide whether to treat them
        # as “W” or “N” based on context:
        # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
        # For us, “W” seems better.
        width_dict[key] = 2
    for key in list(range(0x4DC0, 0x4E00)):
        width_dict[key] = 2

    same_width_lists = []
    current_width_list = []
    for key in sorted(width_dict):
        if not current_width_list:
            current_width_list = [key]
        elif (key == current_width_list[-1] + 1
              and width_dict[key] == width_dict[current_width_list[0]]):
            current_width_list.append(key)
        else:
            same_width_lists.append(current_width_list)
            current_width_list = [key]
    if current_width_list:
        same_width_lists.append(current_width_list)

    for same_width_list in same_width_lists:
        if len(same_width_list) == 1:
            outfile.write('{:s}\t{:d}\n'.format(
                unicode_utils.ucs_symbol(same_width_list[0]),
                width_dict[same_width_list[0]]))
        else:
            outfile.write('{:s}...{:s}\t{:d}\n'.format(
                unicode_utils.ucs_symbol(same_width_list[0]),
                unicode_utils.ucs_symbol(same_width_list[-1]),
                width_dict[same_width_list[0]]))