0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, None, None, None, None, )) # Code pages 874 (TIS-620 exts) # Per alias comments in ICU's convrtrs.txt, IBM's 874 is identical to IBM's 9066. # Microsoft's 874, on the other hand, matches the layout of IBM's 1162. graphdata.rhses["1162"] = parsers.read_single_byte( "WHATWG/index-windows-874.txt") graphdata.rhses["9066"] = parsers.read_single_byte("ICU/ibm-874_P100-1995.ucm") # The two only collide at 0xA0, which IBM uses for an alternate U+0E48 and which Microsoft # uses for an NBSP. Favour the more-deployed Microsoft / ISO-8859-11 NBSP for "874". graphdata.rhses["874"] = tuple( a or b for a, b in zip(graphdata.rhses["1162"], graphdata.rhses["9066"])) graphdata.defgsets["874"] = graphdata.defgsets["1162"] = ("ir006", "ir166", "nil", "nil") # Macintosh code page (doesn't have a Mozilla file) #graphdata.rhses["10021"] = parsers.read_mozilla_ut_file("Mozilla/macthai.ut")
0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x00AD, 0x040E, 0x040F, 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x00A7, 0x045E, 0x045F)) # TODO: ir153 (Russian subset of ISO-8859-5), ir200 (Uralic Cyrillic), ir201 (Volgaic Cyrillic) # Other KOI-8 encodings graphdata.rhses["878"] = graphdata.rhses["20866"] = parsers.read_single_byte("WHATWG/index-koi8-r.txt") # TODO: number 21866 is in reality used for both KOI8-U and KOI8-RU. graphdata.rhses["1168"] = graphdata.rhses["21866"] = parsers.read_single_byte("WHATWG/index-koi8-u.txt") # The Windows encoding graphdata.rhses["1251"] = parsers.read_single_byte("WHATWG/index-windows-1251.txt") # 10007/1283 is the original MacCyrillic; current MacCyrillic is a Euro update of 10017. # Mappings to U+00A4 changed to U+20AC across the board, so number the current one 10017, and use # a version with that change but not the others for 10007/1283. graphdata.rhses["10017"] = parsers.read_single_byte("WHATWG/index-x-mac-cyrillic.txt") maccy = list(graphdata.rhses["10017"]) maccy[0x22] = (0x00A2,) maccy[0x36] = (0x2202,) graphdata.rhses["10007"] = graphdata.rhses["1283"] = tuple(maccy)
(0x00B2, ), (0x00B3, ), (0x0384, ), (0x0385, ), (0x0386, ), (0x00B7, ), (0x0388, ), (0x0389, ), (0x038A, ), (0x00BB, ), (0x038C, ), (0x00BD, ), (0x038E, ), (0x038F, ), (0x0390, ), (0x0391, ), (0x0392, ), (0x0393, ), (0x0394, ), (0x0395, ), (0x0396, ), (0x0397, ), (0x0398, ), (0x0399, ), (0x039A, ), (0x039B, ), (0x039C, ), (0x039D, ), (0x039E, ), (0x039F, ), (0x03A0, ), (0x03A1, ), None, (0x03A3, ), (0x03A4, ), (0x03A5, ), (0x03A6, ), (0x03A7, ), (0x03A8, ), (0x03A9, ), (0x03AA, ), (0x03AB, ), (0x03AC, ), (0x03AD, ), (0x03AE, ), (0x03AF, ), (0x03B0, ), (0x03B1, ), (0x03B2, ), (0x03B3, ), (0x03B4, ), (0x03B5, ), (0x03B6, ), (0x03B7, ), (0x03B8, ), (0x03B9, ), (0x03BA, ), (0x03BB, ), (0x03BC, ), (0x03BD, ), (0x03BE, ), (0x03BF, ), (0x03C0, ), (0x03C1, ), (0x03C2, ), (0x03C3, ), (0x03C4, ), (0x03C5, ), (0x03C6, ), (0x03C7, ), (0x03C8, ), (0x03C9, ), (0x03CA, ), (0x03CB, ), (0x03CC, ), (0x03CD, ), (0x03CE, ), None)) # Windows code page graphdata.rhses["1253"] = parsers.read_single_byte( "WHATWG/index-windows-1253.txt") # OEM code pages graphdata.rhses["737"] = parsers.read_single_byte("ICU/ibm-737_P100-1997.ucm") graphdata.rhses["869"] = parsers.read_single_byte("ICU/ibm-869_P100-1995.ucm") # Macintosh code page graphdata.rhses["10006"] = graphdata.rhses[ "1280"] = parsers.read_mozilla_ut_file("Mozilla/macgreek.ut") # TODO: ir018/19, Greek Teletext, # ir181 (Technical), ir055, ir031, ir050 (INIS Greek non-homoglyphs) # Symbol (ibm-1038) graphdata.gsets["symbolgl"] = (94, 1, parsers.read_single_byte("UTC/symbol.txt",
# SI-1311:2002 Latin/Hebrew RHS graphdata.gsets["ir234"] = (96, 1, ( 0x00A0, None, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 0x20AC, 0x20AA, 0x202D, 0x202E, 0x202C, None, 0x2017, 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF, 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, None, None, 0x200E, 0x200F, None)) # Windows code pages graphdata.rhses["1255"] = parsers.read_single_byte("WHATWG/index-windows-1255.txt") # Hebrew graphdata.rhses["1256"] = parsers.read_single_byte("WHATWG/index-windows-1256.txt") # Arabic # OEM code pages graphdata.rhses["720"] = parsers.read_single_byte("ICU/ibm-720_P100-1997.ucm") # Arabic graphdata.rhses["862"] = parsers.read_single_byte("ICU/ibm-862_P100-1995.ucm") # Hebrew graphdata.rhses["864"] = parsers.read_single_byte("ICU/ibm-864_X110-1999.ucm") # Arabic # Macintosh code pages (both seem to have only Microsoft IDs?) graphdata.rhses["10004"] = parsers.read_mozilla_ut_file("Mozilla/macarabic.ut") graphdata.rhses["10005"] = parsers.read_mozilla_ut_file("Mozilla/machebrew.ut")
(0x23F5, ), (0x23F6, ), (0x23F7, ), (0x2B73, 0xF87F), (0x1F782, ), (0x1F783, ), None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, (0x2714, ), (0x2713, ), (0x1FB7D, ), (0x1FB7F, ), (0x23BE, ), (0x23CC, ), (0x2B1B, ), (0x2022, ), (0x25CF, ), (0x25DC, 0xF879), (0x25DE, 0xF879), (0x25DC, 0xF87F), (0x25DE, 0xF87F), (0x2B24, ), (0x1FB9E, ), (0x1FB9E, 0xF87C), (0x2500, ), (0x1F5D9, ), (0x2753, ), (0x2BC5, ), (0x2BC6, ), (0x2B0D, 0xF87F), (0x1F780, ), (0x1FB9F, ), (0x1FB9F, 0xF87C), None, None, None, None, None)) ##################################################################### # Zapf Dingbats graphdata.gsets["zdings_g0"] = zdg0 = (94, 1, parsers.read_single_byte( "UTC/zdingbat.txt", typ="GL94")) graphdata.gsets["zdings_g1"] = zdg1 = (94, 1, parsers.read_single_byte( "UTC/zdingbat.txt", typ="GR94")) graphdata.rhses["998000"] = (tuple( (i, ) for i in range(0x2768, 0x2776)) + ((None, ) * 19) + zdg1[2] + (None, )) graphdata.defgsets["998000"] = ("zdings_g0", "zdings_g1", "nil", "nil") ##################################################################### # WordPerfect Iconic Symbols (not ECMA-35 structured) # This is the original source of U+231A and U+231B emoji (and U+2319). # 0x00–0x22 were defined in WordPerfect 5. # WordPerfect 6 changed it so 0x21–0x7E, 0xA1–0xEF and 0xF1–0xFE # ranges are Zapf Dingbats, while 0x00–0x20 are similar (not identical).
0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7, 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C, 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A, 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B, 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF)) # Windows code pages for non-Vietnamese Latin graphdata.rhses["1250"] = parsers.read_single_byte( "WHATWG/index-windows-1250.txt") # Central European graphdata.rhses["1252"] = parsers.read_single_byte( "WHATWG/index-windows-1252.txt") # ISO-8859-1 ext. graphdata.defgsets["1252"] = ("ir006", "ir100", "nil", "nil") graphdata.rhses["1254"] = parsers.read_single_byte( "WHATWG/index-windows-1254.txt") # ISO-8859-9 ext. graphdata.defgsets["1254"] = ("ir006", "ir148", "nil", "nil") graphdata.rhses["1257"] = parsers.read_single_byte( "WHATWG/index-windows-1257.txt") # Baltic # OEM pages (TODO: OEM 210 Greek and OEM 220 Spanish are both listed by DEC in the very definition # of the DECSPPCS CSI control. I do not have a source for their layout.) graphdata.rhses["437"] = parsers.read_single_byte( "ICU/ibm-437_P100-1995.ucm") # United States graphdata.defgsets["437"] = ("ir006", "nil", "nil", "nil" ) # Note: gets used as the default.
#!/usr/bin/env python3 # -*- mode: python; coding: utf-8 -*- # By HarJIT in 2020. # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. # Vietnamese Roman (quo^'c ngu*~) encodings from ecma35.data import graphdata from ecma35.data.singlebyte import sbmapparsers as parsers # Windows-1258 graphdata.rhses["1258"] = parsers.read_single_byte( "WHATWG/index-windows-1258.txt") # VPS graphdata.rhses["997000"] = parsers.read_mozilla_ut_file("Mozilla/vps.ut") graphdata.c0graphics["997000"] = parsers.read_mozilla_ut_file("Mozilla/vps.ut", typ="CL33") # TCVN (TCVN 5712, VSCII; not VISCII) graphdata.rhses["997001"] = parsers.read_mozilla_ut_file("Mozilla/tcvn5712.ut") graphdata.c0graphics["997001"] = parsers.read_mozilla_ut_file( "Mozilla/tcvn5712.ut", typ="CL33") graphdata.gsets["ir180"] = (96, 1, parsers.read_mozilla_ut_file("Mozilla/tcvn5712.ut", typ="GR96")) graphdata.defgsets["997001"] = ("ir014", "ir180", "nil", "nil")