def parse_iamcal_emoji_data_json(path): modification = time.gmtime(os.path.getmtime(path)) emoji_version = time.strftime('%A %d %B %Y', modification) print(emoji_version, modification) source_info['emoji_version'] = emoji_version input_list = json_load(path) emoji_map = {} non_qualified = {} input_map = {} for info in input_list: code = info['unified'] unified = True if '-' in code: code = info['non_qualified'] if not code or '-' in code: continue unified = False ch = int(code, 16) assert ch > 0x80, info character = chr(ch) code = 'U+' + code name = info['name'].title().strip() if not name: name = UnicodeData.getCharacterName(character) short_name = info['short_name'] short_names = info['short_names'] assert short_name in short_names for sequence in short_names: if sequence in input_map: assert character == input_map[sequence], (sequence, info) else: input_map[sequence] = character if unified: emoji_map[sequence] = { 'code': code, 'character': character, 'sequence': sequence, 'name': name } else: non_qualified[sequence] = { 'code': code, 'character': character, 'sequence': sequence, 'name': name } return emoji_map, non_qualified
def GenerateUnicodeControlCharacters(): # for kUnicodeControlCharacterTable in Edit.c ucc_table = [ "\u200E", # U+200E LRM Left-to-right mark "\u200F", # U+200F RLM Right-to-left mark "\u200D", # U+200D ZWJ Zero width joiner "\u200C", # U+200C ZWNJ Zero width non-joiner "\u202A", # U+202A LRE Start of left-to-right embedding "\u202B", # U+202B RLE Start of right-to-left embedding "\u202D", # U+202D LRO Start of left-to-right override "\u202E", # U+202E RLO Start of right-to-left override "\u202C", # U+202C PDF Pop directional formatting "\u206E", # U+206E NADS National digit shapes substitution "\u206F", # U+206F NODS Nominal (European) digit shapes "\u206B", # U+206B ASS Activate symmetric swapping "\u206A", # U+206A ISS Inhibit symmetric swapping "\u206D", # U+206D AAFS Activate Arabic form shaping "\u206C", # U+206C IAFS Inhibit Arabic form shaping "\u001E", # U+001E RS Record Separator (Block separator) "\u001F", # U+001F US Unit Separator (Segment separator) "\u2028", # U+2028 LS Line Separator "\u2029", # U+2029 PS Paragraph Separator "\u200B", # U+200B ZWSP Zero width space "\u2060", # U+2060 WJ Word joiner "\u2066", # U+2066 LRI Left-to-right isolate "\u2067", # U+2067 RLI Right-to-left isolate "\u2068", # U+2068 FSI First strong isolate "\u2069", # U+2069 PDI Pop directional isolate "\u061C", # U+061C ALM Arabic letter mark ] print('UnicodeControlCharacters:') for ucc in ucc_table: utf8bytes = ucc.encode('utf-8') utf8str = ''.join(f'\\x{b:02x}' for b in utf8bytes) print(utf8str, f'U+{ord(ucc):04X}', unicodedata.category(ucc), UnicodeData.getCharacterName(ucc))