props += (2**53) * data.get('Punctuation_In_Word', 0) props += (2**54) * data.get('Optional_Space_After', 0) props += (2**55) * data.get('Extended_Dash', 0) props += (2**56) * data.get('Paragraph_Separator', 0) props += (2**57) * data.get('Ellipsis', 0) props += (2**58) * data.get('Semi_Colon', 0) props += (2**59) * data.get('Colon', 0) props += (2**60) * data.get('Comma', 0) props += (2**61) * data.get('Exclamation_Mark', 0) props += (2**62) * data.get('Question_Mark', 0) props += (2**63) * data.get('Full_Stop', 0) return props if __name__ == '__main__': for codepoint in ucd.CodeRange('000000..10FFFF'): try: data = unicode_chars[codepoint] except KeyError: data = {'CodePoint': codepoint} script = data.get('Script', 'Zzzz') title = data.get('TitleCase', codepoint) upper = data.get('UpperCase', codepoint) lower = data.get('LowerCase', codepoint) if title == null: title = codepoint if upper == null: upper = codepoint if lower == null: lower = codepoint print( '%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %016x' % (codepoint, script, data.get('GeneralCategory', 'Cn')[0], data.get(
unicode_chars = {} for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): for codepoint in data['CodePoint']: unicode_chars[codepoint] = data['GeneralCategory'] if '--with-csur' in sys.argv: for csur in ['Klingon']: for data in ucd.parse_ucd_data('data/csur', csur): for codepoint in data['CodePoint']: unicode_chars[codepoint] = data['GeneralCategory'] # This map is a combination of the information in the UnicodeData and Blocks # data files. It is intended to reduce the number of character tables that # need to be generated. category_sets = [ (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'), (ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'), (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'), (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'), (ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'), (ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'), (ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'), (ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'), ] # These categories have many pages consisting of just this category: # Cn -- Unassigned # Lo -- CJK Ideographs
unicode_chars = {} for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): for codepoint in data['Range']: unicode_chars[codepoint] = data['Script'] if '--with-csur' in sys.argv: for csur in ['Klingon']: for data in ucd.parse_ucd_data('data/csur', csur): for codepoint in data['CodePoint']: unicode_chars[codepoint] = data['Script'] # This map is a combination of the information in the UnicodeData and Blocks # data files. It is intended to reduce the number of character tables that # need to be generated. script_sets = [ (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'), (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'), ] # These scripts have many pages consisting of just this script: special_scripts = [] script_tables = {} for codepoints, script, comment in script_sets: if not script: table = {} table_entry = None