def print_Unicode_info(char, short): name = unicodedata.name(char, "UNKNOWN") decCodepoint = ord(char) hexCodepoint = hex(decCodepoint) lower = char.lower() upper = char.upper() category = unicodedata.category(char) bidirectional = unicodedata.bidirectional(char) mirrored = True if (unicodedata.mirrored(char) == 1) else False nfc = unicodedata.normalize("NFC", char) nfd = unicodedata.normalize("NFD", char) if (short): print(char + "\t" + name + " (U+" + str(hexCodepoint).upper().replace("0X", "") + ")") else: print("Name " + name) print("Character " + char) print("Dec Codepoint " + str(decCodepoint)) print("Hex Codepoint " + str(hexCodepoint)) print("Lowercase " + lower) print("Uppercase " + upper) print("Category " + category) print("Bidirectional " + bidirectional) print("Mirrored " + str(mirrored)) print("NFC " + nfc) print("NFD " + nfd)
def test_ucd_510(self): import unicodedata # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 self.assert_(unicodedata.mirrored(u"\u0f3a")) self.assert_(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a")) # Also, we now have two ways of representing # the upper-case mapping: as delta, or as absolute value self.assert_(u"a".upper()==u'A') self.assert_(u"\u1d79".upper()==u'\ua77d')
def test_ucd_510(self): import unicodedata # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 self.assertTrue(unicodedata.mirrored("\u0f3a")) self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a")) # Also, we now have two ways of representing # the upper-case mapping: as delta, or as absolute value self.assertTrue("a".upper() == 'A') self.assertTrue("\u1d79".upper() == '\ua77d') self.assertTrue(".".upper() == '.')
def iter_open_close_info4s(): for i in range(CHAR_ORD_UPPER): char = chr(i) category = unicodedata.category(char) name = unicodedata.name(char, '') if (category in ('Ps', 'Pe', 'Pi', 'Pf') or unicodedata.mirrored(char) #or 'bracket' in name.lower() or 'paren' in name.lower() ): #print(i, name, '#', char) yield i, name, char, category
def test_ucd_510(self): # stdlib import unicodedata # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 self.assertTrue(unicodedata.mirrored('Я╝║')) self.assertTrue(not unicodedata.ucd_3_2_0.mirrored('Я╝║')) # Also, we now have two ways of representing # the upper-case mapping: as delta, or as absolute value self.assertTrue('a'.upper() == 'A') self.assertTrue('рх╣'.upper() == 'ЖЮй') self.assertTrue('.'.upper() == '.')
def apply_mirroring(storage): """ Applies L4: mirroring: A character is depicted by a mirrored glyph if and only if (a) the resolved directionality of that character is R, and (b) the Bidi_Mirrored property value of that character is true. See: http://unicode.org/reports/tr9/#L4 """ for char in storage["chars"]: unichar = char["ch"] if mirrored(unichar) and embedding_direction(char["level"]) == "R": char["ch"] = MIRRORED_CHARACTER_PAIRS.get(unichar, unichar)
def _explain_char(self, ch, further): try: name = unicodedata.name(ch) except ValueError: name = f'[U+{hex(ord(ch))[2:]}]' if not further: return name + f'({ch})' infos = { 'category': unicodedata.category(ch), 'direction': unicodedata.bidirectional(ch), 'east asian width': unicodedata.east_asian_width(ch) } decomposition = unicodedata.decomposition(ch) if decomposition: infos['decomposition'] = decomposition try: infos['digit value'] = unicodedata.digit(ch) except ValueError: pass try: infos['decimal value'] = unicodedata.decimal(ch) except ValueError: pass try: infos['numeric value'] = unicodedata.numeric(ch) except ValueError: pass comb = unicodedata.combining(ch) if comb != 0: infos['combining class'] = str(comb) mirrored = unicodedata.mirrored(ch) if mirrored: infos['mirrored'] = 'yes' if hasattr(unicodedata, 'is_normalized'): forms = [] for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): if unicodedata.is_normalized(form, ch): forms.append(form) if forms: infos['normalized'] = f'yes: {", ".join(forms)}' else: infos['normalized'] = 'no' else: infos['normalized'] = 'unavailable' info = ', '.join([f'{k}: {v}' for k, v in infos.items()]) return f'{name}: {ch!r} ({info})'
def overview(tree_item): """ Returns an overview of the character """ char = tree_item.obj return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), char, unicodedata.decimal(char, ''), unicodedata.digit(char, ''), unicodedata.numeric(char, ''), unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.east_asian_width(char), unicodedata.mirrored(char), unicodedata.decomposition(char))
def char2info(ch): name = U.name(ch, None) decimal = U.decimal(ch, None) digit = U.digit(ch, None) numeric = U.numeric(ch, None) category = U.category(ch) bidirectional = U.bidirectional(ch) combining = U.combining(ch) east_asian_width = U.east_asian_width(ch) mirrored = U.mirrored(ch) decomposition = U.decomposition(ch) unicode = ord(ch) unicode_hex = hex(unicode) return dict(locals())
def apply_mirroring(storage, debug): """Applies L4: mirroring See: http://unicode.org/reports/tr9/#L4 """ # L4. A character is depicted by a mirrored glyph if and only if (a) the # resolved directionality of that character is R, and (b) the # Bidi_Mirrored property value of that character is true. for _ch in storage['chars']: unichar = _ch['ch'] if mirrored(unichar) and \ _embedding_direction(_ch['level']) == 'R': _ch['ch'] = MIRRORED.get(unichar, unichar) if debug: debug_storage(storage)
def test_compare_functions(self): def getX(fun, code): try: return getattr(unicodedb_5_2_0, fun)(code) except KeyError: return -1 for code in range(0x10000): char = unichr(code) assert unicodedata.digit(char, -1) == getX('digit', code) assert unicodedata.numeric(char, -1) == getX('numeric', code) assert unicodedata.decimal(char, -1) == getX('decimal', code) assert unicodedata.category(char) == unicodedb_5_2_0.category(code) assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code) assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code) assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code) assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)
def main(): try: v = bytes(int(x, 16) for x in sys.argv[1:]) c = v.decode('utf8') print('gryph: %s' % c) print('codepoint: U+%x' % ord(c)) print('name: %s' % unicodedata.name(c, 'Unknown')) print('decimal: %s' % unicodedata.decimal(c, 'Unknown')) print('digit: %s' % unicodedata.digit(c, 'Unknown')) print('numeric: %s' % unicodedata.numeric(c, 'Unknown')) print('category: %s' % unicodedata.category(c)) print('bidirectional: %s' % unicodedata.bidirectional(c)) print('combining: %s' % unicodedata.combining(c)) print('east_asian_width: %s' % unicodedata.east_asian_width(c)) print('mirrored: %s' % unicodedata.mirrored(c)) print('decomposition: %s' % unicodedata.decomposition(c)) except Exception as ex: print('ERROR: %s' % ex)
def test_function_checksum(self): h = hashlib.sha1() # nosec: B303 for i in range(sys.maxunicode + 1): char = chr(i) data = [ # Properties format(self.db.digit(char, -1), ".12g"), format(self.db.numeric(char, -1), ".12g"), format(self.db.decimal(char, -1), ".12g"), unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.decomposition(char), str(unicodedata.mirrored(char)), str(unicodedata.combining(char)), ] h.update(''.join(data).encode("ascii")) result = h.hexdigest() self.assertEqual(result, self.expectedchecksum)
def __init__(self, symbol): self.symbol = symbol self.name = u.name(symbol, 'NO_NAME_FOUND') self.decimal = u.decimal(self.symbol, -1) self.digit = u.digit(self.symbol, -1) self.numeric = u.numeric(self.symbol, -1) self.category = u.category(self.symbol) self.bidirectional = u.bidirectional(self.symbol) self.combining = u.combining(self.symbol) self.east_asian_width = u.east_asian_width(self.symbol) self.mirrored = u.mirrored(self.symbol) self.decomposition = u.decomposition(self.symbol) self.normalize_nfc = u.normalize('NFC', self.symbol) self.normalize_nkfc = u.normalize('NFKC', self.symbol) self.normalize_nfd = u.normalize('NFD', self.symbol) self.normalize_nkfd = u.normalize('NFKD', self.symbol) if Config.debug['unicode']: self.print_debug()
def test_compare_functions(self): import unicodedata # CPython implementation def getX(fun, code): if fun == 'numeric' and code in self.diff_numeric: return -1 try: return getattr(unicodedb_4_1_0, fun)(code) except KeyError: return -1 for code in range(0x10000): char = unichr(code) assert unicodedata.digit(char, -1) == getX('digit', code) assert unicodedata.numeric(char, -1) == getX('numeric', code) assert unicodedata.decimal(char, -1) == getX('decimal', code) assert unicodedata.category(char) == unicodedb_4_1_0.category(code) assert unicodedata.bidirectional(char) == unicodedb_4_1_0.bidirectional(code) assert unicodedata.decomposition(char) == unicodedb_4_1_0.decomposition(code) assert unicodedata.mirrored(char) == unicodedb_4_1_0.mirrored(code) assert unicodedata.combining(char) == unicodedb_4_1_0.combining(code)
def test_ipy2_gh357(self): """https://github.com/IronLanguages/ironpython2/issues/357""" import unicodedata if is_cli: self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>') else: self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D') self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d') self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d') self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d') self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0) self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo') self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L') self.assertEqual(unicodedata.combining(u'\u4e2d'), 0) self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W') self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0) self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
def to_str(s): # ";" WARNING! ====================================================== if len(s) > 1 and s[0] == '$': return s #[1:] # $VARIABLE HACK! # TODO: Make it so that combining/rtl characters # are referenced by name, rather than the original # characters codepoints etc if not s: return '{{}}' from unicodedata import name, category, mirrored LOut = [] for c in s: if (not category( str(c)) in {'Mc', 'Cc', 'Cf', 'Zl', 'Zp', 'Zs', 'Mn'} and not mirrored(str(c)) and not c in '=,;'): LOut.append(c) else: LOut.append('{{%s}}' % name(str(c))) return ''.join(LOut)
def test_against_unicodedata(): ''' Check against `unicodedata` or `unicodedata2` if available with the correct version of Unicode. ''' if unicodedata is None: raise Exception( 'Packages unicodedata and unicodedata2 are not available with the necessary version of Unicode ({0}); many consistency tests were omitted' .format(mdl.UNICODE_VERSION)) ucdf = mdl.UCDFiles() ud = ucdf.unicodedata for cp in range(0, 0x10FFFF + 1): c = chr(cp) if cp in ud: name = unicodedata.name(c, None) if name is None: # Handle missing names in unicodedata # Compare Table 4-13 in Unicode Standard # http://www.unicode.org/versions/Unicode9.0.0/ch04.pdf if 0x17000 <= cp <= 0x187EC: assert ud[cp]['Name'] == 'TANGUT IDEOGRAPH-{0:04X}'.format( cp) else: assert ud[cp]['Name'] == '' else: assert name == ud[cp]['Name'] decimal, digit, numeric = (unicodedata.decimal(c, None), unicodedata.digit(c, None), unicodedata.numeric(c, None)) if any(x is not None for x in (decimal, digit, numeric)): if decimal is not None: assert decimal == int(ud[cp]['Numeric_Value']) and ud[cp][ 'Numeric_Type'] == 'Decimal' and digit is not None and decimal is not None elif digit is not None: assert digit == int(ud[cp]['Numeric_Value']) and ud[cp][ 'Numeric_Type'] == 'Digit' and decimal is None and numeric is not None elif numeric is not None: try: num = float(ud[cp]['Numeric_Value']) except ValueError: if '/' in ud[cp]['Numeric_Value']: numerator, denominator = ud[cp][ 'Numeric_Value'].split('/') num = float(numerator) / float(denominator) else: raise assert numeric == num and ud[cp][ 'Numeric_Type'] == 'Numeric' and digit is None and decimal is None else: raise Exception else: assert ud[cp]['Numeric_Value'] == 'NaN' and ud[cp][ 'Numeric_Type'] == 'None' assert unicodedata.category(c) == ud[cp]['General_Category'] assert unicodedata.bidirectional(c) == ud[cp]['Bidi_Class'] assert unicodedata.combining(c) == int( ud[cp]['Canonical_Combining_Class']) assert unicodedata.mirrored(c) == ud[cp]['Bidi_Mirrored'] if unicodedata.decomposition(c) == '': if ud[cp]['Name'].startswith('HANGUL SYLLABLE'): # The Hangul syllables lack decomposition mapping in # unicodedata, so calculate with a full decomposition # followed by a partial composition (Unicode Standard, # chapter 3.12) decomp = unicodedata.normalize('NFD', c) if len(decomp) == 3: decomp = unicodedata.normalize('NFC', decomp[:2]) + decomp[-1] decomp = tuple(ord(x) for x in decomp) assert decomp == ud[cp]['Decomposition_Mapping'] else: assert ud[cp]['Decomposition_Mapping'] == (cp, ) else: x = unicodedata.decomposition(c) if '<' in x: x = x.split('>', 1)[1].strip() x = tuple(int(y, 16) for y in x.split('\x20')) assert x == ud[cp]['Decomposition_Mapping'] dbc = ucdf.derivedbidiclass for cp in range(0, 0x10FFFF + 1): c = chr(cp) # Only compare assigned code points, because unicodedata and # unicodedata2 lack correct defaults for unassigned if cp in dbc and cp in ud: assert unicodedata.bidirectional(c) == dbc[cp]['Bidi_Class'] eaw = ucdf.eastasianwidth deaw = ucdf.derivedeastasianwidth for cp in range(0, 0x10FFFF + 1): c = chr(cp) # Only compare assigned code points, because unicodedata and # unicodedata2 lack correct defaults for unassigned if cp in eaw and cp in ud: assert unicodedata.east_asian_width( c) == eaw[cp]['East_Asian_Width'] if cp in deaw and cp in ud: assert unicodedata.east_asian_width( c) == deaw[cp]['East_Asian_Width']
async def charinfo(self, *, data: str): """Shows information about one or several characters. 'data' can either be a character, a unicode escape sequence, a unicode character name or a string. If 'data' is a string only a summary of each character's info will be displayed. """ data = data.lower() if data.startswith('\\u'): # Let's interpret the unicode escape sequence hex_values = data.split('\\u')[1:] try: code_points = [int(val, 16) for val in hex_values] except ValueError: await self.bot.say('Invalid unicode escape sequence.') return else: data = ''.join(chr(cp) for cp in code_points) elif len(data) > 1: # Maybe we've been given the character's name ? try: data = unicodedata.lookup(data) except KeyError: pass # Normalise the input data = unicodedata.normalize('NFC', data) url_fmt = '<http://unicode-table.com/en/{:X}>' if len(data) == 1: # Detailed info on the character entries = [ ('Character', data), ('Name', unicodedata.name(data, 'None')), ('Code point', '{:04x}'.format(ord(data))) ] decomposition = unicodedata.decomposition(data) if decomposition != '': entries.append(('Decomposition', decomposition)) combining = unicodedata.combining(data) if combining: entries.append(('Combining class', combining)) entries.append(('Category', unicodedata.category(data))) bidirectional = unicodedata.bidirectional(data) entries.append(('Bidirectional', bidirectional if bidirectional != '' else 'None')) entries.append(('Mirrored', 'True' if unicodedata.mirrored(data) == 1 else 'False')) entries.append(('East asian width', unicodedata.east_asian_width(data))) entries.append(('Url', url_fmt.format(ord(data)))) # Create the message's content and send it content = utils.indented_entry_to_str(entries) await self.bot.say_block(content) else: # Minimal info for each character entries = [] for char in data: entries.append('{} | `\\u{:04x}` | {} | {}'.format(char, ord(char), unicodedata.name(char, 'None'), url_fmt.format(ord(char)))) content = '\n'.join(entries) await self.bot.say(content)
def mirrored(self): """Return unicodedata.mirrored.""" return unicodedata.mirrored(self.c)
def is_open_close_char(char): return mirrored(char) or category(char) in categories
async def unicode(self, ctx, *, arg): """Returns the information on a Unicode character or named character.""" if len(arg) == 1: chars = [arg] else: #if " " in arg[1:-1] or "," in arg[1:-1] or ";" in arg[1:-1]: # arg = arg[:0] + arg[1:-1].replace(",", " ").replace(";", " ") + arg[-1:] # try to find what character is meant # if starts with "U+", "\x", "\u", it"s hex if arg.upper().startswith("U+") or arg.upper().startswith( "\\U") or arg.upper().startswith("\\X"): arg = "0x" + arg[2:].strip() try: if arg.lower().startswith("0x"): arg = arg[2:] chars = [chr(int(arg, 16))] except ValueError: # otherwise, use name lookup try: chars = [unicodedata.lookup(arg)] except KeyError: chars = arg #await ctx.send(error("Character not found: `{}`".format(arg))) #return embeds = [] n = 0 for char in chars: n += 1 value = ord(char) name = unicodedata.name(char, None) #name_url = name.lower().replace(" ", "-") dt = {} dt["Character"] = char dt["Name"] = name # str or None dt["Decimal"] = unicodedata.decimal(char, None) # int or None dt["Digit"] = unicodedata.digit(char, None) # int or None dt["Numeric"] = unicodedata.numeric(char, None) # float or None dt["Category"] = unicodedata.category(char) # str dt["Bidirectional"] = unicodedata.bidirectional(char) # str dt["Combining class"] = unicodedata.combining(char) # str dt["East Asian width"] = unicodedata.east_asian_width(char) # str dt["Mirrored"] = unicodedata.mirrored(char) # int dt["Decomposition"] = unicodedata.decomposition(char) # str embed = discord.Embed( title="Unicode codepoints of: {input}".format(input=arg), #url="https://emojipedia.org/{}/".format(name_url), description="About Unicode U+{codepoint:04X}.".format( codepoint=value)) for k, v in dt.items(): if not v is None and len(str(v)): if len( str(v).strip( " \t\r\n\v\f\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000" )) == 0: v = '"{}"'.format(v) embed.add_field(name=k, value=str(v), inline=False) embed.set_footer(text="Character {index} of {count}".format( index=n, count=len(arg))) embeds.append(embed) if len(embeds) > 1: await menu(ctx, embeds, DEFAULT_CONTROLS) else: await ctx.send(embed=embeds[0])
import unicodedata print(unicodedata.bidirectional('$')) print(unicodedata.category('$')) print(unicodedata.combining('7')) print(unicodedata.decimal('1')) print(unicodedata.decomposition('\u00fc')) print(unicodedata.digit('7')) print(unicodedata.lookup('COPYRIGHT SIGN')) print(unicodedata.mirrored('(')) print(unicodedata.name('\u00fc')) print(len(unicodedata.normalize('NFC','resume\u0301'))) print(len(unicodedata.normalize('NFD','resume\u0301'))) print(unicodedata.normalize('NFKD','\u2165')) print(unicodedata.numeric('\u2157')) print(unicodedata.unidata_version)
def printUnicodeInfo(text, description): print("{}:".format(description)) for j, char in enumerate(text): print( "{:2} {:04x} {} '{}' (cat={} bid={} comb={} mirr={})" \ .format(j, ord(char), unicodedata.name(char), char, unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.mirrored(char) ) )
assert unicodedata.numeric(u'A', None) is None assert unicodedata.numeric(u'9') == 9 assert unicodedata.numeric(u'\u215b') == 0.125 assert unicodedata.numeric(u'\u2468') == 9.0 assert unicodedata.decimal(u'A', None) is None assert unicodedata.decimal(u'9') == 9 assert unicodedata.decimal(u'\u215b', None) is None assert unicodedata.decimal(u'\u2468', None) is None assert unicodedata.category(u'\uFFFE') == 'Cn' assert unicodedata.category(u'a') == 'Ll' assert unicodedata.category(u'A') == 'Lu' assert unicodedata.bidirectional(u'\uFFFE') == '' assert unicodedata.bidirectional(u' ') == 'WS' assert unicodedata.bidirectional(u'A') == 'L' assert unicodedata.decomposition(u'\uFFFE') == '' assert unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034' assert unicodedata.mirrored(u'\uFFFE') == 0 assert unicodedata.mirrored(u'a') == 0 assert unicodedata.mirrored(u'\u2201') == 1 assert unicodedata.combining(u'\uFFFE') == 0 assert unicodedata.combining(u'a') == 0 assert unicodedata.combining(u'\u20e1') == 230 print 'done.'
def mirrored(self): return ud.mirrored(self.char)
def printUnicodeData(self, text=None): """ """ #print( "unicodedata", unicodedata.unidata_version ) if text is None: text = self.currentText #def printUnicodeInfo( text, description ): #print( "{}:".format( description ) ) for j, char in enumerate(text): print( "{:2} {:04x} {} {!r} (cat={} bid={} comb={} mirr={})" \ .format(j, ord(char), unicodedata.name(char), char, unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.mirrored(char) ) )
verify(unicodedata.numeric(u"A", None) is None) verify(unicodedata.numeric(u"9") == 9) verify(unicodedata.numeric(u"\u215b") == 0.125) verify(unicodedata.numeric(u"\u2468") == 9.0) verify(unicodedata.decimal(u"A", None) is None) verify(unicodedata.decimal(u"9") == 9) verify(unicodedata.decimal(u"\u215b", None) is None) verify(unicodedata.decimal(u"\u2468", None) is None) verify(unicodedata.category(u"\uFFFE") == "Cn") verify(unicodedata.category(u"a") == "Ll") verify(unicodedata.category(u"A") == "Lu") verify(unicodedata.bidirectional(u"\uFFFE") == "") verify(unicodedata.bidirectional(u" ") == "WS") verify(unicodedata.bidirectional(u"A") == "L") verify(unicodedata.decomposition(u"\uFFFE") == "") verify(unicodedata.decomposition(u"\u00bc") == "<fraction> 0031 2044 0034") verify(unicodedata.mirrored(u"\uFFFE") == 0) verify(unicodedata.mirrored(u"a") == 0) verify(unicodedata.mirrored(u"\u2201") == 1) verify(unicodedata.combining(u"\uFFFE") == 0) verify(unicodedata.combining(u"a") == 0) verify(unicodedata.combining(u"\u20e1") == 230) print "ok"
verify(unicodedata.numeric(u'A',None) is None) verify(unicodedata.numeric(u'9') == 9) verify(unicodedata.numeric(u'\u215b') == 0.125) verify(unicodedata.numeric(u'\u2468') == 9.0) verify(unicodedata.decimal(u'A',None) is None) verify(unicodedata.decimal(u'9') == 9) verify(unicodedata.decimal(u'\u215b',None) is None) verify(unicodedata.decimal(u'\u2468',None) is None) verify(unicodedata.category(u'\uFFFE') == 'Cn') verify(unicodedata.category(u'a') == 'Ll') verify(unicodedata.category(u'A') == 'Lu') verify(unicodedata.bidirectional(u'\uFFFE') == '') verify(unicodedata.bidirectional(u' ') == 'WS') verify(unicodedata.bidirectional(u'A') == 'L') verify(unicodedata.decomposition(u'\uFFFE') == '') verify(unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034') verify(unicodedata.mirrored(u'\uFFFE') == 0) verify(unicodedata.mirrored(u'a') == 0) verify(unicodedata.mirrored(u'\u2201') == 1) verify(unicodedata.combining(u'\uFFFE') == 0) verify(unicodedata.combining(u'a') == 0) verify(unicodedata.combining(u'\u20e1') == 230) print 'ok'
def printUnicodeData( self, text:Optional[str]=None ) -> None: """ """ #dPrint( 'Quiet', debuggingThisModule, "unicodedata", unicodedata.unidata_version ) if text is None: text = self.currentText #def printUnicodeInfo( text, description ): #dPrint( 'Quiet', debuggingThisModule, "{}:".format( description ) ) for j,char in enumerate(text): vPrint( 'Quiet', debuggingThisModule, "{:2} {:04x} {} {!r} (cat={} bid={} comb={} mirr={})" \ .format(j, ord(char), unicodedata.name(char), char, unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.mirrored(char) ) )
import unicodedata if __name__ == "__main__": s = "hello world, Lcoderfit" print(unicodedata.lookup('left curly bracket')) print(unicodedata.name('\\')) print(unicodedata.decimal("1")) print(unicodedata.digit("4")) print(unicodedata.numeric("9")) print(unicodedata.category("/")) print(unicodedata.bidirectional("b")) print(unicodedata.east_asian_width("b")) print(unicodedata.mirrored("{}"))
""" Test script for the unicodedata module. Written by Marc-Andre Lemburg ([email protected]). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" from test_support import verify, verbose import sha encoding = 'utf-8' def test_methods(): h = sha.sha() for i in range(65536): char = unichr(i) data = [ # Predicates (single char) char.isalnum() and u'1' or u'0', char.isalpha() and u'1' or u'0', char.isdecimal() and u'1' or u'0', char.isdigit() and u'1' or u'0', char.islower() and u'1' or u'0', char.isnumeric() and u'1' or u'0', char.isspace() and u'1' or u'0', char.istitle() and u'1' or u'0', char.isupper() and u'1' or u'0', # Predicates (multiple chars) (char + u'abc').isalnum() and u'1' or u'0', (char + u'abc').isalpha() and u'1' or u'0', (char + u'123').isdecimal() and u'1' or u'0', (char + u'123').isdigit() and u'1' or u'0', (char + u'abc').islower() and u'1' or u'0', (char + u'123').isnumeric() and u'1' or u'0', (char + u' \t').isspace() and u'1' or u'0',
def test_mirrored(self): import unicodedata # For no reason, unicodedata.mirrored() returns an int, not a bool assert repr(unicodedata.mirrored(' ')) == '0'
def test_mirrored(self): import unicodedata # For no reason, unicodedata.mirrored() returns an int, not a bool assert repr(unicodedata.mirrored(u" ")) == "0"
def printUnicodeInfo( text, description ): print( "{}:".format( description ) ) for j,char in enumerate(text): print( "{:2} {:04x} {} '{}' (cat={} bid={} comb={} mirr={})" \ .format(j, ord(char), unicodedata.name(char), char, unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.mirrored(char) ) )
""" Test script for the unicodedata module. Written by Marc-Andre Lemburg ([email protected]). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" from test_support import verify, verbose import sha encoding = 'utf-8' def test_methods(): h = sha.sha() for i in range(65536): char = unichr(i) data = [ # Predicates (single char) char.isalnum() and u'1' or u'0', char.isalpha() and u'1' or u'0', char.isdecimal() and u'1' or u'0', char.isdigit() and u'1' or u'0', char.islower() and u'1' or u'0', char.isnumeric() and u'1' or u'0', char.isspace() and u'1' or u'0', char.istitle() and u'1' or u'0', char.isupper() and u'1' or u'0', # Predicates (multiple chars) (char + u'abc').isalnum() and u'1' or u'0', (char + u'abc').isalpha() and u'1' or u'0', (char + u'123').isdecimal() and u'1' or u'0', (char + u'123').isdigit() and u'1' or u'0', (char + u'abc').islower() and u'1' or u'0', (char + u'123').isnumeric() and u'1' or u'0',
print(unicodedata.normalize('NFC', s2)) # 'e'和'\u0301'被合并为一个字符é,因而返回结果为'café'(输出不带引号) print(unicodedata.normalize( 'NFD', s1)) # NFD使组合字符拆开为两个字符,这里'é'被拆为'e'和重音符,即输出结果为:'cafeˋ' print( unicodedata.normalize('NFD', s2) ) # s2最后两个字符为'e'和'\u0301',(我不知道内部机理是什么,接下来的叙述是我自己的理解,不知道正确与否),直接将'\u0301'解释为重音符'ˋ',输出为'cafeˋ' print("-------------------") print(unicodedata.east_asian_width('我')) print(unicodedata.east_asian_width('1')) print(unicodedata.east_asian_width('a')) print(unicodedata.east_asian_width('ﷺ')) # F:fullwidth,H:halfwidth,W:wide,Na:narrow,A:ambiguous(不明确),N:natural(正常) print(unicodedata.mirrored('薛')) # 不懂 print(unicodedata.decomposition('ﷺ')) # 可分解 print(unicodedata.decomposition('é')) # 可分解 print(unicodedata.decomposition('e')) # 不可分解,所以返回空值(输出就是一片空白) # 判断 Unicode 字符串 unistr 是否为正规形式 form。 form 的有效值为 'NFC', 'NFKC', 'NFD' 和 'NFKD' # 3.8 # print(unicodedata.is_normalized('NFC','a')) # true # print(unicodedata.is_normalized('NFC','ﷺ')) # true # print(unicodedata.is_normalized('NFKD','ﷺ')) # false print(unicodedata.unidata_version) print(unicodedata.ucd_3_2_0) #print('const CATEGORY_e CHAR_CATEGORIES[] = {%s};' % ', '.join(unicodedata.category(chr(codepoint)) for codepoint in range(0x110000)))
assert unicodedata.numeric(u'A',None) is None assert unicodedata.numeric(u'9') == 9 assert unicodedata.numeric(u'\u215b') == 0.125 assert unicodedata.numeric(u'\u2468') == 9.0 assert unicodedata.decimal(u'A',None) is None assert unicodedata.decimal(u'9') == 9 assert unicodedata.decimal(u'\u215b',None) is None assert unicodedata.decimal(u'\u2468',None) is None assert unicodedata.category(u'\uFFFE') == 'Cn' assert unicodedata.category(u'a') == 'Ll' assert unicodedata.category(u'A') == 'Lu' assert unicodedata.bidirectional(u'\uFFFE') == '' assert unicodedata.bidirectional(u' ') == 'WS' assert unicodedata.bidirectional(u'A') == 'L' assert unicodedata.decomposition(u'\uFFFE') == '' assert unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034' assert unicodedata.mirrored(u'\uFFFE') == 0 assert unicodedata.mirrored(u'a') == 0 assert unicodedata.mirrored(u'\u2201') == 1 assert unicodedata.combining(u'\uFFFE') == 0 assert unicodedata.combining(u'a') == 0 assert unicodedata.combining(u'\u20e1') == 230 print 'done.'
def printUnicodeData( self, text=None ): """ """ #print( "unicodedata", unicodedata.unidata_version ) if text is None: text = self.currentText #def printUnicodeInfo( text, description ): #print( "{}:".format( description ) ) for j,char in enumerate(text): print( "{:2} {:04x} {} {!r} (cat={} bid={} comb={} mirr={})" \ .format(j, ord(char), unicodedata.name(char), char, unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.mirrored(char) ) )