def uni2tex(text): out = "" txt = tuple(text) i = 0 while i < len(txt): char = text[i] code = ord(char) # Elsevier bibtex dumps sometimes have a fancy dash if code == 8211: out += "-" # combining marks elif unicodedata.category(char) in ("Mn", "Mc") and code in accents: out += "{\\%s%s}" %(accents[code], txt[i+1]) i += 1 # precomposed characters elif unicodedata.decomposition(char): base, acc = unicodedata.decomposition(char).split() acc = int(acc, 16) base = int(base, 16) if acc in accents: out += "\\%s{%s}" %(accents[acc], unichr(base)) else: out += char else: out += char i += 1 return out
def uni2tex(text): """ Translate accented unicode characters intro latex macros. http://tex.stackexchange.com/questions/23410/how-to-convert-characters-to-latex-code """ out = "" txt = tuple(text) i = 0 while i < len(txt): char = text[i] code = ord(char) # combining marks if unicodedata.category(char) in ("Mn", "Mc") and code in accents: out += "{\\%s{%s}}" % (accents[code], txt[i + 1]) i += 1 # precomposed characters elif unicodedata.decomposition(char): base, acc = unicodedata.decomposition(char).split() acc = int(acc, 16) base = int(base, 16) if acc in accents: out += "{\\%s{%s}}" % (accents[acc], chr(base)) else: out += char # other special case elif char in specials: out += "{%s}" % specials[char] else: out += char i += 1 return out
def uni2tex(text): # Courtesy of https://tex.stackexchange.com/q/23410 accents = { 0x0300: '`', 0x0301: "'", 0x0302: '^', 0x0308: '"', 0x030B: 'H', 0x0303: '~', 0x0327: 'c', 0x0328: 'k', 0x0304: '=', 0x0331: 'b', 0x0307: '.', 0x0323: 'd', 0x030A: 'r', 0x0306: 'u', 0x030C: 'v', } out = "" txt = tuple(text) i = 0 while i < len(txt): char = text[i] code = ord(char) # combining marks if unicodedata.category(char) in ("Mn", "Mc") and code in accents: out += "\\%s{%s}" % (accents[code], txt[i+1]) i += 1 # precomposed characters elif unicodedata.decomposition(char): base, acc = unicodedata.decomposition(char).split() acc = int(acc, 16) base = int(base, 16) if acc in accents: out += "\\%s{%s}" % (accents[acc], chr(base)) else: out += char else: out += char i += 1 return out
def mapchar(self, key): if key in self: return self[key] de = unicodedata.decomposition(unichr(key)) if de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = CHAR_REPLACEMENT.get(unichr(key), key) if ch == 32: # space pass elif 47 < ch < 58: # digits pass elif 64 < ch < 91: # uppercase pass elif 96 < ch < 123: # lowercase pass elif 127 < ch < 165: # upper ascii latin1 pass elif ch == 9: # map tab to space ch = 32 elif ch < 128: # reject invalid lower ascii ch = None elif ch in (152, 158) or ch < 256: ch = None self[key] = ch return ch
def getdetails(self, text): chardetails = {} for character in text: chardetails[character] = {} chardetails[character]['Name'] = unicodedata.name(character) chardetails[character]['HTML Entity'] = str(ord(character)) chardetails[character]['Code point'] = repr(character) try: chardetails[character]['Numeric Value'] = \ unicodedata.numeric(character) except: pass try: chardetails[character]['Decimal Value'] = \ unicodedata.decimal(character) except: pass try: chardetails[character]['Digit'] = unicodedata.digit(mychar) except: pass chardetails[character]['Alphabet'] = str(character.isalpha()) chardetails[character]['Digit'] = str(character.isdigit()) chardetails[character]['AlphaNumeric'] = str(character.isalnum()) chardetails[character]['Canonical Decomposition'] = \ unicodedata.decomposition(character) chardetails['Characters'] = list(text) return chardetails
def string2filename(s): """convert a string to a valid filename""" s = s.strip() s = s.lower() # remove an eventual path s = s.replace("\\","/") _, s = os.path.split(s) res = u'' mkeys = mapping.keys() for c in s: o = ord(c) if o in mapping.keys(): res = res+mapping[o] continue if decomposition(c): res = res + normalize('NFKD', c) else: res = res + c valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) res = ''.join(c for c in res if c in valid_chars) res = res.replace(" ","-") return res
def getDecompositionData(u,missingMarks): # inside so we can use umap, nmap ... udec = None try: dec = unicodedata.decomposition(unichr(u)) if len(dec) > 1: if not dec[:1] == "<": udec = [int(s, 16) for s in dec.split()] decall = 0 for ud in udec: if ud in SKIP_MARKS_FINAL: # if mark is in SKIP_MARKS_FINAL we don't want to do any decomposition return 0 if ud in umap: decall += 1 else: if ud not in SKIP_MARKS_FINAL \ and ud in MARK_GLYPH_CODEPOINT_RANGE: missingMarks += [unicodeIntToHexstr(ud)] # if decall == len(udec) and decall == 1: # print "SAME:",umap[u],[umap[ud] for ud in udec] if decall == len(udec) and decall > 1: # the last condition may go for the sake of allowing reference to same-shape glyphs return umap[u],[umap[ud] for ud in udec],udec[0] # last one is the one to check next except ValueError: return 0 return 0
def asciify(string): ''' "ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc. This very cool function originates at http://www.physic.ut.ee/~kkannike/english/prog/python/index.html ''' # Unfortunately, I don’t really understand, how this function works. # I have a hunch, this could be done better with a decomposed representation # of the string ("NFKD"), but I don’t have time to really test a function # as sensitive as this one right now. # To work reliably the way it is, strings must consist of composed # characters. string = normalize("NFC", string) temp = u'' for char in string: decomp = decomposition(char) if decomp: # Not an empty string d = decomp.split()[0] try: temp += unichr(int(d, 16)) except ValueError: if d == "<super>": temp += unichr(int(decomp.split()[1], 16)) else: pass #raise Exception("Can't handle this: " + repr(decomp)) else: temp += char return temp
def deaccent_char(c): decomposed = unicodedata.decomposition(c) if decomposed: basechar = int(decomposed.split(' ')[0], 16) return chr(basechar) else: return c
def extended_unicode_model(list): """ Takes as input a list of QLC-formatted words and outputs a unigram model. """ segments_hash = collections.defaultdict(int) segment_count = 0 for word in list: word = word.strip() segments = word.split() for segment in segments: segment_count += 1 segments_hash[segment] += 1 segments_sorted = sorted(segments_hash.items(), key=operator.itemgetter(1), reverse=True) # print("Phone"+"\t"+"Int"+"\t"+"Count"+"\t"+"Frequency") # +"\t"+"plog") print("Char"+"\t"+"int"+"\t"+"Unicode name"+"\t"+"category"+"\t"+"comb class"+"\t"+"decomposition"+"\t"+"count"+"\t"+"frequency") for segment in segments_sorted: segment, count = segment[0], segment[1] frequency = segments_hash[segment]/segment_count # decimal = unicodedata.decimal(segment) name = unicodedata.name(segment) category = unicodedata.category(segment) combining_class = unicodedata.combining(segment) decomposition = unicodedata.decomposition(segment) print(segment+"\t"+str(ord(segment))+"\t"+name+"\t"+category+"\t"+str(combining_class)+"\t"+decomposition+"\t"+str(count)+"\t"+str(frequency))
def normalizeUnicode(text, encoding='humanascii'): """ This method is used for normalization of unicode characters to the base ASCII letters. Output is ASCII encoded string (or char) with only ASCII letters, digits, punctuation and whitespace characters. Case is preserved. """ if text == "": return "" unicodeinput = True if not isinstance(text, unicode): text = unicode(text, 'utf-8') unicodeinput = False res = '' global allowed, allowedid if encoding == 'humanascii' or encoding == 'identifier': enc = 'ascii' else: enc = encoding for ch in text: if (encoding == 'humanascii') and (ch in allowed): # ASCII chars, digits etc. stay untouched res += ch continue if (encoding == 'identifier') and (ch in allowedid): # ASCII chars, digits etc. stay untouched res += ch continue else: try: ch.encode(enc,'strict') if encoding == 'identifier': res += '_' else: res += ch except UnicodeEncodeError: ordinal = ord(ch) if mapping.has_key(ordinal): # try to apply custom mappings res += mapping.get(ordinal) elif decomposition(ch) or len(normalize('NFKD',ch)) > 1: normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)).strip() # normalized string may contain non-letter chars too. Remove them # normalized string may result to more than one char if encoding == 'identifier': res += ''.join([c for c in normalized if c in allowedid]) else: res += ''.join([c for c in normalized if c in allowed]) else: # hex string instead of unknown char res += "%x" % ordinal if encoding == 'identifier': res = res.strip('_').replace('_____','_').replace('____','_').replace('___','_').replace('__','_') if not res.strip('_')[0] in string.ascii_letters: res = '_' + res if unicodeinput: return res else: return res.encode('utf-8')
def isvalidaccelerator(accelerator, acceptlist=None): """returns whether the given accelerator character is valid @type accelerator: character @param accelerator: A character to be checked for accelerator validity @type acceptlist: String @param acceptlist: A list of characters that are permissible as accelerators @rtype: Boolean @return: True if the supplied character is an acceptable accelerator """ assert isinstance(accelerator, unicode) assert isinstance(acceptlist, unicode) or acceptlist is None if len(accelerator) == 0: return False if acceptlist is not None: acceptlist = data.normalize(acceptlist) if accelerator in acceptlist: return True return False else: # Old code path - ensures that we don't get a large number of regressions accelerator = accelerator.replace("_","") if accelerator in u"-?": return True if not accelerator.isalnum(): return False # We don't want to have accelerators on characters with diacritics, so let's # see if the character can decompose. decomposition = unicodedata.decomposition(accelerator) # Next we strip out any extra information like <this> decomposition = re.sub("<[^>]+>", "", decomposition).strip() return decomposition.count(" ") == 0
def remove_accents(chars): """Divides a given string into decomposable and undecomposable characters.""" decomposable = [] undecomposable = [] for c in chars: de = unicodedata.decomposition(c) if de: dechars = de.split(None) try: # Only keep characters with a decimal value < 300 dechars = map(lambda i: int(i, 16), dechars) dechars = filter(lambda i: i < 300, dechars) dechars = map(unichr, dechars) de = "".join(dechars) except (IndexError, ValueError): if ord(c) in CHAR_REPLACEMENT: de = CHAR_REPLACEMENT[ord(c)] else: dechars = filter(lambda s: s[0] != "<", dechars) dechars = map(lambda i: int(i, 16), dechars) dechars = map(unichr, dechars) de = "".join(dechars) undecomposable.append((c, de)) else: decomposable.append((c, de)) else: if ord(c) in CHAR_REPLACEMENT: de = CHAR_REPLACEMENT[ord(c)] undecomposable.append((c, de)) return decomposable, undecomposable
def buildCompatChars(sfd, ttf): zwj = u'\u200D' ranges = ( (0xfb50, 0xfbb1), (0xfbd3, 0xfd3d), (0xfd50, 0xfdf9), (0xfdfc, 0xfdfc), (0xfe70, 0xfefc), ) with open(ttf, "rb") as f: data = f.read() blob = HarfBuzz.glib_blob_create(GLib.Bytes.new(data)) face = HarfBuzz.face_create(blob, 0) hbfont = HarfBuzz.font_create(face) upem = HarfBuzz.face_get_upem(face) HarfBuzz.font_set_scale(hbfont, upem, upem) HarfBuzz.ot_font_set_funcs(hbfont) ttfont = TTFont(ttf) for r in ranges: for c in range(r[0], r[1]+1): dec = ucd.decomposition(unichr(c)).split() if dec: keyword = dec[0] text = u'' for i in dec[1:]: text += unichr(int(str(i),16)) if keyword == '<initial>': text = text + zwj elif keyword == '<final>': text = zwj + text elif keyword == '<medial>': text = zwj + text + zwj components = shape(text, hbfont) if components: glyph = sfd.createChar(c) glyph.clear() glyph.color = 0xff0000 # red color x = 0 for component in components: gid = component[0] name = ttfont.getGlyphName(gid) x_advance = component[1] x_offset = component[2] y_offset = component[3] matrix = psMat.translate(x + x_offset, y_offset) # ignore blank glyphs, e.g. space or ZWJ if sfd[name].foreground or sfd[name].references: glyph.addReference(name, matrix) x += x_advance glyph.width = x
def string2filename(s, path = None, default=u"anonymous"): """convert a string to a valid filename""" from unicodedata import decomposition, normalize # TODO: make it a better conversion? if type(s) != types.UnicodeType: s = unicode(s) s = s.strip() s = s.lower() if s=="": s = default # remove an eventual path s = s.replace("\\","/") _, s = os.path.split(s) res = u'' mkeys = mapping.keys() for c in s: o = ord(c) if o in mapping.keys(): res = res+mapping[o] continue if decomposition(c): res = res + normalize('NFKD', c) else: res = res + c valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) filename = ''.join(c for c in res if c in valid_chars) filename = filename.replace(" ","_") # if path is not None we can check if there already is a file with that name if path is None: return filename fullpath=os.path.join(path, filename) if not os.path.exists(fullpath): return filename # remove the extension root, ext = os.path.splitext(filename) for idx in range(1,100): filename = "%s-%d%s" %(root, idx, ext) if not os.path.exists(os.path.join(path,filename)): return filename for idx in range(1,100): u = unicode(uuid.uuid4()) filename = "%s-%s%s" %(root, u, ext) if not os.path.exists(os.path.join(path,filename)): return filename return None # we did not get a result, TODO: further checking
def mapchar(self, key): ch = self.get(key) if ch is not None: return ch if sys.version_info >= (3, 0): de = unicodedata.decomposition(chr(key)) else: de = unicodedata.decomposition(unichr(key)) if de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = CHAR_REPLACEMENT.get(key, key) self[key] = ch return ch
def buildCompatChars(font, hbfont): zwj = u'\u200D' ranges = ( (0xfb50, 0xfbb1), (0xfbd3, 0xfd3d), (0xfd50, 0xfdf9), (0xfdfc, 0xfdfc), (0xfe70, 0xfefc), ) text = u'' codes = [] for r in ranges: for c in range(r[0], r[1]+1): dec = ucd.decomposition(unichr(c)).split() if dec: codes.append(c) keyword = dec[0] new_text = u'' for i in dec[1:]: new_text += unichr(int(str(i),16)) if keyword == '<initial>': new_text = new_text + zwj elif keyword == '<final>': new_text = zwj + new_text elif keyword == '<medial>': new_text = zwj + new_text + zwj text += new_text + '\n' lines = runHB(text, hbfont) i = 0 for c in codes: components = lines[i] i += 1 if components: glyph = font.createChar(c) glyph.clear() glyph.color = 0xff0000 # red color x = 0 for component in components: name = component[0] x_advance = component[1] y_advance = component[2] x_offset = component[3] y_offset = component[4] matrix = psMat.translate(x + x_offset, y_offset) # ignore blank glyphs, e.g. space or ZWJ if font[name].foreground or font[name].references: glyph.addReference(name, matrix) x += x_advance glyph.width = x
def make_index_value(display_name): buf = bytearray() for ch in display_name: decomposition = unicodedata.decomposition(ch) if len(decomposition) > 0: ch = chr(int(decomposition.split()[0], 16)) if ch >= 'a' and ch <= 'z': buf.append(ord(ch)) return buf.decode("ASCII")
def normalizeRtlString(s): l=[] for c in s: #If this is an arabic presentation form b character (commenly given by Windows when converting from glyphs) #Decompose it to its original basic arabic (non-presentational_ character. if 0xfe70<=ord(c)<=0xfeff: d=unicodedata.decomposition(c) d=d.split(' ') if d else None if d and len(d)==2 and d[0] in ('<initial>','<medial>','<final>','<isolated>'): c=unichr(int(d[1],16)) l.append(c) return u"".join(l)
def asciify(string): """ gets rid of pesky things like umlauts and tildes and other accents. ascii all the way, baby. """ temp = u'' for char in string: decomp = decomposition(char) if decomp: # Not an empty string temp += unichr(int(decomp.split()[0], 16)) else: temp += char return temp
def store_contextual_form(): # print('store_contextual_form', equiv, file=sys.stderr) compat_disp = equiv.compat if equiv.compat[0] == ' ': compat_disp = '\u00A0' + compat_disp[1:] #nonlocal current_line form_cells = StringIO() form = decomposition(char)[1:7] print('<td class="ch">{}{}</td>'.format(contextual_form_formats.get(form, '{}').format(compat_disp), '<small><br/>{}</small>'.format(ord_mul(compat_disp)) if len(compat_disp) >=2 else ''), file=form_cells) print('<td class="ch">{}<small><br />{:04X}</small></td>'.format(char, equiv.code_point), file=form_cells) #if current_line.get(form, 'not found') != 'not found': print('collision', current_line[form].rstrip(), equiv, file=stderr) current_line[form] = form_cells.getvalue() form_cells.close()
def mapchar(self,key): ch = self.get(key) if ch is not None: return ch de = unicodedata.decomposition(unichr(key)) if de: try: ch = int(de.split(None,1)[0],16) except (IndexError, ValueError): ch = key else: ch = charmap.get(key,key) self[key] = ch return ch
def mapchar(self, key): ch = self.get(key) if ch is not None: return ch ch = unichr(key) try: ch = unichr(int(decomposition(ch).split()[0], 16)) except (IndexError, ValueError): ch = self.CHAR_REPLACEMENT.get(key, ch) # uncomment the following line if you want to remove remaining # non-ascii characters # if ch >= u"\x80": return None self[key] = ch return ch
def return_an_analysis_of_a_string(src, raw_number_only=False): """ This function is debug-oriented and describes what's lie in the string <src>. * if <raw_number_only> == True: output = "xxxx yyyy zzzz..." """ res = [] for char in src: # normal case : try: if not raw_number_only: decompos = "" if unicodedata.decomposition(char) != "": decompos = " ( ="+unicodedata.decomposition(char)+" )" res.append( str(hex(ord(char)))+decompos ) else: res = "".join( (res, "{0:04X}".format(ord(char)), )) # special case : except ValueError: res = "".join( (res, "!UNKNOWN CHARACTER! : "+str(hex(ord(char))) )) return " + ".join(res)
def make_list(lo, hi): # risuto is Romaji for list, since lower case l is easily confused # it is a list of pairs (tuples) like this: (romaji, kana) risuto = [] for i in xrange(lo,hi): kana = unichr(i) try: name = unicodedata.name(kana) except: name = 'NONE' if '' == unicodedata.decomposition(kana) and 'LETTER' in name and 'SMALL' not in name: (syllabary, letter, romaji) = name.split(' ') if 1 == len(romaji): romaji += ' ' # hack, do this with format risuto.append((romaji, kana)) return(risuto)
def __missing__(self, key): ch = self.get(key) if ch is not None: return ch try: de = unicodedata.decomposition(unichr(key)) p1, _, p2 = de.rpartition(' ') if int(p2, 16) == 0x308: ch = self.get(key) else: ch = int(p1, 16) except (IndexError, ValueError): ch = self.get(key, key) self[key] = ch return ch
def __missing__(self, k): if k in self: return self[k] v = k if k in self.STATIC_MAP: v = self.STATIC_MAP[k] else: de = unicodedata.decomposition(unichr(k)) if de: try: v = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): pass self[k] = v return v
def overview(tree_item): """ Returns an overview of the character """ char = tree_item.obj return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), char, unicodedata.decimal(char, ''), unicodedata.digit(char, ''), unicodedata.numeric(char, ''), unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.east_asian_width(char), unicodedata.mirrored(char), unicodedata.decomposition(char))
def asciified(text): """ Similar to ``text`` but with none ASCII letters replaced by their decomposed ASCII equivalent. """ assert text is not None if not isinstance(text, unicode): raise ValueError(u"text must be unicode instead of %s" % type(text)) result = u"" for ch in text: decomp = unicodedata.decomposition(ch) if decomp: result += unichr(int(decomp.split()[0], 16)) else: result += ch return result
def __missing__(self, key): ch = self.get(key) if ch is not None: return ch try: de = unicodedata.decomposition(six.PY3 and chr(key) or unichr(key)) p1, p2 = [int(x, 16) for x in de.split(None, 1)] if p2 == 0x308: ch = self.CHAR_REPLACEMENT.get(key) else: ch = int(p1) except (IndexError, ValueError): ch = self.CHAR_REPLACEMENT.get(key, key) self[key] = ch return ch
assert unicodedata.numeric(u'A', None) is None assert unicodedata.numeric(u'9') == 9 assert unicodedata.numeric(u'\u215b') == 0.125 assert unicodedata.numeric(u'\u2468') == 9.0 assert unicodedata.decimal(u'A', None) is None assert unicodedata.decimal(u'9') == 9 assert unicodedata.decimal(u'\u215b', None) is None assert unicodedata.decimal(u'\u2468', None) is None assert unicodedata.category(u'\uFFFE') == 'Cn' assert unicodedata.category(u'a') == 'Ll' assert unicodedata.category(u'A') == 'Lu' assert unicodedata.bidirectional(u'\uFFFE') == '' assert unicodedata.bidirectional(u' ') == 'WS' assert unicodedata.bidirectional(u'A') == 'L' assert unicodedata.decomposition(u'\uFFFE') == '' assert unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034' assert unicodedata.mirrored(u'\uFFFE') == 0 assert unicodedata.mirrored(u'a') == 0 assert unicodedata.mirrored(u'\u2201') == 1 assert unicodedata.combining(u'\uFFFE') == 0 assert unicodedata.combining(u'a') == 0 assert unicodedata.combining(u'\u20e1') == 230 print 'done.'
import unicodedata for char in [u"A", u"-", u"1", u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}"]: print repr(char), print unicodedata.category(char), print repr(unicodedata.decomposition(char)), print unicodedata.decimal(char, None), print unicodedata.numeric(char, None) ## u'A' Lu '' None None ## u'-' Pd '' None None ## u'1' Nd '' 1 1.0 ## u'Ö' Lu '004F 0308' None None
def decomposition(self): """Return unicodedata.decomposition.""" return unicodedata.decomposition(self.c)
verify(unicodedata.numeric(u'A',None) is None) verify(unicodedata.numeric(u'9') == 9) verify(unicodedata.numeric(u'\u215b') == 0.125) verify(unicodedata.numeric(u'\u2468') == 9.0) verify(unicodedata.decimal(u'A',None) is None) verify(unicodedata.decimal(u'9') == 9) verify(unicodedata.decimal(u'\u215b',None) is None) verify(unicodedata.decimal(u'\u2468',None) is None) verify(unicodedata.category(u'\uFFFE') == 'Cn') verify(unicodedata.category(u'a') == 'Ll') verify(unicodedata.category(u'A') == 'Lu') verify(unicodedata.bidirectional(u'\uFFFE') == '') verify(unicodedata.bidirectional(u' ') == 'WS') verify(unicodedata.bidirectional(u'A') == 'L') verify(unicodedata.decomposition(u'\uFFFE') == '') verify(unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034') verify(unicodedata.mirrored(u'\uFFFE') == 0) verify(unicodedata.mirrored(u'a') == 0) verify(unicodedata.mirrored(u'\u2201') == 1) verify(unicodedata.combining(u'\uFFFE') == 0) verify(unicodedata.combining(u'a') == 0) verify(unicodedata.combining(u'\u20e1') == 230) print 'ok'
def baseNormalize(text): """ This method is used for normalization of unicode characters to the base ASCII letters. Output is ASCII encoded string (or char) with only ASCII letters, digits, punctuation and whitespace characters. Case is preserved. >>> baseNormalize(123) '123' >>> baseNormalize(u'a\u0fff') 'afff' >>> baseNormalize(u"foo\N{LATIN CAPITAL LETTER I WITH CARON}") 'fooI' >>> baseNormalize(u"\u5317\u4EB0") '53174eb0' """ if not isinstance(text, basestring): # This most surely ends up in something the user does not expect # to see. But at least it does not break. return repr(text) text = text.strip() res = [] for ch in text: if ch in allowed: # ASCII chars, digits etc. stay untouched res.append(ch) else: ordinal = ord(ch) if ordinal < UNIDECODE_LIMIT: h = ordinal >> 8 l = ordinal & 0xff c = CHAR.get(h, None) if c == None: try: mod = __import__('unidecode.x%02x' % (h), [], [], ['data']) except ImportError: CHAR[h] = NULLMAP res.append('') continue CHAR[h] = mod.data try: res.append(mod.data[l]) except IndexError: res.append('') else: try: res.append(c[l]) except IndexError: res.append('') elif decomposition(ch): normalized = normalize('NFKD', ch).strip() # string may contain non-letter chars too. Remove them # string may result to more than one char res.append(''.join([c for c in normalized if c in allowed])) else: # hex string instead of unknown char res.append("%x" % ordinal) return ''.join(res).encode('ascii')
def setUpModule(): log = logging.getLogger('unicodedata') log.info('generating unicodedata CSV') with tempfile.NamedTemporaryFile(prefix='unicode-', suffix='.csv') as csvfile: c = csv.writer(csvfile, quoting=csv.QUOTE_ALL) for i in xrange(sys.maxunicode + 1): if i >= 5024 and i <= 5119: continue # the Unicode Cherokee-Block is broken in Python 2.7 and Python 3.4 (maybe also 3.5) u = unichr(i) if unicodedata.category(u).startswith('C'): # [Cc]Other, Control # [Cf]Other, Format # [Cn]Other, Not Assigned # [Co]Other, Private Use # [Cs]Other, Surrogate continue row = ( i, # INT 0-1114111 unicodedata.name(u, 'UNICODE U+%08X' % i), # VARCHAR(100) ASCII u, # VARCHAR(1) UNICODE u.upper(), # VARCHAR(1) UNICODE u.lower(), # VARCHAR(1) UNICODE unicodedata.decimal(u, None), # INT unicodedata.numeric(u, None), # DOUBLE unicodedata.category(u), # VARCHAR(3) ASCII unicodedata.bidirectional(u), # VARCHAR(3) ASCII unicodedata.combining(u), # VARCHAR(3) ASCII unicodedata.east_asian_width(u), # VARCHAR(1) ASCII bool(unicodedata.mirrored), # BOOLEAN unicodedata.decomposition(u), # VARCHAR(10) ASCII unicodedata.normalize('NFC', u), # VARCHAR(3) UNICODE unicodedata.normalize('NFD', u), # VARCHAR(3) UNICODE unicodedata.normalize('NFKC', u), # VARCHAR(3) UNICODE unicodedata.normalize('NFKD', u), # VARCHAR(3) UNICODE ) c.writerow(utf8encoder(row)) csvfile.flush() log.info('loading CSV') sql = ''' DROP SCHEMA utest CASCADE; CREATE SCHEMA utest; CREATE TABLE unicodedata ( codepoint INT NOT NULL, name VARCHAR(100) ASCII, uchar VARCHAR(1) UTF8, to_upper VARCHAR(1) UTF8, to_lower VARCHAR(1) UTF8, decimal_value INT, numeric_value INT, category VARCHAR(3) ASCII, bidirectional VARCHAR(3) ASCII, combining VARCHAR(10) ASCII, east_asian_width VARCHAR(2) ASCII, mirrored BOOLEAN, decomposition VARCHAR(100) ASCII, NFC VARCHAR(10) UTF8, NFD VARCHAR(10) UTF8, NFKC VARCHAR(20) UTF8, NFKD VARCHAR(20) UTF8 ); IMPORT INTO unicodedata FROM LOCAL CSV FILE '%s' ROW SEPARATOR = 'CRLF'; ''' % os.path.join(os.getcwd(), csvfile.name) cmd = '''%(exaplus)s -c %(conn)s -u sys -P exasol -no-config -autocommit ON -L -pipe''' % { 'exaplus': os.environ.get( 'EXAPLUS', '/usr/opt/EXASuite-4/EXASolution-4.2.9/bin/Console/exaplus'), 'conn': udf.opts.server } env = os.environ.copy() env['PATH'] = '/usr/opt/jdk1.8.0_latest/bin:' + env['PATH'] exaplus = subprocess.Popen(cmd.split(), env=env, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, _err = exaplus.communicate(sql) if exaplus.returncode != 0: log.critical('EXAplus error: %d', exaplus.returncode) log.error(out) else: log.debug(out)
def get_ascii_char(c): s = ud.decomposition(c) if s == '': # for an indecomposable character, it returns '' return c code = int('0x' + s.split()[0], 0) return chr(code)
def filter_accents(text): """Return a sequence of accented characters found in the passed in lowercased text string """ return {char for char in text.lower() if ud.decomposition(char) != ''}
def decompose(c): d = unicodedata.decomposition(c) if d and d.split(None, 1)[0] in ['<compat>', '<wide>', '<narrow>', '<noBreak>']: return unicodedata.normalize('NFKD', c) else: return c
def simplechar(ch): dec = unicodedata.decomposition(ch) if len(dec) > 0: return chr(int(dec.split(' ')[0], 16)) else: return ch
print("categories ok") for comb, cp in tests["combinings"].items(): assert int(comb) == unicodedata.combining(chr(int(cp, 16))) print("combining ok") for decimal, cp in tests["decimals"].items(): if decimal: assert eval(decimal) == unicodedata.decimal(chr(int(cp, 16))) print("decimals ok") for decomp, cp in tests["decompositions"].items(): assert decomp == unicodedata.decomposition(chr(int(cp, 16))) print("decomposition ok") for digit, cp in tests["digits"].items(): if digit: assert eval(digit) == unicodedata.digit(chr(int(cp, 16))) print("digits ok") for name, cp in tests["names"].items(): assert name == unicodedata.name(chr(int(cp, 16))) print("names ok") for numeric, cp in tests["numerics"].items():
""" import sys from unicodedata import decomposition from string import ascii_uppercase ocorrencias = {} for linha in file(sys.argv[1]): for car_uni in linha.decode('utf-8'): # converter linha para unicode if not car_uni.strip(): continue # ignorar brancos try: # primeiro tentamos converter para ASCII car = car_uni.encode('ascii') except UnicodeEncodeError: # se não dá certo, apelamos partes = decomposition(car_uni) if partes: # se o caractere pode ser decomposto... ascii = partes.split()[ 0] # a primeira parte é o código ASCII... car = chr(int(ascii, 16)) # converter o ASCII hexadecimal else: # se o caractere não pode ser decomposto... continue # então não tem correspondente na tabela ASCII car = car.upper() # converter para maiúsculas if car in ascii_uppercase: # finalmente, podemos computar a ocorrência if car in ocorrencias: ocorrencias[car] += 1 else: ocorrencias[car] = 1
def decompose(s): return "".join( [chr(int(x, 16)) for x in unicodedata.decomposition(s).split()])
async def unicode(self, ctx, *, arg): """Returns the information on a Unicode character or named character.""" if len(arg) == 1: chars = [arg] else: #if " " in arg[1:-1] or "," in arg[1:-1] or ";" in arg[1:-1]: # arg = arg[:0] + arg[1:-1].replace(",", " ").replace(";", " ") + arg[-1:] # try to find what character is meant # if starts with "U+", "\x", "\u", it"s hex if arg.upper().startswith("U+") or arg.upper().startswith( "\\U") or arg.upper().startswith("\\X"): arg = "0x" + arg[2:].strip() try: if arg.lower().startswith("0x"): arg = arg[2:] chars = [chr(int(arg, 16))] except ValueError: # otherwise, use name lookup try: chars = [unicodedata.lookup(arg)] except KeyError: chars = arg #await ctx.send(error("Character not found: `{}`".format(arg))) #return embeds = [] n = 0 for char in chars: n += 1 value = ord(char) name = unicodedata.name(char, None) #name_url = name.lower().replace(" ", "-") dt = {} dt["Character"] = char dt["Name"] = name # str or None dt["Decimal"] = unicodedata.decimal(char, None) # int or None dt["Digit"] = unicodedata.digit(char, None) # int or None dt["Numeric"] = unicodedata.numeric(char, None) # float or None dt["Category"] = unicodedata.category(char) # str dt["Bidirectional"] = unicodedata.bidirectional(char) # str dt["Combining class"] = unicodedata.combining(char) # str dt["East Asian width"] = unicodedata.east_asian_width(char) # str dt["Mirrored"] = unicodedata.mirrored(char) # int dt["Decomposition"] = unicodedata.decomposition(char) # str embed = discord.Embed( title="Unicode codepoints of: {input}".format(input=arg), #url="https://emojipedia.org/{}/".format(name_url), description="About Unicode U+{codepoint:04X}.".format( codepoint=value)) for k, v in dt.items(): if not v is None and len(str(v)): if len( str(v).strip( " \t\r\n\v\f\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000" )) == 0: v = '"{}"'.format(v) embed.add_field(name=k, value=str(v), inline=False) embed.set_footer(text="Character {index} of {count}".format( index=n, count=len(arg))) embeds.append(embed) if len(embeds) > 1: await menu(ctx, embeds, DEFAULT_CONTROLS) else: await ctx.send(embed=embeds[0])
def _info_on_char(self, reply, c): try: name = unicodedata.name(c) except ValueError: name = "(no name in database)" cat = unicodedata.category(c) replytxt = u"U+%04X" % (ord(c), ) if not cat.startswith("C"): replytxt += " (%s)" % c replytxt += ": %s" % name cats = { "Cc": "Other, Control", "Cf": "Other, Format", "Cn": "Other, Not Assigned", "Co": "Other, Private Use", "Cs": "Other, Surrogate", "LC": "Letter, Cased", "Ll": "Letter, Lowercase", "Lm": "Letter, Modifier", "Lo": "Letter, Other", "Lt": "Letter, Titlecase", "Lu": "Letter, Uppercase", "Mc": "Mark, Spacing Combining", "Me": "Mark, Enclosing", "Mn": "Mark, Nonspacing", "Nd": "Number, Decimal Digit", "Nl": "Number, Letter", "No": "Number, Other", "Pc": "Punctuation, Connector", "Pd": "Punctuation, Dash", "Pe": "Punctuation, Close", "Pf": "Punctuation, Final quote", "Pi": "Punctuation, Initial quote", "Po": "Punctuation, Other", "Ps": "Punctuation, Open", "Sc": "Symbol, Currency", "Sk": "Symbol, Modifier", "Sm": "Symbol, Math", "So": "Symbol, Other", "Zl": "Separator, Line", "Zp": "Separator, Paragraph", "Zs": "Separator, Space", } try: replytxt += ", category: %s" % cats[cat] except KeyError: log.err("No category found for %s" % cat) try: replytxt += ", numeric value %s" % unicodedata.numeric(c) except ValueError: pass decomp = unicodedata.decomposition(c) if decomp: replytxt += ", decomposition: " + decomp reply(replytxt)
""" Test script for the unicodedata module. Written by Marc-Andre Lemburg ([email protected]). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" from test_support import verify, verbose import sha encoding = 'utf-8' def test_methods(): h = sha.sha() for i in range(65536): char = unichr(i) data = [ # Predicates (single char) char.isalnum() and u'1' or u'0', char.isalpha() and u'1' or u'0', char.isdecimal() and u'1' or u'0', char.isdigit() and u'1' or u'0', char.islower() and u'1' or u'0', char.isnumeric() and u'1' or u'0', char.isspace() and u'1' or u'0', char.istitle() and u'1' or u'0', char.isupper() and u'1' or u'0', # Predicates (multiple chars) (char + u'abc').isalnum() and u'1' or u'0', (char + u'abc').isalpha() and u'1' or u'0', (char + u'123').isdecimal() and u'1' or u'0', (char + u'123').isdigit() and u'1' or u'0', (char + u'abc').islower() and u'1' or u'0', (char + u'123').isnumeric() and u'1' or u'0',
def normalizeUnicode(text, encoding='humanascii'): """ This method is used for normalization of unicode characters to the base ASCII letters. Output is ASCII encoded string (or char) with only ASCII letters, digits, punctuation and whitespace characters. Case is preserved. """ if text == "": return "" unicodeinput = True if not isinstance(text, str): text = str(text, 'utf-8') unicodeinput = False res = '' global allowed, allowedid if encoding == 'humanascii' or encoding == 'identifier': enc = 'ascii' else: enc = encoding for ch in text: if (encoding == 'humanascii') and (ch in allowed): # ASCII chars, digits etc. stay untouched res += ch continue if (encoding == 'identifier') and (ch in allowedid): # ASCII chars, digits etc. stay untouched res += ch continue else: try: ch.encode(enc, 'strict') if encoding == 'identifier': res += '_' else: res += ch except UnicodeEncodeError: ordinal = ord(ch) if ordinal in mapping: # try to apply custom mappings res += mapping.get(ordinal) elif decomposition(ch) or len(normalize('NFKD', ch)) > 1: normalized = str( filter(lambda i: not combining(i), normalize('NFKD', ch))).strip() # normalized string may contain non-letter chars too. Remove them # normalized string may result to more than one char if encoding == 'identifier': res += ''.join( [c for c in normalized if c in allowedid]) else: res += ''.join([c for c in normalized if c in allowed]) else: # hex string instead of unknown char res += "%x" % ordinal if encoding == 'identifier': res = res.strip('_').replace('_____', '_').replace('____', '_').replace( '___', '_').replace('__', '_') if not res.strip('_')[0] in string.ascii_letters: res = '_' + res if unicodeinput: return res else: return res.encode('utf-8')
def handle_unicode_characters(s): """Handle unicode characters appearing in string. Some do actually contain valuable information for NLP applications. But there is also a lot of "unnecessary" unicode in scientific texts (at least from an Software-NER perspective). It can either be dropped, or different codes can be summarized by one characters. Args: s (string): string to transform Returns: string: unicode 'normalized' string """ dropped_char_indices = [] running_count = 0 out_s = '' for char in s: if re.match(r"[A-Za-z0-9\s]", char) is not None or char in string.punctuation: # keep "normal" chars out_s += char running_count += 1 else: # here we will deal with unicode if char in ['©', '™', '®']: # 'TradeMarks' are tricky but often used to indicate external equipment in studies out_s += '™' running_count += 1 continue if char == '°': # Temperatures are almost always indicated by ° out_s += char running_count += 1 continue # some unicodes are combined and based on 'normal' characters -> we want to keep the base characters, e.g. á -> a unicode_matched = False #char_ u_map = unicodedata.decomposition(char) if u_map and len(u_map) > 1: split_codes = [ code for code in u_map.split() if not re.match(r'<.*>', code) ] for code in split_codes: code_char = chr(int(code, 16)) if re.match(r'[a-zA-Z]', code_char): out_s += code_char # TODO unicode_matched = True running_count += 1 break if unicode_matched: continue # normalized unicode for everything else just to be save.. char = unicodedata.normalize('NFC', char) if len(char) > 1: print( RuntimeWarning( "Unkown unicode character with length > 1: {} -- ignored" .format(char))) continue # we want to keep basic greek letters no matter what if char == 'µ': # yes, they are actually different: this is the 'micro sign' char = 'μ' # this the greek letter.. if (ord(char) >= 945 and ord(char) <= 970) or (ord(char) >= 913 and ord(char) <= 938): out_s += char running_count += 1 continue # the rest is based on unicode categories some of which are considered important and others are not category = unicodedata.category(char) if category == 'Pi': if ord(char) == 8216 or ord(char) == 8219: out_s += char else: out_s += '“' running_count += 1 elif category == 'Pf': if ord(char) == 8217: out_s += '’' else: out_s += '”' running_count += 1 elif category == 'Pd': char = '-' out_s += char running_count += 1 elif category == 'Sc': out_s += char running_count += 1 elif category in ['Pe', 'Cf', 'Ps', 'So', 'Sk', 'No']: dropped_char_indices.append([running_count, char]) running_count += 1 elif category == 'Lm': if ord(char) >= 697 and ord(char) <= 719: char = "'" running_count += 1 out_s += char elif category in ['Lu', 'Ll', 'Po']: # keep out_s += char running_count += 1 elif category == 'Sm': # Mathsymbols, TODO: handle them better? out_s += char running_count += 1 unicode_in_sent = True else: #print("Encountered an unhandled unicode character: {} - DROPPED".format(char)) dropped_char_indices.append([running_count, char]) running_count += 1 return out_s, dropped_char_indices
with codecs.open(filename_in, encoding='utf-8') as fin: with open(filename_out, 'w') as fout: with codecs.open(filename_err, encoding='utf-8', mode='w') as ferr: ascii = '' err = False ln = 1 badnames = 0 goodnames = 0 line = fin.readline() while line: for c in line: d = unicodedata.decomposition(c) if d: # Character is a unicode composition. ascii = ascii + decode(unicodedata.name(c)) else: # Character is not a unicode decomposition. (replacement, handled, reason) = repl(c) if handled: # We want to keep this char.
def _escape(s): return s.encode('unicode-escape').decode('ascii') def _idempotent_ignoring_space(profile, value): result1 = profile.enforce(value) result2 = profile.enforce(result1) return result1.strip() == result2.strip() results = Counter() profile = precis.get_profile('NicknameCaseMapped:ToLower') for cp in range(0x0110000): char = chr(cp) try: if not _idempotent_ignoring_space(profile, char): decomp = unicodedata.decomposition(char) kind = decomp.split()[0] if kind.startswith('<'): results[kind] += 1 else: print(_escape(char), unicodedata.name(char)) except UnicodeEncodeError: pass print(results)
def _char_translate(c): base = unicodedata.decomposition(c).split(" ")[0].strip('0') return bytes.fromhex(base).decode("utf-8")
def getDecomposition(cls, char): u""" <doc> Decomposition. </doc> """ charDec = ord(char) decompString = unicodedata.decomposition(char) if decompString: decompHex = decompString.split(' ') decomp = [TX.hex2dec(i) for i in decompHex] overrides = { 290: { 807: 806 }, # u'Ģ': {u'̦': u'̧'} 291: { 807: 806 }, # u'ģ': {u'̦': u'̧'} 325: { 807: 806 }, # u'Ņ': {u'̦': u'̧'} 311: { 807: 806 }, # u'ķ': {u'̦': u'̧'} 310: { 807: 806 }, # u'Ķ': {u'̦': u'̧'} 342: { 807: 806 }, # u'Ŗ': {u'̦': u'̧'} 343: { 807: 806 }, # u'ŗ': {u'̦': u'̧'} 536: { 807: 806 }, # u'Ș': {u'̦': u'̧'} 537: { 807: 806 }, # u'ș': {u'̦': u'̧'} 538: { 807: 806 }, # u'Ț': {u'̦': u'̧'} 539: { 807: 806 }, # u'ț': {u'̦': u'̧'} 316: { 807: 806 }, # u'ļ': {u'̦': u'̧'} 315: { 807: 806 }, # u'Ļ': {u'̦': u'̧'} 291: { 807: 786 }, # gcommaccent 319: { 183: 775 }, 320: { 183: 775 } } for x, u in enumerate(decomp): if charDec in overrides and u in overrides[charDec]: decomp[x] = overrides[charDec][u] charList = [] for d in decomp: if isinstance(d, int): charList.append(unichr(d)) return charList return None
def add_accentless_fallbacks(pattern): r"""Modifies a regexp pattern to also match accentless text. >>> add_accentless_fallbacks(r'Arrêté') 'Arr[êe]t[ée]' >>> add_accentless_fallbacks(r'foo|bar') 'foo|bar' >>> add_accentless_fallbacks(r'm[êè]me') 'm[êèe]me' >>> add_accentless_fallbacks(r'm[êèe]me') 'm[êèe]me' >>> add_accentless_fallbacks(r'\[Décret') '\\[D[ée]cret' >>> add_accentless_fallbacks(r'\[(?P<blé>Décret[ée])?(?(blé) à | a )(?P=blé)') '\\[(?P<blé>D[ée]cret[ée])?(?(blé) [àa] | a )(?P=blé)' >>> add_accentless_fallbacks(r'(?# commenté )') '(?# commenté )' >>> add_accentless_fallbacks(r'[\]é]') '[\\]ée]' """ def remove_accent(c): return chr(int(decomposition(c).split(' ', 1)[0], 16)) r = [] source = sre_parse.Tokenizer(pattern) sourceget = source.get while True: this = source.next if this is None: break # end of pattern sourceget() if this[0] == '\\': r.append(this) elif this == '[': elements = [] accented = set() while True: this = sourceget() if this in (None, ']'): break elements.append(this) if this[0] == '\\': continue if decomposition(this): accented.add(this) if accented: elements_set = set(elements) for c in sorted(accented): accentless = remove_accent(c) if accentless not in elements_set: elements.append(accentless) elements_set.add(accentless) r.append('[') r.extend(elements) if this: r.append(']') elif this == '(' and source.match('?'): this = sourceget() if this is None: this = '' elif this == 'P': if source.next == '<': # named group this += source.getuntil('>') + '>' elif source.next == '=': # named backreference this += source.getuntil(')') + ')' elif this == '#': # comment this += source.getuntil(')') + ')' elif this == '(': # conditional backreference group this += source.getuntil(')') + ')' r.append('(?' + this) else: if decomposition(this): this = '[%s%s]' % (this, remove_accent(this)) r.append(this) return ''.join(r)
import unicodedata print(unicodedata.bidirectional('$')) print(unicodedata.category('$')) print(unicodedata.combining('7')) print(unicodedata.decimal('1')) print(unicodedata.decomposition('\u00fc')) print(unicodedata.digit('7')) print(unicodedata.lookup('COPYRIGHT SIGN')) print(unicodedata.mirrored('(')) print(unicodedata.name('\u00fc')) print(len(unicodedata.normalize('NFC','resume\u0301'))) print(len(unicodedata.normalize('NFD','resume\u0301'))) print(unicodedata.normalize('NFKD','\u2165')) print(unicodedata.numeric('\u2157')) print(unicodedata.unidata_version)
def remove_accent(c): return chr(int(decomposition(c).split(' ', 1)[0], 16))
print(unicodedata.normalize( 'NFD', s1)) # NFD使组合字符拆开为两个字符,这里'é'被拆为'e'和重音符,即输出结果为:'cafeˋ' print( unicodedata.normalize('NFD', s2) ) # s2最后两个字符为'e'和'\u0301',(我不知道内部机理是什么,接下来的叙述是我自己的理解,不知道正确与否),直接将'\u0301'解释为重音符'ˋ',输出为'cafeˋ' print("-------------------") print(unicodedata.east_asian_width('我')) print(unicodedata.east_asian_width('1')) print(unicodedata.east_asian_width('a')) print(unicodedata.east_asian_width('ﷺ')) # F:fullwidth,H:halfwidth,W:wide,Na:narrow,A:ambiguous(不明确),N:natural(正常) print(unicodedata.mirrored('薛')) # 不懂 print(unicodedata.decomposition('ﷺ')) # 可分解 print(unicodedata.decomposition('é')) # 可分解 print(unicodedata.decomposition('e')) # 不可分解,所以返回空值(输出就是一片空白) # 判断 Unicode 字符串 unistr 是否为正规形式 form。 form 的有效值为 'NFC', 'NFKC', 'NFD' 和 'NFKD' # 3.8 # print(unicodedata.is_normalized('NFC','a')) # true # print(unicodedata.is_normalized('NFC','ﷺ')) # true # print(unicodedata.is_normalized('NFKD','ﷺ')) # false print(unicodedata.unidata_version) print(unicodedata.ucd_3_2_0) #print('const CATEGORY_e CHAR_CATEGORIES[] = {%s};' % ', '.join(unicodedata.category(chr(codepoint)) for codepoint in range(0x110000))) print(u'\ua62c') # 因为没有定义 print(unicodedata.name(u"\ua62c"))
async def charinfo(self, ctx, *, data: str): """Shows information about one or several characters. 'data' can either be a character, a unicode escape sequence, a unicode character name or a string. If 'data' is a string only a summary of each character's info will be displayed. """ data = data.lower() if data.startswith('\\u'): # Let's interpret the unicode escape sequence hex_values = data.split('\\u')[1:] try: code_points = [int(val, 16) for val in hex_values] except ValueError: raise commands.BadArgument('Invalid unicode escape sequence.') else: data = ''.join(chr(cp) for cp in code_points) elif len(data) > 1: # Maybe we've been given the character's name ? try: data = unicodedata.lookup(data) except KeyError: pass # Normalise the input data = unicodedata.normalize('NFC', data) url_fmt = '<http://unicode-table.com/en/{:X}>' if len(data) == 1: # Detailed info on the character entries = [('Character', data), ('Name', unicodedata.name(data, 'None')), ('Code point', f'{ord(data):04x}')] decomposition = unicodedata.decomposition(data) if decomposition != '': entries.append(('Decomposition', decomposition)) combining = unicodedata.combining(data) if combining: entries.append(('Combining class', combining)) entries.append(('Category', unicodedata.category(data))) bidirectional = unicodedata.bidirectional(data) entries.append(('Bidirectional', bidirectional if bidirectional != '' else 'None')) entries.append( ('Mirrored', 'True' if unicodedata.mirrored(data) == 1 else 'False')) entries.append( ('East asian width', unicodedata.east_asian_width(data))) entries.append(('Url', url_fmt.format(ord(data)))) # Create the message's content and send it content = utils.indented_entry_to_str(entries) await ctx.send(utils.format_block(content)) else: # Minimal info for each character entries = [ f'`\N{ZERO WIDTH SPACE}{c}\N{ZERO WIDTH SPACE}` | `\\u{ord(c):04x}` | `{unicodedata.name(c, "None")}` | {url_fmt.format(ord(c))}' for c in data ] content = '\n'.join(entries) await ctx.send(content)