def print_unicode_entry(n): u = get_unicode_using_unicode_escape(n) try: print unicodedata.digit(u), except: return False print '{:4d} 0x{:3x}'.format(n, n), u.encode('utf8'), unicodedata.category(u), unicodedata.name(u) return True
def digit(unichr, default_value=None): """Returns the digit value assigned to the Unicode character unichr as integer. If no such value is defined, default is returned, or, if not given, ValueError is raised.""" unichr = unicode(unichr) if default_value is not None: return unicodedata.digit(unichr, default_value) else: return unicodedata.digit(unichr)
def digit(uni_char, default_value=None): """Returns the digit value assigned to the Unicode character uni_char as integer. If no such value is defined, default is returned, or, if not given, ValueError is raised.""" uni_char = unicod(uni_char) # Force to Unicode. if default_value is not None: return unicodedata.digit(uni_char, default_value) else: return unicodedata.digit(uni_char)
def print_unicode_entry(n): u = get_unicode_using_unicode_escape(n) print '{:8d} {:8x}'.format(n, n), print u.encode('utf8'), unicodedata.category(u), try: print unicodedata.name(u), except: print 'unicodedata has no name defined', try: print unicodedata.digit(u) except: print 'unicodedata has no numeric value'
def is_number(s): try: float(s) return True except ValueError: pass try: unicodedata.digit(s) # digit 把一个合法的数字字符串转换为数字值 return True except (TypeError, ValueError): pass return False
def is_int(x): try: long(x) except ValueError: try: unicodedata.digit(x) except (ValueError, TypeError): return False else: return True else: return True
def is_int(x): try: int(x) except ValueError: try: unicodedata.digit(x) except (ValueError, TypeError): return False else: return True else: return True
def _parse_number(token): try: number = int(token) except ValueError: try: if len(token) > 1: number = "".join( [str(unicodedata.digit(ch)) for ch in token]) else: number = unicodedata.digit(token) except ValueError: return None return int(number)
def is_int(x): try: return x.is_integer() except AttributeError: try: long(x) except ValueError: try: unicodedata.digit(x) except (ValueError, TypeError): return False else: return True else: return True
def normalize(text, *, map_cmb=True, map_digits=True, map_whitespace=True, form="NFKD") -> str: # pragma: no-cov text = unicodedata.normalize(form, text) if map_cmb: cmb_map = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) text = text.translate(cmb_map) if map_digits: digits_map = { c: ord("0") + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == "Nd" } text = text.translate(digits_map) if map_whitespace: whitespace_map = {ord("\t"): " ", ord("\f"): " ", ord("\r"): None} text = text.translate(whitespace_map) return text
def more(): digitmap = {c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'} print(len(digitmap)) x = '\u0661\u0662\u0663' print(x.translate(digitmap))
def try_to_read_signed_integer(iterable, val): """ If the given string ends with +/-, attempt to return a signed int. Otherwise, return the string as-is. """ if val.endswith(('+', '-')): next_element = next(iterable, None) # Last element, return as-is. if next_element is None: yield val return # We know the next value in the sequence must be "isnum == True". # We just need to handle unicode or not. _, next_val, next_isuni = next_element # If unicode, don't apply sign and just return the val as-is # and convert the unicode character. if next_isuni: yield val yield unicodedata.digit(next_val) # If the val is *only* the sign, return only the number. elif val in ('-', '+'): yield [val, next_val] # Otherwise, remove the sign from the val and apply it to the number, # returning both. else: yield val[:-1] yield [val[-1], next_val] else: yield val
def translate_fun(): s = 'pýtĥöñ\fis\tawesome\r\n' print(s) remap = { # ord 一个ascii字符 ord('\t'): ' ', ord('\f'): ' ', ord('\r'): None } a = s.translate(remap) print(a) # 找到所有的和音字符 cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) # 标准化 b = unicodedata.normalize('NFD', a) print(b.translate(cmb_chrs)) digitmap = { # unicodedata.digit把一个合法的数字字符串转换为数字值 c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) # 类型是否为Nd,及数字类型 if unicodedata.category(chr(c)) == 'Nd' } print(len(digitmap)) x = '\u0661\u0662\u0663' print(x.translate(digitmap)) # IO解码与编码处理 b = unicodedata.normalize('NFD', s) print(b.encode('ascii', 'ignore').decode('ascii')) print(b.encode('utf-8', 'ignore').decode('utf-8'))
def translate_str(): s = 'pýtĥöñ\fis\tawesome\r\n' print(s) remap = { ord('\t'): ' ', ord('\f'): ' ', ord('\r'): None # Deleted } a = s.translate(remap) print(a) # 删除和音符 cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) print(b) print(b.translate(cmb_chrs)) # unicode数字字符映射到ascii字符 digitmap = {c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'} print(len(digitmap)) x = '\u0661\u0662\u0663' print(x.translate(digitmap)) # 先标准化,然后使用encode和decode函数 b = unicodedata.normalize('NFD', a) print(type(b)) print(b.encode('ascii', 'ignore').decode('ascii'))
def getdetails(self, text): chardetails = {} for character in text: chardetails[character] = {} chardetails[character]['Name'] = unicodedata.name(character) chardetails[character]['HTML Entity'] = str(ord(character)) chardetails[character]['Code point'] = repr(character) try: chardetails[character]['Numeric Value'] = \ unicodedata.numeric(character) except: pass try: chardetails[character]['Decimal Value'] = \ unicodedata.decimal(character) except: pass try: chardetails[character]['Digit'] = unicodedata.digit(mychar) except: pass chardetails[character]['Alphabet'] = str(character.isalpha()) chardetails[character]['Digit'] = str(character.isdigit()) chardetails[character]['AlphaNumeric'] = str(character.isalnum()) chardetails[character]['Canonical Decomposition'] = \ unicodedata.decomposition(character) chardetails['Characters'] = list(text) return chardetails
def translate_str(): s = 'pýtĥöñ\fis\tawesome\r\n' print(s) #创建转换表格,再使用translate()方法 remap = { ord('\t'):' ', ord('\f'):' ', ord('\r'):None } a = s.translate(remap) print(a) ##删除和音符 cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) ##参考ex2_9a.py b = unicodedata.normalize('NFD',a) print(b) print(b.translate(cmb_chrs)) #unicode数字字符映射到ascii字符 digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' } print(len(digitmap)) #Arabic digits x = '\u0661\u0662\u0663' print(x.translate(digitmap)) #unicodedata.category(chr) 返回分配给字符 chr 的常规类别为字符串。 #unicodedata.digit(chr) 返回分配给字符 chr 的数字值作为整数。 如果没有定义这样的值,则返回 default ,如果没有给出,则 ValueError 被引发。 ## 先标准化,然后使用encode和decode函数 b = unicodedata.normalize('NFD',a) print(b.encode('ascii','ignore').decode('ascii')) #ascii编码,解码操作丢弃了那些和音符,只在目标是获取文本对应ascii表示的时候生效
def test_translate(): s = 'pýtĥöñ\fis\tawesome\r\n' print s remap = { ord('\t'): ' ', ord('\f'): ' ', ord('\r'): None } a = s.translate(remap) print a # remove all combining characters import unicodedata import sys cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) print b print b.translate(cmb_chrs) # maps all Unicode decimal digit to ASCII digitmap = {c: ord('0')+unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'} print len(digitmap) # Arabic digits x = '\u0661\u0662\u0663' print x.translate(digitmap)
def translate_str(): s = 'pýtĥöñ\fis\tawesome\r\n' print(s) remap = { ord('\t'): ' ', ord('\f'): ' ', ord('\r'): None # Deleted } a = s.translate(remap) print(a) # 删除和音符 cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) print(b) print(b.translate(cmb_chrs)) # unicode数字字符映射到ascii字符 digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' } print(len(digitmap)) x = '\u0661\u0662\u0663' print(x.translate(digitmap)) # 先标准化,然后使用encode和decode函数 b = unicodedata.normalize('NFD', a) print(type(b)) print(b.encode('ascii', 'ignore').decode('ascii'))
def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters( ): set_numeric_hex = set(numeric_hex) set_numeric_chars = set(numeric_chars) set_digit_chars = set(digit_chars) set_decimal_chars = set(decimal_chars) for i in py23_range(0X110000): try: a = py23_unichr(i) except ValueError: break if a in set('0123456789'): continue if unicodedata.numeric(a, None) is not None: assert i in set_numeric_hex assert a in set_numeric_chars if unicodedata.digit(a, None) is not None: assert i in set_numeric_hex assert a in set_digit_chars if unicodedata.decimal(a, None) is not None: assert i in set_numeric_hex assert a in set_decimal_chars assert set_decimal_chars.isdisjoint(digits_no_decimals) assert set_digit_chars.issuperset(digits_no_decimals) assert set_decimal_chars.isdisjoint(numeric_no_decimals) assert set_numeric_chars.issuperset(numeric_no_decimals)
def CleanRubbishStr(needCleanStr='pýtĥöñ\fis\tawesome\r\n'): remap = { ord('\t'): ' ', ord('\f'): ' ', ord('\n'): None, # Deleted ord('\r'): None, # Deleted } print("remap Count is " + str(len(remap))) remap2 = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) print("remap2 Count is " + str(len(remap2))) # a = needCleanStr.translate(remap) # print(a) # b = unicodedata.normalize('NFD', a) # print(b) # b = b.translate(remap2) # print(b) digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' } print("digitmap Count is " + str(len(digitmap))) # x = '\u0661\u0662\u0663' # print(x.translate(digitmap)) allList = tool.MergeTwoDicts(remap, remap2) allList = tool.MergeTwoDicts(allList, digitmap) print("allList Count is " + str(len(allList))) a = unicodedata.normalize('NFD', needCleanStr) b = a.translate(allList) print(b)
def tr(c): try: return table[c] except KeyError: try: return str(unicodedata.digit(c)) except ValueError: return c
def more(): digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' } print(len(digitmap)) x = '\u0661\u0662\u0663' print(x.translate(digitmap))
def decode_digits(string): new_digit = '' for character in string: try: new_digit += str(unicodedata.digit(character)) except ValueError: pritn('have some problem') pass return new_digit
def print_unicode_entry(n): u = chr(n) try: print(unicodedata.digit(u), end=' ') except: return False print('{:4d} 0x{:3x}'.format(n, n), u, unicodedata.category(u), unicodedata.name(u)) return True
def convert_to_int(self, string): tmp = self.convert(string) result = 0 digit = -1 for letter in string: digit = unicodedata.digit(letter, -1) if digit >= 0: result = result * 10 + digit return result
def test_digit_chars_contains_all_valid_unicode_digit_characters(): for i in py23_range(0X10FFFF): try: a = py23_unichr(i) except ValueError: break if a in set('0123456789'): continue if unicodedata.digit(a, None) is not None: assert a in digit_chars
def example_3(): import unicodedata import sys digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' } print(len(digitmap)) x = '\u0661\u0662\u0663' print(x.translate(digitmap))
def uninum2en(string): """Convert non-ascii unicode digits to equivalent English one (0-9). Example: >>> uninum2en('٤۴৪౪') '4444' """ digits = set(re.findall(r'\d', string)) for d in digits: string = string.replace(d, str(unicodedata.digit(d))) return string
def int_splitter_iter(iterable, signed): """Split the input into integers and strings.""" for isnum, val, isuni in iterable: if isuni: yield unicodedata.digit(val) elif isnum: yield int(val) elif signed: for x in try_to_read_signed_integer(iterable, val): yield int(''.join(x)) if isinstance(x, list) else x else: yield val
def int_splitter(x, signed, safe, sep): """Alternate (slow) method to split a string into numbers.""" if not x: return [] all_digits = set('0123456789') full_list, strings, nums = [], [], [] input_len = len(x) for i, char in enumerate(x): # If this character is a sign and the next is a number, # start a new number. if (i + 1 < input_len and signed and (char in '-+') and (x[i + 1] in all_digits)): # Reset any current string or number. if strings: full_list.append(''.join(strings)) if nums: full_list.append(int(''.join(nums))) strings = [] nums = [char] # If this is a number, add to the number list. elif char in all_digits: nums.append(char) # Reset any string. if strings: full_list.append(''.join(strings)) strings = [] # If this is a unicode digit, append directly to the full list. elif char.isdigit(): # Reset any string or number. if strings: full_list.append(''.join(strings)) if nums: full_list.append(int(''.join(nums))) strings = [] nums = [] full_list.append(unicodedata.digit(char)) # Otherwise add to the string. else: strings.append(char) # Reset any number. if nums: full_list.append(int(''.join(nums))) nums = [] if nums: full_list.append(int(''.join(nums))) elif strings: full_list.append(''.join(strings)) if safe: full_list = sep_inserter(full_list, (int, long), sep) if type(full_list[0]) in (int, long): return [sep] + full_list else: return full_list
def int_splitter(x, signed, safe, sep): """Alternate (slow) method to split a string into numbers.""" if not x: return [] all_digits = set('0123456789') full_list, strings, nums = [], [], [] input_len = len(x) for i, char in enumerate(x): # If this character is a sign and the next is a number, # start a new number. if (i+1 < input_len and signed and (char in '-+') and (x[i+1] in all_digits)): # Reset any current string or number. if strings: full_list.append(''.join(strings)) if nums: full_list.append(int(''.join(nums))) strings = [] nums = [char] # If this is a number, add to the number list. elif char in all_digits: nums.append(char) # Reset any string. if strings: full_list.append(''.join(strings)) strings = [] # If this is a unicode digit, append directly to the full list. elif char.isdigit(): # Reset any string or number. if strings: full_list.append(''.join(strings)) if nums: full_list.append(int(''.join(nums))) strings = [] nums = [] full_list.append(unicodedata.digit(char)) # Otherwise add to the string. else: strings.append(char) # Reset any number. if nums: full_list.append(int(''.join(nums))) nums = [] if nums: full_list.append(int(''.join(nums))) elif strings: full_list.append(''.join(strings)) if safe: full_list = sep_inserter(full_list, (int, long), sep) if type(full_list[0]) in (int, long): return [sep] + full_list else: return full_list
def print_unicode_entry(n): u = chr(n) print('{:8d} {:8x}'.format(n, n), end=' ') print(u, unicodedata.category(u), end=' ') try: print(unicodedata.name(u), end=' ') except: print('unicodedata has no name defined', end=' ') try: print(unicodedata.digit(u)) except: print('unicodedata has no numeric value')
def test_digit_chars_contains_all_valid_unicode_digit_characters(): set_numeric_hex = set(numeric_hex) set_numeric_chars = set(numeric_chars) for i in py23_range(0X110000): try: a = py23_unichr(i) except ValueError: break if a in set('0123456789'): continue if unicodedata.digit(a, None) is not None: assert i in set_numeric_hex assert a in set_numeric_chars
def overview(tree_item): """ Returns an overview of the character """ char = tree_item.obj return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), char, unicodedata.decimal(char, ''), unicodedata.digit(char, ''), unicodedata.numeric(char, ''), unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.east_asian_width(char), unicodedata.mirrored(char), unicodedata.decomposition(char))
def _explain_char(self, ch, further): try: name = unicodedata.name(ch) except ValueError: name = f'[U+{hex(ord(ch))[2:]}]' if not further: return name + f'({ch})' infos = { 'category': unicodedata.category(ch), 'direction': unicodedata.bidirectional(ch), 'east asian width': unicodedata.east_asian_width(ch) } decomposition = unicodedata.decomposition(ch) if decomposition: infos['decomposition'] = decomposition try: infos['digit value'] = unicodedata.digit(ch) except ValueError: pass try: infos['decimal value'] = unicodedata.decimal(ch) except ValueError: pass try: infos['numeric value'] = unicodedata.numeric(ch) except ValueError: pass comb = unicodedata.combining(ch) if comb != 0: infos['combining class'] = str(comb) mirrored = unicodedata.mirrored(ch) if mirrored: infos['mirrored'] = 'yes' if hasattr(unicodedata, 'is_normalized'): forms = [] for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): if unicodedata.is_normalized(form, ch): forms.append(form) if forms: infos['normalized'] = f'yes: {", ".join(forms)}' else: infos['normalized'] = 'no' else: infos['normalized'] = 'unavailable' info = ', '.join([f'{k}: {v}' for k, v in infos.items()]) return f'{name}: {ch!r} ({info})'
def latex_char(char: Character, prefix: str) -> str: if char.category[1] == 'd': return ud.digit(char.character) ipc = char.ipython_cmd # TODO: special-case upgreek in bf suffix = ipc[len(prefix):] if len(suffix) > 1: if prefix in ('\\', r'\bf') and suffix not in upgreek_blacklist: suffix = 'up' + suffix elif (prefix in (r'\it', r'\bi') and suffix in upgreek_blacklist and 'var' not in suffix): suffix = 'var' + suffix suffix = '\\' + suffix return suffix
def convert_string(string): string_list = list(string) for i in range(0, len(string_list)): try: string_list[i] = str(unicodedata.digit( string_list[i])).encode('utf-8') except ValueError: pass out = "" for j in string_list: out += j return out
def char2info(ch): name = U.name(ch, None) decimal = U.decimal(ch, None) digit = U.digit(ch, None) numeric = U.numeric(ch, None) category = U.category(ch) bidirectional = U.bidirectional(ch) combining = U.combining(ch) east_asian_width = U.east_asian_width(ch) mirrored = U.mirrored(ch) decomposition = U.decomposition(ch) unicode = ord(ch) unicode_hex = hex(unicode) return dict(locals())
def checkEntry(*event): entry = yearentry.get() if isWesternYear(entry): westernyear = int(entry) period, year = edoFromWestern(westernyear) showPeriod(period, year) else: period, year = "", "0" for c in entry: if c in "01234567890123456789": year += str(unicodedata.digit(c)) else: period += c showPeriod(period, int(year))
def test_compare_functions(self): def getX(fun, code): try: return getattr(unicodedb_5_2_0, fun)(code) except KeyError: return -1 for code in range(0x10000): char = unichr(code) assert unicodedata.digit(char, -1) == getX('digit', code) assert unicodedata.numeric(char, -1) == getX('numeric', code) assert unicodedata.decimal(char, -1) == getX('decimal', code) assert unicodedata.category(char) == unicodedb_5_2_0.category(code) assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code) assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code) assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code) assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)
def startPattern(self, input): """Do the normalization that is possible before splitting the string and that is needed to split the string. Args: input: date subfield Returns: (norminput, pattern) norminput is the normalized date subfield. pattern is the preliminary date pattern. The pattern will continue to be refined after the date is split. Sets: self.hijri: if the date subfield is hijri self.datetype: if the date was flourished """ pattern = unicodedata.normalize('NFKD', unicode(input[1:]).lower()) pattern = ''.join([unicode(unicodedata.digit(d, d)) for d in pattern]) pattern = re.sub(',', ' ', pattern) # convert various dashes to dash pattern = re.sub(u'\u2212|\u2013|\u2014|\u05be|\u2010|\u2015|\u30fb', '-', pattern) pattern = pattern.replace('bzw.', '-') ## from DNB records pattern = re.sub(u'\u061f', '?', pattern) # arabic question mark pattern = re.sub('----|-t\.|\[.*h\]| reg\..*$| age .*$', '', pattern) pattern = re.sub('\(|\)|;|<|>|\]|\[', '', pattern) ## moved these to overrides ##pattern = re.sub('av\. ?j\.?-\.?c', 'av jc', pattern) ##pattern = re.sub('-talet', ' talet', pattern) pattern = re.sub('\[|\]', '', pattern) pattern = pattern.replace('xxxx', '').replace('gegenwart', '') pattern = re.sub('\.{4,10}', '', pattern) pattern = pattern.strip(' ') pattern = re.sub(' +', ' ', pattern) flourishedpattern = isFlourished if self.flags.find('fIsFlourished') == -1 else altisFlourished if flourishedpattern.search(pattern): pattern = flourishedpattern.sub('', pattern) self.datetype = 'flourished' norminput = pattern pattern = monthMasker7.sub(r'\1month\4', pattern) self.hijri = isHijri.search(pattern) if self.hijri: pattern = isHijri.sub('', pattern).strip() norminput = isHijri.sub('', norminput).strip() pattern = re.sub('\d', 'N', pattern) return norminput, pattern
def main(): try: v = bytes(int(x, 16) for x in sys.argv[1:]) c = v.decode('utf8') print('gryph: %s' % c) print('codepoint: U+%x' % ord(c)) print('name: %s' % unicodedata.name(c, 'Unknown')) print('decimal: %s' % unicodedata.decimal(c, 'Unknown')) print('digit: %s' % unicodedata.digit(c, 'Unknown')) print('numeric: %s' % unicodedata.numeric(c, 'Unknown')) print('category: %s' % unicodedata.category(c)) print('bidirectional: %s' % unicodedata.bidirectional(c)) print('combining: %s' % unicodedata.combining(c)) print('east_asian_width: %s' % unicodedata.east_asian_width(c)) print('mirrored: %s' % unicodedata.mirrored(c)) print('decomposition: %s' % unicodedata.decomposition(c)) except Exception as ex: print('ERROR: %s' % ex)
def test_compare_functions(self): import unicodedata # CPython implementation def getX(fun, code): if fun == 'numeric' and code in self.diff_numeric: return -1 try: return getattr(unicodedb_4_1_0, fun)(code) except KeyError: return -1 for code in range(0x10000): char = unichr(code) assert unicodedata.digit(char, -1) == getX('digit', code) assert unicodedata.numeric(char, -1) == getX('numeric', code) assert unicodedata.decimal(char, -1) == getX('decimal', code) assert unicodedata.category(char) == unicodedb_4_1_0.category(code) assert unicodedata.bidirectional(char) == unicodedb_4_1_0.bidirectional(code) assert unicodedata.decomposition(char) == unicodedb_4_1_0.decomposition(code) assert unicodedata.mirrored(char) == unicodedb_4_1_0.mirrored(code) assert unicodedata.combining(char) == unicodedb_4_1_0.combining(code)
def test_ipy2_gh357(self): """https://github.com/IronLanguages/ironpython2/issues/357""" import unicodedata if is_cli: self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>') else: self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D') self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d') self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d') self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d') self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0) self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo') self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L') self.assertEqual(unicodedata.combining(u'\u4e2d'), 0) self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W') self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0) self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
def test_digit_chars_contains_only_valid_unicode_digit_characters(): for a in digit_chars: assert unicodedata.digit(a, None) is not None
} a = s.translate(remap) print(a) import unicodedata import sys cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) print(b) result = b.translate(cmb_chrs) print(result) digitmap = { c : ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' } print(len(digitmap)) # Arabic digits x = '\u0661\u0662\u0663' x.translate(digitmap) print(a) b = unicodedata.normalize('NFD', a) b.encode('ascii', 'ignore').decode('ascii') """
0X1D7FB, 0X1D7FC, 0X1D7FD, 0X1D7FE, 0X1D7FF, 0X1E8C7, 0X1E8C8, 0X1E8C9, 0X1E8CA, 0X1E8CB, 0X1E8CC, 0X1E8CD, 0X1E8CE, 0X1E8CF, 0X1F100, 0X1F101, 0X1F102, 0X1F103, 0X1F104, 0X1F105, 0X1F106, 0X1F107, 0X1F108, 0X1F109, 0X1F10A, 0X1F10B, 0X1F10C, 0X20001, 0X20064, 0X200E2, 0X20121, 0X2092A, 0X20983, 0X2098C, 0X2099C, 0X20AEA, 0X20AFD, 0X20B19, 0X22390, 0X22998, 0X23B1B, 0X2626D, 0X2F890 ] # Convert each hex into the literal Unicode character. # Stop if a ValueError is raised in case of a narrow Unicode build. # The extra check with unicodedata is in case this Python version # does not support some characters. numeric_chars = [] for a in numeric_hex: try: l = py23_unichr(a) except ValueError: # pragma: no cover break if unicodedata.numeric(l, None) is None: continue numeric_chars.append(l) # The digit characters are a subset of the numerals. digit_chars = [a for a in numeric_chars if unicodedata.digit(a, None) is not None] # Create a single string with the above data. digits = ''.join(digit_chars) numeric = ''.join(numeric_chars)
print "Testing Unicode Database..." print "Methods:", print test_methods() # In case unicodedata is not available, this will raise an ImportError, # but still test the above cases... import unicodedata print "Functions:", print test_unicodedata() # Some additional checks of the API: print "API:", verify(unicodedata.digit(u"A", None) is None) verify(unicodedata.digit(u"9") == 9) verify(unicodedata.digit(u"\u215b", None) is None) verify(unicodedata.digit(u"\u2468") == 9) verify(unicodedata.numeric(u"A", None) is None) verify(unicodedata.numeric(u"9") == 9) verify(unicodedata.numeric(u"\u215b") == 0.125) verify(unicodedata.numeric(u"\u2468") == 9.0) verify(unicodedata.decimal(u"A", None) is None) verify(unicodedata.decimal(u"9") == 9) verify(unicodedata.decimal(u"\u215b", None) is None) verify(unicodedata.decimal(u"\u2468", None) is None) verify(unicodedata.category(u"\uFFFE") == "Cn")
审查清理字符串: str.upper()、str.lower()、str.replace()、re.sub()、unicodedata.normalize()等实现不同功能的基础处理 str.translate()通过构造替换字典进行处理 ''' if __name__ == '__main__': s = 'pýtĥöñ\fis\taWesome\r\n' print(s) print(s.upper()) print(s.lower()) print(s.replace('W', 'w')) remap = { ord('\t') : ' ', ord('\f') : ' ', ord('\r') : None } a = s.translate(remap) print(a) import unicodedata, sys cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) print(b) print(b.translate(cmb_chrs)) x = '\u0661\u0662\u0663' digitmap = {c : ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'} print(len(digitmap)) print(x.translate(digitmap)) print(b.encode('ascii', 'ignore').decode('ascii'))
def quote_norm(line): line = ' %s ' % line # Delete control characters: line = re.sub(r'[\x00-\x1F]+', ' ', line) # PTB --> normal line = line.replace(r'-LRB-', '(') line = line.replace(r'-RRB-', ')') line = line.replace(r'-LSB-', '[') line = line.replace(r'-RSB-', ']') line = line.replace(r'-LCB-', '{') line = line.replace(r'-RCB-', '}') line = line.replace(r' gon na ', ' gonna ') # Regularize named HTML/XML escapes: line = re.sub(r'&\s*lt\s*;', '<', line, flags=re.IGNORECASE) # HTML opening angle bracket line = re.sub(r'&\s*gt\s*;', '>', line, flags=re.IGNORECASE) # HTML closing angle bracket line = re.sub(r'&\s*squot\s*;', '\'', line, flags=re.IGNORECASE) # HTML single quote line = re.sub(r'&\s*quot\s*;', '"', line, flags=re.IGNORECASE) # HTML double quote line = re.sub(r'&\s*nbsp\s*;', ' ', line, flags=re.IGNORECASE) # HTML non-breaking space line = re.sub(r'&\s*apos\s*;', '\'', line, flags=re.IGNORECASE) # HTML apostrophe line = re.sub(r'&\s*amp\s*;', '&', line, flags=re.IGNORECASE) # HTML ampersand (last) # Regularize known HTML numeric codes: line = re.sub(r'&\s*#\s*160\s*;', ' ', line) line = re.sub(r'&\s*#45\s*;\s*&\s*#45\s*;', '--', line) line = re.sub(r'&\s*#45\s*;', '--', line) # Convert arbitrary hex or decimal HTML entities to actual characters: line = re.sub(r'&\#x([0-9A-Fa-f]+);', html_hex_entity, line) line = re.sub(r'&\#([0-9]+);', html_entity, line) # Regularlize spaces: zero_width_spaces = [u'\u00ad', # soft hyphen u'\u200C'] # zero-width non-joiner line = re.sub('|'.join(zero_width_spaces), '', line) spaces = [u'\u00a0', # non-breaking space u'\u2009', # thin space u'\u2028', # "line separator" u'\u2029', # "paragraph separator" u'\u202a', # "left-to-right embedding" u'\u202b', # "right-to-left embedding" u'\u202c', # "pop directional formatting" u'\u202d', # "left-to-right override" u'\u202e', # "right-to-left override" u'\u0085', # "next line" u'\ufffd', # "replacement character" u'\ufeff', # byte-order mark u'\ufdd3'] # "unicode non-character" line = re.sub('|'.join(spaces), ' ', line) # Convert other Windows 1252 characters to UTF-8 line = line.replace(u'\u0080', u'\u20ac') # euro sign line = line.replace(u'\u0095', u'\u2022') # bullet line = line.replace(u'\u0099', u'\u2122') # trademark sign # Currency and measure conversions: line = re.sub(r' (\d\d): (\d\d)', r' \1:\2', line) line = line.replace(u'\u20a0', ' EUR ') line = line.replace(u'\u20ac', ' EUR ') line = line.replace(u'\u00a3', ' GBP ') line = re.sub(r'(\W)([A-Z]+\$?)(\d*\.\d+|\d+)', r'\1\2 \3', line) # AU$12.34 line = re.sub(r'(\W)(euro?)(\d*\.\d+|\d+)', r'\1EUR \3', line, flags=re.IGNORECASE) # EUR12.34 # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8: line = line.replace(u'�c', '--') # long dash line = line.replace(u'\u00e2\u20acoe', '"') # opening double quote line = line.replace(u'\u00e2\u20ac\u009c', '"') # opening double quote line = line.replace(u'\u00e2\u20ac\u009d', '"') # closing double quote line = line.replace(u'\u00e2\u20ac\u2122', '\'') # apostrophe line = line.replace(u'\u00e2\u20ac\u201c', ' -- ') # en dash? line = line.replace(u'\u00e2\u20ac\u201d', ' -- ') # em dash? line = line.replace(u'\u00e2\u0080\u0098', r'\'') # single quote? line = line.replace(u'\u00e2\u0080\u0099', r'\'') # single quote? line = line.replace(u'\u00e2\u0080\u009c', r'"') # double quote? line = line.replace(u'\u00e2\u0080\u009d', r'"') # double quote? line = line.replace(u'\u00c3\u009f', u'\u00df') # esset line = line.replace(u'\u00c3\u0178', u'\u00df') # esset line = line.replace(u'\u00c3\u00a4', u'\u00e4') # a umlaut line = line.replace(u'\u00c3\u00b6', u'\u00f6') # o umlaut line = line.replace(u'\u00c3\u00bc', u'\u00fc') # u umlaut line = line.replace(u'\u00c3\u0084', u'\u00c4') # A umlaut: create no C4s after this line = line.replace(u'\u00c3\u201e', u'\u00c4') # A umlaut: create no C4s after this line = line.replace(u'\u00c3\u0096', u'\u00d6') # O umlaut line = line.replace(u'\u00c3\u2013', u'\u00d6') # O umlaut line = line.replace(u'\u00c3\u00bc', u'\u00dc') # U umlaut line = line.replace(u'\u0080', u'\u20ac') # euro sign line = line.replace(u'\u0095', u'\u2022') # bullet line = line.replace(u'\u0099', u'\u2122') # trademark sign # Regularize quotes: line = line.replace(u'ˇ', '\'') # caron line = line.replace(u'´', '\'') # acute accent line = line.replace(u'`', '\'') # grave accent line = line.replace(u'ˉ', '\'') # modified letter macron line = line.replace(u' ,,', '"') # ghetto low-99 quote line = line.replace(u'``', '"') # latex-style left quote line = line.replace(u'\'\'', '"') # latex-style right quote line = line.replace(u'\u300c', '"') # left corner bracket line = line.replace(u'\u300d', '"') # right corner bracket line = line.replace(u'\u3003', '"') # ditto mark line = line.replace(u'\u00a8', '"') # diaeresis line = line.replace(u'\u0092', '\'') # curly apostrophe line = line.replace(u'\u2019', '\'') # curly apostrophe line = line.replace(u'\uf03d', '\'') # curly apostrophe line = line.replace(u'\u00b4', '\'') # curly apostrophe line = line.replace(u'\u2018', '\'') # curly single open quote line = line.replace(u'\u201a', '\'') # low-9 quote line = line.replace(u'\u0093', '"') # curly left quote line = line.replace(u'\u201c', '"') # curly left quote line = line.replace(u'\u0094', '"') # curly right quote line = line.replace(u'\u201d', '"') # curly right quote line = line.replace(u'\u2033', '"') # curly right quote line = line.replace(u'\u201e', '"') # low-99 quote line = line.replace(u'\u0084', '"') # low-99 quote (bad enc) line = line.replace(u'\u201f', '"') # high-rev-99 quote line = line.replace(u'\u00ab', '"') # opening guillemet line = line.replace(u'\u00bb', '"') # closing guillemet line = line.replace(u'\u0301', '\'') # combining acute accent line = line.replace(u'\u203a', '"') # angle quotation mark line = line.replace(u'\u2039', '"') # angle quotation mark # Space inverted punctuation: line = line.replace(u'¡', u' ¡ ') line = line.replace(u'¿', u' ¿ ') # Russian abbreviations: line = line.replace(u' п. п. ', u' п.п. ') line = line.replace(u' ст. л. ', u' ст.л. ') line = line.replace(u' т. е. ', u' т.е. ') line = line.replace(u' т. к. ', u' т.к. ') line = line.replace(u' т. ч. ', u' т.ч. ') line = line.replace(u' т. д. ', u' т.д. ') line = line.replace(u' т. п. ', u' т.п. ') line = line.replace(u' и. о. ', u' и.о. ') line = line.replace(u' с. г. ', u' с.г. ') line = line.replace(u' г. р. ', u' г.р. ') line = line.replace(u' т. н. ', u' т.н. ') line = line.replace(u' т. ч. ', u' т.ч. ') line = line.replace(u' н. э. ', u' н.э. ') # Convert foreign numerals into Arabic numerals line = ''.join([str(unicodedata.digit(c)) if c.isdigit() else c for c in line]) # Random punctuation: line = line.replace(u'!', '!') line = line.replace(u'-', '-') line = line.replace(u'~', '~') line = line.replace(u'、', ',') #line = line.replace(u'。', '.') line = line.replace(u'\u0085', '...') line = line.replace(u'…', '...') line = line.replace(u'―', '--') line = line.replace(u'–', '--') line = line.replace(u'─', '--') line = line.replace(u'—', '--') line = line.replace(u'\u0097', '--') line = line.replace(u'•', ' * ') line = line.replace(u'\*', ' * ') line = line.replace(u'،', ',') line = line.replace(u'؟', '?') line = line.replace(u'ـ', ' ') line = line.replace(u'à ̄', 'i') line = line.replace(u'’', '\'') line = line.replace(u'â€"', '"') line = line.replace(u'؛', ';') # Regularize ligatures: line = line.replace(u'\u009c', 'oe') # "oe" ligature line = line.replace(u'\u0153', 'oe') # "oe" ligature line = line.replace(u'\u008c', 'Oe') # "OE" ligature line = line.replace(u'\u0152', 'Oe') # "OE" ligature line = line.replace(u'\ufb00', 'ff') # "ff" ligature line = line.replace(u'\ufb01', 'fi') # "fi" ligature line = line.replace(u'\ufb02', 'fl') # "fl" ligature line = line.replace(u'\ufb03', 'ffi') # "ffi" ligature line = line.replace(u'\ufb04', 'ffl') # "ffl" ligature line = line.replace(u'\u0132', 'Ij') # "Ij" ligature line = line.replace(u'\u0133', 'ij') # "ij" ligature line = line.replace(u'\ufb06', 'st') # "st" ligature line = line.replace(u'\u00c6', 'Ae') # "Ae" ligature line = line.replace(u'\u00e6', 'ae') # "ae" ligature line = line.replace(u'\ufb05', 'st') # "st" ligature line = line.replace(u'β', u'ß') # WMT 2010 error # Strip extra spaces: line = re.sub(r'\s+', ' ', line) line = line.strip() return line
""" Test script for the unicodedata module. Written by Marc-Andre Lemburg ([email protected]). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" from test_support import verbose import sys # Test Unicode database APIs import unicodedata print 'Testing unicodedata module...', assert unicodedata.digit(u'A',None) is None assert unicodedata.digit(u'9') == 9 assert unicodedata.digit(u'\u215b',None) is None assert unicodedata.digit(u'\u2468') == 9 assert unicodedata.numeric(u'A',None) is None assert unicodedata.numeric(u'9') == 9 assert unicodedata.numeric(u'\u215b') == 0.125 assert unicodedata.numeric(u'\u2468') == 9.0 assert unicodedata.decimal(u'A',None) is None assert unicodedata.decimal(u'9') == 9 assert unicodedata.decimal(u'\u215b',None) is None assert unicodedata.decimal(u'\u2468',None) is None assert unicodedata.category(u'\uFFFE') == 'Cn'
def digit(self, default=None): return ud.digit(self.char, default)
### Run tests print 'Testing Unicode Database...' print 'Methods:', print test_methods() # In case unicodedata is not available, this will raise an ImportError, # but still test the above cases... import unicodedata print 'Functions:', print test_unicodedata() # Some additional checks of the API: print 'API:', verify(unicodedata.digit(u'A',None) is None) verify(unicodedata.digit(u'9') == 9) verify(unicodedata.digit(u'\u215b',None) is None) verify(unicodedata.digit(u'\u2468') == 9) verify(unicodedata.numeric(u'A',None) is None) verify(unicodedata.numeric(u'9') == 9) verify(unicodedata.numeric(u'\u215b') == 0.125) verify(unicodedata.numeric(u'\u2468') == 9.0) verify(unicodedata.decimal(u'A',None) is None) verify(unicodedata.decimal(u'9') == 9) verify(unicodedata.decimal(u'\u215b',None) is None) verify(unicodedata.decimal(u'\u2468',None) is None) verify(unicodedata.category(u'\uFFFE') == 'Cn')
""" Test script for the unicodedata module. Written by Marc-Andre Lemburg ([email protected]). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" from test_support import verify, verbose import sha encoding = 'utf-8' def test_methods(): h = sha.sha() for i in range(65536): char = unichr(i) data = [ # Predicates (single char) char.isalnum() and u'1' or u'0', char.isalpha() and u'1' or u'0', char.isdecimal() and u'1' or u'0', char.isdigit() and u'1' or u'0', char.islower() and u'1' or u'0', char.isnumeric() and u'1' or u'0', char.isspace() and u'1' or u'0', char.istitle() and u'1' or u'0', char.isupper() and u'1' or u'0', # Predicates (multiple chars) (char + u'abc').isalnum() and u'1' or u'0', (char + u'abc').isalpha() and u'1' or u'0', (char + u'123').isdecimal() and u'1' or u'0', (char + u'123').isdigit() and u'1' or u'0', (char + u'abc').islower() and u'1' or u'0', (char + u'123').isnumeric() and u'1' or u'0', (char + u' \t').isspace() and u'1' or u'0',