def unescape_unicode(raw_str: str): """ >>> s = r'тест\u0301' >>> codecs.raw_unicode_escape_encode(s) >>> sss = ('\\u0442\\u0435\\u0441\\u0442\\u0301', 10) >>> codecs.raw_unicode_escape_decode(sss[0]) >>> 'тест́' :param raw_str: a raw string with unicode escapes :return: an unescaped unicode string """ _bytes, c = codecs.raw_unicode_escape_encode(raw_str) _str, c = codecs.raw_unicode_escape_decode(_bytes) return _str
def test_codecs_builtins(self): s = "abc" encoded = codecs.utf_8_encode(s) self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0]) encoded = codecs.utf_7_encode(s) self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0]) encoded = codecs.utf_16_encode(s) self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0]) encoded = codecs.utf_16_le_encode(s) self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0]) encoded = codecs.utf_16_be_encode(s) self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0]) encoded = codecs.utf_32_encode(s) self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0]) encoded = codecs.utf_32_le_encode(s) self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.raw_unicode_escape_encode(s) self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0]) encoded = codecs.unicode_escape_encode(s) self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0]) encoded = codecs.latin_1_encode(s) self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0]) encoded = codecs.ascii_encode(s) self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
def encode(self, input, final=False): return codecs.raw_unicode_escape_encode(input, self.errors)[0]
def test_raw_unicode_escape_encode(self): #sanity new_str, size = codecs.raw_unicode_escape_encode("abc") self.assertEqual(new_str, 'abc') self.assertEqual(size, 3)
def test_raw_unicode_escape_encode(self): #sanity new_str, num_processed = codecs.raw_unicode_escape_encode("abc") self.assertEqual(new_str, b'abc') self.assertEqual(num_processed, 3)
def raw_unistr(self, unistr): return codecs.raw_unicode_escape_encode(unistr)[0].decode('gbk')
def encode(self, etype=None, emethod=None): ''' 编码 @params: dtype: 解码类型,默认为auto,支持URL/base64/md5/hex/unicode等类型 dmethod: 非ASCII字符串解码类型,默认为STDOUT的编码类型,支持utf8/utf16/gbk/gb2312/big5等类型 ''' etype = etype.lower() if etype else "url" emethod = emethod.lower() if emethod else sys.stdout.encoding ecode = self.code.decode(sys.stdout.encoding).encode(emethod) if etype == 'md5': return [hashlib.md5(ecode).hexdigest()] if etype == 'sha': result = [] result.append("sha1: " + hashlib.sha1(ecode).hexdigest() + "\n") result.append("sha224: " + hashlib.sha224(ecode).hexdigest() + "\n") result.append("sha256: " + hashlib.sha256(ecode).hexdigest() + "\n") result.append("sha384: " + hashlib.sha384(ecode).hexdigest() + "\n") result.append("sha512: " + hashlib.sha512(ecode).hexdigest() + "\n") return result if etype == 'base64': return [base64.b64encode(ecode)] if etype == 'base32': return [base64.b32encode(ecode)] if etype == 'hex': tmp1 = ['\\' + hex(ord(x))[1:] for x in ecode] tmp2 = ['0' + hex(ord(x))[1:] for x in ecode] return ["".join(tmp1), "".join(tmp2), ",".join(tmp2)] if etype == 'decimal': tmp = [str(ord(x)) for x in ecode] return [" ".join(tmp), ",".join(tmp)] if etype == 'unicode': return [ codecs.raw_unicode_escape_encode( self.code.decode(sys.stdout.encoding))[0] ] if etype == 'unicode-all': result = "" tmp = codecs.raw_unicode_escape_encode( self.code.decode(sys.stdout.encoding))[0] current = 'unicode' if tmp[:2] == '\\u' else "raw" i = 0 while i < len(tmp): if tmp[i:i + 2].lower() == '\\u': result += tmp[i:i + 6] if current != "unicode": current = 'unicode' i += 6 else: result += "\\u00" + hex(ord(tmp[i]))[2:] if current != 'raw': current = 'raw' i += 1 return [result] if etype == 'url': return [urllib.quote(ecode)] if etype == 'url-all': tmp = ['%' + hex(ord(x))[2:].upper() for x in ecode] return ["".join(tmp)] if etype == 'html': return [cgi.escape(self.code, quote=True)] if etype == 'html-all': hexstr = ["&#" + hex(ord(x))[1:] + ";" for x in self.code] decstr = ["&#" + str(ord(x)) + ";" for x in self.code] return ["".join(hexstr), "".join(decstr)] if etype == 'php-chr': tmp = ["chr({0})".format(ord(x)) for x in ecode] return [".".join(tmp)] if etype == 'utf7': return [self.code.decode(sys.stdout.encoding).encode('utf7')] if etype == 'utf7-all': return [self._utf7EncodeAll(self.code.decode(sys.stdout.encoding))] raise EncodeError("unrecognized type, should be {0}".format( self.encodeTypes))
def test_unescape_unicode(self): s = utils.unescape_unicode(r'Sample\u0301') assert s == 'Sample' + '\u0301' assert len(s) == len('Sample') + 1 assert codecs.raw_unicode_escape_encode(s[6])[0] == rb'\u0301'
def encode(self, etype=None, emethod=None): ''' 编码 @params: dtype: 解码类型,默认为auto,支持URL/base64/md5/hex/unicode等类型 dmethod: 非ASCII字符串解码类型,默认为STDOUT的编码类型,支持utf8/utf16/gbk/gb2312/big5等类型 ''' etype = etype.lower() if etype else "url" emethod = emethod.lower() if emethod else sys.stdout.encoding ecode = self.code.decode(sys.stdout.encoding).encode(emethod) if etype == 'md5': return [hashlib.md5(ecode).hexdigest()] if etype == 'sha': result = [] result.append("sha1: " + hashlib.sha1(ecode).hexdigest() + "\n") result.append("sha224: " + hashlib.sha224(ecode).hexdigest() + "\n") result.append("sha256: " + hashlib.sha256(ecode).hexdigest() + "\n") result.append("sha384: " + hashlib.sha384(ecode).hexdigest() + "\n") result.append("sha512: " + hashlib.sha512(ecode).hexdigest() + "\n") return result if etype == 'base64': return [base64.b64encode(ecode)] if etype == 'base32': return [base64.b32encode(ecode)] if etype == 'hex': tmp1 = ['\\'+hex(ord(x))[1:] for x in ecode] tmp2 = ['0'+hex(ord(x))[1:] for x in ecode] return ["".join(tmp1), "".join(tmp2), ",".join(tmp2)] if etype == 'decimal': tmp = [str(ord(x)) for x in ecode] return [" ".join(tmp), ",".join(tmp)] if etype == 'unicode': return [codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0]] if etype == 'unicode-all': result = "" tmp = codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0] current = 'unicode' if tmp[:2] == '\\u' else "raw" i = 0 while i<len(tmp): if tmp[i:i+2].lower() == '\\u': result += tmp[i:i+6] if current != "unicode": current = 'unicode' i += 6 else: result += "\\u00" + hex(ord(tmp[i]))[2:] if current != 'raw': current = 'raw' i += 1 return [result] if etype == 'url': return [urllib.quote(ecode)] if etype == 'url-all': tmp = ['%'+hex(ord(x))[2:].upper() for x in ecode] return ["".join(tmp)] if etype == 'html': return [cgi.escape(self.code, quote=True)] if etype == 'html-all': hexstr = ["&#"+hex(ord(x))[1:]+";" for x in self.code] decstr = ["&#"+str(ord(x))+";" for x in self.code] return ["".join(hexstr), "".join(decstr)] if etype == 'php-chr': tmp = ["chr({0})".format(ord(x)) for x in ecode] return [".".join(tmp)] if etype == 'utf7': return [self.code.decode(sys.stdout.encoding).encode('utf7')] if etype == 'utf7-all': return [self._utf7EncodeAll(self.code.decode(sys.stdout.encoding))] raise EncodeError("unrecognized type, should be {0}".format(self.encodeTypes))
def update_event(self, inp=-1): self.set_output_val( 0, codecs.raw_unicode_escape_encode(self.input(0), self.input(1)))
def encode(self, etype=None, emethod=None): etype = etype.lower() if etype else "url" emethod = emethod.lower() if emethod else sys.stdout.encoding ecode = self.code.decode(sys.stdout.encoding).encode(emethod) if etype == 'md5': return [hashlib.md5(ecode).hexdigest()] if etype == 'sha': result = [] result.append("sha1: " + hashlib.sha1(ecode).hexdigest() + "\n") result.append("sha224: " + hashlib.sha224(ecode).hexdigest() + "\n") result.append("sha256: " + hashlib.sha256(ecode).hexdigest() + "\n") result.append("sha384: " + hashlib.sha384(ecode).hexdigest() + "\n") result.append("sha512: " + hashlib.sha512(ecode).hexdigest() + "\n") return result if etype == 'base64': return [base64.b64encode(ecode)] if etype == 'base32': return [base64.b32encode(ecode)] if etype == 'hex': tmp1 = ['\\'+hex(ord(x))[1:] for x in ecode] tmp2 = ['0'+hex(ord(x))[1:] for x in ecode] return ["".join(tmp1), "".join(tmp2), ",".join(tmp2)] if etype == 'decimal': tmp = [str(ord(x)) for x in ecode] return [" ".join(tmp), ",".join(tmp)] if etype == 'unicode': return [codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0]] if etype == 'unicode-all': result = "" tmp = codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0] current = 'unicode' if tmp[:2] == '\\u' else "raw" i = 0 while i<len(tmp): if tmp[i:i+2].lower() == '\\u': result += tmp[i:i+6] if current != "unicode": current = 'unicode' i += 6 else: result += "\\u00" + hex(ord(tmp[i]))[2:] if current != 'raw': current = 'raw' i += 1 return [result] if etype == 'url': return [urllib.quote(ecode)] if etype == 'url-all': tmp = ['%'+hex(ord(x))[2:] for x in ecode] return ["".join(tmp)] if etype == 'html': return [cgi.escape(self.code, quote=True)] if etype == 'html-all': hexstr = ["&#"+hex(ord(x))[1:]+";" for x in self.code] decstr = ["&#"+str(ord(x))+";" for x in self.code] return ["".join(hexstr), "".join(decstr)] raise EncodeError("unrecognized type, should be {0}".format(self.encodeTypes))