def unescape_unicode(raw_str: str):
    """
    >>> s = r'тест\u0301'
    >>> codecs.raw_unicode_escape_encode(s)
    >>> sss = ('\\u0442\\u0435\\u0441\\u0442\\u0301', 10)
    >>> codecs.raw_unicode_escape_decode(sss[0])
    >>> 'тест́'

    :param raw_str: a raw string with unicode escapes
    :return: an unescaped unicode string
    """
    _bytes, c = codecs.raw_unicode_escape_encode(raw_str)
    _str, c = codecs.raw_unicode_escape_decode(_bytes)
    return _str
Example #2
0
    def test_codecs_builtins(self):
        s = "abc"

        encoded = codecs.utf_8_encode(s)
        self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0])

        encoded = codecs.utf_7_encode(s)
        self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0])

        encoded = codecs.utf_16_encode(s)
        self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0])

        encoded = codecs.utf_16_le_encode(s)
        self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0])

        encoded = codecs.utf_16_be_encode(s)
        self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_encode(s)
        self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0])

        encoded = codecs.utf_32_le_encode(s)
        self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.raw_unicode_escape_encode(s)
        self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0])

        encoded = codecs.unicode_escape_encode(s)
        self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0])

        encoded = codecs.latin_1_encode(s)
        self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0])

        encoded = codecs.ascii_encode(s)
        self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
Example #3
0
 def encode(self, input, final=False):
     return codecs.raw_unicode_escape_encode(input, self.errors)[0]
Example #4
0
 def test_raw_unicode_escape_encode(self):
     #sanity
     new_str, size = codecs.raw_unicode_escape_encode("abc")
     self.assertEqual(new_str, 'abc')
     self.assertEqual(size, 3)
Example #5
0
 def test_raw_unicode_escape_encode(self):
     #sanity
     new_str, num_processed = codecs.raw_unicode_escape_encode("abc")
     self.assertEqual(new_str, b'abc')
     self.assertEqual(num_processed, 3)
Example #6
0
 def raw_unistr(self, unistr):
     return codecs.raw_unicode_escape_encode(unistr)[0].decode('gbk')
Example #7
0
    def encode(self, etype=None, emethod=None):
        '''
        编码
        @params:
            dtype: 解码类型,默认为auto,支持URL/base64/md5/hex/unicode等类型
            dmethod: 非ASCII字符串解码类型,默认为STDOUT的编码类型,支持utf8/utf16/gbk/gb2312/big5等类型
        '''
        etype = etype.lower() if etype else "url"
        emethod = emethod.lower() if emethod else sys.stdout.encoding

        ecode = self.code.decode(sys.stdout.encoding).encode(emethod)

        if etype == 'md5':
            return [hashlib.md5(ecode).hexdigest()]
        if etype == 'sha':
            result = []
            result.append("sha1: " + hashlib.sha1(ecode).hexdigest() + "\n")
            result.append("sha224: " + hashlib.sha224(ecode).hexdigest() +
                          "\n")
            result.append("sha256: " + hashlib.sha256(ecode).hexdigest() +
                          "\n")
            result.append("sha384: " + hashlib.sha384(ecode).hexdigest() +
                          "\n")
            result.append("sha512: " + hashlib.sha512(ecode).hexdigest() +
                          "\n")
            return result
        if etype == 'base64':
            return [base64.b64encode(ecode)]
        if etype == 'base32':
            return [base64.b32encode(ecode)]

        if etype == 'hex':
            tmp1 = ['\\' + hex(ord(x))[1:] for x in ecode]
            tmp2 = ['0' + hex(ord(x))[1:] for x in ecode]
            return ["".join(tmp1), "".join(tmp2), ",".join(tmp2)]
        if etype == 'decimal':
            tmp = [str(ord(x)) for x in ecode]
            return [" ".join(tmp), ",".join(tmp)]

        if etype == 'unicode':
            return [
                codecs.raw_unicode_escape_encode(
                    self.code.decode(sys.stdout.encoding))[0]
            ]
        if etype == 'unicode-all':
            result = ""
            tmp = codecs.raw_unicode_escape_encode(
                self.code.decode(sys.stdout.encoding))[0]
            current = 'unicode' if tmp[:2] == '\\u' else "raw"
            i = 0
            while i < len(tmp):
                if tmp[i:i + 2].lower() == '\\u':
                    result += tmp[i:i + 6]
                    if current != "unicode":
                        current = 'unicode'
                    i += 6
                else:
                    result += "\\u00" + hex(ord(tmp[i]))[2:]
                    if current != 'raw':
                        current = 'raw'
                    i += 1
            return [result]

        if etype == 'url':
            return [urllib.quote(ecode)]
        if etype == 'url-all':
            tmp = ['%' + hex(ord(x))[2:].upper() for x in ecode]
            return ["".join(tmp)]

        if etype == 'html':
            return [cgi.escape(self.code, quote=True)]
        if etype == 'html-all':
            hexstr = ["&#" + hex(ord(x))[1:] + ";" for x in self.code]
            decstr = ["&#" + str(ord(x)) + ";" for x in self.code]
            return ["".join(hexstr), "".join(decstr)]

        if etype == 'php-chr':
            tmp = ["chr({0})".format(ord(x)) for x in ecode]
            return [".".join(tmp)]

        if etype == 'utf7':
            return [self.code.decode(sys.stdout.encoding).encode('utf7')]

        if etype == 'utf7-all':
            return [self._utf7EncodeAll(self.code.decode(sys.stdout.encoding))]

        raise EncodeError("unrecognized type, should be {0}".format(
            self.encodeTypes))
Example #8
0
 def test_unescape_unicode(self):
     s = utils.unescape_unicode(r'Sample\u0301')
     assert s == 'Sample' + '\u0301'
     assert len(s) == len('Sample') + 1
     assert codecs.raw_unicode_escape_encode(s[6])[0] == rb'\u0301'
Example #9
0
    def encode(self, etype=None, emethod=None):
        '''
        编码
        @params:
            dtype: 解码类型,默认为auto,支持URL/base64/md5/hex/unicode等类型
            dmethod: 非ASCII字符串解码类型,默认为STDOUT的编码类型,支持utf8/utf16/gbk/gb2312/big5等类型
        '''
        etype = etype.lower() if etype else "url"
        emethod = emethod.lower() if emethod else sys.stdout.encoding

        ecode = self.code.decode(sys.stdout.encoding).encode(emethod)

        if etype == 'md5':
            return [hashlib.md5(ecode).hexdigest()]
        if etype == 'sha':
            result = []
            result.append("sha1: " + hashlib.sha1(ecode).hexdigest() + "\n")
            result.append("sha224: " + hashlib.sha224(ecode).hexdigest() + "\n")
            result.append("sha256: " + hashlib.sha256(ecode).hexdigest() + "\n")
            result.append("sha384: " + hashlib.sha384(ecode).hexdigest() + "\n")
            result.append("sha512: " + hashlib.sha512(ecode).hexdigest() + "\n")
            return result
        if etype == 'base64':
            return [base64.b64encode(ecode)]
        if etype == 'base32':
            return [base64.b32encode(ecode)]
        
        if etype == 'hex':
            tmp1 = ['\\'+hex(ord(x))[1:] for x in ecode]
            tmp2 = ['0'+hex(ord(x))[1:] for x in ecode]
            return ["".join(tmp1), "".join(tmp2), ",".join(tmp2)]
        if etype == 'decimal':
            tmp = [str(ord(x)) for x in ecode]
            return [" ".join(tmp), ",".join(tmp)]

        if etype == 'unicode':
            return [codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0]]
        if etype == 'unicode-all':
            result = ""
            tmp = codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0]
            current = 'unicode' if tmp[:2] == '\\u' else "raw"
            i = 0
            while i<len(tmp):
                if tmp[i:i+2].lower() == '\\u':
                    result += tmp[i:i+6]
                    if current != "unicode":
                        current = 'unicode'
                    i += 6
                else:
                    result += "\\u00" + hex(ord(tmp[i]))[2:]
                    if current != 'raw':
                        current = 'raw'
                    i += 1
            return [result]

        if etype == 'url':
            return [urllib.quote(ecode)]
        if etype == 'url-all':
            tmp = ['%'+hex(ord(x))[2:].upper() for x in ecode]
            return ["".join(tmp)]

        if etype == 'html':
            return [cgi.escape(self.code, quote=True)]
        if etype == 'html-all':
            hexstr = ["&#"+hex(ord(x))[1:]+";" for x in self.code]
            decstr = ["&#"+str(ord(x))+";" for x in self.code]
            return ["".join(hexstr), "".join(decstr)]

        if etype == 'php-chr':
            tmp = ["chr({0})".format(ord(x)) for x in ecode]
            return [".".join(tmp)]

        if etype == 'utf7':
            return [self.code.decode(sys.stdout.encoding).encode('utf7')]

        if etype == 'utf7-all':
            return [self._utf7EncodeAll(self.code.decode(sys.stdout.encoding))]

        
        raise EncodeError("unrecognized type, should be {0}".format(self.encodeTypes))
Example #10
0
 def test_raw_unicode_escape_encode(self):
     #sanity
     new_str, size = codecs.raw_unicode_escape_encode("abc")
     self.assertEqual(new_str, 'abc')
     self.assertEqual(size, 3)
Example #11
0
 def update_event(self, inp=-1):
     self.set_output_val(
         0, codecs.raw_unicode_escape_encode(self.input(0), self.input(1)))
Example #12
0
    def encode(self, etype=None, emethod=None):
        etype = etype.lower() if etype else "url"
        emethod = emethod.lower() if emethod else sys.stdout.encoding

        ecode = self.code.decode(sys.stdout.encoding).encode(emethod)

        if etype == 'md5':
            return [hashlib.md5(ecode).hexdigest()]
        if etype == 'sha':
            result = []
            result.append("sha1: " + hashlib.sha1(ecode).hexdigest() + "\n")
            result.append("sha224: " + hashlib.sha224(ecode).hexdigest() + "\n")
            result.append("sha256: " + hashlib.sha256(ecode).hexdigest() + "\n")
            result.append("sha384: " + hashlib.sha384(ecode).hexdigest() + "\n")
            result.append("sha512: " + hashlib.sha512(ecode).hexdigest() + "\n")
            return result
        if etype == 'base64':
            return [base64.b64encode(ecode)]
        if etype == 'base32':
            return [base64.b32encode(ecode)]
        
        if etype == 'hex':
            tmp1 = ['\\'+hex(ord(x))[1:] for x in ecode]
            tmp2 = ['0'+hex(ord(x))[1:] for x in ecode]
            return ["".join(tmp1), "".join(tmp2), ",".join(tmp2)]
        if etype == 'decimal':
            tmp = [str(ord(x)) for x in ecode]
            return [" ".join(tmp), ",".join(tmp)]

        if etype == 'unicode':
            return [codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0]]
        if etype == 'unicode-all':
            result = ""
            tmp = codecs.raw_unicode_escape_encode(self.code.decode(sys.stdout.encoding))[0]
            current = 'unicode' if tmp[:2] == '\\u' else "raw"
            i = 0
            while i<len(tmp):
                if tmp[i:i+2].lower() == '\\u':
                    result += tmp[i:i+6]
                    if current != "unicode":
                        current = 'unicode'
                    i += 6
                else:
                    result += "\\u00" + hex(ord(tmp[i]))[2:]
                    if current != 'raw':
                        current = 'raw'
                    i += 1
            return [result]

        if etype == 'url':
            return [urllib.quote(ecode)]
        if etype == 'url-all':
            tmp = ['%'+hex(ord(x))[2:] for x in ecode]
            return ["".join(tmp)]

        if etype == 'html':
            return [cgi.escape(self.code, quote=True)]
        if etype == 'html-all':
            hexstr = ["&#"+hex(ord(x))[1:]+";" for x in self.code]
            decstr = ["&#"+str(ord(x))+";" for x in self.code]
            return ["".join(hexstr), "".join(decstr)]
        
        raise EncodeError("unrecognized type, should be {0}".format(self.encodeTypes))