def test_unicode_decode(self): for i in range(0, 0xd7ff): u = unichr(i) #s = '"\\u{0:04x}"'.format(i) s = '"\\u%04x"' % (i, ) self.assertEqual(json.loads(s), u)
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. If strict is False then literal control characters are allowed in the string. Returns a tuple of the decoded string and the index of the character in s after the end quote.""" if encoding is None: encoding = DEFAULT_ENCODING chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise JSONDecodeError("Unterminated string starting at", s, begin) end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: if not isinstance(content, text_type): content = text_type(content, encoding) _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: msg = "Invalid control character %r at" % (terminator, ) #msg = "Invalid control character {0!r} at".format(terminator) raise JSONDecodeError(msg, s, end) else: _append(terminator) continue try: esc = s[end] except IndexError: raise JSONDecodeError("Unterminated string starting at", s, begin) # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\escape: " + repr(esc) raise JSONDecodeError(msg, s, end) end += 1 else: # Unicode escape sequence esc = s[end + 1:end + 5] next_end = end + 5 if len(esc) != 4: msg = "Invalid \\uXXXX escape" raise JSONDecodeError(msg, s, end) uni = int(esc, 16) # Check for surrogate pair on UCS-4 systems if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: msg = "Invalid \\uXXXX\\uXXXX surrogate pair" if not s[end + 5:end + 7] == '\\u': raise JSONDecodeError(msg, s, end) esc2 = s[end + 7:end + 11] if len(esc2) != 4: raise JSONDecodeError(msg, s, end) uni2 = int(esc2, 16) uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) next_end += 6 char = unichr(uni) end = next_end # Append the unescaped character _append(char) return u('').join(chunks), end
def test_unicode_decode(self): for i in range(0, 0xd7ff): u = unichr(i) #s = '"\\u{0:04x}"'.format(i) s = '"\\u%04x"' % (i,) self.assertEqual(json.loads(s), u)
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. If strict is False then literal control characters are allowed in the string. Returns a tuple of the decoded string and the index of the character in s after the end quote.""" if encoding is None: encoding = DEFAULT_ENCODING chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise JSONDecodeError( "Unterminated string starting at", s, begin) end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: if not isinstance(content, text_type): content = text_type(content, encoding) _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: msg = "Invalid control character %r at" % (terminator,) #msg = "Invalid control character {0!r} at".format(terminator) raise JSONDecodeError(msg, s, end) else: _append(terminator) continue try: esc = s[end] except IndexError: raise JSONDecodeError( "Unterminated string starting at", s, begin) # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\escape: " + repr(esc) raise JSONDecodeError(msg, s, end) end += 1 else: # Unicode escape sequence esc = s[end + 1:end + 5] next_end = end + 5 if len(esc) != 4: msg = "Invalid \\uXXXX escape" raise JSONDecodeError(msg, s, end) uni = int(esc, 16) # Check for surrogate pair on UCS-4 systems if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: msg = "Invalid \\uXXXX\\uXXXX surrogate pair" if not s[end + 5:end + 7] == '\\u': raise JSONDecodeError(msg, s, end) esc2 = s[end + 7:end + 11] if len(esc2) != 4: raise JSONDecodeError(msg, s, end) uni2 = int(esc2, 16) uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) next_end += 6 char = unichr(uni) end = next_end # Append the unescaped character _append(char) return u('').join(chunks), end