def parse_array(tokens: List[Token], index: int) -> Tuple[List[Any], int]: """Parses a JSON list whose open bracket is at `index` in `tokens`.""" assert tokens[index].token_type == TokenType.OPEN_BRACKET index += 1 if index >= len(tokens): raise JSONDecodeError("unterminated array") result_list: List[Any] = [] if tokens[index].token_type == TokenType.CLOSE_BRACKET: # Empty list. return result_list, index + 1 expecting_value = True while index < len(tokens): token = tokens[index] # TODO: what if the list is empty? if expecting_value: value, index = parse_value(tokens, index) result_list.append(value) # A comma or close bracket should come next, not another value. expecting_value = False else: if token.token_type == TokenType.COMMA: # Another value *must* come next. We can't have two commas in a row or # a comma immediately before a close bracket. expecting_value = True index += 1 elif token.token_type == TokenType.CLOSE_BRACKET: # Found the end of the list. break else: raise JSONDecodeError("expecting comma or close bracket in list") return result_list, index + 1
def _parse_integer( literal: str, start_index: int = 0, allow_plus: bool = False, allow_leading_zeros: bool = False, ) -> Tuple[int, int]: if start_index >= len(literal): raise JSONDecodeError("expected number") is_negative = literal[start_index] == "-" if literal[start_index] == "-" or (allow_plus and literal[start_index] == "+"): start_index += 1 index = start_index integer = 0 # We could just use the int(str) function, but that would be cheating. while index < len(literal) and literal[index].isdigit(): integer *= 10 integer += _parse_digit(literal[index]) index += 1 if index == start_index: raise JSONDecodeError("expected number") if ( not allow_leading_zeros and literal[start_index] == "0" and index - start_index > 1 ): raise JSONDecodeError("leading zeros in number") if is_negative: integer *= -1 return integer, index
def parse_number(token: Token) -> Union[int, float]: number: Union[int, float] literal = token.lexeme number, index = _parse_integer(literal) if index < len(literal) and literal[index] == ".": number = float(number) index += 1 decimal_places = 0 while index < len(literal) and literal[index].isdigit(): decimal_places += 1 number += _parse_digit(literal[index]) / (10 ** decimal_places) index += 1 if decimal_places == 0: raise JSONDecodeError("no decimal places after period") if index < len(literal) and literal[index].lower() == "e": exponent, index = _parse_integer( literal, index + 1, allow_plus=True, allow_leading_zeros=True ) number = float(number) * (10 ** exponent) if index < len(literal): raise JSONDecodeError("unexpected characters after number") return number
def parse_value(tokens: List[Token], index: int) -> Tuple[Any, int]: if index >= len(tokens): raise JSONDecodeError(f"expecting value at index {index}") token = tokens[index] if token.token_type == TokenType.NULL: assert token.lexeme == "null", "null token lexeme must be 'null'" return None, index + 1 elif token.token_type == TokenType.BOOLEAN: return parse_boolean(token), index + 1 elif token.token_type == TokenType.NUMBER: return parse_number(token), index + 1 elif token.token_type == TokenType.INFINITY: if token.lexeme == "Infinity": return math.inf, index + 1 elif token.lexeme == "-Infinity": return -math.inf, index + 1 assert False, "invalid infinity lexeme" elif token.token_type == TokenType.NAN: assert token.lexeme == "NaN", "NaN token lexeme must be 'NaN'" return math.nan, index + 1 elif token.token_type == TokenType.STRING: return parse_string(token), index + 1 elif token.token_type == TokenType.OPEN_BRACKET: return parse_array(tokens, index) elif token.token_type == TokenType.OPEN_BRACE: return parse_object(tokens, index) else: raise JSONDecodeError(f"expecting value at index {index}")
def parse_string(token: Token) -> str: has_quotes = token.lexeme.startswith('"') and token.lexeme.endswith('"') assert has_quotes, "string lexeme not quoted" chars: List[str] = [] escaped = False current_unicode_literal: Optional[List[str]] = None for i in range(1, len(token.lexeme) - 1): char = token.lexeme[i] if current_unicode_literal is not None: current_unicode_literal.append(char) if len(current_unicode_literal) == 4: try: char_point = int("".join(current_unicode_literal), base=16) except ValueError: raise JSONDecodeError("invalid hex in unicode literal") chars.append(chr(char_point)) current_unicode_literal = None elif not escaped and char == "\\": # Backlash; escape the next character. escaped = True elif escaped: # The previous character was a backslash. escaped = False if char == "u": current_unicode_literal = [] else: try: escape_char = { "b": "\b", "f": "\f", "n": "\n", "r": "\r", "t": "\t", '"': '"', "/": "/", "\\": "\\", }[char] except KeyError: raise JSONDecodeError("invalid \\escape") chars.append(escape_char) else: # Regular old character. chars.append(char) if current_unicode_literal is not None: raise JSONDecodeError("unterminated unicode literal") if escaped: raise JSONDecodeError("unterminated escape sequence") return "".join(chars)
def lex(json_string: str) -> List[Token]: # Ordering of lex functions is based roughly on efficiency of the function # (faster functions first) and how common the corresponding token type is # expected to be (more common first) to avoid expensive negative checks. lex_funcs: List[LexFunc] = [ lex_delimiter, lex_null, lex_bool, lex_nan, lex_infinity, # Must come before lex_number, as both check for negative sign. lex_number, lex_string, ] tokens: List[Token] = [] index = 0 while index < len(json_string): if json_string[index] in JSON_WHITESPACE: index += 1 continue match: Optional[TokenMatch] for lex_func in lex_funcs: match = lex_func(json_string, index) if match: tokens.append(match.token) index = match.next_index break if not match: raise JSONDecodeError(f"invalid character at index {index}") return tokens
def parse_object(tokens: List[Token], index: int) -> Tuple[Dict[str, Any], int]: assert tokens[index].token_type == TokenType.OPEN_BRACE index += 1 if index >= len(tokens): raise JSONDecodeError("unterminated object") result_dict: Dict[str, Any] = {} if tokens[index].token_type == TokenType.CLOSE_BRACE: # Empty dict. return result_dict, index + 1 expecting_value = True expecting_colon = False current_key: Optional[str] = None while index < len(tokens): token = tokens[index] # TODO: what if the dict is empty? if expecting_value: if current_key is None: if token.token_type != TokenType.STRING: raise JSONDecodeError("object keys must be strings") current_key = parse_string(token) expecting_colon = True index += 1 else: value, index = parse_value(tokens, index) result_dict[current_key] = value current_key = None expecting_value = False elif expecting_colon: if token.token_type != TokenType.COLON: raise JSONDecodeError(f"expected colon at index {index}") expecting_colon = False expecting_value = True index += 1 else: # We reached the end of a key/value pair, so we expect either a comma, or a # close brace to end the object. if token.token_type == TokenType.COMMA: expecting_value = True index += 1 elif token.token_type == TokenType.CLOSE_BRACE: # Found the end of the object. break else: raise JSONDecodeError(f"expected comma or close brace at index {index}") return result_dict, index + 1
def lex_string(json_string: str, start_index: int) -> Optional[TokenMatch]: if json_string[start_index] != '"': return None close_index = start_index + 1 # Whether the previous character was a backslash preceded by an even number of # backslashes. escaped = False for close_index in range(start_index + 1, len(json_string)): char = json_string[close_index] if not escaped and char == "\\": escaped = True elif escaped: escaped = False elif char == '"': string = json_string[start_index:close_index + 1] token = Token(TokenType.STRING, string, start_index) return TokenMatch(token=token, next_index=close_index + 1) # Got to the end of the input string without finding a closing quote. raise JSONDecodeError( f"unterminated string starting at index {start_index}")
def parse(tokens: List[Token]) -> Union[None, bool, float, str, list, dict]: value, next_index = parse_value(tokens, 0) if next_index != len(tokens): raise JSONDecodeError("more than one value at top level of json") return value