class _Lexer(events.EventSource): _number_pattern = re.compile('^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$') _true = 'true' _false = 'false' _null = 'null' _literal = 'LITERAL' _s_new = 'new' _s_doc_start = 'doc_start' _s_doc_end = 'doc_end' _s_o_start = 'object_start' _s_o_end = 'object_end' _s_a_start = 'array_start' _s_a_end = 'array_end' _s_s_start = 'string_start' _s_s_end = 'string_end' _s_literal = 'literal' _s_more = 'more' _s_escaping = 'string_escaping' _k_escaping = 'key_escaping' _e_start = 'start' _e_end = 'end' _e_reset = 'reset' _e_lbrace = 'lbrace' _e_rbrace = 'rbrace' _e_lsquare = 'lsquare' _e_rsquare = 'rsquare' _e_char = 'char' _e_comma = 'comma' _e_colon = 'colon' _e_dblquote = 'dblquote' _e_whitespace = 'whitespace' _e_newline = 'newline' _e_backslash = 'backslash' def __init__(self): super(_Lexer, self).__init__() self._started = False self._tokenizer = _Tokenizer() self._tokenizer.add_catch_all_listener(self._catch_all) self._text_accumulator = _TextAccumulator() self._text_accumulator.bind(self._tokenizer) self._setup_state_machine() def _setup_state_machine(self): new = State(_Lexer._s_new) doc_start = State(_Lexer._s_doc_start) doc_end = State(_Lexer._s_doc_end) object_start = State(_Lexer._s_o_start) object_end = State(_Lexer._s_o_end) array_start = State(_Lexer._s_a_start) array_end = State(_Lexer._s_a_end) key_escaping = State(_Lexer._k_escaping) string_start = State(_Lexer._s_s_start) string_end = State(_Lexer._s_s_end) literal = State(_Lexer._s_literal) more = State(_Lexer._s_more) string_escaping = State(_Lexer._s_escaping) e_start = Event(_Lexer._e_start) e_end = Event(_Lexer._e_end) e_reset = Event(_Lexer._e_reset) e_lbrace = Event(_Lexer._e_lbrace) e_rbrace = Event(_Lexer._e_rbrace) e_lsquare = Event(_Lexer._e_lsquare) e_rsquare = Event(_Lexer._e_rsquare) e_char = Event(_Lexer._e_char) e_comma = Event(_Lexer._e_comma) e_colon = Event(_Lexer._e_colon) e_dblquote = Event(_Lexer._e_dblquote) e_whitespace = Event(_Lexer._e_whitespace) e_newline = Event(_Lexer._e_newline) e_backslash = Event(_Lexer._e_backslash) new.on(e_start, doc_start) new.on(e_end, doc_end) new.ignores(e_newline, e_whitespace) new.faulty(e_lbrace, e_lsquare, e_rbrace, e_rsquare, e_char, e_comma, e_colon, e_dblquote, e_backslash) doc_start.on(e_lbrace, object_start) doc_start.on(e_lsquare, array_start) doc_start.ignores(e_newline, e_whitespace) doc_start.faulty(e_start, e_end, e_rbrace, e_rsquare, e_char, e_comma, e_colon, e_dblquote, e_backslash) doc_end.on(e_reset, new) doc_end.faulty(e_start, e_end, e_lbrace, e_rbrace, e_lsquare, e_rsquare, e_char, e_comma, e_colon, e_dblquote, e_backslash) # JSON Object related states object_start.loops(e_lbrace) object_start.on(e_rbrace, object_end) object_start.on(e_dblquote, string_start) object_start.ignores(e_newline, e_whitespace) object_start.faulty(e_start, e_end, e_lsquare, e_rsquare, e_comma, e_colon, e_backslash) object_end.on(e_end, doc_end) object_end.loops(e_lbrace, e_rbrace) object_end.on(e_rsquare, array_end) object_end.on(e_comma, more) object_end.ignores(e_whitespace, e_newline) object_end.faulty(e_start, e_reset, e_lsquare, e_char, e_colon, e_dblquote, e_backslash) array_start.on(e_lbrace, object_start) array_start.on(e_rsquare, array_end) array_start.loops(e_lsquare) array_start.on(e_char, literal) array_start.on(e_dblquote, string_start) array_start.ignores(e_newline, e_whitespace) array_start.faulty(e_start, e_end, e_reset, e_rbrace, e_rsquare, e_comma, e_colon, e_backslash) array_end.on(e_end, doc_end) array_end.on(e_comma, more) array_end.on(e_rbrace, object_end) array_end.loops(e_rsquare) array_end.ignores(e_whitespace, e_newline) array_end.faulty(e_start, e_reset, e_lbrace, e_lsquare, e_rsquare, e_char, e_colon, e_dblquote, e_backslash) string_start.loops(e_char, e_comma, e_colon, e_whitespace, e_newline, e_lbrace, e_rbrace, e_lsquare, e_rsquare) string_start.on(e_backslash, string_escaping) string_start.on(e_dblquote, string_end) string_start.faulty(e_start, e_end, e_reset) string_escaping.on(e_char, string_start) string_escaping.on(e_dblquote, string_start) string_escaping.on(e_backslash, string_start) string_escaping.faulty(e_start, e_end, e_reset, e_lbrace, e_rbrace, e_lsquare, e_rsquare, e_comma, e_colon, e_whitespace, e_newline) string_end.on(e_rbrace, object_end) string_end.on(e_rsquare, array_end) string_end.on(e_comma, more) string_end.on(e_colon, more) string_end.ignores(e_whitespace, e_newline) string_end.faulty(e_start, e_end, e_reset, e_lbrace, e_lsquare, e_char, e_colon, e_dblquote, e_backslash) literal.loops(e_char) literal.on(e_rbrace, object_end) literal.on(e_rsquare, array_end) literal.on(e_comma, more) literal.ignores(e_newline, e_whitespace) literal.faulty(e_start, e_end, e_reset, e_lbrace, e_lsquare, e_colon, e_dblquote, e_backslash) more.on(e_lbrace, object_start) more.on(e_lsquare, array_start) more.on(e_char, literal) more.on(e_dblquote, string_start) more.ignores(e_whitespace, e_newline) more.faulty(e_start, e_end, e_reset, e_rbrace, e_rsquare, e_comma, e_colon, e_backslash) self._state_machine = StateMachine(new) self._state_machine.add_states(new, doc_start, doc_end, object_start, object_end, array_start, array_end, key_escaping, literal, string_start, string_end, more) self._state_machine.add_listener('before_state_change', self._on_before_state_change) self._state_machine.add_listener('after_state_change', self._on_after_state_change) self._state_machine.add_listener('error', self._on_error) def _on_error(self, current_state, event): raise RuntimeError("{} event cannot be processed in current state: {}".format(event.name, current_state.name)) def _on_before_state_change(self, current_state, pending_event): pass def _on_after_state_change(self, previous_state, event, new_state): # TODO reorder new_state by probability for perf if previous_state.equals(_Lexer._s_s_end): text = self._text_accumulator.pop().strip() text = text[1:-1] # remove surrounding double quotes self.fire(_Lexer._literal, JSONLiteralType.STRING, text) if previous_state.equals(_Lexer._s_literal) and not new_state.equals(_Lexer._s_literal): literal = self._text_accumulator.pop().strip() if re.fullmatch(_Lexer._number_pattern, literal): try: i = int(literal) except ValueError: i = float(literal) self.fire(_Lexer._literal, JSONLiteralType.NUMBER, i) elif literal == _Lexer._true: self.fire(_Lexer._literal, JSONLiteralType.BOOLEAN, True) elif literal == _Lexer._false: self.fire(_Lexer._literal, JSONLiteralType.BOOLEAN, False) elif literal == _Lexer._null: self.fire(_Lexer._literal, JSONLiteralType.NULL, None) else: raise RuntimeError("Invalid Literal {}".format(literal)) if new_state.equals(_Lexer._s_doc_start, _Lexer._s_doc_end, _Lexer._s_o_start, _Lexer._s_o_end, _Lexer._s_a_start, _Lexer._s_a_end): self.fire(new_state.name) if new_state.equals(_Lexer._s_doc_end): self._state_machine.consume(Event('reset')) self._started = False def _catch_all(self, event_name, payload): e = Event(event_name, payload) self._state_machine.consume(e) def consume(self, data): if not self._started: self._started = True self._state_machine.consume(Event(_Lexer._e_start)) self._tokenizer.consume(data) def close(self): if self._started: self._state_machine.consume(Event(_Lexer._e_end)) self._started = False self._tokenizer = None self._text_accumulator = None self._state_machine = None
def _setup_state_machine(self): new = State(_Lexer._s_new) doc_start = State(_Lexer._s_doc_start) doc_end = State(_Lexer._s_doc_end) object_start = State(_Lexer._s_o_start) object_end = State(_Lexer._s_o_end) array_start = State(_Lexer._s_a_start) array_end = State(_Lexer._s_a_end) key_escaping = State(_Lexer._k_escaping) string_start = State(_Lexer._s_s_start) string_end = State(_Lexer._s_s_end) literal = State(_Lexer._s_literal) more = State(_Lexer._s_more) string_escaping = State(_Lexer._s_escaping) e_start = Event(_Lexer._e_start) e_end = Event(_Lexer._e_end) e_reset = Event(_Lexer._e_reset) e_lbrace = Event(_Lexer._e_lbrace) e_rbrace = Event(_Lexer._e_rbrace) e_lsquare = Event(_Lexer._e_lsquare) e_rsquare = Event(_Lexer._e_rsquare) e_char = Event(_Lexer._e_char) e_comma = Event(_Lexer._e_comma) e_colon = Event(_Lexer._e_colon) e_dblquote = Event(_Lexer._e_dblquote) e_whitespace = Event(_Lexer._e_whitespace) e_newline = Event(_Lexer._e_newline) e_backslash = Event(_Lexer._e_backslash) new.on(e_start, doc_start) new.on(e_end, doc_end) new.ignores(e_newline, e_whitespace) new.faulty(e_lbrace, e_lsquare, e_rbrace, e_rsquare, e_char, e_comma, e_colon, e_dblquote, e_backslash) doc_start.on(e_lbrace, object_start) doc_start.on(e_lsquare, array_start) doc_start.ignores(e_newline, e_whitespace) doc_start.faulty(e_start, e_end, e_rbrace, e_rsquare, e_char, e_comma, e_colon, e_dblquote, e_backslash) doc_end.on(e_reset, new) doc_end.faulty(e_start, e_end, e_lbrace, e_rbrace, e_lsquare, e_rsquare, e_char, e_comma, e_colon, e_dblquote, e_backslash) # JSON Object related states object_start.loops(e_lbrace) object_start.on(e_rbrace, object_end) object_start.on(e_dblquote, string_start) object_start.ignores(e_newline, e_whitespace) object_start.faulty(e_start, e_end, e_lsquare, e_rsquare, e_comma, e_colon, e_backslash) object_end.on(e_end, doc_end) object_end.loops(e_lbrace, e_rbrace) object_end.on(e_rsquare, array_end) object_end.on(e_comma, more) object_end.ignores(e_whitespace, e_newline) object_end.faulty(e_start, e_reset, e_lsquare, e_char, e_colon, e_dblquote, e_backslash) array_start.on(e_lbrace, object_start) array_start.on(e_rsquare, array_end) array_start.loops(e_lsquare) array_start.on(e_char, literal) array_start.on(e_dblquote, string_start) array_start.ignores(e_newline, e_whitespace) array_start.faulty(e_start, e_end, e_reset, e_rbrace, e_rsquare, e_comma, e_colon, e_backslash) array_end.on(e_end, doc_end) array_end.on(e_comma, more) array_end.on(e_rbrace, object_end) array_end.loops(e_rsquare) array_end.ignores(e_whitespace, e_newline) array_end.faulty(e_start, e_reset, e_lbrace, e_lsquare, e_rsquare, e_char, e_colon, e_dblquote, e_backslash) string_start.loops(e_char, e_comma, e_colon, e_whitespace, e_newline, e_lbrace, e_rbrace, e_lsquare, e_rsquare) string_start.on(e_backslash, string_escaping) string_start.on(e_dblquote, string_end) string_start.faulty(e_start, e_end, e_reset) string_escaping.on(e_char, string_start) string_escaping.on(e_dblquote, string_start) string_escaping.on(e_backslash, string_start) string_escaping.faulty(e_start, e_end, e_reset, e_lbrace, e_rbrace, e_lsquare, e_rsquare, e_comma, e_colon, e_whitespace, e_newline) string_end.on(e_rbrace, object_end) string_end.on(e_rsquare, array_end) string_end.on(e_comma, more) string_end.on(e_colon, more) string_end.ignores(e_whitespace, e_newline) string_end.faulty(e_start, e_end, e_reset, e_lbrace, e_lsquare, e_char, e_colon, e_dblquote, e_backslash) literal.loops(e_char) literal.on(e_rbrace, object_end) literal.on(e_rsquare, array_end) literal.on(e_comma, more) literal.ignores(e_newline, e_whitespace) literal.faulty(e_start, e_end, e_reset, e_lbrace, e_lsquare, e_colon, e_dblquote, e_backslash) more.on(e_lbrace, object_start) more.on(e_lsquare, array_start) more.on(e_char, literal) more.on(e_dblquote, string_start) more.ignores(e_whitespace, e_newline) more.faulty(e_start, e_end, e_reset, e_rbrace, e_rsquare, e_comma, e_colon, e_backslash) self._state_machine = StateMachine(new) self._state_machine.add_states(new, doc_start, doc_end, object_start, object_end, array_start, array_end, key_escaping, literal, string_start, string_end, more) self._state_machine.add_listener('before_state_change', self._on_before_state_change) self._state_machine.add_listener('after_state_change', self._on_after_state_change) self._state_machine.add_listener('error', self._on_error)