def parse(self, tokenizer, state=None): from rply.token import Token lookahead = None lookaheadstack = [] statestack = [0] symstack = [Token("$end", "$end")] current_state = 0 while True: if self.lr_table.default_reductions[current_state]: t = self.lr_table.default_reductions[current_state] current_state = self._reduce_production( t, symstack, statestack, state ) continue if lookahead is None: if lookaheadstack: lookahead = lookaheadstack.pop() else: if tokenizer.idx < len(tokenizer.s): lookahead = next(tokenizer) else: lookahead = None if lookahead is None: lookahead = Token("$end", "$end") ltype = lookahead.gettokentype() if ltype in self.lr_table.lr_action[current_state]: t = self.lr_table.lr_action[current_state][ltype] if t > 0: statestack.append(t) current_state = t symstack.append(lookahead) lookahead = None continue elif t < 0: current_state = self._reduce_production( t, symstack, statestack, state ) continue else: n = symstack[-1] return n else: # TODO: actual error handling here if self.error_handler is not None: if state is None: self.error_handler(lookahead) else: self.error_handler(state, lookahead) raise AssertionError("For now, error_handler must raise.") else: raise ParsingError(None, lookahead.getsourcepos())
def parse(self, tokenizer, state=None): from rply.token import Token lookahead = None lookaheadstack = [] statestack = [0] symstack = [Token("$end", "$end")] current_state = 0 while True: if self.lr_table.default_reductions[current_state]: t = self.lr_table.default_reductions[current_state] current_state = self._reduce_production( t, symstack, statestack, state) continue else: if lookahead is None: if lookaheadstack: lookahead = lookaheadstack.pop() else: try: lookahead = next(tokenizer) except StopIteration: lookahead = None if lookahead is None: lookahead = Token("$end", "$end") ltype = lookahead.gettokentype() if ltype in self.lr_table.lr_action[current_state]: t = self.lr_table.lr_action[current_state][ltype] if t > 0: statestack.append(t) current_state = t symstack.append(lookahead) lookahead = None continue else: if t < 0: current_state = self._reduce_production( t, symstack, statestack, state) continue else: n = symstack[(-1)] return n elif self.error_handler is not None: if state is None: self.error_handler(lookahead) else: self.error_handler(state, lookahead) lookahead = None continue else: raise ParsingError(None, lookahead.getsourcepos())
def __init__(self, stream): self.stream = [] self.idx = 0 TAB_WIDTH = 4 indent = 0 current_indent = 0 indent_token = None indent_start_pos = 0 while True: try: token = stream.next() except StopIteration: if current_indent > 0: print(current_indent) dedents = [Token('DEDENT', '') ] * (current_indent / TAB_WIDTH) self.stream.extend(dedents) break token_type = token.gettokentype() if token_type == 'WHITESPACE': indent_token = token # WHITESPACE is tab only now. indent = len(token.getstr()) * TAB_WIDTH elif token_type == 'NEWLINE': # print("%d <=> %d" % (current_indent, indent)) if current_indent < indent: indent_token.name = 'INDENT' current_indent = indent elif current_indent > indent: dedent_num = (current_indent - indent) / TAB_WIDTH for i in range(0, dedent_num): if not indent_token: indent_token = Token('', '', token.getsourcepos()) self.stream.insert(indent_start_pos, indent_token) indent_token.name = 'DEDENT' indent_token = None current_indent = indent else: if indent_token: self.stream.remove(indent_token) indent = 0 indent_token = None indent_start_pos = len(self.stream) + 1 self.stream.append(token)
def token(self): """ Return the next token (a Token object) found in the input buffer. None is returned if the end of the buffer was reached. In case of a lexing error (the current chunk of the buffer matches no rule), a LexerError is raised with the position of the error. """ if self.pos >= len(self.buf): if len(self.context_stack) != 1: raise LexerError("contexts are not closed", -1) return None else: if self.pos >= self.heredoc_finish and self.heredoc_finish != -1: start = self.pos end = self.pos + self.heredoc_lgt assert start >= 0 assert end >= 0 tok = Token('T_END_HEREDOC', self.buf[start:end], self.lineno) self.pos = self.heredoc_finish + self.heredoc_lgt self.heredoc_finish = -1 self.heredoc_lgt = 0 self.context_stack.pop() return tok tmp_buf = self._gettmpbuf(self.pos) ctx = self.context_stack[-1] rules = self.rules[ctx] for token_regex, token_type in rules: pos = self.pos assert pos >= 0 m = self.match(token_regex, tmp_buf, pos) if m: start, end = self._getstartend(m) value = self.buf[start:end] if token_type == 'H_NEW_LINE': self.lineno += 1 elif token_type == 'T_COMMENT': self.lineno += value.count('\n') elif token_type == 'T_CONSTANT_ENCAPSED_STRING': self.lineno += value.count("\n") # tokens changing the context tok = Token(token_type, value, self.lineno) tok = self.maybe_change_context(ctx, tok, token_type, end) self.last_token = token_type return tok # if we're here, no rule matched raise LexerError("unknown token", self.lineno)
def next(self): while True: if self.idx >= len(self.s): raise StopIteration for rule in self._get_current_state().ignore_rules: match = rule.matches(self.s, self.idx) if match: self._update_pos(match) self._make_transition(rule) break else: break for rule in self._get_current_state().rules: match = rule.matches(self.s, self.idx) if match: lineno = self._lineno colno = self._update_pos(match) source_pos = SourcePosition(match.start, lineno, colno) token = Token(rule.name, self.s[match.start:match.end], source_pos) self._make_transition(rule) return token else: raise LexingError(None, SourcePosition(self.idx, -1, -1))
def next(self): while True: if self.idx >= len(self.s): raise StopIteration for rule in self.lexer.ignore_rules: match = rule.matches(self.s, self.idx) if match: self._update_pos(match) break else: break for rule in self.lexer.rules: match = rule.matches(self.s, self.idx) if match: lineno = self._lineno self._colno = self._update_pos(match) source_pos = SourcePosition(match.start, lineno, self._colno) if rule.name == "MISMATCH": raise LexingError( "%r unexpected" % self.s[match.start:match.end], SourcePosition(self.idx, self._lineno, self._colno)) token = Token(rule.name, self.s[match.start:match.end], source_pos) return token
def test_parse_error(self): errors = [] cycy = interpreter.CyCy(handle_error=errors.append) cycy.interpret(["asdf"]) self.assertEqual( errors, [ ParseError(token=Token("IDENTIFIER", "asdf"), source="asdf"), ], )
def next(self): if self.idx >= len(self.s): raise StopIteration for rule in self.lexer.ignore_rules: match = rule.matches(self.s, self.idx) if match: self.idx = match.end return self.next() for rule in self.lexer.rules: match = rule.matches(self.s, self.idx) if match: source_pos = self.__get_position__(match.start) token = Token(rule.name, self.s[match.start:match.end], source_pos) self.idx = match.end return token else: raise LexingError(None, SourcePosition(self.idx, -1, -1))
def next(self): if self.idx >= len(self.s): return None for rule in self.lexer.ignore_rules: match = rule.matches(self.s, self.idx) if match: self.idx = match.end return self.next() for rule in self.lexer.rules: match = rule.matches(self.s, self.idx) if match: # TODO: lineno and colno source_pos = SourcePosition(match.start, -1, -1) token = Token(rule.name, self.s[match.start:match.end], source_pos) self.idx = match.end return token else: raise LexingError(None, SourcePosition(self.idx, -1, -1))
def next(self): if self.idx >= len(self.s): raise StopIteration for rule in self.lexer.ignore_rules: match = rule.matches(self.s, self.idx) if match: self._update_pos(match) return self.next() for rule in self.lexer.rules: match = rule.matches(self.s, self.idx) if match: lineno = self._lineno colno = self._update_pos(match) source_pos = SourcePosition(match.start, lineno, colno) token = Token(rule.name, self.s[match.start:match.end], source_pos) return token else: raise LexingError(None, SourcePosition(self.idx, -1, -1))
def _scan_double_quote(self, tok): p = 1 v = tok.value if v[0] == "b": p += 1 backslash = False while p < len(v): c = v[p] if not backslash: if c == '"': # not encountered anything funny, this is just T_STRING return tok if (((c == '$' and p < len(v) - 1 and v[p + 1].isalpha()) or (c == "{" and p < len(v) - 1 and v[p + 1] == "$") or (c == "$" and p < len(v) - 1 and v[p + 1] == "{"))): p += 1 self.context_stack.append(CONTEXT_DOUBLEQUOTE) return Token('"', '"', self.lineno) elif c == '\\': backslash = True else: backslash = False p += 1 assert False
def next(self): while True: if self.idx >= len(self.s): raise StopIteration for rule in self.lexer.ignore_rules: match = rule.matches(self.s, self.idx) if match: self._update_pos(match) break else: break for rule in self.lexer.rules: match = rule.matches(self.s, self.idx) if match: lineno = self._lineno colno = self._update_pos(match) source_pos = SourcePosition(match.start, lineno, colno) source_str = self.s[match.start:match.end] name = self.lexer.reserved_dict.get(source_str, rule.name) token = Token(name, source_str, source_pos) return token else: raise LexingError(None, SourcePosition(self.idx, -1, -1))
def maybe_change_context(self, ctx, tok, token_type, endpos): # print self.context_stack, tok.name, tok.value if ctx == CONTEXT_OBJECT_ACCESS: self.context_stack.pop() elif (ctx == CONTEXT_NORMAL and token_type == "T_CONSTANT_ENCAPSED_STRING" and (tok.value[0] == '"' or tok.value[:2] == 'b"')): newtok = self._scan_double_quote(tok) if newtok.name == '"': # we have to rewind a little ofs = 1 if tok.value[0] == 'b': ofs += 1 self.pos = endpos - len(tok.value) + ofs else: self.pos = endpos return newtok elif ctx == CONTEXT_BACKTICK and tok.value[0] == '`': self.context_stack.pop() elif ctx == CONTEXT_NORMAL and token_type == '`': self.context_stack.append(CONTEXT_BACKTICK) elif ctx == CONTEXT_BACKTICK and token_type == '"': self.context_stack.append(CONTEXT_DOUBLEQUOTE) elif ctx == CONTEXT_BACKTICK and token_type == '`': self.context_stack.pop() elif ctx == CONTEXT_NORMAL and token_type == "T_START_HEREDOC": lgt = 3 if tok.value.startswith("b"): lgt += 1 start = lgt end = len(tok.value) - 1 while tok.value[start] in (' ', '\t'): start += 1 while tok.value[end] in (' ', '\t'): end -= 1 assert end >= 0 marker = tok.value[start:end] if marker.startswith('"'): if not marker.endswith('"'): raise LexerError("wrong marker", self.lineno) end = len(marker) - 1 assert end >= 0 marker = marker[1:end] heredoc_marker = "\n" + marker + ";" start = self.pos + len(tok.value) - 1 assert start >= 0 self.heredoc_finish = self.buf.find(heredoc_marker, start) self.heredoc_lgt = len(heredoc_marker) - 1 if self.heredoc_finish == -1: # XXX case where heredoc does not end with [;] # its then heredoc is an argument and end like ... HEND ); heredoc_marker = "\n" + marker self.heredoc_finish = self.buf.find(heredoc_marker, start) if self.heredoc_finish == -1: raise LexerError("unfinished heredoc", self.lineno) self.heredoc_lgt = len(heredoc_marker) self.context_stack.append(CONTEXT_HEREDOC) elif ctx == CONTEXT_DOUBLEQUOTE and token_type == '"': self.context_stack.pop() elif ctx == CONTEXT_BACKTICK and token_type == '"': self.context_stack.pop() elif ((ctx == CONTEXT_DOUBLEQUOTE or ctx == CONTEXT_HEREDOC or ctx == CONTEXT_BACKTICK) and token_type == "T_DOLLAR_OPEN_CURLY_BRACES"): self.pos = endpos - 1 self.context_stack.append(CONTEXT_CURLY_BRACES) return tok elif (ctx == CONTEXT_CURLY_BRACES and token_type == "{" and self.last_token == "T_DOLLAR_OPEN_CURLY_BRACES"): # instead, we recognize it as a variable tmp_buf = self._gettmpbuf(self.pos) m = self.match(self.var_re, tmp_buf, self.pos) assert m is not None start, end = self._getstartend(m) tok = Token("T_VARIABLE", self.buf[start:end], tok.lineno) self.pos = end return tok elif ((ctx == CONTEXT_DOUBLEQUOTE or ctx == CONTEXT_HEREDOC) and token_type == "T_VARIABLE"): # only if the next one is [ if self.buf[endpos] == "[": self.context_stack.append(CONTEXT_BRACKETS) elif ((ctx == CONTEXT_DOUBLEQUOTE or ctx == CONTEXT_HEREDOC) and token_type == "T_OBJECT_OPERATOR"): if (self.last_token != "T_VARIABLE" or not self.buf[self.pos + 2].isalpha()): tok = Token("T_ENCAPSED_AND_WHITESPACE", tok.value, tok.lineno) else: self.context_stack.append(CONTEXT_OBJECT_ACCESS) elif token_type == "T_OBJECT_OPERATOR": self.context_stack.append(CONTEXT_OBJECT_ACCESS) elif ctx == CONTEXT_BRACKETS and token_type == "]": self.context_stack.pop() elif ctx == CONTEXT_CURLY_BRACES and token_type == "}": # XXX this is incorrect but we don't care at the moment # if someone inserts } inside ] we have to do something else # like scan grammar until we hit it self.context_stack.pop() self.pos = endpos return tok
def test_repr(self): t = Token("VALUE", "3") assert repr(t) == "Token('VALUE', '3')"
def test_source_pos(self): t = Token("VALUE", "3", SourcePosition(5, 2, 1)) assert t.getsourcepos().lineno == 2
def parse(self, tokenizer, state=None): from rply.token import Token lookahead = None lookaheadstack = [] statestack = [0] symstack = [Token("$end", "$end")] current_state = 0 while True: if self.lr_table.default_reductions[current_state]: t = self.lr_table.default_reductions[current_state] current_state = self._reduce_production( t, symstack, statestack, state) continue if lookahead is None: if lookaheadstack: lookahead = lookaheadstack.pop() else: try: # Get the next token. lookahead = next(tokenizer) except StopIteration: lookahead = None if lookahead is None: # Check if the only possible action from here is to end. could_only_end = len( self.lr_table.lr_action[current_state]) == 1 lookahead = Token("$end", "$end") ltype = lookahead.gettokentype() # Check if the next token is a valid next step, given our current # state. if ltype in self.lr_table.lr_action[current_state]: # Get the next action. t = self.lr_table.lr_action[current_state][ltype] # Shift. if t > 0: statestack.append(t) current_state = t symstack.append(lookahead) lookahead = None continue # Reduce. elif t < 0: current_state = self._reduce_production( t, symstack, statestack, state) continue # t == 0 means (maybe among other things), we got the 'end' # token. We are done, so we should return the token we made. else: # This is the output token. n = symstack[-1] # Annotate the output token with whether or not the only # next step when we got to the end, was in fact to end. n._could_only_end = could_only_end return n else: self.sym_stack = symstack self.state_stack = statestack self.look_ahead = lookahead self.look_ahead_stack = lookaheadstack # TODO: actual error handling here if self.error_handler is not None: if state is None: self.error_handler(lookahead) else: self.error_handler(state, lookahead) raise AssertionError("For now, error_handler must raise.") else: raise ParsingError(None, lookahead.getsourcepos())
def test_eq(self): t = Token("VALUE", "3", SourcePosition(-1, -1, -1)) assert not (t == 3) assert t != 3