def parse(self, tokenizer, state=None): from rply.token import Token lookahead = None lookaheadstack = [] statestack = [0] symstack = [Token("$end", "$end")] current_state = 0 while True: if self.lr_table.default_reductions[current_state]: t = self.lr_table.default_reductions[current_state] current_state = self._reduce_production( t, symstack, statestack, state ) continue if lookahead is None: if lookaheadstack: lookahead = lookaheadstack.pop() else: try: lookahead = next(tokenizer) except StopIteration: lookahead = None if lookahead is None: lookahead = Token("$end", "$end") ltype = lookahead.gettokentype() if ltype in self.lr_table.lr_action[current_state]: t = self.lr_table.lr_action[current_state][ltype] if t > 0: statestack.append(t) current_state = t symstack.append(lookahead) lookahead = None continue elif t < 0: current_state = self._reduce_production( t, symstack, statestack, state ) continue else: n = symstack[-1] return n else: # TODO: actual error handling here if self.error_handler is not None: if state is None: self.error_handler(lookahead) else: self.error_handler(state, lookahead) raise AssertionError("For now, error_handler must raise.") else: raise ParsingError(None, lookahead.getsourcepos())
def parse(self, tokenizer, state=None): from rply.token import Token lookahead = None lookaheadstack = [] statestack = [0] symstack = [Token("$end", "$end")] current_state = 0 while True: if self.lr_table.default_reductions[current_state]: t = self.lr_table.default_reductions[current_state] current_state = self._reduce_production( t, symstack, statestack, state) continue else: if lookahead is None: if lookaheadstack: lookahead = lookaheadstack.pop() else: try: lookahead = next(tokenizer) except StopIteration: lookahead = None if lookahead is None: lookahead = Token("$end", "$end") ltype = lookahead.gettokentype() if ltype in self.lr_table.lr_action[current_state]: t = self.lr_table.lr_action[current_state][ltype] if t > 0: statestack.append(t) current_state = t symstack.append(lookahead) lookahead = None continue else: if t < 0: current_state = self._reduce_production( t, symstack, statestack, state) continue else: n = symstack[(-1)] return n elif self.error_handler is not None: if state is None: self.error_handler(lookahead) else: self.error_handler(state, lookahead) lookahead = None continue else: raise ParsingError(None, lookahead.getsourcepos())
def __init__(self, stream): self.stream = [] self.idx = 0 TAB_WIDTH = 4 indent = 0 current_indent = 0 indent_token = None indent_start_pos = 0 while True: try: token = stream.next() except StopIteration: if current_indent > 0: print(current_indent) dedents = [Token('DEDENT', '')] * (current_indent / TAB_WIDTH) self.stream.extend(dedents) break token_type = token.gettokentype() if token_type == 'WHITESPACE': indent_token = token # WHITESPACE is tab only now. indent = len(token.getstr()) * TAB_WIDTH elif token_type == 'NEWLINE': # print("%d <=> %d" % (current_indent, indent)) if current_indent < indent: indent_token.name = 'INDENT' current_indent = indent elif current_indent > indent: dedent_num = (current_indent - indent) / TAB_WIDTH for i in range(0, dedent_num): if not indent_token: indent_token = Token('', '', token.getsourcepos()) self.stream.insert(indent_start_pos, indent_token) indent_token.name = 'DEDENT' indent_token = None current_indent = indent else: if indent_token: self.stream.remove(indent_token) indent = 0 indent_token = None indent_start_pos = len(self.stream) + 1 self.stream.append(token)
def token(self): """ Return the next token (a Token object) found in the input buffer. None is returned if the end of the buffer was reached. In case of a lexing error (the current chunk of the buffer matches no rule), a LexerError is raised with the position of the error. """ if self.pos >= len(self.buf): if len(self.context_stack) != 1: raise LexerError("contexts are not closed", -1) return None else: if self.pos >= self.heredoc_finish and self.heredoc_finish != -1: start = self.pos end = self.pos + self.heredoc_lgt assert start >= 0 assert end >= 0 tok = Token('T_END_HEREDOC', self.buf[start:end], self.lineno) self.pos = self.heredoc_finish + self.heredoc_lgt self.heredoc_finish = -1 self.heredoc_lgt = 0 self.context_stack.pop() return tok tmp_buf = self._gettmpbuf(self.pos) ctx = self.context_stack[-1] rules = self.rules[ctx] for token_regex, token_type in rules: pos = self.pos assert pos >= 0 m = self.match(token_regex, tmp_buf, pos) if m: start, end = self._getstartend(m) value = self.buf[start:end] if token_type == 'H_NEW_LINE': self.lineno += 1 elif token_type == 'T_COMMENT': self.lineno += value.count('\n') elif token_type == 'T_CONSTANT_ENCAPSED_STRING': self.lineno += value.count("\n") # tokens changing the context tok = Token(token_type, value, self.lineno) tok = self.maybe_change_context(ctx, tok, token_type, end) self.last_token = token_type return tok # if we're here, no rule matched raise LexerError("unknown token", self.lineno)
def test_parse_error(self): errors = [] cycy = interpreter.CyCy(handle_error=errors.append) cycy.interpret(["asdf"]) self.assertEqual( errors, [ ParseError(token=Token("IDENTIFIER", "asdf"), source="asdf"), ], )
def next(self): if self.idx >= len(self.s): raise StopIteration for rule in self.lexer.ignore_rules: match = rule.matches(self.s, self.idx) if match: self.idx = match.end return self.next() for rule in self.lexer.rules: match = rule.matches(self.s, self.idx) if match: source_pos = self.__get_position__(match.start) token = Token(rule.name, self.s[match.start:match.end], source_pos) self.idx = match.end return token else: raise LexingError(None, SourcePosition(self.idx, -1, -1))
def next(self): if self.idx >= len(self.s): return None for rule in self.lexer.ignore_rules: match = rule.matches(self.s, self.idx) if match: self.idx = match.end return self.next() for rule in self.lexer.rules: match = rule.matches(self.s, self.idx) if match: # TODO: lineno and colno source_pos = SourcePosition(match.start, -1, -1) token = Token(rule.name, self.s[match.start:match.end], source_pos) self.idx = match.end return token else: raise LexingError(None, SourcePosition(self.idx, -1, -1))
def _scan_double_quote(self, tok): p = 1 v = tok.value if v[0] == "b": p += 1 backslash = False while p < len(v): c = v[p] if not backslash: if c == '"': # not encountered anything funny, this is just T_STRING return tok if (((c == '$' and p < len(v) - 1 and v[p + 1].isalpha()) or (c == "{" and p < len(v) - 1 and v[p + 1] == "$") or (c == "$" and p < len(v) - 1 and v[p + 1] == "{"))): p += 1 self.context_stack.append(CONTEXT_DOUBLEQUOTE) return Token('"', '"', self.lineno) elif c == '\\': backslash = True else: backslash = False p += 1 assert False
def parse(self, tokenizer, state=None): from rply.token import Token lookahead = None lookaheadstack = [] statestack = [0] symstack = [Token("$end", None)] current_state = 0 while True: if lookahead is None: if lookaheadstack: lookahead = lookaheadstack.pop() else: lookahead = tokenizer.next() if lookahead is None: lookahead = Token("$end", None) ltype = lookahead.gettokentype() if ltype in self.lr_table.lr_action[current_state]: t = self.lr_table.lr_action[current_state][ltype] if t > 0: statestack.append(t) current_state = t symstack.append(lookahead) lookahead = None continue elif t < 0: # reduce a symbol on the stack and emit a production p = self.lr_table.grammar.productions[-t] pname = p.name plen = p.getlength() start = len(symstack) + (-plen - 1) assert start >= 0 targ = symstack[start:] del targ[0] start = len(symstack) + (-plen) assert start >= 0 del symstack[start:] del statestack[start:] if state is None: value = p.func(targ) else: value = p.func(state, targ) symstack.append(value) current_state = self.lr_table.lr_goto[statestack[-1]][pname] statestack.append(current_state) continue else: n = symstack[-1] return n else: # TODO: actual error handling here if self.error_handler is not None: if state is None: self.error_handler(lookahead) else: self.error_handler(state, lookahead) raise AssertionError("For now, error_handler must raise.") else: raise ParsingError(lookahead.getsourcepos())
def test_source_pos(self): t = Token("VALUE", "3", SourcePosition(5, 2, 1)) assert t.getsourcepos().lineno == 2
def __init__(self, left, opt: Token, right): self.left = left self.opt = opt.getstr() self.right = right
def __init__(self, value: Token): self.value = int(value.getstr())
def test_eq(self): t = Token("VALUE", "3", SourcePosition(-1, -1, -1)) assert not (t == 3) assert t != 3
def test_repr(self): t = Token("VALUE", "3") assert repr(t) == "Token('VALUE', '3')"
def maybe_change_context(self, ctx, tok, token_type, endpos): # print self.context_stack, tok.name, tok.value if ctx == CONTEXT_OBJECT_ACCESS: self.context_stack.pop() elif (ctx == CONTEXT_NORMAL and token_type == "T_CONSTANT_ENCAPSED_STRING" and (tok.value[0] == '"' or tok.value[:2] == 'b"')): newtok = self._scan_double_quote(tok) if newtok.name == '"': # we have to rewind a little ofs = 1 if tok.value[0] == 'b': ofs += 1 self.pos = endpos - len(tok.value) + ofs else: self.pos = endpos return newtok elif ctx == CONTEXT_BACKTICK and tok.value[0] == '`': self.context_stack.pop() elif ctx == CONTEXT_NORMAL and token_type == '`': self.context_stack.append(CONTEXT_BACKTICK) elif ctx == CONTEXT_BACKTICK and token_type == '"': self.context_stack.append(CONTEXT_DOUBLEQUOTE) elif ctx == CONTEXT_BACKTICK and token_type == '`': self.context_stack.pop() elif ctx == CONTEXT_NORMAL and token_type == "T_START_HEREDOC": lgt = 3 if tok.value.startswith("b"): lgt += 1 start = lgt end = len(tok.value) - 1 while tok.value[start] in (' ', '\t'): start += 1 while tok.value[end] in (' ', '\t'): end -= 1 assert end >= 0 marker = tok.value[start:end] if marker.startswith('"'): if not marker.endswith('"'): raise LexerError("wrong marker", self.lineno) end = len(marker) - 1 assert end >= 0 marker = marker[1:end] heredoc_marker = "\n" + marker + ";" start = self.pos + len(tok.value) - 1 assert start >= 0 self.heredoc_finish = self.buf.find(heredoc_marker, start) self.heredoc_lgt = len(heredoc_marker) - 1 if self.heredoc_finish == -1: # XXX case where heredoc does not end with [;] # its then heredoc is an argument and end like ... HEND ); heredoc_marker = "\n" + marker self.heredoc_finish = self.buf.find(heredoc_marker, start) if self.heredoc_finish == -1: raise LexerError("unfinished heredoc", self.lineno) self.heredoc_lgt = len(heredoc_marker) self.context_stack.append(CONTEXT_HEREDOC) elif ctx == CONTEXT_DOUBLEQUOTE and token_type == '"': self.context_stack.pop() elif ctx == CONTEXT_BACKTICK and token_type == '"': self.context_stack.pop() elif ((ctx == CONTEXT_DOUBLEQUOTE or ctx == CONTEXT_HEREDOC or ctx == CONTEXT_BACKTICK) and token_type == "T_DOLLAR_OPEN_CURLY_BRACES"): self.pos = endpos - 1 self.context_stack.append(CONTEXT_CURLY_BRACES) return tok elif (ctx == CONTEXT_CURLY_BRACES and token_type == "{" and self.last_token == "T_DOLLAR_OPEN_CURLY_BRACES"): # instead, we recognize it as a variable tmp_buf = self._gettmpbuf(self.pos) m = self.match(self.var_re, tmp_buf, self.pos) assert m is not None start, end = self._getstartend(m) tok = Token("T_VARIABLE", self.buf[start:end], tok.lineno) self.pos = end return tok elif ((ctx == CONTEXT_DOUBLEQUOTE or ctx == CONTEXT_HEREDOC) and token_type == "T_VARIABLE"): # only if the next one is [ if self.buf[endpos] == "[": self.context_stack.append(CONTEXT_BRACKETS) elif ((ctx == CONTEXT_DOUBLEQUOTE or ctx == CONTEXT_HEREDOC) and token_type == "T_OBJECT_OPERATOR"): if (self.last_token != "T_VARIABLE" or not self.buf[self.pos + 2].isalpha()): tok = Token("T_ENCAPSED_AND_WHITESPACE", tok.value, tok.lineno) else: self.context_stack.append(CONTEXT_OBJECT_ACCESS) elif token_type == "T_OBJECT_OPERATOR": self.context_stack.append(CONTEXT_OBJECT_ACCESS) elif ctx == CONTEXT_BRACKETS and token_type == "]": self.context_stack.pop() elif ctx == CONTEXT_CURLY_BRACES and token_type == "}": # XXX this is incorrect but we don't care at the moment # if someone inserts } inside ] we have to do something else # like scan grammar until we hit it self.context_stack.pop() self.pos = endpos return tok