def read_sexp(sexp): processor = TokenProcessor(sexp) scanner = Scanner([ (r"\s+", processor("skip_whitespaces")), (r";[^\n]*\n", processor("skip")), (r""""(?:[^"])*"|(\]|\[|\)|\(|[^\(\)\s]+)""", processor("atom")), (r".*", processor("error")) ], re.M) scanner.scan(processor.string) if processor.paren_stack: processor.raise_error("missing closing parenthesis.") result = eval("".join(processor.result).lstrip(",")) return (isinstance(result, tuple) and (result[0],0) or (result,0))[0]
def test_scanner(self): def s_ident(scanner, token): return token def s_operator(scanner, token): return "op%s" % token def s_float(scanner, token): return float(token) def s_int(scanner, token): return int(token) scanner = Scanner([ (r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", s_float), (r"\d+", s_int), (r"=|\+|-|\*|/", s_operator), (r"\s+", None), ]) self.assertNotEqual(scanner.scanner.scanner("").pattern, None) self.assertEqual( scanner.scan("sum = 3*foo + 312.50 + bar"), (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], ''))
def test_scanner(self): def s_ident(scanner, token): return token def s_operator(scanner, token): return "op%s" % token def s_float(scanner, token): return float(token) def s_int(scanner, token): return int(token) scanner = Scanner( [ (r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", s_float), (r"\d+", s_int), (r"=|\+|-|\*|/", s_operator), (r"\s+", None), ] ) self.assertNotEqual(scanner.scanner.scanner("").pattern, None) self.assertEqual( scanner.scan("sum = 3*foo + 312.50 + bar"), (["sum", "op=", 3, "op*", "foo", "op+", 312.5, "op+", "bar"], ""), )
def scan(self, input): """Preforms the scan of the input and outputs any errors including line on which the lexical error occured.""" scanner = Scanner([ (r"[\n]", self.newline), (r"\"[^\"\n]*\"", self.string), (r"\'[^\'\n]*\'", self.string), (r"\b(if|fi|else|do|od|fa|af|to|proc)\b", self.key), (r"\b(end|return|forward|var|type|break)\b", self.key), (r"\b(exit|true|false|writes|write|read)\b", self.key), (r"[A-Za-z][A-Za-z0-9_]*", self.identifier), (r"\-\>|\(|\)|\[\]|\[|\]|;|:\=|:|\,", self.symbol), (r"\+|\-|\/|\*|\=|\%|!\=|\>=|\<=|\>|\<|\?", self.operator), (r"[0-9]+", self.integer), (r"#.*(?=\n?)", self.ignore), (r"[\t ]+", self.ignore), ]) tokens, remainder = scanner.scan(input) tokens.append(('EOF', 'EOF')) if remainder: print "line %s: illegal character (%s)" % ( self.line, remainder[:1]) sys.exit(1) else: return tokens
class Parser: def __init__(self): self.s1 = Scanner(( (r'^@@', self.got), (r'aa', self.got), )) def write_all(self, text): D("scan %r", self.s1.scan(text)) def got(self, text): D("GOT %r", text)
def scan(self, string): """ Scans an input string for tokens, and returns them. """ scanner = Scanner([ (self.constant_signs, lambda _, tok: (self.constant_type, tok)), (self.numerical_variables, lambda _, tok: (self.numerical_type, tok)), (self.sentntial_variables, lambda _, tok: (self.sentntial_type, tok)), (self.predicate_variables, lambda _, tok: (self.predicate_type, tok))]) tokens, remainder = scanner.scan(string) if remainder: if len(remainder) > 10: remainder = remainder[:10] raise LexicalException("Error lexing input near {0}...".format(remainder)) return tokens
def scan(self, string): scanner = Scanner([(self.constant_signs, lambda _, tok: (self.constant_type, tok)), (self.numerical_variables, lambda _, tok: (self.numerical_type, tok)), (self.sentntial_variables, lambda _, tok: (self.sentntial_type, tok)), (self.predicate_variables, lambda _, tok: (self.predicate_type, tok))]) tokens, remainder = scanner.scan(string) if remainder: if len(remainder) > 10: remainder = remainder[:10] raise LexicalException("error lexing {0} ..".format(remainder)) return tokens
def vt_parse(str): # We'll memoise this function so several calls on the same input don't # require re-parsing. if (str in vt_parse.memory): return vt_parse.memory[str] # Use the built in re.Scanner to tokenise the input string. def s_lbrace(scanner, token): return ("LBRACE", token) def s_rbrace(scanner, token): return ("RBRACE", token) def s_comma(scanner, token): return ("COMMA", token) def s_varname(scanner, token): return ("VAR", token) scanner = Scanner([(r'{', s_lbrace), (r'}', s_rbrace), (r',', s_comma), (r'[a-zA-Z_]\w*', s_varname), (r'\s+', None)]) tokens = scanner.scan(str) # tokens is a pair of the tokenised string and any "uneaten" part. # check the entire string was eaten. if (tokens[1] != ''): print "Could not read the variable tree given:" print str #print "could not lex: " + tokens[1].__str__() exit() tokens = tokens[0] # Just the list of tokens. p = Parser() try: tree = p.parse(tokens) except p.ParseErrors, e: print "Could not read the variable tree given:" print str exit()
def vt_parse(str): # We'll memoise this function so several calls on the same input don't # require re-parsing. if(str in vt_parse.memory): return vt_parse.memory[str] # Use the built in re.Scanner to tokenise the input string. def s_lbrace(scanner, token): return ("LBRACE", token) def s_rbrace(scanner, token): return ("RBRACE", token) def s_comma(scanner, token): return ("COMMA", token) def s_varname(scanner, token): return ("VAR", token) scanner = Scanner([ (r'{', s_lbrace), (r'}', s_rbrace), (r',', s_comma), (r'[a-zA-Z_]\w*', s_varname), (r'\s+', None) ]) tokens = scanner.scan(str) # tokens is a pair of the tokenised string and any "uneaten" part. # check the entire string was eaten. if(tokens[1] != ''): print "Could not read the variable tree given:" print str #print "could not lex: " + tokens[1].__str__() exit() tokens = tokens[0] # Just the list of tokens. p = Parser() try: tree = p.parse(tokens) except p.ParseErrors, e: print "Could not read the variable tree given:" print str exit()
def parse_code(self): def var_found( scanner, name: str ): if name in ['caller', 'e', 'pi']: return name if name not in self._keys: self._keys.append(name) ret = 'a[%d]' % self._count self._count += 1 else: ret = 'a[%d]' % (self._keys.index(name)) return ret code = self._func scanner = Scanner([ (r"x", lambda y, x: x), (r"[a-zA-Z]+\.", lambda y, x: x), (r"[a-z]+\(", lambda y, x: x), (r"[a-zA-Z_]\w*", var_found), (r"\d+\.\d*", lambda y, x: x), (r"\d+", lambda y, x: x), (r"\+|-|\*|/", lambda y, x: x), (r"\s+", None), (r"\)+", lambda y, x: x), (r"\(+", lambda y, x: x), (r",", lambda y, x: x), ]) self._count = 0 self._keys = list() parsed, rubbish = scanner.scan(code) parsed = ''.join(parsed) if rubbish != '': raise Exception('parsed: %s, rubbish %s' % (parsed, rubbish)) self.code = parsed # Define parameters self._parameters = list() for key in self._keys: p = FittingParameter(name=key, value=1.0) self._parameters.append(p)
def test_scanner(self): def s_ident(scanner, token): return token def s_operator(scanner, token): return "op%s" % token def s_float(scanner, token): return float(token) def s_int(scanner, token): return int(token) from re import Scanner scanner = Scanner([ (r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", s_float), (r"\d+", s_int), (r"=|\+|-|\*|/", s_operator), (r"\s+", None), ]) self.assertNotEqual(scanner.scanner.scanner("").pattern, None) self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], ''))
def lex_source(source): """ Lexes the source into ice9 tokens. Returns a list of (token type, token string) pairs. May raise a ValueError in case of a syntax error. """ scanner_tokens = [(regex, make_token(typ)) for typ, regex in TOKENS] scanner = Scanner(scanner_tokens) # use python's scanner class to tokenize the input tokenized, unused = scanner.scan(source) if unused != '': # unexpected character broke the flow! lineno = sum(1 for typ,tok in tokenized if typ == 'newline') + 1 raise Ice9LexicalError(lineno, 'illegal character (%s)' % unused[0]) # mark the start and end of the file tokenized.insert(0, ('SOF', 'start of file')) tokenized.append(('EOF', 'end of file')) return tokenized
def _scan_int(self, string, const): # TODO: Add better invalid integer handling # Check for integer sign, possibly treat unsigned integer # as POSITIVE patterns = [] INT_SIGN = (r"^[{}{}]".format(CHAR_MAP['space'], CHAR_MAP['tab']), lambda scanner, token: ("INT_SIGN", token)) INT_VAL = (r".[{}{}]*".format(CHAR_MAP['space'], CHAR_MAP['tab']), lambda scanner, token: ("INT_VAL", token)) if const == 'SIGNED_INT': patterns.append(INT_SIGN) patterns.append(INT_VAL) scanner = Scanner(patterns) found, remainder = scanner.scan(string) self.type = 'INT' try: self.value = ''.join([f[1] for f in found]) except IndexError: print("Hit IndexError, string trying to check is: {}".format( dbg(string)))
def parse_title(title: str, *, scanner: re.Scanner = _scanner) -> Title: r, rest = scanner.scan(title.strip().lstrip("#")) assert not rest itr = (line for line in r if line.strip()) tags = [] buf = [] for tk in itr: if tk != "[": buf.append(tk) break tag = next(itr) buf.append(tag) tk = next(itr) if tk != "]": buf.append(tag) buf.append(tk) break buf.pop() tags.append(html.unescape(tag.strip())) return Title(tags=tags, title="".join(itertools.chain(buf, itr)).strip())
class Reader(object): PAREN = {"]": "[", ")": "("} def __init__(self, binding=None, symbol_marker="'", use_dict=True): self.binding = binding or default_binding self.symbol_marker = symbol_marker self.use_dict = use_dict def read(self, value): self.result = [] self.paren_stack = [] self.source = value self.pos = 0 self.quoted = False self.scanner = Scanner( [ (r"\s+", self("skip")), (r";[^\n]*\n", self("skip")), (r""""(((?<=\\)")|[^"])*((?<!\\)")""", self("str")), (r"(\(|\[)", self("open")), (r"(\)|\])", self("close")), (r"(([\d]+|(((\d+)?\.[\d]+)|([\d]+\.)))e[\+\-]?[\d]+)|(((\d+)?\.[\d]+)|([\d]+\.))", self("number")), (r"\-?((0x[\da-f]+)|(0[0-7]+)|([1-9][\d]*)|0)[l]?", self("number")), (r"""%s([^\(\[\)\]\s"]+)""" % self.symbol_marker, self("symbol")), (r"'", self("quote")), (r"""([^\(\[\)\]\s"]+)""", self("ident")), (r"""".*""", self("unterm_str")), (r".*", self("unknown_token")), ], re.M | re.S | re.I, ) self.scanner.scan(self.source) if self.paren_stack: self.raise_error("missing closing parenthesis.") return self.parse(self.result) def append(self, v): if self.quoted: quote_lst = self.paren_stack.pop()[1] quote_lst.append(Token(v, self.pos)) self.quoted = False else: self.last().append(Token(v, self.pos)) def __call__(self, name): def _(scanner, s): self.pos += len(s) return getattr(self, name)(s) return _ def unknown_token(self, s): self.raise_error("unknown token: %s" % s) def skip(self, _): pass def quote(self, _): new_lst = [] self.last().append(new_lst) self.paren_stack.append(["quote", new_lst]) self.append(Ident("quote")) self.quoted = True def open(self, s): new_lst = [] if self.quoted: quote_lst = self.paren_stack.pop()[1] quote_lst.append(new_lst) self.quoted = False else: self.last().append(new_lst) self.paren_stack.append([s, new_lst]) def close(self, s): if not self.paren_stack: self.raise_error("missing opening parenthesis.") if self.PAREN[s] != self.paren_stack.pop()[0]: self.raise_error("missing closing parenthesis.") def str(self, s): self.append(eval('u""' + s + '""')) def unterm_str(self, s): self.raise_error("unterminated string literal.") def number(self, s): self.append(eval(s)) def symbol(self, s): self.append(Symbol(s[1:])) def ident(self, s): if s in self.binding: self.append(self.binding[s]) else: self.append(Ident(s)) def last(self): if self.paren_stack: return self.paren_stack[-1][1] else: return self.result def parse(self, rs): def is_ident(value, expected): return getattr(value, "value", None) == Ident(expected) def is_pair(rs): return getattr(rs, "__len__", lambda: 0)() == 3 and is_ident(rs[1], u".") if isinstance(rs, list): if not len(rs): return [] elif self.use_dict and is_ident(rs[0], u"alist->hash-table"): if len(rs) != 2: self.raise_error("alist->hash-table: expected 1 arguments, got %d." % (len(rs) - 1), rs[0].pos) if not all(is_pair(a) for a in rs[1]): self.raise_error("alist->hash-table: aruguments must be alist", rs[0].pos) return dict((self.parse(i[0]), self.parse(i[2])) for i in rs[1]) elif len(rs) != 3 and any(is_ident(t, u".") for t in rs): self.raise_error('illegal use of "."', rs[0].pos) elif is_pair(rs): parsed = self.parse(rs[2]) if not isinstance(rs[2], list): return Pair([rs[0].value, parsed]) if isinstance(parsed, Pair): return Pair([rs[0].value, parsed]) elif isinstance(parsed, list): return [rs[0].value] + parsed else: return [rs[0].value, parsed] else: return map(self.parse, rs) else: return rs.value def raise_error(self, msg="parse error", pos=None, range=3): pos = pos or self.pos lines = self.source.split("\n") curline = self.source[:pos].count("\n") linepos = pos - len("\n".join(lines[:curline])) buf = ["\n"] for i in xrange(max(0, curline - range), curline + 1): buf.append("% 5d: %s" % (i + 1, lines[i])) width = 7 + sum(east_asian_width(c) == "W" and 2 or 1 for c in unicode(lines[i])) buf.append("%s~" % (" " * width)) buf.append("line %d, %d: %s" % (curline + 1, linepos, msg)) raise ParseError(("\n".join(buf)).encode(sys.stderr.encoding))
class Reader(object): PAREN = {"]": "[", ")": "("} def __init__(self, binding=None, symbol_marker="'", use_dict=True): self.binding = binding or default_binding self.symbol_marker = symbol_marker self.use_dict = use_dict def read(self, value): self.result = [] self.paren_stack = [] self.source = value self.pos = 0 self.quoted = False self.scanner = Scanner([ (r"\s+", self("skip")), (r";[^\n]*\n", self("skip")), (r""""(((?<=\\)")|[^"])*((?<!\\)")""", self("str")), (r"(\(|\[)", self("open")), (r"(\)|\])", self("close")), (r"(([\d]+|(((\d+)?\.[\d]+)|([\d]+\.)))e[\+\-]?[\d]+)|(((\d+)?\.[\d]+)|([\d]+\.))", self("number")), (r"\-?((0x[\da-f]+)|(0[0-7]+)|([1-9][\d]*)|0)[l]?", self("number")), (r"""%s([^\(\[\)\]\s"]+)""" % self.symbol_marker, self("symbol")), (r"'", self("quote")), (r"""([^\(\[\)\]\s"]+)""", self("ident")), (r"""".*""", self("unterm_str")), (r".*", self("unknown_token")) ], re.M | re.S | re.I) self.scanner.scan(self.source) if self.paren_stack: self.raise_error("missing closing parenthesis.") return self.parse(self.result) def append(self, v): if self.quoted: quote_lst = self.paren_stack.pop()[1] quote_lst.append(Token(v, self.pos)) self.quoted = False else: self.last().append(Token(v, self.pos)) def __call__(self, name): def _(scanner, s): self.pos += len(s) return getattr(self, name)(s) return _ def unknown_token(self, s): self.raise_error("unknown token: %s" % s) def skip(self, _): pass def quote(self, _): new_lst = [] self.last().append(new_lst) self.paren_stack.append(['quote', new_lst]) self.append(Ident('quote')) self.quoted = True def open(self, s): new_lst = [] if self.quoted: quote_lst = self.paren_stack.pop()[1] quote_lst.append(new_lst) self.quoted = False else: self.last().append(new_lst) self.paren_stack.append([s, new_lst]) def close(self, s): if not self.paren_stack: self.raise_error("missing opening parenthesis.") if self.PAREN[s] != self.paren_stack.pop()[0]: self.raise_error("missing closing parenthesis.") def str(self, s): self.append(eval('u""' + s + '""')) def unterm_str(self, s): self.raise_error("unterminated string literal.") def number(self, s): self.append(eval(s)) def symbol(self, s): self.append(Symbol(s[1:])) def ident(self, s): if s in self.binding: self.append(self.binding[s]) else: self.append(Ident(s)) def last(self): if self.paren_stack: return self.paren_stack[-1][1] else: return self.result def parse(self, rs): def is_ident(value, expected): return getattr(value, "value", None) == Ident(expected) def is_pair(rs): return getattr(rs, "__len__", lambda: 0)() == 3 and is_ident( rs[1], u".") if isinstance(rs, list): if not len(rs): return [] elif self.use_dict and is_ident(rs[0], u"alist->hash-table"): if len(rs) != 2: self.raise_error( "alist->hash-table: expected 1 arguments, got %d." % (len(rs) - 1), rs[0].pos) if not all(is_pair(a) for a in rs[1]): self.raise_error( "alist->hash-table: aruguments must be alist", rs[0].pos) return dict( (self.parse(i[0]), self.parse(i[2])) for i in rs[1]) elif len(rs) != 3 and any(is_ident(t, u".") for t in rs): self.raise_error('illegal use of "."', rs[0].pos) elif is_pair(rs): parsed = self.parse(rs[2]) if not isinstance(rs[2], list): return Pair([rs[0].value, parsed]) if isinstance(parsed, Pair): return Pair([rs[0].value, parsed]) elif isinstance(parsed, list): return [rs[0].value] + parsed else: return [rs[0].value, parsed] else: return map(self.parse, rs) else: return rs.value def raise_error(self, msg="parse error", pos=None, range=3): pos = pos or self.pos lines = self.source.split("\n") curline = self.source[:pos].count("\n") linepos = pos - len("\n".join(lines[:curline])) buf = ["\n"] for i in xrange(max(0, curline - range), curline + 1): buf.append("% 5d: %s" % (i + 1, lines[i])) width = 7 + sum( east_asian_width(c) == 'W' and 2 or 1 for c in unicode(lines[i])) buf.append("%s~" % (" " * width)) buf.append("line %d, %d: %s" % (curline + 1, linepos, msg)) raise ParseError(("\n".join(buf)).encode(sys.stderr.encoding))
from pprint import pformat import logging import re log = logging.getLogger() D = log.debug logging.basicConfig(level=logging.DEBUG) def callback(scanner, text): D("CALL %r", text) def ignore(scanner, text): D("IGNORE %r", text) s = Scanner(( (r'{{{', callback), (r'##', callback), (r'\s+', ignore), (r'(.+)(?=##)', callback), )) text = "## {{{ aa##" while text: D("%r", text) text = s.scan(text)[1]
def _scan_command(self, line, pos, const): patterns = [(r"^{}".format(i[0]), i[1]) for i in const] scanner = Scanner(patterns) found, remainder = scanner.scan(line[pos:]) self.type = found[0] self.value = [i[0] for i in const if i[1] == self.type][0]
def _scan_file(self): scanner = Scanner(token_patterns, FLAGS['s']) return scanner.scan(self._read_file())[0]