def test_basic_lexer(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") l = lg.build() def f(n): tokens = l.lex("%d+%d+%d" % (n, n, n)) i = 0 s = 0 while i < 5: t = tokens.next() if i % 2 == 0: if t.name != "NUMBER": return -1 s += int(t.value) else: if t.name != "PLUS": return -2 if t.value != "+": return -3 i += 1 if tokens.next() is not None: return -4 return s assert self.run(f, [14]) == 42
def test_position(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.ignore(r"\s+") l = lg.build() stream = l.lex("2 + 3") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 3 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 5 with raises(StopIteration): stream.next() stream = l.lex("2 +\n 37") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 3 t = stream.next() assert t.source_pos.lineno == 2 assert t.source_pos.colno == 5 with raises(StopIteration): stream.next()
def test_regex_flags_ignore(self): lg = LexerGenerator() lg.add("ALL", r".*", re.DOTALL) lg.ignore(r".*", re.DOTALL) l = lg.build() stream = l.lex("test\ndotall") with raises(StopIteration): stream.next()
def test_error(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") l = lg.build() stream = l.lex('fail') with raises(LexingError) as excinfo: stream.next() assert 'SourcePosition(' in repr(excinfo.value)
def test_regex_flags(self): lg = LexerGenerator() lg.add("ALL", r".*", re.DOTALL) l = lg.build() stream = l.lex("test\ndotall") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 assert t.getstr() == "test\ndotall" with raises(StopIteration): stream.next()
def test_repr(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.ignore(r"\s+") l = lg.build() stream = l.lex("2 + 3") assert str(stream) is not None t = stream.next() assert t.name == "NUMBER" assert t.value == "2" assert str(stream) is not None t = stream.next() assert t.name == "PLUS"
def test_newline_position(self): lg = LexerGenerator() lg.add("NEWLINE", r"\n") lg.add("SPACE", r" ") l = lg.build() stream = l.lex(" \n ") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 2 t = stream.next() assert t.source_pos.lineno == 2 assert t.source_pos.colno == 1
def test_simple(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") l = lg.build() stream = l.lex("2+3") t = stream.next() assert t.name == "NUMBER" assert t.value == "2" t = stream.next() assert t.name == "PLUS" assert t.value == "+" t = stream.next() assert t.name == "NUMBER" assert t.value == "3" assert t.source_pos.idx == 2 t = stream.next() assert t is None
def test_ignore(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.ignore(r"\s+") l = lg.build() stream = l.lex("2 + 3") t = stream.next() assert t.name == "NUMBER" assert t.value == "2" t = stream.next() assert t.name == "PLUS" assert t.value == "+" t = stream.next() assert t.name == "NUMBER" assert t.value == "3" assert t.source_pos.idx == 4 with raises(StopIteration): stream.next()
def test_arithmetic(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.add("TIMES", r"\*") pg = ParserGenerator(["NUMBER", "PLUS", "TIMES"], precedence=[ ("left", ["PLUS"]), ("left", ["TIMES"]), ]) @pg.production("main : expr") def main(p): return p[0] @pg.production("expr : expr PLUS expr") @pg.production("expr : expr TIMES expr") def expr_binop(p): return BoxInt({ "+": operator.add, "*": operator.mul }[p[1].getstr()](p[0].getint(), p[2].getint())) @pg.production("expr : NUMBER") def expr_num(p): return BoxInt(int(p[0].getstr())) lexer = lg.build() parser = pg.build() assert parser.parse(lexer.lex("3*4+5"))
def __init__(self): _lg = LexerGenerator() for r in grammar: _lg.add(r[0], r[1]) _lg.ignore(r'\s+') self._scanner = _lg.build()
def lexer_from_mapping(mapping): lg = LexerGenerator() # Escape data with forward slashes lg.add("DATA", r'/.+?/') # Add the special characters for char in mapping.keys(): lg.add(char, r"\\" + char) # Normal tokens lg.add("TYPE", r':') lg.add("AND", r'\&') lg.add("OR", r'\|') lg.add("L_PAREN", r'\(') lg.add("R_PAREN", r'\)') lg.add("EQUAL", r'=') lg.add("CHILD", r'>') lg.add("PARENT", r'<') lg.add("NOT", r'!') # Everything else is data excluded_chars = r'^<>=&|():!' for char in mapping.keys(): excluded_chars += r"\\" + char lg.add("DATA", "[{excluded}]+".format(excluded=excluded_chars)) lg.ignore(r'\s+') lexer = lg.build() return lexer
def add_infix_1_macro_name(macro_name): infix_1_macro_names.append(macro_name) def add_infix_2_macro_name(macro_name): infix_2_macro_names.append(macro_name) def add_user_defined_keyword(keyword): user_defined_keywords.append(keyword) lg = LexerGenerator() lg.add( 'TQUOTE_STR', r'(?x)"""(?:|[^\\]|\\.|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"""' ) lg.add( 'SQUOTE_STR', r"(?x)'(?:|[^'\\]|\\.|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*'" ) lg.add( 'DQUOTE_STR', r'(?x)"(?:|[^"\\]|\\.|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"' ) lg.add( 'TQUOTE_RAW_STR', r'(?x)r"""(?:|[^\\]|\\.|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"""' ) lg.add(
class Lexer: def __init__(self): self.lexer = LexerGenerator() self.__add_tokens() def __add_tokens(self): # Constant self.lexer.add('E', r'-?__E__') self.lexer.add('PI', r'-?__PI__') self.lexer.add('FLOAT', r'-?\d+\.\d+') self.lexer.add('INTEGER', r'-?\d+') self.lexer.add('STRING', r'(""".*""")|(".*")|(\'.*\')') self.lexer.add( 'BOOLEAN', r'true(?!\w)|false(?!\w)|True(?!\w)|False(?!\w)|TRUE(?!\w)|FALSE(?!\w)' ) # Mathematical Operators self.lexer.add('SUM', r'\+') self.lexer.add('SUB', r'\-') self.lexer.add('MUL', r'\*') self.lexer.add('DIV', r'\/') # Binary Operator self.lexer.add('AND', r'and(?!\w)') self.lexer.add('OR', r'or(?!\w)') self.lexer.add('==', r'\=\=') self.lexer.add('!=', r'\!\=') self.lexer.add('>=', r'\>\=') self.lexer.add('<=', r'\<\=') self.lexer.add('>', r'\>') self.lexer.add('<', r'\<') self.lexer.add('=', r'\=') # Statement self.lexer.add('IF', r'if(?!\w)') self.lexer.add('ELSE', r'else(?!\w)') self.lexer.add('NOT', r'not(?!\w)') # Semi Colon self.lexer.add(';', r'\;') self.lexer.add(',', r'\,') # Parenthesis self.lexer.add('(', r'\(') self.lexer.add(')', r'\)') self.lexer.add('{', r'\{') self.lexer.add('}', r'\}') # Function self.lexer.add('CONSOLE_INPUT', r'input') self.lexer.add('FUNCTION', r'function') self.lexer.add('PRINT', r'print') self.lexer.add('ABSOLUTE', r'abs') self.lexer.add('SIN', r'sin') self.lexer.add('COS', r'cos') self.lexer.add('TAN', r'tan') self.lexer.add('POWER', r'pow') # Assignment self.lexer.add('LET', r'let(?!\w)') self.lexer.add('IDENTIFIER', "[a-zA-Z_][a-zA-Z0-9_]*") # Ignore spaces self.lexer.ignore('\s+') # self.lexer.add('OPT_LINE', r'\n*') def build(self): return self.lexer.build()
from rply import LexerGenerator lg = LexerGenerator() lg.add("PLUS", r"\+") lg.add("MINUS", r"-") lg.add("MUL", r"/") lg.add("DIV", r"\*") lg.add("NUMBER", r"\d+") lg.ignore(r"\s+") lexer = lg.build()
from rply import ParserGenerator, LexerGenerator import box lg = LexerGenerator() lg.add("LPAREN", r"\(") lg.add("RPAREN", r"\)") lg.add("QUOTE", r"'") lg.add("ATOM", r"[^\s()]+") lg.ignore(r"\s+") pg = ParserGenerator(["QUOTE", "LPAREN", "RPAREN", "ATOM"], precedence=[], cache_id="wasp") @pg.error def error_handler(token): type = token.gettokentype() pos = token.getsourcepos() if pos is None: raise ValueError("unexpected %s" % type) else: raise ValueError("unexpected %s at (%s, %s)" % (type, pos.lineno, pos.colno)) @pg.production("main : sexpr") def main(p): return p[0]
from rply import ParserGenerator, LexerGenerator from rply.token import BaseBox class BoxString(BaseBox): def __init__(self, value): self.value = value def getstr(self): return self.value lg = LexerGenerator() lg.add('GT', r'\bgt\b') lg.add('GE', r'\bge\b') lg.add('LT', r'\blt\b') lg.add('LE', r'\ble\b') lg.add('EQ', r'\beq\b') lg.add('NE', r'\bne\b') lg.add('IS', r'\bis\b') lg.add('LIKE', r'\blike\b') lg.add('AND', r'\band\b') lg.add('OR', r'\bor\b') lg.add('NOT', r'\bnot\b') lg.add('NONE', r'\bnull\b') lg.add('OPEN_PARENS', r'\(') lg.add('CLOSE_PARENS', r'\)') lg.add('NUMBER', r'[\d]{1,99}([.]\d{1,99})?')
] operators = OrderedDict([ ("COMMA", ","), ("PAREN_L", r"\("), ("PAREN_R", r"\)"), ("ASSIGN", "<-"), ("MULTIPLY", r"\*"), ("DIVIDE", r"/"), ("PLUS", r"\+"), ("MINUS", r"-"), ]) lg = LexerGenerator() lg.add("NUM", r"\d+") lg.add("ID", r"[a-zA-Z][a-zA-Z0-9]*") for key, value in operators.items(): lg.add(key, value) def id_reserved(token): if token.value.lower() in reserved: return Token(token.value.upper(), token.value) return token callbacks = { "ID": [id_reserved], } # type: Dict[str, List[Callable[[Token], Token]]]
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # Print self.lexer.add('PRINT', r'print') # Parenthesis self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') self.lexer.add('WQ', r'\"') # Operators self.lexer.add('SUM', r'\+') self.lexer.add('SUB', r'\-') self.lexer.add('MUP', r'\*') self.lexer.add('DIV', r'\/') self.lexer.add('EQUAL', r'=') self.lexer.add('GREATER', r'\>') self.lexer.add('SMALLER', r'\<') self.lexer.add('NOT', r'\!') # semi colon self.lexer.add('SEMI_COLON', r'\;') #logical self.lexer.add('AND', r'and') self.lexer.add('OR', r'or') # colon self.lexer.add('COLON', r'\:') # comma self.lexer.add('COMMA', ',') # Number self.lexer.add('NUMBER', r'\d+') #String self.lexer.add('STRING', r'\w*') # Variables # right now all the alphabets written here <a n d o r i f e l s :> will not be accecpted #AND if i replace round brackets with square then if else will not work self.lexer.add('VAR', r'[^(and)|(or)|(if)|(else)|(:) ]\w*') # Ignore spaces self.lexer.ignore('\s+') #if else self.lexer.add('IF', r'if') self.lexer.add('ELSE', r'else') def get_lexer(self): self._add_tokens() return self.lexer.build()
from rply import LexerGenerator lg = LexerGenerator() lg.add("BEGIN_STATEMENT", r"begin") lg.add("END_STATEMENT", r"end") lg.add("FLOAT_LITERAL", r"\d+\.\d+") lg.add("INTEGER_LITERAL", r"[0-9]+") lg.add("STRING_LITERAL", r"\"[^\"]*\"") lg.add("CHARACTER_LITERAL", r"\'.\'") lg.add("BOOLEAN_LITERAL", r"(true)|(false)") lg.add("FLOAT_TYPENAME", r"Float") lg.add("INTEGER_TYPENAME", r"Integer") lg.add("STRING_TYPENAME", r"String") lg.add("CHARACTER_TYPENAME", r"Character") lg.add("BOOLEAN_TYPENAME", r"Boolean") lg.add("PLUS", r"\+") lg.add("MINUS", r"\-") lg.add("MULTIPLICATION", r"\*") lg.add("CONCATENATION", r"&") lg.add("DIVISION", r"/") lg.add("MODULO", r"mod") lg.add("ASSIGNMENT", r":=") lg.add("TYPE_DECLARATION", r":") lg.add("EQUALS", r"=") lg.add("LPAREN", r"\(") lg.add("RPAREN", r"\)") lg.add("LBRACE", r"\{") lg.add("RBRACE", r"\}") lg.add("IF_CONDITIONAL", r"if") lg.add("THEN_CONDITIONAL", r"then")
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. from rply import LexerGenerator lg = LexerGenerator() # A regexp for something that should end a quoting/unquoting operator # i.e. a space or a closing brace/paren/curly end_quote = r'(?![\s\)\]\}])' lg.add('LPAREN', r'\(') lg.add('RPAREN', r'\)') lg.add('LBRACKET', r'\[') lg.add('RBRACKET', r'\]') lg.add('LCURLY', r'\{') lg.add('RCURLY', r'\}') lg.add('QUOTE', r'\'%s' % end_quote) lg.add('QUASIQUOTE', r'`%s' % end_quote) lg.add('UNQUOTESPLICE', r'~@%s' % end_quote) lg.add('UNQUOTE', r'~%s' % end_quote) lg.add('HASHBANG', r'#!.*[^\r\n]') lg.add('HASHREADER', r'#.') lg.add( 'STRING', r'''(?x) (?:u|r|ur|ru)? # prefix
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # Parentheses self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') # definitions self.lexer.add('DEF_NOT', r'def \~') self.lexer.add('DEF_IMPLIE', r'def \->') self.lexer.add('DEF_AND', r'def \&') self.lexer.add('DEF_OR', r'def \|') self.lexer.add('DEF_IFF', r'def \<->') self.lexer.add('DEF_BASE', r'def A') # conectives self.lexer.add('NOT', r'\~') self.lexer.add('IMPLIE', r'\->') self.lexer.add('AND', r'\&') self.lexer.add('OR', r'\|') self.lexer.add('IFF', r'\<->') #hifen self.lexer.add('HYPHEN', r'\-') #dot self.lexer.add('DOT', r'\.') self.lexer.add('COMMA', r'\,') # Number self.lexer.add('NUMBER', r'\d+') # Atomo self.lexer.add('ATHOM', r'[a-zA-Z][a-zA-Z0-9]*') # Ignore spaces self.lexer.ignore('\s+') def get_lexer(self): self._add_tokens() return self.lexer.build()
from __future__ import print_function import re import ast import collections from rply import ParserGenerator, LexerGenerator from ytypes import * lg = LexerGenerator() SYMBOL_RE = r"[\.\*\+\!\-\_\?\$%&=a-zA-Z][\.\*\+\!\-\_\?\$%&=a-zA-Z0-9:#]*" NS_SYMBOL = SYMBOL_RE + "/" + SYMBOL_RE lg.add("boolean", r"(true|false)") lg.add("nil", r"nil") lg.add("float", r"\d+\.\d+") lg.add("number", r"[-+]?\d+") lg.add("olist", r"\(") lg.add("clist", r"\)") lg.add("omap", r"{") lg.add("cmap", r"}") lg.add("ovec", r"\[") lg.add("cvec", r"\]") lg.add("oset", r"#{") lg.add("colon", r":") lg.add("char_nl", r"\\newline") lg.add("char_tab", r"\\tab") lg.add("char_return", r"\\return") lg.add("char_space", r"\\space") lg.add("char", r"\\.")
'MINUS': r'-', 'MUL': r'\*', 'NUMBER_SEP': r'/', 'EXPR_OPEN': r'\(', 'EXPR_CLOSE': r'\)', 'AND': r'&', 'OR': r'\|', 'NOT': r'!', 'EQ': r'\?\s*=', 'GT': r'>', 'LT': r'<', 'BOWL': r':', 'BOWL_OPEN': r'{', 'BOWL_CLOSE': r'}', 'NOODLE_OPEN': r'\[', 'NOODLE_SEP': r';', 'NOODLE_CLOSE': r'\]', 'ASSIGN': r'=', 'DENO': r'\^', 'MEM': r'@', } lg = LexerGenerator() for name, regex in op_map.items(): lg.add(name, regex) lg.ignore('\s+') lg.ignore('~\s*#((?!#~).)*#\s*~') lexer = lg.build()
def test_states(self): lg = LexerGenerator(initial_state="scalar") lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.ignore(r"\s+") lg.add("OPEN_BRACKET", r"\[", to_state="vector") lg.add("PLUS", r"\+", state="vector") lg.add("NUMBER", r"\d+", state="vector") lg.add("NEW_LINE", r"\n+", state="vector") lg.add("CLOSE_BRACKET", r"\]", state="vector", to_state="scalar") lg.ignore(r" +", state="vector") l = lg.build() stream = l.lex("2 + [ 3 + 4 \n\n 5 + 6 ] + 7") tokens = [ ("NUMBER", "2", "scalar"), ("PLUS", "+", "scalar"), ("OPEN_BRACKET", "[", "scalar"), ("NUMBER", "3", "vector"), ("PLUS", "+", "vector"), ("NUMBER", "4", "vector"), ("NEW_LINE", "\n\n", "vector"), ("NUMBER", "5", "vector"), ("PLUS", "+", "vector"), ("NUMBER", "6", "vector"), ("CLOSE_BRACKET", "]", "vector"), ("PLUS", "+", "scalar"), ("NUMBER", "7", "scalar"), ] for compare_token, token in zip(tokens, stream): name, value, state = compare_token assert token.name == name assert token.value == value assert token.state == state
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # Print self.lexer.add('IN_RA', r'in_ra') # Parenthesis self.lexer.add('MO_NGOAC_TRON', r'\(') self.lexer.add('DONG_NGOAC_TRON', r'\)') # Semi Colon # self.lexer.add('HET_DONG', r'\;') self.lexer.add('HET_DONG', r'(\n)|(\r\n)') # Operators self.lexer.add('CONG', r'\+') self.lexer.add('TRU', r'\-') self.lexer.add('NHAN', r'\*') self.lexer.add('CHIA', r'\/') # bool self.lexer.add('BANG', r'\=\=') self.lexer.add('LON_HON', r'\>') self.lexer.add('NHO_HON)', r'\<') self.lexer.add('KHAC', r'\!\=') # Number self.lexer.add('SO_NGUYEN', r'\d+') # Ignore spaces self.lexer.ignore(r'(^\s+)|( )+|\t+') def get_lexer(self): self._add_tokens() return self.lexer.build()
def __init__(self): lg = LexerGenerator() tokens = [ ("PROTO", r"[a-zA-Z]+://[^ ]+"), ("INT", r"\d+"), ("STRING", r"'[^']+'|\"[^\"]+\""), ("NAME", r"--colors=always"), ("PATH", r"([a-zA-Z0-9/._-]|\\ )+"), ("PATH", r"~([a-zA-Z0-9/._-]|\\ )*"), ("NAME", r"([a-zA-Z0-9_-]|\\ )+"), ("SEMICOLON", r";"), ("ENDL", r"\r?\n"), ] for token in tokens: lg.add(*token) lg.ignore(r"[ ]+") pg = ParserGenerator([x[0] for x in tokens]) @pg.production("main : statements") def main(args): return args[0] @pg.production("statements : statement") def statements_one(args): expression, = args return { "type": "statement", "content": expression, } @pg.production("statements : statement separator statements") def statements_many(args): statement, separtor, statements = args return { "type": "statement_infix_operator", "content": { "left": { "type": "statement", "content": statement, }, "right": statements, "operator": separtor, } } @pg.production("separator : SEMICOLON") @pg.production("separator : ENDL") def separator(args): # don't care return args[0].value @pg.production("statement : atom") def expression_one(args): atom, = args return [atom] @pg.production("statement : atom atoms") def expression_many(args): atom, atoms = args return [atom] + atoms @pg.production("atoms : atom") def atoms_one(args): atom, = args return [atom] @pg.production("atoms : atom atoms") def atoms_many(args): atom, atoms = args return [atom] + atoms @pg.production("atom : NAME") @pg.production("atom : INT") @pg.production("atom : STRING") @pg.production("atom : PATH") @pg.production("atom : PROTO") def atom(args): name, = args return name.value self.pg = pg self.lg = lg self.lexer = self.lg.build() self.parser = self.pg.build()
from rply import LexerGenerator lg = LexerGenerator() lg.add('LSLASHANGLE', r'</') lg.add('RSLASHANGLE', r'/>') lg.add('LANGLE', r'<') lg.add('RANGLE', r'>') lg.add('LSQUARE', r'\[') lg.add('RSQUARE', r'\]') lg.add('EQUAL', r'=') lg.add('STRING', r'''(?x) (r)? " [^"]* " ''') lg.add('IDENTIFIER', r'[^<>\[\]{}=/\s"]+') lg.ignore(r'<!--(.|\s)*-->') lg.ignore(r'\s+') lexer = lg.build()
def __eq__(self, other): if type(other) is not Keyword: return False if self.name == other.name: return True else: return False def __hash__(self): return self.name.__hash__() lg = LexerGenerator() lg.add('SQUOTE_STR', r"(?x)'(?:|[^'\\]|\\.|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*'") lg.add('DQUOTE_STR', r'(?x)"(?:|[^"\\]|\\.|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"') lg.add('UNTERMINATED_STRING', r"[\"\'].*") lg.add('NUMBER', r'-?[0-9]+(?:\.[0-9]+)?') lg.add('NAME', r'\&?[_a-zA-Z$][-_a-zA-Z0-9]*') lg.add('PIPELINE_FIRST_BIND', r'\|>1\?') lg.add('PIPELINE_FIRST', r'\|>1') lg.add('PIPELINE_BIND', r'\|>\?') lg.add('PIPELINE', r'\|>') lg.add('PIPELINE_SEND', r'!>') lg.add('PIPELINE_MULTI_SEND', r'!&>') lg.add('BAR', r'\|') lg.add('LBRACK', r'\[') lg.add('RBRACK', r'\]') lg.add('LBRACE', r'\{')
"!=", "<=", ">=", "<", ">", "=", ",", "+", "-", ";", "*", "/", "%", ] lg = LexerGenerator() lg.add("INCLUDE", "#include") lg.add("ASM", "__asm__") lg.add("ASM", "asm") lg.add("FLOAT_LITERAL", "\d+\.\d+") lg.add("INTEGER_LITERAL", "\d+") lg.add("CHAR_LITERAL", "'\\\\?.'") lg.add("STRING_LITERAL", "\".*\"") lg.add("CHAR", "char") lg.add("SHORT", "short") lg.add("INT", "int") lg.add("LONG", "long") lg.add("FLOAT", "float") lg.add("DOUBLE", "double") lg.add("null", "NULL") lg.add("CONST", "const") lg.add("UNSIGNED", "unsigned")
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): self.lexer.ignore('/\*((.|[\\r\\n])*?)\*/') self.lexer.ignore('//.*\\n') self.lexer.ignore('\s+') # Ignore spaces self.lexer.add('NUMBER', r'\d+') self.lexer.add('STRING_CONST', r'"(.*)"') self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') self.lexer.add('OPEN_CURLY_PAREN', r'\{') self.lexer.add('CLOSE_CURLY_PAREN', r'\}') self.lexer.add('OPEN_INDEX_PAREN', r'\[') self.lexer.add('CLOSE_INDEX_PAREN', r'\]') self.lexer.add('DOT', r'\.') self.lexer.add('COMMA', r'\,') self.lexer.add('SEMI_COLON', r'\;') self.lexer.add('SUM', r'\+') self.lexer.add('SUB', r'\-') self.lexer.add('MUL', r'\*') self.lexer.add('DIV', r'/') self.lexer.add('AND', r'&') self.lexer.add('OR', r'\|') self.lexer.add('LT', r'<') self.lexer.add('GT', r'>') self.lexer.add('EQUAL', r'=') self.lexer.add('NOT', '~') self.lexer.add('CLASS', r'class') self.lexer.add('CONSTRUCTOR', r'constructor') self.lexer.add('FUNCTION', r'function') self.lexer.add('METHOD', r'method') self.lexer.add('FIELD', r'field') self.lexer.add('STATIC', r'static') self.lexer.add('VAR', r'var') self.lexer.add('INT', r'int') self.lexer.add('CHAR', r'char') self.lexer.add('BOOLEAN', r'boolean') self.lexer.add('VOID', r'void') self.lexer.add('TRUE', r'true') self.lexer.add('FALSE', r'false') self.lexer.add('NULL', r'null') self.lexer.add('THIS', r'this') self.lexer.add('LET', r'let') self.lexer.add('DO', r'do ') self.lexer.add('IF', r'if') self.lexer.add('ELSE', r'else') self.lexer.add('WHILE', r'while') self.lexer.add('RETURN', r'return') self.lexer.add('IDENTIFIER', r'[A-Za-z_](\w*)') def get_lexer(self): self._add_tokens() return self.lexer.build()
'OP_NOT_LIKE', 'OP_IN', 'OP_NOT_IN', 'OP_LSHIFT', 'OP_RSHIFT', ] SINGLE_OPERATORS = [ 'OP_NOT', 'OP_BITWISE_NOT', 'OP_ABSOLUTE', 'OP_ADD', 'OP_SUB', ] lg.add('SELECT', r'SELECT\b', flags=re.IGNORECASE) lg.add('FROM', r'FROM\b', flags=re.IGNORECASE) lg.add('AS', r'AS\b', flags=re.IGNORECASE) lg.add('WHERE', r'WHERE\b', flags=re.IGNORECASE) lg.add('LIMIT', r'LIMIT\b', flags=re.IGNORECASE) lg.add('OFFSET', r'OFFSET\b', flags=re.IGNORECASE) lg.add('GROUP_BY', r'GROUP\s+BY\b', flags=re.IGNORECASE) lg.add('FLOAT', r'[+-]*(\d*\.\d+|\d+\.)') lg.add('INTEGER', r'[+-]*\d+') lg.add('STRING', r"'(\\'|[^'])+'") lg.add('BOOL', r'TRUE\b|YES\b|NO\b|FALSE\b', flags=re.IGNORECASE) lg.add('NULL', r'NULL\b', flags=re.IGNORECASE) lg.add('PAREN_LEFT', r'\(') lg.add('PAREN_RIGHT', r'\)') lg.add('BRACKET_LEFT', r'\[') lg.add('BRACKET_RIGHT', r'\]')
class Lexer(): def __init__(self, input=None): # Initialize the lexer self.lexer = LexerGenerator() self._initialize_tokens() self.built_lexer = self.lexer.build() self.tokens = None self.valid_tokens = [] self.char = 0 self.line = 0 self.token_pos = 0 # Try to parse the input, if there is any if input: self.input(input) # Add all tokens to the lexer def _initialize_tokens(self): self.lexer.add('KW_ARRAY', r'array') self.lexer.add('OP_DOTDOT', r'\.\.') self.lexer.add('LBRAK', r'\[') self.lexer.add('RBRAK', r'\]') self.lexer.add('SEMI', r'\;') self.lexer.add('KW_TUPLE', r'tuple') self.lexer.add('KW_LOCAL', r'local') self.lexer.add('KW_GLOBAL', r'global') self.lexer.add('KW_DEFUN', r'defun') self.lexer.add('LPAR', r'\(') self.lexer.add('RPAR', r'\)') self.lexer.add('OP_COMMA', r'\,') self.lexer.add('KW_END', r'end') self.lexer.add('KW_WHILE', r'while') self.lexer.add('KW_DO', r'do') self.lexer.add('KW_IF', r'if') self.lexer.add('KW_THEN', r'then') self.lexer.add('KW_ELSIF', r'elsif') self.lexer.add('KW_ELSE', r'else') self.lexer.add('KW_FOREACH', r'foreach') self.lexer.add('KW_FOR', r'for') self.lexer.add('KW_IN', r'in') self.lexer.add('OP_DOT', r'\.') self.lexer.add('INT_LIT', r'\d+') self.lexer.add('RETURN', r'return') self.lexer.add('PRINT', r'print') self.lexer.add('EXCHANGE', r'\<\-\>') self.lexer.add('OP_LESSEQUAL', r'\<\=') self.lexer.add('OP_GREATEREQUAL', r'\>\=') self.lexer.add('OP_LESS', r'\<') self.lexer.add('OP_GREATER', r'\>') self.lexer.add('OP_EQUAL', r'\=\=') self.lexer.add('OP_NOTEQUA', r'\!\=') self.lexer.add('ASSIGN', r'\=') self.lexer.add('OP_PLUS', r'\+') self.lexer.add('OP_MINUS', r'\-') # self.lexer.add('COMMENT', r'\*\*\*.*[^\r\n]') self.lexer.add('OP_MULT', r'\*') self.lexer.add('OP_DIV', r'\/') self.lexer.add('ID', r'[A-Za-z_]+') # self.lexer.add('END-OF-LINE', r'\r\n|\n\r|\r|\n') # self.lexer.add('WS', r'\s+') self.lexer.add('UNKNOWN', r'.') # Ignore comments for now self.lexer.ignore(r'\*\*\*.*[^\r\n]') self.lexer.ignore(r'\r\n|\n\r|\r|\n') self.lexer.ignore(r'\s+') # Make the lexer to lex an input def input(self, input): self.char = 0 self.line = 0 self.token_pos = 0 return self.built_lexer.lex(input) self.tokens = [i for i in self.built_lexer.lex(input)] self.valid_tokens = [] # Iteratively lex the input token = self._next() while token: # When the token is an ID and it is too long, truncate it if token.name == "ID" and len(token.value) > 80: truncated = token.value[:80] print("ERROR: ID " + token.value + " is too long, truncated to " + truncated) token.value = truncated if token.name == "INT_LIT" and (int(token.value) > 2147483647 or int(token.value) < -2147483648): print("ERROR: " + token.value + " does not fit in INT_LIT. " + "The proper range is [−2147483648, 2147483647]") token.value = "0" self.valid_tokens.append(token) token = self._next() # Reset the value of this counter for further use self.token_pos = 0 return raw_output # Recursively return the next valid token in the token list def _next(self): if self.token_pos < len(self.tokens): token = self.tokens[self.token_pos] if token.name != "WS" and token.name != "END-OF-LINE"\ and token.name != "UNKNOWN": char_pos = self.char self.char += len(token.value) self.token_pos += 1 return Token(token.name, token.value, self.line, char_pos) elif token.name == "WS": self.char += len(token.value) self.token_pos += 1 return self._next() elif token.name == "END-OF-LINE": self.line += 1 self.char = 0 self.token_pos += 1 return self._next() elif token.name == "UNKNOWN": print("ERROR: " + token.value + " is not a valid token") self.char += len(token.value) self.token_pos += 1 return self._next() else: return None # Advance the counter and return the next token def next(self): pos = self.token_pos if pos < len(self.valid_tokens): self.token_pos += 1 return self.valid_tokens[pos] else: return None # Return the next token without advancing the counter def peek(self): pos = self.token_pos if pos < len(self.valid_tokens): return self.valid_tokens[pos] else: return None
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # Print self.lexer.add('PRINT', r'[Ww]ypisz na ekranie') self.lexer.add("FORMAT", r'\w puste miejsce wpisz ') self.lexer.add("SEPARATOR", r'oraz') # Parenthesis self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') ### #if ### self.lexer.add('IF', r'[Jj]eżeli') self.lexer.add('ELSE', r'W przeciwnym razie') self.lexer.add('START_BLOCK', r'to') self.lexer.add('END_BLOCK', r'Tyle') ### #for ### # Semi Colon self.lexer.add('DOT', r'\.') # Operators +,- self.lexer.add('SUM', r'\+') self.lexer.add('SUB', r'\-') # Number self.lexer.add('NUMBER', r'\d+') # self.lexer.add('INCREMENT', r'Zwiększ') # self.lexer.add('DECREMENT', r'Zmniejsz') # +,- operations helper self.lexer.add('ADDSUB_HELPER', r'o') # *,/ operations helper self.lexer.add('DIVMUL_HELPER', r'przez') # assignment self.lexer.add('ASSIGN', r'jest równe') #bigger > self.lexer.add('BIGGER', r'jest wieksze od') #smaller < self.lexer.add('SMALLER', r'jest mniejsze od') #equal == self.lexer.add('EQUAL', r'równa się') #!= differ self.lexer.add('DIFFER', r'jest różne od') #variable name, ex. response_time or latencySegID self.lexer.add('VARIABLE', r'(_|[a-zA-Z])(_|[a-zA-Z]|[0-9])*') self.lexer.add("STRING", r'\"[^\"]*\"') self.lexer.add("COMMA", r'\,') # Ignore spaces self.lexer.ignore('\s') def get_lexer(self): self._add_tokens() return self.lexer.build()
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. from rply import LexerGenerator lg = LexerGenerator() # A regexp for something that should end a quoting/unquoting operator # i.e. a space or a closing brace/paren/curly end_quote = r'(?![\s\)\]\}])' lg.add('LPAREN', r'\(') lg.add('RPAREN', r'\)') lg.add('LBRACKET', r'\[') lg.add('RBRACKET', r'\]') lg.add('LCURLY', r'\{') lg.add('RCURLY', r'\}') lg.add('QUOTE', r'\'%s' % end_quote) lg.add('QUASIQUOTE', r'`%s' % end_quote) lg.add('UNQUOTESPLICE', r'~@%s' % end_quote) lg.add('UNQUOTE', r'~%s' % end_quote) lg.add('HASHBANG', r'#!.*[^\r\n]') lg.add('STRING', r'''(?x) (?:u|r|ur|ru)? # prefix " # start string
from rply import LexerGenerator try: import rpython.rlib.rsre.rsre_re as re except: import re lg = LexerGenerator() # build up a set of token names and regexes they match lg.add('FLOAT', '-?\d+\.\d+') lg.add('INTEGER', '-?\d+') lg.add('STRING', '(""".*?""")|(".*?")|(\'.*?\')') # lg.add('PRINT', 'print(?!\w)') # put this before variable which would otherwise match lg.add('BOOLEAN', "true(?!\w)|false(?!\w)") lg.add('IF', 'if(?!\w)') lg.add('ELSE', 'else(?!\w)') lg.add('END', 'end(?!\w)') lg.add('AND', "and(?!\w)") lg.add('OR', "or(?!\w)") lg.add('NOT', "not(?!\w)") lg.add('LET', 'let(?!\w)') lg.add('FOR', 'for(?!\w)') lg.add('WHILE', 'while(?!\w)') lg.add('BREAK', 'break(?!\w)') lg.add('CONTINUE', 'continue(?!\w)') lg.add('MATCH', 'match(?!\w)') lg.add('ENUM', 'enum(?!\w)') lg.add('NEW', 'new(?!\w)') lg.add('RETURN', 'return(?!\w)') lg.add('TYPE', 'type(?!\w)')
("GT", r">"), # Punctuation ("LPAREN", r"\("), ("RPAREN", r"\)"), ("LBRACE", r"{"), ("RBRACE", r"}"), ("COMMA", r","), ("LBRACK", r"\["), ("RBRACK", r"\]"), # Literals ("TRUE", r"true\b"), ("FALSE", r"false\b"), ("FLOAT", r"(((0|[1-9][0-9]*)(\.[0-9]*)+)|(\.[0-9]+))([eE][\+\-]?[0-9]*)?"), ("INTEGER", r"-?(0|[1-9][0-9]*)"), ("STRING", r"\"([^\"\\]|\\.)*\""), ("IDENTIFIER", r"[a-zA-Z_$][a-zA-Z_0-9]*"), # Others ("EQUAL", r"="), ] tokens = get_tokens() for token in tokens: lexer_gen.add(token[0], token[1]) LEXER = lexer_gen.build() def get_lexer(): return LEXER
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): self.lexer.add('PRINT', r'print') self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') self.lexer.add('SEMI_COLON', r'\;') self.lexer.add('SUM', r'\+') self.lexer.add('SUB', r'\-') self.lexer.add('MUL', r'\*') self.lexer.add('DIV', r'\/') self.lexer.add('NUMBER', r'\d+') self.lexer.ignore('\s+') def get_lexer(self): self._add_tokens() return self.lexer.build()
from rply import ParserGenerator, LexerGenerator lg = LexerGenerator() lg.add("INTEGER", r"-?0|([1-9][0-9]*)") lg.add("DECIMAL", r"\.[0-9]+") _id = r"[A-Za-z][A-Za-z0-9_]*" lg.add("KEYWORD", _id + r":") lg.add("IDENTIFIER", _id) lg.add("SYMBOL", r":" + _id) _comment = r"[ \t]*#[^\n]*" lg.add("COMMENT", _comment) lg.add("LPAREN", r"\([ \t\n]*") lg.add("RPAREN", r"[ \t\n]*\)") # TODO: Maybe clear this up to be prettier. lg.add("PREFACE", r"<[A-Za-z0-9_:@, \t\n]+>[ \t\n]*") lg.add("LBRACK", r"{[ \t\n]*") lg.add("RBRACK", r"[ \t\n]*}") lg.add("VERT", r"\|[ \t\n]*") lg.add("LSQ", r"\[[ \t\n]*") lg.add("RSQ", r"[ \t\n]*\]") lg.add("CONT", r"[ \t]+\\(" + _comment + r")?\n[ \t]*") lg.add("SWS", r"[ \t]+") lg.add("COMMA", ",[ \t\n]*") lg.add("TERMINAL", r"[ \t]*\n[ \t\n]*") # TODO: Make strings parsing not suck dick. lg.add("STRING", r"\"[^\"]*\"") # TODO: Finalize operators lg.add("OPERATOR", r"[+\-=*/\^]")
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # Number self.lexer.add('NUMBER', r'\d+') # Image self.lexer.add('IMAGE', r'[^\s]+(\.(?i)(jpg|png|gif|bmp|jpeg))') # Position self.lexer.add('POSITION', r'position') # Scale self.lexer.add('SCALE', r'scale') # Move self.lexer.add('MOVE', r'move') # Dimensions self.lexer.add('DIMENSIONS', r'dimensions') # Total self.lexer.add('TOTAL', r'total') # Print self.lexer.add('PRINT', r'print') # Parenthesis self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') # Comma separator self.lexer.add('COMMA', r'\,') # Ignore spaces self.lexer.ignore('\s+') def get_lexer(self): self._add_tokens() return self.lexer.build()
from rply import LexerGenerator lg = LexerGenerator() lg.add("STEP", r"s") lg.add("TURN_LEFT", r"l") lg.add("TURN_RIGHT", r"r") lg.add("FUNC", r"a|b|c|d|e|f|g|h|i|j|k|m|n|o|p|q|t|u|v|w|x|y|z") lg.add("COLON", r"\:") lg.add("NEWLINE", r"\n+ *\n*") lg.add("NAME", r"[A-Z]") lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.add("MINUS", r"\-") lg.add("(", r"\(") lg.add(")", r"\)") lg.add(",", r"\,") lg.ignore(r" +") lg.ignore(r"\#.*") TOKENS = [r.name for r in lg.rules] lexer = lg.build()
def build_lexer(): lg = LexerGenerator() commands = sorted( itertools.chain(common_commands, lmao_commands, rofl_commands)) for command in reversed(commands): lg.add(command, command) lg.add('NEWLINE', r'\n') lg.add('SCALAR_VAR', r's\d+') lg.add('ARRAY_VAR', r'a\d+') lg.add('REGISTER', r'reg[A-H]') lg.add('LABEL', r'[a-zA-Z_][a-zA-Z_0-9]*') lg.add('NUM_LITERAL', r'-?((\d+)(\.\d+)?)|(\.\d+)') lg.add('CHAR_LITERAL', r"'([^\\']|\\n|\\t|\\'|\\\\)'") lg.add('COLON', r':') lg.ignore(r'[ \t]') lg.ignore(r'\#.*') lg.add('ERROR', r'.') return lg.build()
from rply import LexerGenerator lg = LexerGenerator() lg.add('NUMBER', r'\d+(\.\d+)?') lg.add('PLUS', r'\+') lg.add('MINUS', r'-') lg.add('MUL', r'\*') lg.add('DIV', r'/') lg.add('OPEN_PARENS', r'\(') lg.add('CLOSE_PARENS', r'\)') lg.add('EQUALS', r'=') lg.add('SYMBOL', r'[^\s0-9][^\s]*') lg.ignore(r'\s+') lexer = lg.build()
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. from rply import LexerGenerator lg = LexerGenerator() # A regexp for something that should end a quoting/unquoting operator # i.e. a space or a closing brace/paren/curly end_quote = r'(?![\s\)\]\}])' lg.add('LPAREN', r'\(') lg.add('RPAREN', r'\)') lg.add('LBRACKET', r'\[') lg.add('RBRACKET', r'\]') lg.add('LCURLY', r'\{') lg.add('RCURLY', r'\}') lg.add('HLCURLY', r'#\{') lg.add('QUOTE', r'\'%s' % end_quote) lg.add('QUASIQUOTE', r'`%s' % end_quote) lg.add('UNQUOTESPLICE', r'~@%s' % end_quote) lg.add('UNQUOTE', r'~%s' % end_quote) lg.add('HASHBANG', r'#!.*[^\r\n]') lg.add('HASHREADER', r'#[^{]') # A regexp which matches incomplete strings, used to support # multi-line strings in the interpreter
#! /usr/bin/env python # -*- coding: utf-8 -*- # >> # LTPyB, 2016 # << from rply import LexerGenerator lg = LexerGenerator() lg.add('INTEGER', r'\-?\d+') lg.add('FLOAT', r'\-?\d+\.\d+') lg.add('OP_ASSIGNMENT', r'=') lg.add('OP_EQUAL', r'==') lg.ignore(r'\s+') # ignore whitespace lg.ignore(r'#.*\n') # ignore comments lexer = lg.build()
from rply import LexerGenerator lg = LexerGenerator() end_quote = r"(?![\s\)\]\}])" identifier = r'[^()\[\]{}\'"\s;]+' lg.add("LPAREN", r"\(") lg.add("RPAREN", r"\)") lg.add("LBRACKET", r"\[") lg.add("RBRACKET", r"\]") lg.add("LCURLY", r"\{") lg.add("RCURLY", r"\}") lg.add("HLCURLY", r"#\{") lg.add("QUOTE", r"\'%s" % end_quote) lg.add("QUASIQUOTE", r"`%s" % end_quote) lg.add("UNQUOTESPLICE", r"~@%s" % end_quote) lg.add("UNQUOTE", r"~%s" % end_quote) lg.add("DISCARD", r"#_") lg.add("HASHSTARS", r"#\*+") lg.add( "BRACKETSTRING", r"""(?x) \# \[ ( [^\[\]]* ) \[ \n? ((?:\n|.)*?) \] \1 \] """, ) lg.add("HASHOTHER", r"#%s" % identifier)
# -*- coding:utf-8 -*- from rply import LexerGenerator from rply.token import BaseBox lg = LexerGenerator() # Add takes a rule name, and a regular expression that defines the rule. #lg.add("COMMENT", r"\s*\*[^\n]*") # ([0-9]+)|([0-9]*\.[0-9]+)|(0x[0-9A-Fa-f]+) lg.add("DATE", r"0d[0-9]{8}") lg.add("NUMBER", r"(0x[0-9A-Fa-f]+)|([0-9]*\.[0-9]+)|([0-9]+)") #if lg.add("IF",r"if|IF") #lg.add("THEN",r"then|THEN") lg.add("ELSE",r"ELSE|else") lg.add("ELSEIF","ELSEIF|elseif") lg.add("ENDIF","endif|ENDIF") # do lg.add("DO", "do|DO") # do while lg.add("WHILE",r"while|WHILE") # end do lg.add("ENDDO",r"ENDDO|enddo") # do case lg.add("CASE",r"case|CASE") lg.add("ENDCASE", r"ENDCASE|endcase") # otherwise lg.add("OTHERWISE",r"otherwise|OTHERWISE") # exit lg.add("EXIT",r"exit|EXIT") # for, for each lg.add("FOR",r"for|FOR") lg.add("TO", r"to|TO")
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. from rply import LexerGenerator lg = LexerGenerator() # A regexp for something that should end a quoting/unquoting operator # i.e. a space or a closing brace/paren/curly end_quote = r'(?![\s\)\]\}])' lg.add('LPAREN', r'\(') lg.add('RPAREN', r'\)') lg.add('LBRACKET', r'\[') lg.add('RBRACKET', r'\]') lg.add('LCURLY', r'\{') lg.add('RCURLY', r'\}') lg.add('QUOTE', r'\'%s' % end_quote) lg.add('QUASIQUOTE', r'`%s' % end_quote) lg.add('UNQUOTESPLICE', r'~@%s' % end_quote) lg.add('UNQUOTE', r'~%s' % end_quote) lg.add('HASHBANG', r'#!.*[^\r\n]') lg.add('HASHREADER', r'#.') # A regexp which matches incomplete strings, used to support # multi-line strings in the interpreter partial_string = r'''(?x)
class BoxInt(BaseBox): def __init__(self, value): self.value = value def getint(self): return self.value ''' from rply import ParserGenerator, LexerGenerator from rply.token import BaseBox lexgen = LexerGenerator() lexgen.add('AND', r"(and)") lexgen.add('WITHOUT', r"(without)") lexgen.add('DIVIDE', r"(divide)") lexgen.add('MULTIPLY', ) keywords = { "return": Keyword("RETURN", "RETURN", EXPR_MID), "if": Keyword("IF", "IF_MOD", EXPR_BEG), "unless": Keyword("UNLESS", "UNLESS_MOD", EXPR_BEG), "then": Keyword("THEN", "THEN", EXPR_BEG), "elsif": Keyword("ELSIF", "ELSIF", EXPR_BEG), "else": Keyword("ELSE", "ELSE", EXPR_BEG), "while": Keyword("WHILE", "WHILE_MOD", EXPR_BEG), "until": Keyword("UNTIL", "UNTIL_MOD", EXPR_BEG),
import re import itertools from collections import deque from rply import ParserGenerator, LexerGenerator from graphextractor.rfc3987 import UrlPattern from graphextractor.flattened import flattened __all__ = ['TweetLexer', 'TweetParser'] lex = LexerGenerator() lex.ignore(ur'(?:[,;\s]+|\band\b|\bor\b)+') lex.add(u'URL', UrlPattern) lex.add(u'BTHASH', ur'#betterthan') lex.add(u'IBTHASH', ur'#isbetterthan') lex.add(u'HASHTAG', ur'#[a-zA-Z0-9_]+') lex.add(u'MENTION', ur'@[a-zA-Z0-9_]+') lex.add(u'FOR', ur'(for|FOR|For)') lex.add(u'WORD', ur'[\w]+') pg = ParserGenerator([u'URL', u'BTHASH', u'IBTHASH', u'HASHTAG', u'MENTION', u'FOR', u'WORD' ], cache_id=u'graphextractor.tweetparser') @pg.production("betterthan : words URL bthash URL topics words") def betterthan(p):
def build_lexer(): lexer = LexerGenerator() # Lexer Analysis Rules lexer.ignore(' ') lexer.add("WHATEVR", r"WHATEVR") lexer.add("VISIBLE", r"VISIBLE") lexer.add("KTHXBAI", r"KTHXBAI") lexer.add("GIMME", r"GIMME") lexer.add("MKAY", r"MKAY") lexer.add("HAS", r"HAS") lexer.add("HAI", r"HAI") lexer.add("ITZ", r"ITZ") lexer.add("OF", r"OF") lexer.add("BANG", r"!") lexer.add("BY", r"BY") lexer.add("AN", r"AN") lexer.add("A", r"A") lexer.add("R", r"R") lexer.add("I", r"I") lexer.add("MULTI_COMMENT", r"OBTW [.*|\n]TDLR") # Not working at all! lexer.add("NEWLINE", "\n") lexer.add("PRIMITIVE_TYPE", r"NUMBR|NUMBAR|LETTR|TROOF") lexer.add("NUMBAR_LITERAL", r"-?\d+.\d+") lexer.add("NUMBR_LITERAL", r"-?\d+") lexer.add("TROOF_LITERAL", r"[WIN|FAIL]") lexer.add("YARN_LITERAL", r"[\"|\'].*[\"|\']") lexer.add("MATH_BINARY_OPERATOR", r"SUM|DIFF|PRODUKT|QUOSHUNT|BIGGR|SMALLR") lexer.add("MATH_UNARY_OPERATOR", r"FLIP|SQUAR") lexer.add("LOGICAL_BINARY_OPERATOR", r"BOTH|EIHER|WON") lexer.add("LOGICAL_UNARY_OPERATOR", r"NOT") lexer.add("LOGICAL_VARIABLE_OPERATOR", r"ALL|ANY") lexer.add("COMPARISON_BINARY_OPERATOR", r"SAEM|DIFFRINT|FURSTSMALLR|FURSTBIGGR") lexer.add("ASSIGNMENT_OPERATOR", r"CORRECT_THIS") lexer.add( "SINGLE_COMMENT", r"BTW.*\n") # New line required to be added to tokens list prior! lexer.add("IDENTIFIER", r"[a-zA-Z][a-zA-Z_]*") lexer.add("LETTR_LITERAL", r".") lexer.add("ERROR", r"^[.]*") return lexer.build()
from rply import Token, LexerGenerator, ParserGenerator from rply.token import BaseBox from objects import Atom, Compound, Variable, known_atoms, atom, as_list from objects import parse_integer leg = LexerGenerator() leg.ignore(r'#.*\n') leg.ignore(r'\s+') leg.add('ATOM', r'[a-z][a-zA-Z0-9_]*') leg.add('VARIABLE', r'[A-Z_][a-zA-Z0-9_]*') leg.add('INTEGER', r'[0-9]+') leg.add('IMPLICATION', r"<-") leg.add('LEFTPAREN', r"\(") leg.add('RIGHTPAREN', r"\)") leg.add('LEFTBRACKET', r"\[") leg.add('RIGHTBRACKET', r"\]") leg.add('COMMA', r",") leg.add('AT', r"@") leg.add('VBAR', r"\|") leg.add('SIMP', r"<=>") leg.add('PROP', r"==>") leg.add('UNIFY', r"=") leg.add('COLON', r":") leg.add('SEMICOLON', r";") lexer = leg.build() pg = ParserGenerator( ['ATOM', 'VARIABLE', 'IMPLICATION', 'UNIFY', 'LEFTPAREN', 'RIGHTPAREN', 'LEFTBRACKET', 'RIGHTBRACKET', 'COLON', 'INTEGER',
from rply import LexerGenerator lg = LexerGenerator() lg.ignore(r"\s+") lg.add("NUMBER", r"\d+") lg.add("BOOLEAN", r"True|False") lg.add("ADD", r"\+") lg.add("SUB", r"\-") lg.add("MULT", r"\*") lg.add("DIV", r"\/") lg.add("SEMICOLON", r";") lg.add("PRINT", r"print") lg.add("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*") lg.add("EQUALS", r"==") lg.add("ASSIGN", r"=") lexer = lg.build()
# Copyright 2017 the authors. # This file is part of Hy, which is free software licensed under the Expat # license. See the LICENSE. from rply import LexerGenerator lg = LexerGenerator() # A regexp for something that should end a quoting/unquoting operator # i.e. a space or a closing brace/paren/curly end_quote = r'(?![\s\)\]\}])' identifier = r'[^()\[\]{}\'"\s;]+' lg.add('LPAREN', r'\(') lg.add('RPAREN', r'\)') lg.add('LBRACKET', r'\[') lg.add('RBRACKET', r'\]') lg.add('LCURLY', r'\{') lg.add('RCURLY', r'\}') lg.add('HLCURLY', r'#\{') lg.add('QUOTE', r'\'%s' % end_quote) lg.add('QUASIQUOTE', r'`%s' % end_quote) lg.add('UNQUOTESPLICE', r'~@%s' % end_quote) lg.add('UNQUOTE', r'~%s' % end_quote) lg.add('DISCARD', r'#_') lg.add('HASHSTARS', r'#\*+') lg.add('HASHOTHER', r'#%s' % identifier) # A regexp which matches incomplete strings, used to support # multi-line strings in the interpreter
""":mod:`stencil_lang.matrix.lexer` -- Matrix scanner """ from rply import LexerGenerator from stencil_lang.matrix.tokens import TOKENS, IGNORES lg = LexerGenerator() for rule_name, regex in TOKENS.iteritems(): lg.add(rule_name, regex) for regex in IGNORES: lg.ignore(regex) # This has to be called outside a function because the parser must be generated # in Python during translation, not in RPython during runtime. _lexer = lg.build() """This intepreter's lexer instance.""" def lex(text): """Scan text using the generated lexer. :param text: text to lex :type text: :class:`str` :return: parsed stream :rtype: :class:`rply.lexer.LexerStream` """ return _lexer.lex(text)
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # print self.lexer.add('PRINT', r'print') # parentheses self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') # semicolon self.lexer.add('SEMI_COLON', r'\;') # addition and subtraction operators self.lexer.add('SUM', r'\+') self.lexer.add('SUB', r'\-') # number self.lexer.add('NUMBER', r'\d+') # ignore spaces self.lexer.ignore('\s+') def get_lexer(self): self._add_tokens() return self.lexer.build()
import collections from transit.transit_types import Keyword, Symbol, TaggedValue, List, Vector import transit.transit_types transit_true = transit.transit_types.true transit_false = transit.transit_types.false from rply import ParserGenerator, LexerGenerator lg = LexerGenerator() SYMBOL_RE = r"[\.\*\+\!\-\_\?\$%&=a-zA-Z][\.\*\+\!\-\_\?\$%&=a-zA-Z0-9:#]*" NS_SYMBOL = SYMBOL_RE + "/" + SYMBOL_RE lg.add("boolean", r"(true|false)") lg.add("nil", r"nil") lg.add("float", r"\d+\.\d+") lg.add("number", r"[-+]?\d+") lg.add("olist", r"\(") lg.add("clist", r"\)") lg.add("omap", r"{") lg.add("cmap", r"}") lg.add("ovec", r"\[") lg.add("cvec", r"\]") lg.add("oset", r"#{") lg.add("colon", r":") lg.add("char_nl", r"\\newline") lg.add("char_tab", r"\\tab") lg.add("char_return", r"\\return") lg.add("char_space", r"\\space")
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # LODD self.lexer.add('LODD', r'(?<!\w)LODD(?!\w)') # STOD self.lexer.add('STOD', r'(?<!\w)STOD(?!\w)') # ADDD self.lexer.add('ADDD', r'(?<!\w)ADDD(?!\w)') # SUBD self.lexer.add('SUBD', r'(?<!\w)SUBD(?!\w)') # JPOS self.lexer.add('JPOS', r'(?<!\w)JPOS(?!\w)') # JZER self.lexer.add('JZER', r'(?<!\w)JZER(?!\w)') # JUMP self.lexer.add('JUMP', r'(?<!\w)JUMP(?!\w)') # LOCO self.lexer.add('LOCO', r'(?<!\w)LOCO(?!\w)') # LODL self.lexer.add('LODL', r'(?<!\w)LODL(?!\w)') # STOL self.lexer.add('STOL', r'(?<!\w)STOL(?!\w)') # ADDL self.lexer.add('ADDL', r'(?<!\w)ADDL(?!\w)') # SUBL self.lexer.add('SUBL', r'(?<!\w)SUBL(?!\w)') # JNEG self.lexer.add('JNEG', r'(?<!\w)JNEG(?!\w)') # JNZE self.lexer.add('JNZE', r'(?<!\w)JNZE(?!\w)') # CALL self.lexer.add('CALL', r'(?<!\w)CALL(?!\w)') # PUSHI self.lexer.add('PUSHI', r'(?<!\w)PUSHI(?!\w)') # POPI self.lexer.add('POPI', r'(?<!\w)POPI(?!\w)') # PUSH self.lexer.add('PUSH', r'(?<!\w)PUSH(?!\w)') # POP self.lexer.add('POP', r'(?<!\w)POP(?!\w)') # RETN self.lexer.add('RETN', r'(?<!\w)RETN(?!\w)') # SWAP self.lexer.add('SWAP', r'(?<!\w)SWAP(?!\w)') # INSP self.lexer.add('INSP', r'(?<!\w)INSP(?!\w)') # DESP self.lexer.add('DESP', r'(?<!\w)DESP(?!\w)') # INPAC self.lexer.add('INPAC', r'(?<!\w)INPAC(?!\w)') # OUTAC self.lexer.add('OUTAC', r'(?<!\w)OUTAC(?!\w)') # HALT self.lexer.add('HALT', r'(?<!\w)HALT(?!\w)') # ETIQUETA self.lexer.add('ETIQUETA', r'\@[A-Za-z]\w*') # VARIABLE self.lexer.add('VARIABLE', r'(?<!\w)[A-Za-z]\w*') # DIRECCION self.lexer.add('DIRECCION', r'(?<!\w)0x[A-F0-9][A-F0-9](?!\w)') # Numero self.lexer.add('NUMERO', r'(?<![A-Za-z])\d+(?![A-Za-z])') # Ignore spaces self.lexer.ignore('\s+') self.lexer.ignore('\%.*') def get_lexer(self): self._add_tokens() return self.lexer.build()
from rply import LexerGenerator lg = LexerGenerator() lg.add("LPAREN", r"\(") lg.add("RPAREN", r"\)") # lg.add('LBRACKET', r'\[') # lg.add('RBRACKET', r'\]') lg.add("IDENTIFIER", r"[^()\[\]{}\s#]+") lg.ignore(r"#.*(?=\r|\n|$)") lg.ignore(r"\s+") lexer = lg.build()
class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): # Print self.lexer.add('PRINT', r'out') # Parenthesis self.lexer.add('OPEN_PAREN', r'\(') self.lexer.add('CLOSE_PAREN', r'\)') # Semi Colon self.lexer.add('SEMI_COLON', r'\;') # Operators self.lexer.add('SUM', r'\+') self.lexer.add('SUB', r'\-') self.lexer.add('MUL', r'\*') self.lexer.add('DIV', r'\/') # Number self.lexer.add('NUMBER', r'\d+') # Ignore spaces self.lexer.ignore('\s+') def get_lexer(self): self._add_tokens() return self.lexer.build()