def main(): import logging lexer = lex.lex(debug=1, optimize=0, debuglog=logging.getLogger(__name__)) lexer._file = '<???>' lex.runmain(lexer=lexer) if lex.lexer.current_state()!='INITIAL': raise DBSyntaxError("open macro at EOF", lexer._file, -1)
def lexer_output_to_dict(self): with open(os.path.join("php_functions", "php_function_list.json"), "r") as f: php_functions = json.load(f) with open(self.input_file, "r") as fin: old_stdout = sys.stdout sys.stdout = open("lexer_output.txt", "w") lex.runmain(lexer=phplex.full_lexer, data=fin.read().rstrip()) sys.stdout = old_stdout linepos = 0 lineno = 0 f = open("lexer_output.txt", "r") lexer_dict = {} for line in f.readlines(): toks = line.split(",") linepos = int(toks[-1].rstrip(")\n")) lineno = int(toks[-2]) toktype = toks[0].lstrip("(") tokvalue = self.find_tok_value(line) if toktype not in lexer_dict.keys(): lexer_dict[toktype] = {} lexer_dict[toktype][tokvalue] = [(lineno, linepos)] else: if tokvalue not in lexer_dict[toktype].keys(): lexer_dict[toktype][tokvalue] = [(lineno, linepos)] else: lexer_dict[toktype][tokvalue].append((lineno, linepos)) self.find_all_functions(lexer_dict, php_functions) number_of_lines = int(lineno) number_of_chars = int(linepos) return lexer_dict, number_of_lines, number_of_chars
def getAST(): lexer = lex.lex() lexer.indents = [] lexer.indents.append(0) lexer.paren_stack = [] lexer.curr_indent = 0 lexer.token_ = lexer.token lexer.token = (lambda: token_override(lexer)) lexer.begin('indent') yacc.yacc(debug=1) file = sys.argv[1] stream = open(file) contents = stream.read() lex.runmain(lexer) ast=yacc.parse(contents, lexer) return ast
def run_token_match(self, parser, token_data): from ply.lex import runmain #@UnresolvedImport original_stdout = sys.stdout stdout_stringio = StringIO() sys.stdout = stdout_stringio try: runmain(lexer=parser.lexer, data=token_data) captured_io = stdout_stringio.getvalue() return captured_io finally: sys.stdout = original_stdout
def run_token_match(self, parser, token_data): from ply.lex import runmain #@UnresolvedImport original_stdout = sys.stdout stdout_stringio = StringIO() sys.stdout = stdout_stringio try: runmain(lexer = parser.lexer, data = token_data) captured_io = stdout_stringio.getvalue() return captured_io finally: sys.stdout = original_stdout
import ply.lex as lex tokens = ( 'H_EDIT_DESCRIPTOR', ) # Tokens t_ignore = " \t\n" def t_H_EDIT_DESCRIPTOR(t): r"\d+H.*" # This grabs all of the remaining text i = t.value.index('H') n = eval(t.value[:i]) # Adjust the tokenizing position t.lexer.lexpos -= len(t.value) - (i+1+n) t.value = t.value[i+1:i+1+n] return t def t_error(t): print "Illegal character '%s'" % t.value[0] t.lexer.skip(1) # Build the lexer lex.lex() lex.runmain(data="3Habc 10Habcdefghij 2Hxy")
t_TYPE = r'Int|Float|Double' t_ARROW = r'->' t_COMMA = r',' t_TRIPLEDOT = r'\.\.\.' def t_NUMBER(t): r'\d+' t.value = int(t.value) return t def t_ID(t): r'[a-z|A-Z][a-z|A-Z|0-9]*' t.type = reserved.get(t.value, 'ID') return t def t_singleLine(t): r'//.*\n' def t_multiLine(t): r'/\*[^(*/)]*\*/\n' lexer = lex.lex(debug=1) if (__name__ == "__main__"): lex.runmain()
else: # Rewrite close tag as a semicolon. t.type = 'SEMI' break t = self.lexer.token() self.last_token = t return t # Iterator interface def __iter__(self): return self def next(self): t = self.token() if t is None: raise StopIteration return t __next__ = next full_lexer = lex.lex() lexer = FilteredLexer(full_lexer) full_tokens = tokens tokens = filter(lambda token: token not in unparsed, tokens) if __name__ == "__main__": lex.runmain(full_lexer)
def t_DOCSTRINGOPEN(t): r'/\*\*[ ]+' return t; #t_COMMENTOPEN = r'/\*' t_COMMENTCLOSE = r'\*/' # Preprocessor directive (ignored) def t_preprocessor(t): r'\#(.)*?\n' t.lexer.lineno += 1 def t_error(t): print "Illegal character %s" % repr(t.value[0]) t.lexer.skip(1) lexer = lex.lex(debug=False) if __name__ == "__main__": lex.runmain(lexer)
t_INTERVAL = 'interval' # Deliminators t_LPAREN = r'\(' t_RPAREN = r'\)' t_LBRACE = r'\[' t_RBRACE = r'\]' t_COMMA = r',' t_SEMICOLON = r';' # Non-emmitting t_ignore = (' \t\n\r') def t_comment(t): r'\#[^\n]*' pass def t_error(t): print("Illegal character '{}'".format(t), file=sys.stderr) sys.exit(-1) # Create lexer on call and import function_lexer = lex.lex() #, optimize=1) #used when stable # On call run as a util, taking in text and printing the lexed version if __name__ == "__main__": lex.runmain(function_lexer)
t_EQUALS = r'=' t_LPAREN = r'\(' t_RPAREN = r'\)' t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' def t_NUMBER(t): r'\d+' try: t.value = int(t.value) except ValueError: print("Integer value too large %s" % t.value) t.value = 0 return t t_ignore = " \t" def t_newline(t): r'\n+' t.lineno += t.value.count("\n") def t_error(t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) # Build the lexer lex.lex(optimize=1,lextab="opt2tab") lex.runmain(data="3+4")
t_ignore = " \t" # Comments def t_comment(t): r'/\*' t.lexer.begin('comment') print "Entering comment state" def t_comment_body_part(t): r'(.|\n)*\*/' print "comment body", t t.lexer.begin('INITIAL') def t_error(t): pass t_comment_error = t_error t_comment_ignore = t_ignore import sys lex.lex() data = "3 + 4 /* This is a comment */ + 10" lex.runmain(data=data)
@_lex.TOKEN(num + ur'%') def t_PERCENTAGE(self, t): return t t_NUMBER = num @_lex.TOKEN(U + R + L + ur'\(' + w + r_or(string, url) + w + ur'\)') def t_URI(self, t): return t @_lex.TOKEN(ident + ur'\(') def t_FUNCTION(self, t): return t def t_error(self, t): print "Illegal token '%s'" % t.value[0] t.lexer.skip(1) def lex(**kw): if 'object' in kw: del kw['object'] kw['module'] = csslexer() if 'reflags' not in kw: kw['reflags'] = 0 kw['reflags'] |= re.UNICODE | re.IGNORECASE return _lex.lex(**kw) if '__main__' == __name__: _lex.runmain(lexer=lex())
def __init__(self, filename): self.lex = lex.lex(module=self) self.input = self.lex.input self.token = self.lex.token # For tracking current file/line position self.filename = filename self.line_offset = 0 self.filenames = [] self._filenames_set = set() if self.filename: self.filenames.append(filename) self._filenames_set.add(filename) # Doxygen comments self.doxygenCommentCache = "" def current_location(self): return self.filename, self.lex.lineno - self.line_offset def get_doxygen(self): doxygen = self.doxygenCommentCache self.doxygenCommentCache = "" return doxygen if __name__ == "__main__": lex.runmain(lexer=Lexer())
return t t_OP_GE = r'>=' t_OP_LE = r'<=' t_OP_EQ = r'==' t_OP_NE = r'!=' def t_ID(t): r'[a-zA-Z_]+[\da-zA-Z_]*' t.type = reserved.get(t.value, 'ID') return t t_ignore = ' \t\v\f' def t_error(t): raise SyntaxError('Error at line %d, position %d' % (t.lineno, t.lexpos)) lexer = lex.lex() # # Scripting part # if __name__ == '__main__': lex.runmain() # pragma: no cover
t_EQ = r'==' def t_error(self, t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(0) def build(self, **kwargs): self.lexer = lex.lex(module=self, **kwargs) # Test it output def test(self, data): self.lexer.input(data) while True: tok = self.lexer.token() if not tok: break print(tok) # Build the lexer and try it out cpp_scanner = MyLexer() cpp_scanner.build() if __name__ == "__main__": if (len(sys.argv) == 2): filename = sys.argv[1] a = open(filename) data = a.read() cpp_scanner.test(data) else: lex.runmain(cpp_scanner.lexer)
if tok.type not in self._discard_types: tok.location = (self.filename, tok.lineno - self.line_offset) break return tok def token_if(self, *types): tok = self.token(eof_ok=True) if tok is None: return None if tok.type not in types: # put it back on the left in case it was retrieved # from the lookahead buffer self.lookahead.appendleft(tok) return None return tok def return_token(self, tok): self.lookahead.appendleft(tok) def return_tokens(self, toks): self.lookahead.extendleft(reversed(toks)) if __name__ == "__main__": try: lex.runmain(lexer=Lexer(None)) except EOFError: pass
import ply.lex as lex tokens = ( 'H_EDIT_DESCRIPTOR', ) # Tokens t_ignore = " \t\n" def t_H_EDIT_DESCRIPTOR(t): r"\d+H.*" # This grabs all of the remaining text i = t.value.index('H') n = eval(t.value[:i]) # Adjust the tokenizing position t.lexer.lexpos -= len(t.value) - (i+1+n) t.value = t.value[i+1:i+1+n] return t def t_error(t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) # Build the lexer lex.lex() lex.runmain(data="3Habc 10Habcdefghij 2Hxy")
def autotest(self): lex.runmain()
t_COMMA = r',' t_SEMICOLON = r';' # Non-emmitting t_ignore = (' \t\n\r') def t_comment(t): r'\#[^\n]*' pass def t_error(t): print("Illegal character '{}'".format(t), file=sys.stderr) sys.exit(-1) # Create lexer on call and import function_lexer = lex.lex() #, optimize=1) #used when stable # On call run as a util, taking in text and printing the lexed version if __name__ == "__main__": lex.runmain(function_lexer)
return t # Newlines def t_NEWLINE(t): r'\n+' t.lexer.lineno += t.value.count("\n") # Comments def t_comment(t): r'/\*(.|\n)*?\*/' t.lexer.lineno += t.value.count('\n') # Preprocessor directive (ignored) def t_preprocessor(t): r'\#(.)*?\n' t.lexer.lineno += 1 # Error handling def t_error(t): print("Illegal character %s" % repr(t.value[0])) t.lexer.skip(1) lexical_analyzer = lex.lex() if __name__ == "__main__": lex.runmain(lexical_analyzer)
self.data = data return self._lexer.input(data) def token(self): tok = self._lexer.token() if tok is not None: # Tokens without a processing function don't set this themselves tok.lexer = self._lexer # Wrap the token up to present useful data when in the parsing stage tok.value = RdlToken(tok) return tok def __init__(self): # hw and sw are properties, but they are lexed as a precedence self.keywords.update({prop.name: 'PROPNAME' for prop in properties if prop.name not in self.keywords}) self.tokens = ['VNUM', 'NUM', 'STRING', 'ID', 'DEREF', 'INC', 'MOD', 'LSQ', 'RSQ', 'RBRACE', 'LBRACE', 'COLON', 'COMMA', 'DOT', #'OR', 'AT', 'SEMI', 'EQ'] self.tokens += list(OrderedDict.fromkeys(self.keywords.values())) self._lexer = lex.lex(object=self) self.lex_errors = 0 self.data = None if __name__ == "__main__": lex.runmain(RdlLexer())
# lex_many_tokens.py # # Test lex's ability to handle a large number of tokens (beyond the # 100-group limit of the re module) import sys if ".." not in sys.path: sys.path.insert(0, "..") import ply.lex as lex tokens = ["TOK%d" % i for i in range(1000)] for tok in tokens: if sys.version_info[0] < 3: exec("t_%s = '%s:'" % (tok, tok)) else: exec("t_%s = '%s:'" % (tok, tok), globals()) t_ignore = " \t" def t_error(t): pass lex.lex(optimize=1, lextab="manytab") lex.runmain(data="TOK34: TOK143: TOK269: TOK372: TOK452: TOK561: TOK999:")
import sys if "../.." not in sys.path: sys.path.insert(0,"../..") from gSLLexer import * import ply.lex as lex code = """var a:numerico """ gLexer = gSLLexer() lex.runmain(data = code)
# lex_many_tokens.py # # Test lex's ability to handle a large number of tokens (beyond the # 100-group limit of the re module) import sys if ".." not in sys.path: sys.path.insert(0, "..") import ply.lex as lex tokens = ["TOK%d" % i for i in range(1000)] for tok in tokens: if sys.version_info[0] < 3: exec("t_%s = '%s:'" % (tok, tok)) else: exec("t_%s = '%s:'" % (tok, tok), globals()) t_ignore = " \t" def t_error(t): pass lex.lex() lex.runmain(data="TOK34: TOK143: TOK269: TOK372: TOK452: TOK561: TOK999:")
def run_on_argv1(): lex.runmain(full_lexer)
t.type = rdict.get(t.value, "IDENTIFIER") return t def t_NEWLINE(t): r'\n+' t.lexer.lineno += t.value.count("\n") def t_comment(t): r'/\*(.|\n)*?\*/ | //(.)*?\n' t.lexer.lineno += t.value.count('\n') def t_preprocessor(t): r'\#(.)*?\n' t.lexer.lineno += 1 def t_error(t): print("Error : %s" % str(t.value[0])) t.lexer.skip(1) lexer = lex.lex() if __name__ == "__main__": if len(sys.argv) < 2: print "{token type, token name, line nunmber, index relative to start of input}" lex.runmain(lexer) else: fo = open(str(sys.argv[1]), "r+") data = fo.read() fo.close() print "{token type, token name, line nunmber, index relative to start of input}" lex.runmain(lexer, data)
# lex_module.py # import sys if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex import lex_module_import lex.lex(module=lex_module_import) lex.runmain(data="3+4")
def t_SIMB(t): r'[></a-zA-Z_+=\*\-][></a-zA-Z0-9_+\*\-=]*' #print ('In t_SIMB',t) t.type = reserved.get(t.value,'SIMB') # Check for reserved words return t def t_TEXT(t): r'\'[a-zA-Z0-9_+\*\- :,\.\\[\];=()\"$]*\'' #print ('In t_Text',t) t.type = reserved.get(t.value,'TEXT') # Check for reserved words return t # Define a rule so we can track line numbers def t_newline(t): r'\n+' t.lexer.lineno += len(t.value) # A string containing ignored characters (spaces and tabs) t_ignore = ' \t' # Error handling rule def t_error(t): print "Illegal character '%s'" % t.value[0] t.lexer.skip(1) # Build the lexer lex.lex() if __name__ == '__main__': lex.runmain()
t_PLUS = r'\+' t_MINUS = r'-' t_NUMBER = r'\d+' t_ignore = " \t" # Comments def t_comment(t): r'/\*' t.lexer.begin('comment') print "Entering comment state" def t_comment_body_part(t): r'(.|\n)*\*/' print "comment body", t t.lexer.begin('INITIAL') def t_error(t): pass t_comment_error = t_error t_comment_ignore = t_ignore import sys lex.lex() data = "3 + 4 /* This is a comment */ + 10" lex.runmain(data=data)
t_ignore_COMMENT = r'//.*' # t_MODIFIERBACK = r'%' # t_MODIFIERDEBUG = r'\#' # t_MODIFIERROOT = r'!' # t_MODIFIERDISABLE = r'\*' t_ignore = " \t" def t_comment(self, t): r'/\*(.|\n)*?\*/' t.lexer.lineno += t.value.count('\n') def t_ID(self, t): r'[$]?[a-zA-Z_][a-zA-Z_0-9]*' t.type = reserved.get(t.value, 'ID') # Check for reserved words return t def t_newline(self, t): r'\n+' t.lexer.lineno += t.value.count("\n") def t_error(self, t): error("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) #lexer = lex.lex() if __name__ == "__main__": lex.runmain(lexer)