def regex_tokenizer(text, G, skip_whitespaces=True): tokens = [] # > fixed_tokens = ??? # Your code here!!! skip = False for i, char in enumerate(text): if skip: skip = False continue if skip_whitespaces and char.isspace(): continue # Your code here!!! if char == '\\': try: tokens.append(Token(text[i+1],symbol)) except IndexError: tokens.append(Token('\\',symbol)) skip = True continue try: tokens.append(fixed_tokens[char]) except KeyError: tokens.append(Token(char,symbol)) tokens.append(Token('$', G.EOF)) return tokens
def __test_lexer_2(lexer): text = '5465 for 45foreach fore' tokens, errors = lexer(text) assert errors == [] assert [t.token_type for t in tokens] == [ 'num', 'space', 'for', 'space', 'num', 'foreach', 'space', 'id', 'eof' ] assert [t.lex for t in tokens ] == ['5465', ' ', 'for', ' ', '45', 'foreach', ' ', 'fore', '$'] text = '4forense forforeach for4foreach foreach 4for' tokens, errors = lexer(text) assert errors == [] assert [t.token_type for t in tokens] == [ 'num', 'id', 'space', 'id', 'space', 'id', 'space', 'foreach', 'space', 'num', 'for', 'eof' ] assert [t.lex for t in tokens] == [ '4', 'forense', ' ', 'forforeach', ' ', 'for4foreach', ' ', 'foreach', ' ', '4', 'for', '$' ] text = "LAexer" tokens, errors = lexer(text) assert errors == ["L", "A"] assert tokens == [Token("exer", "id"), Token("$", "eof")]
def tokenize(input_program: str): tokenizer = lex.lex() tokenizer.input(input_program) toks = [] for tok in tokenizer: toks.append(Token(tok.value, token_type[tok.type])) toks.append(Token('$', G.EOF)) return toks
def tokenize_text(text): tokens = [] for item in text.split(): try: float(item) token = Token(item, G['num']) except ValueError: try: token = fixed_tokens[item] except: token = UnknownToken(item) tokens.append(token) eof = Token('$', G.EOF) tokens.append(eof) return tokens
def _regex_tokenizer(G, symbol, text, skip_whitespaces=False): tokens = [] if len(text) == 1: tokens.append(Token(text[0], G[symbol])) else: for char in text: if skip_whitespaces and char.isspace(): continue temp = G[char] if not temp is None: tokens.append(Token(char, temp)) else: tokens.append(Token(char, G[symbol])) tokens.append(Token('$', G.EOF)) return tokens
def regex_tokenizer(text, G, skip_whitespaces=True): tokens = [] jump = False # print(text) for char in text: if skip_whitespaces and char.isspace(): continue elif(char == '\\' and not jump): jump = True elif(char == '*' and not jump): tokens.append(Token('*', star)) elif(char == '(' and not jump): tokens.append(Token('(', opar)) elif(char == ')' and not jump): tokens.append(Token(')', cpar)) elif(char == '|' and not jump): tokens.append(Token('|', pipe)) elif(char == 'ε' and not jump): tokens.append(Token('ε', epsilon)) else: tokens.append(Token(char, symbol)) jump = False tokens.append(Token('$', G.EOF)) return tokens
def tokenize_text(text): tokens = [] for item in text.split(): try: float(item) token = Token(item, num) except ValueError: try: token = fixed_tokens[item] except: raise Exception('Undefined token') tokens.append(token) eof = Token('$', G.EOF) tokens.append(eof) return tokens
def __init__(self, idx, args, obj=None, at_type=None, token = Token("", "", (-1,-1))): self.obj = obj self.id = idx self.args = args self.at_type = at_type if token.location[0] == -1: self.token = idx else: self.token = token
def _fix_tokens(self, tokens, errors): """ If there are a token_type named 'space' this are discarted from the parsing tokens\n also transform lexer tokens to grammar tokens """ fix_tokens = [] for x in tokens: if x.token_type != 'space': try: if x.token_type in self.token_parse: tok = Token(self.token_parse[x.token_type](x.lex), x.token_type) else: tok = Token(x.lex, x.token_type) fix_tokens.append(tok) except KeyError: errors.append( f'The grammar does not recognize the token {x}') return fix_tokens
def get_grammar_tokens(gram_def,errors:list): tokens = [] for x in gram_lexer(gram_def): if x.token_type != 'space': try: tok = Token(x.lex,symbols[x.token_type]) tokens.append(tok) except KeyError: errors.append(f'Unknown Token({x.lex},{x.token_type}) in gram_def') return tokens
def regex_tokenizer(text, G, skip_whitespaces=True): tokens = [] fixed_tokens = {lex: Token(lex, G[lex]) for lex in '| * ( ) ε [ ] ? + -'.split()} open_pos = 0 inside_squares = False set_literal = False for i, char in enumerate(text): if skip_whitespaces and char.isspace(): continue if not set_literal and char == '\\': set_literal = True continue if set_literal: tokens.append(Token(char, G['symbol'])) set_literal = False continue if not inside_squares: if char in (']', '-') or char not in fixed_tokens: tokens.append(Token(char, G['symbol'])) else: tokens.append(fixed_tokens[char]) open_pos = i inside_squares = char == '[' else: if char == ']': if i - open_pos == 1: tokens.append(Token(char, G['symbol'])) else: inside_squares = False tokens.append(fixed_tokens[char]) elif char == '-': if is_minus_a_symbol(G, text, tokens, i, open_pos): tokens.append(Token(char, G['symbol'])) else: tokens.append(fixed_tokens[char]) else: tokens.append(Token(char, G['symbol'])) if inside_squares: raise Exception(f'Unterminated character set at position {open_pos}') tokens.append(Token('$', G.EOF)) return tokens
def tokenize_cool_text(G, text, idx, num, print_tokens=False): fixed_tokens = { t.Name: Token(t.Name, t) for t in G.terminals if t not in {idx, num} } @tokenizer(G, fixed_tokens) def tokenize_text(token): lex = token.lex try: float(lex) return token.transform_to(num) except ValueError: # verificar los string return token.transform_to(idx) # (do something like if(lex[0] == " and lex[-1] ==")) tokens = tokenize_text(text) if print_tokens: pprint_tokens(tokens) return tokens
def regex_tokenizer(text, G, skip_whitespaces=True): tokens = [] fixed_tokens = { '|': Token('|', pipe), '*': Token('*', star), '(': Token('(', opar), ')': Token(')', cpar), 'ε': Token('ε', epsilon) } for char in text: if skip_whitespaces and char.isspace(): continue char_token = '' try: char_token = fixed_tokens[char] except KeyError: char_token = Token(char, symbol) tokens.append(char_token) tokens.append(Token('$', G.EOF)) return tokens
def manual_input_app(): ################ # Declarations # ################ G = Grammar() parsers = { 'LL(1)': LL1Parser, 'SLR(1)': SLR1Parser, 'LR(1)': LR1Parser, 'LALR(1)': LALR1Parser } ################# # Input Options # ################# options = ('terminal id', 'terminal id + value', 'terminal id + value + regex') option = st.sidebar.selectbox('Entrada de los terminales', options, index=2) ################### # Parser Selector # ################### parser_type = st.sidebar.selectbox('Seleccione el algoritmo de Parsing', ('LL(1)', 'SLR(1)', 'LR(1)', 'LALR(1)'), index=1) ################################################ # Start Symbol, Non terminal & terminals Input # ################################################ start_symbol = st.sidebar.text_input('Simbolo inicial: ', value=AritmethicStartSymbol) input_nonterminals = st.sidebar.text_input('No Terminales :', value=AritmethicNonTerminalsLR) input_terminals = st.sidebar.text_input('Terminales :', value=AritmethicTerminals) terminals_id, terminals_regex = terminals_input_control( option, options, input_terminals) ################### # Get Productions # ################### input_productions = st.text_area('Producciones :', value=AritmethicProductionsLR) nonterminals_variables = ', '.join(input_nonterminals.split()) terminal_variables = ', '.join(terminals_id[term] for term in input_terminals.split()) ##################################################### # Declarando instrucciones para ejecutar con exec() # ##################################################### inst1 = f'{start_symbol} = G.NonTerminal("{start_symbol}", True)' if len(input_nonterminals) == 1: inst2 = f'{nonterminals_variables} = G.NonTerminal("{input_nonterminals}")' else: inst2 = f'{nonterminals_variables} = G.NonTerminals("{input_nonterminals}")' inst3 = f'{terminal_variables} = G.Terminals("{input_terminals}")' ########## # exec() # ########## exec_instructions(G, inst1, inst2, inst3, input_productions) ######################### # Preparacion del Lexer # ######################### if terminals_regex: table = [(G[t], re) for t, re in terminals_regex.items()] + [ ('space', ' *'), ] lexer = Lexer(table, G.EOF) else: lexer = tokenizer(G, {t.Name: Token(t.Name, t) for t in G.terminals}) ########################## # Preparacion del parser # ########################## ParserClass = parsers[parser_type] parser = ParserClass(G) ########## # Salvar # ########## gName = st.sidebar.text_input("Nombre del archivo") if st.sidebar.button("Salvar"): try: f = open(gName + '.json', 'x') s = G.to_json json.dump(s, f, indent=4) st.sidebar.success(f'Salvado {gName}.json') except FileExistsError: st.sidebar.error('Ya existe un archivo con ese nombre') ############################ # Regular Grammar Checking # ############################ re_grammar = RegularGrammar(G) if re_grammar.valid: st.sidebar.success("Esta Gramatica es Regular") dfa = re_grammar.dfa regex = re_grammar.regex if st.checkbox('Mostrar DFA de la Gramatica'): st.graphviz_chart(str(dfa.graph())) if st.checkbox('Mostrar la expresion regular'): st.latex(regex) ########################### # Visualizar la gramatica # ########################### if st.checkbox('Mostrar Gramatica'): show_grammar(G) #################### # Fisrts & Follows # #################### if st.checkbox('Mostrar Firsts & Follows'): st.subheader("Firsts :") st.dataframe(set_to_dataframe(parser.G, parser.firsts)) st.subheader("Follows :") st.dataframe(set_to_dataframe(parser.G, parser.follows)) ################## # Parsing Table # ################## if st.checkbox('Mostrar Tabla de Parsing'): if parser_type == 'LL(1)': st.subheader("Table :") st.dataframe(lltable_to_dataframe(parser.table)) else: st.subheader("Action :") st.dataframe(lrtable_to_dataframe(parser.action)) st.subheader("Goto :") st.dataframe(lrtable_to_dataframe(parser.goto)) ############### # Automata LR # ############### if parser_type != 'LL(1)': if st.checkbox('Mostrar Automata LR'): st.graphviz_chart(str(parser.automaton.graph())) dtree = LRDerivationTree else: dtree = LLDerivationTree ################################ # Modificacion de la Gramatica # ################################ modify_grammar(G) #################### # Parsing Conflict # #################### if parser.conflict is not None: deal_with_conflict(parser, parser_type) else: #################### # Analizar cadenas # #################### text = st.text_input('Introduzca una cadena para analizar', value='') if st.button('Analyze'): tokens = [t for t in lexer(text) if t.token_type != 'space'] derivation = parser(tokens) st.graphviz_chart(str(dtree(derivation).graph()))
from cmp.utils import Token, tokenizer from Grammar import get_grammar G, idx, num, string, ocur, ccur, semi = get_grammar() fixed_tokens = { t.Name: Token(t.Name, t) for t in G.terminals if t not in {idx, num, string} } @tokenizer(G, fixed_tokens) def tokenize_text(token): lex = token.lex try: float(lex) return token.transform_to(num) except ValueError: return token.transform_to(idx) def pprint_tokens(tokens): indent = 0 pending = [] ret_text = '' for token in tokens: pending.append(token) if token.token_type in {ocur, ccur, semi}: if token.token_type == ccur: indent -= 1 print(' ' * indent +
def __init__(self, G): self.G = G self.fixed_tokens = { lex: Token(lex, G[lex]) for lex in '+ - * / ( )'.split() }
class PlusNode(UnaryNode): @staticmethod def operate(value): # Your code here!!! return f'({value[0]})+',automata_concatenation(value[1],automata_closure(value[1])) class QuestionNode(UnaryNode): @staticmethod def operate(value): # Your code here!!! epsilon = NFA(1,{0},{},0) return f'({value[0]})?',automata_union(value[1],epsilon) fixed_tokens = { '*' : Token('*' , star), '(' : Token('(' , opar), ')' : Token(')' , cpar), '|' : Token('|' , pipe), '?' : Token('?' , qtn), '+' : Token('+' , plus), '[' : Token('[' , obra), ']' : Token(']' , cbra), EPSILON : Token(EPSILON , epsilon), } def regex_tokenizer(text, G, skip_whitespaces=True): tokens = [] # > fixed_tokens = ??? # Your code here!!!
def __call__(self, text): errors = [] return [ Token(lex, ttype) for lex, ttype in self._tokenize(text, errors) ], errors
def __unit_testing_regex_tokenizer(): G, symbol = grammar_for_regex() tokens = _regex_tokenizer(G, symbol, "a*(a|b)*cd|ε") assert tokens == [ Token("a", G[symbol]), Token("*", G["*"]), Token("(", G["("]), Token("a", G[symbol]), Token("|", G["|"]), Token("b", G[symbol]), Token(")", G[")"]), Token("*", G["*"]), Token("c", G[symbol]), Token("d", G[symbol]), Token("|", G["|"]), Token("ε", G["ε"]), Token("$", G.EOF) ], "regex tokenizer error in 'a*(a|b)*cd|ε'" tokens = _regex_tokenizer(G, symbol, "*") assert tokens == [Token("*", G[symbol]), Token("$", G.EOF)], "regex tokenizer error in '*'"
def __call__(self, tokens, errors, finding_conflict=False): stack = [0] cursor = 0 output = [] tokens = [x for x in tokens] while True and cursor < len(tokens): state = stack[-1] lookahead = tokens[cursor] if self.verbose: print(stack, '<---||--->', tokens[cursor:]) # Your code here!!! (Detect error) try: action = self.action[state, lookahead.token_type] if isinstance(action, tuple): action, tag = action else: return None if not finding_conflict else (state, lookahead, output) except KeyError: # errors.append(f'Invalid transition ({state},{lookahead}) doesnt exist expected {[ x[1] for x in self.action if x[0] == state ]}') posibles = [x for x in self.action if x[0] == state] arg = f"{lookahead.lex[0]}" if lookahead.is_eof else lookahead.lex[ 0] errors.append( SyntacticCoolError(SYNTACTIC_ERROR, arg, token=lookahead)) # errors.append(f"Invalid transition near '{lookahead.lex[0]}'. Expected: {', '.join([ str(x[1]) for x in posibles ])}. Line:{lookahead.lex[1] + 1} Column:{lookahead.lex[2] + 1}") if len(posibles) == 1 and not lookahead.is_eof: tokens.insert( cursor + 1, Token((str(posibles[0][1]), lookahead.lex[1], lookahead.lex[2]), posibles[0][1])) cursor += 1 continue return None if not finding_conflict else (state, lookahead, output) if action == self.SHIFT: # Your code here!!! (Shift case) stack.append(lookahead.token_type) stack.append(tag) cursor += 1 elif action == self.REDUCE: # Your code here!!! (Reduce case) for i in range(len(tag.Right)): stack.pop() top = stack.pop() if top != tag.Right[-(i + 1)]: errors.append( f"Productions reduce doesnt match: {top} != {tag.Right[-(i+1)]}" ) index = self.goto[stack[-1], tag.Left] stack.append(tag.Left) stack.append(index) output.append(tag) elif action == self.OK: # Your code here!!! (OK case) return output if not finding_conflict else (state, lookahead, output) # Your code here!!! (Invalid case) else: errors.append(f"Invalid case: {action}") return None if not finding_conflict else (state, lookahead, output) if cursor == len(tokens): errors.append('EOF token missing') else: errors.append( 'No valid derivation tree can be built with the given tokens')
def unit_testing(): G = Grammar() E = G.NonTerminal('E', True) T,F,X,Y = G.NonTerminals('T F X Y') plus, minus, star, div, opar, cpar, num = G.Terminals('+ - * / ( ) num') E %= T + X, lambda h,s: s[2], None, lambda h,s: s[1] X %= plus + T + X, lambda h,s: s[3], None, None, lambda h,s: s[2] + h[0] X %= minus + T + X, lambda h,s: s[3], None, None, lambda h,s: h[0] - s[2] X %= G.Epsilon, lambda h,s: h[0] T %= F + Y, lambda h,s: s[2], None, lambda h,s: s[1] Y %= star + F + Y, lambda h,s: s[3], None, None, lambda h,s: h[0] * s[2] Y %= div + F + Y, lambda h,s: s[3], None, None, lambda h,s: h[0]/s[2] Y %= G.Epsilon, lambda h,s: h[0] F %= num, lambda h,s: float(s[1]), None F %= opar + E + cpar, lambda h,s: s[2], None, None, None xcool = BasicXCool(G) tokens = [num, star, num, star, num, plus, num, star, num, plus, num, plus, num, G.EOF] M = _build_parsing_table(G,xcool.firsts,xcool.follows) assert M == xcool.table ,"Test Error in build_parsing_table" print(" - buider table ;) ") #################################################################### parser = _buid_parsing_func(G,M) left_parse,error = parser(tokens) assert error == [] assert left_parse == [ Production(E, Sentence(T, X)), Production(T, Sentence(F, Y)), Production(F, Sentence(num)), Production(Y, Sentence(star, F, Y)), Production(F, Sentence(num)), Production(Y, Sentence(star, F, Y)), Production(F, Sentence(num)), Production(Y, G.Epsilon), Production(X, Sentence(plus, T, X)), Production(T, Sentence(F, Y)), Production(F, Sentence(num)), Production(Y, Sentence(star, F, Y)), Production(F, Sentence(num)), Production(Y, G.Epsilon), Production(X, Sentence(plus, T, X)), Production(T, Sentence(F, Y)), Production(F, Sentence(num)), Production(Y, G.Epsilon), Production(X, Sentence(plus, T, X)), Production(T, Sentence(F, Y)), Production(F, Sentence(num)), Production(Y, G.Epsilon), Production(X, G.Epsilon), ] ,"Test Error in parser_library.LL1.parser" print(" - buider func ;) ") ################################################################### fixed_tokens = { '+' : Token( '+', plus ), '-' : Token( '-', minus ), '*' : Token( '*', star ), '/' : Token( '/', div ), '(' : Token( '(', opar ), ')' : Token( ')', cpar ), } def tokenize_text(text): tokens = [] for item in text.split(): try: float(item) token = Token(item, num) except ValueError: try: token = fixed_tokens[item] except: raise Exception('Undefined token') tokens.append(token) eof = Token('$', G.EOF) tokens.append(eof) return tokens text = '5.9 + 4' tokens = [ Token('5.9', num), Token('+', plus), Token('4', num), Token('$', G.EOF) ] left_parse,error = parser(tokens) assert len(left_parse) == 9 and len(error) == 0,"Test Error in parser func" result = _evaluate_parse(left_parse, tokens) assert result == 9.9,"Test Error in eval parser" text = '1 - 1 - 1' tokens = tokenize_text(text) left_parse,error = parser(tokens) assert len(left_parse) == 13 and len(error) == 0,"Test Error in parser func" result = _evaluate_parse(left_parse, tokens) assert result == -1,"Test Error in eval parser" text = '1 - ( 1 - 1 )' tokens = tokenize_text(text) left_parse,error = parser(tokens) assert len(left_parse) == 18 and len(error) == 0,"Test Error in parser func" result = _evaluate_parse(left_parse, tokens) assert result == 1,"Test Error in eval parser" print(" - method eval ;) ") ############################################################# return "LL1"
def __init__(self, declarations, context=None): super().__init__(Token("", "", (0,0))) # symbolic initial token self.declarations = declarations self.context = context
def __init__(self, idx, typex, init_exp=None, token = Token("", "", (0,0))): self.id = idx self.type = typex self.init_exp = init_exp self.token = token
def __call__(self, text): return [Token(lex, ttype) for lex, ttype in self._tokenize(text)]
def tokenize_cool_text(grammar, idx, type_id, string, num, data, errors, printing=False): # lexer starts with: lexpos = 0, lineno = 1, last_new_line = 0 # lexpos: Within token rule functions, this points to the first character after the matched text. lexer = lex.lex(module=tokens_rules) lexer.last_new_line_pos = 0 lexer.errors = errors # Give the lexer some input lexer.input(data) lessequal = grammar.__getitem__("<=") rarrow = grammar.__getitem__("=>") larrow = grammar.__getitem__("<-") fixed_tokens_names = { t.Name: (t.Name, t) for t in grammar.terminals if t not in {idx, type_id, string, num, lessequal, rarrow, larrow} } fixed_tokens_names["larrow"] = ("<-", larrow) fixed_tokens_names["rarrow"] = ("=>", rarrow) fixed_tokens_names["lessequal"] = ("<=", lessequal) tokens = [] pos_data = [] # Tokenize while True: tok = lexer.token() if not tok: # append EOF if len(pos_data) > 0: last_lineno, last_col = pos_data[-1] col = last_col + len(tokens[-1].lex) else: # empty program last_lineno = 0 col = -1 tokens.append( Token("$", grammar.EOF, (last_lineno, find_column(data, col)))) break # No more input else: try: tval, ttype = fixed_tokens_names[tok.type] except: tval = tok.value if tok.type == "string": ttype = string elif tok.type == "id": ttype = idx elif tok.type == "type_id": ttype = type_id else: ttype = num tokens.append( Token(tval, ttype, (tok.lineno, find_column(data, tok.lexpos)))) if printing: pprint_tokens(tokens) return tokens