def test_multiple_for(self): line = "530 NEXTI" results = tokenize_line(line) self.assertTrue(isinstance(results, ProgramLine)) self.assertEqual(530, results.line) self.assertEqual(1, len(results.stmts)) result = results.stmts[0] self.assertEqual(Keywords.NEXT, result.keyword) self.assertEqual('I', result.loop_var) line = "530 FORI=1TO9:C(I,1)=0:C(I,2)=37:NEXTI" results = tokenize_line(line) self.assertTrue(isinstance(results, ProgramLine)) self.assertEqual(530, results.line) self.assertEqual(4, len(results.stmts)) result = results.stmts[1] self.assertEqual(Keywords.LET, result.keyword) self.assertEqual([lexer_token(0, "num")], result._tokens) self.assertEqual("C(I,1)", result._variable) result = results.stmts[2] self.assertEqual(Keywords.LET, result.keyword) self.assertEqual([lexer_token(37, "num")], result._tokens) self.assertEqual("C(I,2)", result._variable) result = results.stmts[3] self.assertEqual(Keywords.NEXT, result.keyword) self.assertEqual('I', result.loop_var)
def test_def(self): executor = self.runit(['100 DEF FNA(X)=X^2+1']) self.assertEqual(1, executor.get_symbol_count()) # self.assert_value(executor, "FNA", "X^2+1") value = executor.get_symbol("FNA", symbol_type=SymbolType.FUNCTION) expected = [ lexer_token(token='X', type='id'), lexer_token(token='^', type='op'), lexer_token(token=2.0, type='num'), lexer_token(token='+', type='op'), lexer_token(token=1.0, type='num') ] self.assertEqual(expected, value) AT = executor.get_symbol_type("FNA", SymbolType.FUNCTION) self.assertEqual(SymbolType.FUNCTION, AT)
def test_spaces(self): stack = [] tokens = self._lexer.lex(' 10 + 7 ') self.assertEqual(3, len(tokens)) stack.append(tokens[0]) stack.append(tokens[2]) binop = get_op(lexer_token("+", "op")) answer = binop.eval(stack, op=None) self.assertEqual(17, answer.token)
def test_minus(self): stack = [] tokens = self._lexer.lex('10-7') self.assertEqual(3, len(tokens)) stack.append(tokens[0]) stack.append(tokens[2]) binop = get_op(lexer_token("-", "op")) answer = binop.eval( stack, op=None) # Op is not needed for this test. Only used for DEF FNx self.assertEqual(3, answer.token)
def test_token_exp(self): multi_exp = "T=INT(RND(1)*20+20)*100:T0=T:T9=25+INT(RND(1)*10):D0=0:E=3000:E0=E" line = f"370 {multi_exp}" results = tokenize_line(line) self.assertTrue(isinstance(results, ProgramLine)) self.assertEqual(370, results.line) self.assertEqual(6, len(results.stmts)) expect = multi_exp.split(":") self.assertEqual(6, len(expect)) for i in range(len(expect)): self.assertEqual(Keywords.LET, results.stmts[i].keyword) #self.assertEqual(expect[i], results.stmts[i].args) self.assertEqual([lexer_token(3000, "num")], results.stmts[4]._tokens)
def test_lex_vars(self): # Check that we can handle variable names that run into keywords. ("YandQ1then" tokens = self._lexer.lex("X<>YANDQ1<7") print(tokens) expected = [ lexer_token(token='X', type='id'), lexer_token(token='<>', type='op'), lexer_token(token='Y', type='id'), lexer_token(token='AND', type='op'), lexer_token(token='Q1', type='id'), lexer_token(token='<', type='op'), lexer_token(token=7.0, type='num') ] self.assertEqual(expected, tokens)
def eval(self, stack, *, op): self.check_args(stack) first = stack.pop() answer = self.eval1(first.token, op=op) return_type = self._return_type if self._return_type is not None else first.type return lexer_token(answer, return_type)
def eval(self, stack, *, op): self.check_args(stack) second = stack.pop() first = stack.pop() answer = self.eval2(first.token, second.token) return lexer_token(answer, first.type)
def lex2(self, text): state = None token = "" back = None index = 0 def cur(): if text is None: assert (0) if index == len(text): return None return text[index] def peek(): if index + 1 == len(text): return None return text[index + 1] def consume(): nonlocal index current = text[index] index += 1 return current # So we can get and consume in one operation. while (c := cur()) is not None: if state is None: if c in LETTERS: token = "" if peek() is not None and peek() in NUMBERS or peek( ) == '$': # Only consume if on identifier path. token += consume() if cur() in NUMBERS: token += consume() if cur() == '$': token += consume() yield lexer_token(token, "id") continue if peek() is None or peek() not in LETTERS: yield lexer_token(consume(), "id") continue # At this point, we know it's not a variable. found = self.scan_for_keyword(TEXT_OPERATORS, text[index:]) if not found: # Can't make an operator from it, so much be an ID. yield lexer_token(consume(), "id") continue for _ in found: consume() if found in BOOLEAN_OPERATORS: yield lexer_token(found, "op") else: yield lexer_token(found, "id") elif c in OPERATORS: first = consume() if first == "<" and cur() == ">": consume() yield lexer_token("<>", "op") elif first == "<" and cur() == "=": consume() yield lexer_token("<=", "op") elif first == ">" and cur() == "=": consume() yield lexer_token(">=", "op") else: yield lexer_token(first, "op") elif c in NUMBERS or c == '.': token = "" while (c := cur()) is not None and (c in NUMBERS or c == '.'): token += consume() yield lexer_token(float(token), "num") elif c == '"': consume() token = "" while (c := cur()) is not None and (c != '"'): token += consume() if cur() != '"': raise BasicSyntaxError(F"No closing quote char.") consume() yield lexer_token(token, "str")
def eval(self, tokens: list[lexer_token], *, symbols=None) -> lexer_token: """ Evalulates an expression, like "2+3*5-A+RND()" :param symbols: Symbols (BASIC variables) to use when evaluating the expression :param tokens: the incoming list[lexer_token] :return: A lexer token with the result and the type. """ from basic_operators import get_op, get_precedence # Import it in two places, so the IDE knows it's there. # "-" is ambiguous. It can mean subtraction or unary minus. # if "-" follows a data item, it's subtraction. # if "-" follows an operator, it's unary minus, unless the operator is ) # Why ")"? I need to be able to express this better. is_unary_context = True assert type(symbols) != dict if symbols is None: # Happens during testing. symbols = SymbolTable() # TODO Fix this. No "if test" allowed. if len(tokens) == 0: raise BasicSyntaxError(F"No expression.") data_stack = [] op_stack: OP_TOKEN = [] token_index = 0 while token_index < len(tokens): current = tokens[token_index] if current.type == "op": if current.token == "-" and is_unary_context: current = lexer_token(UNARY_MINUS, current.type) # Do anything on the stack that has higher precedence. while len(op_stack): top = op_stack[-1] # This makes everything left associative. I think that's ok. Might be wrong for exponentiation # This says visual basic was left associative for everything. # https://docs.microsoft.com/en-us/dotnet/visual-basic/language-reference/operators/operator-precedence # This shows left associative exponentiation: (they use **, not ^) # http://www.quitebasic.com/ if top.token != "(" and get_precedence( top) >= get_precedence( current): # Check operator precedence self.one_op(op_stack, data_stack) else: break if current.token != ")": op_stack.append( OP_TOKEN(current.token, current.type, None, None, symbols=None)) else: assert_syntax(top.token == "(", F"Unbalanced parens.") op_stack.pop() if current.token == ")": is_unary_context = False else: is_unary_context = True else: if current.type == "id": # TODO Problem: We now need to know the SymbolType of a variable to retrieve it # but we don't know it here. Maybe we can defer referencing it, until it is # used? At that point, we would know array vs function. I think. # I think this works: symbol_type = self.get_type_from_name( current, tokens, token_index) if not symbols.is_symbol_defined(current.token, symbol_type): raise UndefinedSymbol( F"Undefined variable: '{current.token}'") symbol_value = symbols.get_symbol(current.token, symbol_type) symbol_type2 = symbols.get_symbol_type( current.token, symbol_type) # Changed the way that symbols tables work. Check that we are still consistent. assert (symbol_type == symbol_type2) if symbol_type == SymbolType.VARIABLE: if current.token.endswith("$"): data_stack.append(lexer_token(symbol_value, "str")) else: data_stack.append(lexer_token(symbol_value, "num")) elif symbol_type == SymbolType.FUNCTION: # Handle function as operators. Lower priority than "(", but higher than everything else. # So don't append this to the data stack, append it to the op stack as a function. arg = symbols.get_symbol_arg(current.token, SymbolType.FUNCTION) op_stack.append( OP_TOKEN(current.token, SymbolType.FUNCTION, arg, symbol_value, symbols=symbols)) else: # Array access arg = current.token op_stack.append( OP_TOKEN(ARRAY_ACCESS, "array_access", arg, None, symbols=symbols)) else: data_stack.append(current) is_unary_context = False token_index += 1 # Do anything left on the stack while len(op_stack): self.one_op(op_stack, data_stack) assert_syntax(len(op_stack) == 0, F"Expression not completed.") assert_syntax(len(data_stack) == 1, F"Data not consumed.") return data_stack[0].token
def lex2(self, text): state = None token = "" back = None index = 0 def cur(): if text is None: assert(0) if index == len(text): return None return text[index] def peek(): if index + 1 == len(text): return None return text[index+1] def consume(): nonlocal index current = text[index] index += 1 return current # So we can get and consume in one operation. ST_ANY = 1 ST_REF = 2 # VAR, FUNC, ST_INT = 3 ST_FLOAT = 4 ST_STRING = 5 state = ST_ANY while (c := cur()) is not None: print(state, c) assert state if state == ST_ANY: if c == ' ': consume() elif c in LETTERS: token = consume() state = ST_REF elif c in NUMBERS: token = consume() state = ST_INT elif c == '.': # Number starts with ., like ".5" token = "0"+consume() state = ST_FLOAT elif c in OP_FIRST: p = peek() if c not in OP_TWO_FIRST: consume() yield lexer_token(c, "op") state = ST_ANY token = "" elif c+p in OP_TWO: consume() consume() yield lexer_token(c+p, "op") state = ST_ANY token = "" else: consume() yield lexer_token(c, "op") state = ST_ANY token = "" elif c =='"': consume() state = ST_STRING token = "" else: raise BasicSyntaxError(F"Unexpected char {c} in state {state}") elif state == ST_REF: if c in NUMBERS: # Need to check for A1$ token += consume() yield lexer_token(token, "id") token = "" state = ST_ANY elif c in LETTERS: token += consume() else: if token in BOOLEAN_OPERATORS: yield lexer_token(token, "op") else: yield lexer_token(token, "id") token = "" state = ST_ANY elif state == ST_INT: if c in NUMBERS: token += consume() elif c == '.': token += consume() state = ST_FLOAT else: yield lexer_token(float(token), "num") token = "" state = ST_ANY elif state == ST_FLOAT: if c in NUMBERS: token += consume() else: yield lexer_token(float(token), "num") token = "" state = ST_ANY elif state == ST_STRING: if c == '"': consume() yield lexer_token(token, "str") token = "" state = ST_ANY else: token += consume() if len(token) > 65536: raise BasicSyntaxError(F"String too long (> 65536).") elif c == ' ' or c == '\t': consume() # Ignore non quoted whitespace. else: raise BasicSyntaxError("Invalid character {c} in state {state}")
class Lexer: def __init__(self): pass def scan_for_keyword(self, array, text): """ Find any strings matching an element of array in text. :param array: :param text: :return: """ match = "" for i, c in enumerate(text): match += c potentials = [op for op in array if i < len(op) and op[i] == match[i]] #print(c, potentials) if not potentials: return None for p in potentials: if i + 1 == len(p): return p array = potentials return None def lex(self, text): tokens = [token for token in self.lex2(text)] return tokens def lex2(self, text): state = None token = "" back = None index = 0 def cur(): if text is None: assert(0) if index == len(text): return None return text[index] def peek(): if index + 1 == len(text): return None return text[index+1] def consume(): nonlocal index current = text[index] index += 1 return current # So we can get and consume in one operation. ST_ANY = 1 ST_REF = 2 # VAR, FUNC, ST_INT = 3 ST_FLOAT = 4 ST_STRING = 5 state = ST_ANY while (c := cur()) is not None: print(state, c) assert state if state == ST_ANY: if c == ' ': consume() elif c in LETTERS: token = consume() state = ST_REF elif c in NUMBERS: token = consume() state = ST_INT elif c == '.': # Number starts with ., like ".5" token = "0"+consume() state = ST_FLOAT elif c in OP_FIRST: p = peek() if c not in OP_TWO_FIRST: consume() yield lexer_token(c, "op") state = ST_ANY token = "" elif c+p in OP_TWO: consume() consume() yield lexer_token(c+p, "op") state = ST_ANY token = "" else: consume() yield lexer_token(c, "op") state = ST_ANY token = "" elif c =='"': consume() state = ST_STRING token = "" else: raise BasicSyntaxError(F"Unexpected char {c} in state {state}") elif state == ST_REF: if c in NUMBERS: # Need to check for A1$ token += consume() yield lexer_token(token, "id") token = "" state = ST_ANY elif c in LETTERS: token += consume() else: if token in BOOLEAN_OPERATORS: yield lexer_token(token, "op") else: yield lexer_token(token, "id") token = "" state = ST_ANY elif state == ST_INT: if c in NUMBERS: token += consume() elif c == '.': token += consume() state = ST_FLOAT else: yield lexer_token(float(token), "num") token = "" state = ST_ANY elif state == ST_FLOAT: if c in NUMBERS: token += consume() else: yield lexer_token(float(token), "num") token = "" state = ST_ANY elif state == ST_STRING: if c == '"': consume() yield lexer_token(token, "str") token = "" state = ST_ANY else: token += consume() if len(token) > 65536: raise BasicSyntaxError(F"String too long (> 65536).") elif c == ' ' or c == '\t': consume() # Ignore non quoted whitespace. else: raise BasicSyntaxError("Invalid character {c} in state {state}") # check for tokens in progress if state == ST_REF: yield lexer_token(token, "id") elif state == ST_INT: yield lexer_token(int(token), "num") elif state == ST_FLOAT: yield lexer_token(float(token), "num") elif state == ST_STRING: raise BasicSyntaxError("END of line in string.") return