def test_throws_if_next_invoked_while_status_is_FINISHED(self): mock_func = Mock() tokenizer = Tokenizer('') tokenizer._generator = mock_func tokenizer._status = Tokenizer.EStatus.FINISHED with self.assertRaisesRegex(TokenizerError, '(f|F)inished'): tokenizer.next() self.assertFalse(mock_func.called)
def test_throws_if_string_constant_has_unescaped_double_quote_within(self): filename = TEST_FILES / 'UnescapedDoubleQuotesInString.jack' tokenizer = Tokenizer(filename) line = 5 character = 15 for i in range(16): tokenizer.next() with self.assertRaisesRegex( TokenizerError, f'(L|l)ine.*{line}.*(C|c)haracter.*{character}'): tokenizer.next()
def test_throws_if_string_constant_has_missing_end_double_quote(self): filename = TEST_FILES / 'MalformedString.jack' tokenizer = Tokenizer(filename) line = 5 character = 15 for i in range(16): tokenizer.next() with self.assertRaisesRegex( TokenizerError, f'(L|l)ine.*{line}.*(C|c)haracter.*{character}'): tokenizer.next()
def test_returns_tokens_until_exhausted(self): expected = [ Token('class', 'keyword', 1, 1), Token('CorrectSyntax', 'identifier', 1, 7), Token('{', 'symbol', 1, 21), Token('field', 'keyword', 2, 3), Token('String', 'identifier', 2, 9), Token('bar', 'identifier', 2, 16), Token(';', 'symbol', 2, 19), Token('constructor', 'keyword', 4, 3), Token('CorrectSyntax', 'identifier', 4, 15), Token('new', 'identifier', 4, 29), Token('(', 'symbol', 4, 32), Token(')', 'symbol', 4, 33), Token('{', 'symbol', 4, 35), Token('let', 'keyword', 5, 5), Token('bar', 'identifier', 5, 9), Token('=', 'symbol', 5, 13), Token('Hello world!', 'stringConst', 5, 15), Token(';', 'symbol', 5, 29), Token('return', 'keyword', 6, 5), Token('this', 'keyword', 6, 12), Token(';', 'symbol', 6, 16), Token('}', 'symbol', 7, 3), Token('method', 'keyword', 9, 3), Token('void', 'keyword', 9, 10), Token('greetings', 'identifier', 9, 15), Token('(', 'symbol', 9, 24), Token(')', 'symbol', 9, 25), Token('{', 'symbol', 9, 27), Token('do', 'keyword', 10, 5), Token('Output', 'identifier', 10, 8), Token('.', 'symbol', 10, 14), Token('printString', 'identifier', 10, 15), Token('(', 'symbol', 10, 26), Token('bar', 'identifier', 10, 27), Token(')', 'symbol', 10, 30), Token(';', 'symbol', 10, 31), Token('return', 'keyword', 11, 5), Token(';', 'symbol', 11, 11), Token('}', 'symbol', 12, 3), Token('}', 'symbol', 13, 1) ] filename = TEST_FILES / 'CorrectSyntax.jack' tokenizer = Tokenizer(filename) tokens = [] while not tokenizer.finished(): token = tokenizer.next() if not token: break tokens.append(token) self.assertEqual(tokens, expected)
def test_skips_everything_inbetween_multiline_comment(self): expected = [ Token('let', 'keyword', 5, 5), Token('foo', 'identifier', 5, 9), Token('=', 'symbol', 5, 13), Token('5', 'intConst', 5, 15), Token(';', 'symbol', 5, 16) ] filename = TEST_FILES / 'IgnoreMultilineComment.jack' tokenizer = Tokenizer(filename) tokens = [] while not tokenizer.finished(): token = tokenizer.next() if not token: break tokens.append(token) self.assertEqual(tokens, expected)
def test_skips_rest_of_line_after_line_comment(self): expected = [ Token('let', 'keyword', 1, 1), Token('foo', 'identifier', 1, 5), Token('=', 'symbol', 1, 9), Token('5', 'intConst', 1, 11), Token(';', 'symbol', 1, 12) ] filename = TEST_FILES / 'IgnoreLineComment.jack' tokenizer = Tokenizer(filename) tokens = [] while not tokenizer.finished(): token = tokenizer.next() if not token: break tokens.append(token) self.assertEqual(tokens, expected)
class Parser: def __init__(self, text, source_file=None): self.tokenizer = Tokenizer(text, source_file) def parse(self, native_function_declarations=None): expression = self.expression() if native_function_declarations: transform_lvalues(expression, native_function_declarations) return expression # recursive descent parse methods (organized alphabetically) def arguments(self): self.__expect(SymbolToken('(')) args = [] token = self.__peek() if not token.equals(SymbolToken(')')): exp = self.expression() args.append(exp) while self.__accept_and_consume(SymbolToken(',')): exp = self.expression() args.append(exp) self.__expect(SymbolToken(')')) return args def array(self): lvalue = self.lvalue() return self.array_from_lvalue(lvalue) def array_from_lvalue(self, lvalue): assert isinstance(lvalue, LValue) type_name = lvalue.name next_lvalue = lvalue.next assert isinstance(next_lvalue, ArrayLValue) exp1 = next_lvalue.expression exp2 = self.expression() return ArrayCreation(TypeId(type_name), exp1, exp2) def array_lvalue(self): exp = self.expression() self.__expect(SymbolToken(']')) next_lvalue = self.lvalue_next() return ArrayLValue(exp, next_lvalue) def declaration(self): if self.__accept_type(KeywordToken): token = self.__peek() if token.value == 'type': return self.type_declaration() elif token.value == 'var': return self.variable_declaration() elif token.value == 'function': return self.function_declaration() elif token.value == 'import': return self.import_declaration() else: raise ExpectationError( 'keyword in {type, var, function, import}', token) else: return None def declarations(self): declarations = [] while self.is_declaration(): declaration = self.declaration() declarations.append(declaration) return declarations def expression(self): exp = self.expression_without_precedence() if exp is None: raise ParseError('Unable to parse', self.__peek()) return self.expression_with_precedence(exp) def expressions(self): # note that though Dr. Appel's specification admits empty lists of expressions, I restrict this to at # least one expression to avoid exception handling expressions = [self.expression()] while self.__accept_and_consume(SymbolToken(';')): expressions.append(self.expression()) return expressions def expression_with_precedence(self, left, precedence=0): """See https://en.wikipedia.org/wiki/Operator-precedence_parser""" token = self.__peek() while self.is_operator(token) and self.precedence(token) >= precedence: self.__next() # consume operator operation = token.value inner_precedence = PRECEDENCE[token.value] right = self.expression_without_precedence() token = self.__peek() while self.is_operator( token) and self.precedence(token) >= inner_precedence: right = self.expression_with_precedence( right, PRECEDENCE[token.value]) token = self.__peek() left = self.operation(operation, left, right) return left def expression_without_precedence(self): if self.__accept_and_consume(KeywordToken('nil')): return NilValue() elif self.__accept_type(NumberToken): token = self.__next() return IntegerValue.from_string(token.value) elif self.__accept_and_consume(SymbolToken('-')): token = self.__next() return IntegerValue.from_string('-' + token.value) elif self.__accept_type(StringToken): token = self.__next() return StringValue(token.value) elif self.__accept(SymbolToken('(')): return self.sequence() elif self.__accept_type(IdentifierToken): return self.id_started() elif self.__accept(KeywordToken('if')): return self.if_then() elif self.__accept(KeywordToken('while')): return self.while_do() elif self.__accept(KeywordToken('for')): return self.for_do() elif self.__accept_and_consume(KeywordToken('break')): return Break() elif self.__accept(KeywordToken('let')): return self.let() elif self.__accept(KeywordToken('type')): return self.type_declaration() elif self.__accept(KeywordToken('var')): return self.variable_declaration() elif self.__accept(KeywordToken('function')): return self.function_declaration() else: return None def for_do(self): self.__expect(KeywordToken('for')) var = self.__expect_type(IdentifierToken) self.__expect(SymbolToken(':=')) start = self.expression() self.__expect(KeywordToken('to')) end = self.expression() self.__expect(KeywordToken('do')) body = self.expression() return For(var.value, start, end, body) def function_call(self): function_id = self.id() args = self.arguments() return FunctionCall(function_id, args) def function_declaration(self): self.__expect(KeywordToken('function')) function_name = self.id() self.__expect(SymbolToken('(')) parameters = self.parameters() self.__expect(SymbolToken(')')) return_type = None if self.__accept_and_consume(SymbolToken(':')): return_type = self.type() self.__expect(SymbolToken('=')) body = self.expression() return FunctionDeclaration(function_name, parameters, return_type, body) def id(self): token = self.__expect_type(IdentifierToken) return token.value def id_field(self): field_name = self.id() self.__expect(SymbolToken('=')) exp = self.expression() return field_name, exp def id_started(self): """An ID has been peeked above, peek further...""" if self.__accept(SymbolToken('{'), self.__peek(1)): return self.record() elif self.__accept(SymbolToken('('), self.__peek(1)): return self.function_call() else: lvalue = self.lvalue() return self.lvalue_started(lvalue) def if_then(self): self.__expect(KeywordToken('if')) condition = self.expression() self.__expect(KeywordToken('then')) exp1 = self.expression() exp2 = None if self.__accept_and_consume(KeywordToken('else')): exp2 = self.expression() return If(condition, exp1, exp2) def import_declaration(self): raise NotImplementedError def is_declaration(self): token = self.__peek() return isinstance(token, KeywordToken) and token.value in [ 'type', 'var', 'function', 'import' ] def is_operator(self, token): return isinstance(token, SymbolToken) and token.value in PRECEDENCE def let(self): self.__expect(KeywordToken('let')) decs = self.declarations() self.__expect(KeywordToken('in')) if not self.__accept(KeywordToken('end')): exps = self.expressions() else: exps = [] self.__expect(KeywordToken('end')) return Let(decs, exps) def lvalue(self): lvalue_name = self.id() next_lvalue = self.lvalue_next() return LValue(lvalue_name, next_lvalue) def lvalue_next(self): next_lvalue = None if self.__accept_and_consume(SymbolToken('.')): next_lvalue = self.record_lvalue() elif self.__accept_and_consume(SymbolToken('[')): next_lvalue = self.array_lvalue() return next_lvalue def lvalue_started(self, lvalue): if self.__accept_and_consume(SymbolToken(':=')): exp = self.expression() return Assign(lvalue, exp) elif self.__accept_and_consume(KeywordToken('of')): return self.array_from_lvalue(lvalue) else: return lvalue def operation(self, operation, left, right): operator_class = OPERATORS[operation] return operator_class(left, right) def parameters(self): if self.__accept_type(IdentifierToken): parameters = [] name, type_id = self.type_field() parameters.append(FunctionParameter(name, type_id)) while self.__accept_and_consume(SymbolToken(',')): name, type_id = self.type_field() parameters.append(FunctionParameter(name, type_id)) return parameters else: return [] def precedence(self, token): return PRECEDENCE[token.value] def record(self): type_token = self.__expect_type(IdentifierToken) self.__expect(SymbolToken('{')) fields = OrderedDict() while self.__accept_type(IdentifierToken): field_name, exp = self.id_field() fields[field_name] = exp token = self.__next() if self.__accept(SymbolToken(','), token): pass elif self.__accept(SymbolToken('}'), token): break else: raise ParseError('Expected either , or }', token) # TODO possibility for garbage after ', ...' return RecordCreation(TypeId(type_token.value), fields) def record_lvalue(self): lvalue_name = self.__expect_type(IdentifierToken) next_lvalue = self.lvalue_next() return RecordLValue(lvalue_name.value, next_lvalue) def sequence(self): exps = [] self.__expect(SymbolToken('(')) if not self.__peek().equals(SymbolToken(')')): exp = self.expression() exps.append(exp) while self.__accept_and_consume(SymbolToken(';')): exp = self.expression() exps.append(exp) self.__expect(SymbolToken(')')) return Sequence(exps) def type(self): token = self.__next() if self.__accept_type(IdentifierToken, token): return TypeId(token.value) elif self.__accept(SymbolToken('{'), token): type_fields = self.type_fields() self.__expect(SymbolToken('}')) return RecordType(type_fields) elif self.__accept(KeywordToken('array'), token): self.__expect(KeywordToken('of')) type_name = self.__expect_type(IdentifierToken) return ArrayType(type_name.value) else: raise ExpectationError('a type definition', token) def type_declaration(self): self.__expect(KeywordToken('type')) type_name = self.id() self.__expect(SymbolToken('=')) ty = self.type() return TypeDeclaration(type_name, ty) def type_fields(self): type_fields = OrderedDict() if self.__accept_type(IdentifierToken): name, type_id = self.type_field() type_fields[name] = type_id while self.__accept_and_consume(SymbolToken(',')): name, type_id = self.type_field() type_fields[name] = type_id return type_fields def type_field(self): field_name = self.id() self.__expect(SymbolToken(':')) type_id = self.type_id() return field_name, type_id def type_id(self): type_id = self.__expect_type(IdentifierToken) return TypeId(type_id.value) def variable_declaration(self): self.__expect(KeywordToken('var')) name = self.id() type_id = None if self.__accept_and_consume(SymbolToken(':')): type_id = self.type() self.__expect(SymbolToken(':=')) exp = self.expression() return VariableDeclaration(name, type_id, exp) def while_do(self): self.__expect(KeywordToken('while')) condition = self.expression() self.__expect(KeywordToken('do')) body = self.expression() return While(condition, body) # navigation methods def __peek(self, index=0): """Peek at the next token""" return self.tokenizer.peek(index) def __next(self): """Consume and return the next token""" return self.tokenizer.next() def __accept(self, expected, token=None): """Check if the given token (or the next peeked token, if none is passed) is of a certain type or has a certain value""" token = token or self.tokenizer.peek() return expected.equals(token) def __accept_type(self, expected_type, token=None): """Check if the given token (or the next peeked token, if none is passed) is of a certain type or has a certain value""" # assert isinstance(expected_type, type) token = token or self.tokenizer.peek() return isinstance(token, expected_type) def __accept_and_consume(self, expected): """Check if the next token is of a certain type or has a certain value; if it is, consume it""" accepted = self.__accept(expected) if accepted: self.__next() return accepted def __expect(self, expected, token=None): """Demand that the next token is of the expected type (and optionally value) and throw an error otherwise""" token = token or self.__next() if expected.equals(token): return token else: raise ExpectationError(expected.to_string(), token) def __expect_type(self, expected_type, token=None): """Demand that the next token is of the expected type (and optionally value) and throw an error otherwise""" # assert isinstance(expected_type, type) token = token or self.__next() if isinstance(token, expected_type): return token else: raise ExpectationError(expected_type.__name__, token)
def test_next_method_returns_an_identifier_token(self): tokenizer = Tokenizer('') tokenizer._generator = iter(['fooBar']) token = tokenizer.next() expected = Token('fooBar', 'identifier', 1, 1) self.assertEqual(token, expected)
def test_next_method_returns_a_string_token(self): tokenizer = Tokenizer('') tokenizer._generator = iter([r'"\"hello world,\" said the man"']) token = tokenizer.next() expected = Token('"hello world," said the man', 'stringConst', 1, 1) self.assertEqual(token, expected)
def test_next_method_returns_an_integer_token(self): tokenizer = Tokenizer('') tokenizer._generator = iter(['700']) token = tokenizer.next() expected = Token('700', 'intConst', 1, 1) self.assertEqual(token, expected)
def test_next_method_returns_a_keyword_token(self): tokenizer = Tokenizer('') tokenizer._generator = iter(['class']) token = tokenizer.next() expected = Token('class', 'keyword', 1, 1) self.assertEqual(token, expected)
def test_next_method_returns_a_symbol_token(self): tokenizer = Tokenizer('') tokenizer._generator = iter([';']) token = tokenizer.next() expected = Token(';', 'symbol', 1, 1) self.assertEqual(token, expected)
def test_next_method_throws_if_token_does_not_fit_any_category(self): tokenizer = Tokenizer('') tokenizer._generator = iter(['bad value']) with self.assertRaisesRegex(TokenizerError, 'Jack file.+Token not recognized') as e: tokenizer.next()
def test_throws_if_file_does_not_exist(self): filename = TEST_FILES / 'does_not_exist.jack' tokenizer = Tokenizer(filename) with self.assertRaises(FileNotFoundError): tokenizer.next()