class LexerTests(TestCase): def setUp(self): self._symbols = SymbolTable.default() self._tokens = TokenFactory(self._symbols) self._text = None def _on_take(self, character): if character in self._symbols.NEW_LINE: self._position = self._position.new_line else: self._position = self._position.new_character def test_recognises_a_single_character(self): self._text = "b" self._verify_tokens(self._tokens.character(Position(1, 1), "b")) def test_recognises_a_word(self): self._text = "hello" self._verify_tokens(self._tokens.character(Position(1, 1), "h"), self._tokens.character(Position(1, 2), "e"), self._tokens.character(Position(1, 3), "l"), self._tokens.character(Position(1, 4), "l"), self._tokens.character(Position(1, 5), "o")) def test_recognises_a_single_command(self): self._text = r"\myMacro" self._verify_tokens(self._tokens.command(Position(1, 1), r"\myMacro")) def test_recognises_a_single_special_character_command(self): self._text = r"\%" self._verify_tokens(self._tokens.command(Position(1, 1), r"\%")) def test_recognises_sequences_of_single_character_command(self): self._text = r"\%\$\\" self._verify_tokens(self._tokens.command(Position(1, 1), r"\%"), self._tokens.command(Position(1, 3), r"\$"), self._tokens.command(Position(1, 5), r"\\")) def test_recognises_two_commands(self): self._text = r"\def\foo" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.command(Position(1, 5), r"\foo")) def test_recognises_two_commands_separated_by_white_spaces(self): self._text = "\\def \t \\foo" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.white_space(Position(1, 5), " \t "), self._tokens.command(Position(1, 14), r"\foo")) def test_recognises_a_comment(self): self._text = "%This is a comment\n\\def\\foo" self._verify_tokens( self._tokens.comment(Position(1, 1), "%This is a comment"), self._tokens.new_line(Position(1, 1)), self._tokens.command(Position(2, 1), r"\def"), self._tokens.command(Position(2, 5), r"\foo")) def test_recognises_an_opening_group(self): self._text = "{" self._verify_tokens(self._tokens.begin_group(Position(1, 1))) def test_recognises_an_ending_group(self): self._text = "}" self._verify_tokens(self._tokens.end_group(Position(1, 1))) def test_recognises_an_parameter(self): self._text = "\def#1" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.parameter(Position(1, 5), "#1")) def test_recognises_a_complete_macro_definition(self): self._text = "\\def\\point#1#2{(#2,#1)}" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.command(Position(1, 5), r"\point"), self._tokens.parameter(Position(1, 11), "#1"), self._tokens.parameter(Position(1, 13), "#2"), self._tokens.begin_group(Position(1, 15), "{"), self._tokens.others(Position(1, 16), "("), self._tokens.parameter(Position(1, 17), "#2"), self._tokens.others(Position(1, 19), ","), self._tokens.parameter(Position(1, 20), "#1"), self._tokens.others(Position(1, 22), ")"), self._tokens.end_group(Position(1, 23), "}")) def test_recognises_math_mode(self): self._text = "$" self._verify_tokens(self._tokens.math(Position(1, 1))) def test_recognises_superscript(self): self._text = "^" self._verify_tokens(self._tokens.superscript(Position(1, 1))) def test_recognises_subscript(self): self._text = "_" self._verify_tokens(self._tokens.subscript(Position(1, 1))) def test_recognises_non_breaking_space(self): self._text = "~" self._verify_tokens(self._tokens.non_breaking_space(Position(1, 1))) def _verify_tokens(self, *expected_tokens): self.assertListEqual(list(expected_tokens), list(Lexer(self._symbols, Source(self._text))))
class LexerTests(TestCase): def setUp(self): self._symbols = SymbolTable.default() self._tokens = TokenFactory(self._symbols) self._text = None def _on_take(self, character): if character in self._symbols.NEW_LINE: self._position = self._position.new_line else: self._position = self._position.new_character def test_recognises_a_single_character(self): self._text = "b" self._verify_tokens(self._tokens.character(Position(1, 1), "b")) def test_recognises_a_word(self): self._text = "hello" self._verify_tokens(self._tokens.character(Position(1, 1), "h"), self._tokens.character(Position(1, 2), "e"), self._tokens.character(Position(1, 3), "l"), self._tokens.character(Position(1, 4), "l"), self._tokens.character(Position(1, 5), "o")) def test_recognises_a_single_command(self): self._text = r"\myMacro" self._verify_tokens(self._tokens.command(Position(1, 1), r"\myMacro")) def test_recognises_a_single_special_character_command(self): self._text = r"\%" self._verify_tokens(self._tokens.command(Position(1, 1), r"\%")) def test_recognises_sequences_of_single_character_command(self): self._text = r"\%\$\\" self._verify_tokens(self._tokens.command(Position(1, 1), r"\%"), self._tokens.command(Position(1, 3), r"\$"), self._tokens.command(Position(1, 5), r"\\")) def test_recognises_two_commands(self): self._text = r"\def\foo" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.command(Position(1, 5), r"\foo")) def test_recognises_two_commands_separated_by_white_spaces(self): self._text = "\\def \t \\foo" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.white_space(Position(1, 5), " \t "), self._tokens.command(Position(1, 14), r"\foo")) def test_recognises_a_comment(self): self._text = "%This is a comment\n\\def\\foo" self._verify_tokens(self._tokens.comment(Position(1, 1), "%This is a comment"), self._tokens.new_line(Position(1, 1)), self._tokens.command(Position(2, 1), r"\def"), self._tokens.command(Position(2, 5), r"\foo")) def test_recognises_an_opening_group(self): self._text = "{" self._verify_tokens(self._tokens.begin_group(Position(1,1))) def test_recognises_an_ending_group(self): self._text = "}" self._verify_tokens(self._tokens.end_group(Position(1, 1))) def test_recognises_an_parameter(self): self._text = "\def#1" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.parameter(Position(1, 5), "#1")) def test_recognises_a_complete_macro_definition(self): self._text = "\\def\\point#1#2{(#2,#1)}" self._verify_tokens(self._tokens.command(Position(1, 1), r"\def"), self._tokens.command(Position(1, 5), r"\point"), self._tokens.parameter(Position(1, 11), "#1"), self._tokens.parameter(Position(1, 13),"#2"), self._tokens.begin_group(Position(1, 15), "{"), self._tokens.others(Position(1, 16), "("), self._tokens.parameter(Position(1, 17),"#2"), self._tokens.others(Position(1, 19), ","), self._tokens.parameter(Position(1, 20), "#1"), self._tokens.others(Position(1, 22), ")"), self._tokens.end_group(Position(1, 23), "}")) def test_recognises_math_mode(self): self._text = "$" self._verify_tokens(self._tokens.math(Position(1,1))) def test_recognises_superscript(self): self._text = "^" self._verify_tokens(self._tokens.superscript(Position(1,1))) def test_recognises_subscript(self): self._text = "_" self._verify_tokens(self._tokens.subscript(Position(1,1))) def test_recognises_non_breaking_space(self): self._text = "~" self._verify_tokens(self._tokens.non_breaking_space(Position(1,1))) def _verify_tokens(self, *expected_tokens): self.assertListEqual(list(expected_tokens), list(Lexer(self._symbols, Source(self._text))))
class Lexer: """ Scan a stream of character and yields a stream of token. The lexer shall define handler for each category of symbols. These handlers are automatically selected using reflection: each handler shall be named "_read_category". """ def __init__(self, symbols, source): self._source = source self._symbols = symbols self._tokens = TokenFactory(self._symbols) self._reset() def _reset(self): self._position = Position(1, 0, self._source.name) self._input = Stream(iter(self._source.content), self._on_take) def _on_take(self, character): if character in self._symbols.NEW_LINE: self._position = self._position.next_line() else: self._position = self._position.next_character() @property def position(self): return self._position def _take(self): return self._input.take() @property def _next(self): return self._input.look_ahead() def __iter__(self): return self def __next__(self): if self._next is None: raise StopIteration() return self._one_token() def _one_token(self): handler = self._handler_for(self._symbols.category_of(self._next)) return handler() def _handler_for(self, category): handler_name = "_read_" + category.name.lower() handler = getattr(self, handler_name) assert handler, "Lexer has no handler for '%s' symbols" % category.name return handler def _read_character(self): character = self._take() return self._tokens.character(self._position, character) def _read_control(self): marker = self._take() location = self._position assert marker in self._symbols.CONTROL if not self._next in self._symbols.CHARACTER: name = self._take() else: name = self._take_while(lambda c: c in self._symbols.CHARACTER) return self._tokens.command(location, marker + name) def _take_while(self, predicate): return "".join(self._input.take_while(predicate)) def _read_comment(self): marker = self._input.take() location = self._position assert marker in self._symbols.COMMENT text = self._take_while(lambda c: c not in self._symbols.NEW_LINE) return self._tokens.comment(location, marker + text) def _read_white_spaces(self): marker = self._input.take() location = self._position spaces = self._take_while(lambda c: c in self._symbols.WHITE_SPACES) return self._tokens.white_space(location, marker + spaces) def _read_new_line(self): marker = self._input.take() location = self._position assert marker in self._symbols.NEW_LINE return self._tokens.new_line(location, marker) def _read_begin_group(self): marker = self._input.take() location = self._position assert marker in self._symbols.BEGIN_GROUP return self._tokens.begin_group(location, marker) def _read_end_group(self): marker = self._take() location = self._position assert marker in self._symbols.END_GROUP return self._tokens.end_group(location, marker) def _read_parameter(self): marker = self._input.take() location = self._position assert marker in self._symbols.PARAMETER text = marker + self._take_while(lambda c: c.isdigit()) return self._tokens.parameter(location, text) def _read_math(self): marker = self._input.take() location = self._position assert marker in self._symbols.MATH return self._tokens.math(location) def _read_superscript(self): marker = self._input.take() location = self._position assert marker in self._symbols.SUPERSCRIPT return self._tokens.superscript(location, marker) def _read_subscript(self): marker = self._input.take() location = self._position assert marker in self._symbols.SUBSCRIPT return self._tokens.subscript(location, marker) def _read_non_breaking_space(self): marker = self._input.take() location = self._position assert marker in self._symbols.NON_BREAKING_SPACE return self._tokens.non_breaking_space(location, marker) def _read_others(self): marker = self._input.take() location = self._position #assert marker in self._symbols.OTHERS return self._tokens.others(location, marker)