def _tokenize_line(line, line_number, file_name): """Tokenizes a single line of input. Arguments: line: The line of text to tokenize. line_number: The line number (used when constructing token objects). file_name: The name of a file to use in errors. Returns: A tuple of: A list of token objects or None. A possibly-empty list of errors. """ tokens = [] offset = 0 while offset < len(line): best_candidate = "" best_candidate_symbol = None # Find the longest match. Ties go to the first match. This way, keywords # ("struct") are matched as themselves, but words that only happen to start # with keywords ("structure") are matched as words. # # There is never a reason to try to match a literal after a regex that # could also match that literal, so check literals first. for literal in LITERAL_TOKEN_PATTERNS: if line[offset:].startswith( literal) and len(literal) > len(best_candidate): best_candidate = literal # For Emboss, the name of a literal token is just the literal in quotes, # so that the grammar can read a little more naturally, e.g.: # # expression -> expression "+" expression # # instead of # # expression -> expression Plus expression best_candidate_symbol = '"' + literal + '"' for pattern in REGEX_TOKEN_PATTERNS: match_result = pattern.regex.match(line[offset:]) if match_result and len( match_result.group(0)) > len(best_candidate): best_candidate = match_result.group(0) best_candidate_symbol = pattern.symbol if not best_candidate: return None, [[ error.error( file_name, parser_types.make_location((line_number, offset + 1), (line_number, offset + 2)), "Unrecognized token") ]] if best_candidate_symbol: tokens.append( parser_types.Token( best_candidate_symbol, best_candidate, parser_types.make_location( (line_number, offset + 1), (line_number, offset + len(best_candidate) + 1)))) offset += len(best_candidate) return tokens, None
def test_prelude_as_file_name(self): error_message = error.error("", parser_types.make_location((3, 4), (3, 6)), "Bad thing") self.assertEqual("", error_message.source_file) self.assertEqual(error.ERROR, error_message.severity) self.assertEqual(parser_types.make_location((3, 4), (3, 6)), error_message.location) self.assertEqual("Bad thing", error_message.message) sourceless_format = error_message.format({}) sourced_format = error_message.format({"": "\n\nabcdefghijklm"}) self.assertEqual("[prelude]:3:4: error: Bad thing", "".join([x[1] for x in sourceless_format])) self.assertEqual( [ (error.BOLD, "[prelude]:3:4: "), # Location (error.BRIGHT_RED, "error: "), # Severity (error.BOLD, "Bad thing"), # Message ], sourceless_format) self.assertEqual( "[prelude]:3:4: error: Bad thing\n" "abcdefghijklm\n" " ^^", "".join([x[1] for x in sourced_format])) self.assertEqual( [ (error.BOLD, "[prelude]:3:4: "), # Location (error.BRIGHT_RED, "error: "), # Severity (error.BOLD, "Bad thing\n"), # Message (error.WHITE, "abcdefghijklm\n"), # Source snippet (error.BRIGHT_GREEN, " ^^"), # Error column indicator ], sourced_format)
def test_make_location(self): self.assertEqual( ir_pb2.Location(start=ir_pb2.Position(line=1, column=2), end=ir_pb2.Position(line=3, column=4), is_synthetic=False), parser_types.make_location((1, 2), (3, 4))) self.assertEqual( ir_pb2.Location(start=ir_pb2.Position(line=1, column=2), end=ir_pb2.Position(line=3, column=4), is_synthetic=False), parser_types.make_location(ir_pb2.Position(line=1, column=2), ir_pb2.Position(line=3, column=4)))
def test_multiline_error(self): error_message = error.error( "foo.emb", parser_types.make_location((3, 4), (3, 6)), "Bad thing\nSome explanation\nMore explanation") self.assertEqual("foo.emb", error_message.source_file) self.assertEqual(error.ERROR, error_message.severity) self.assertEqual(parser_types.make_location((3, 4), (3, 6)), error_message.location) self.assertEqual("Bad thing\nSome explanation\nMore explanation", error_message.message) sourceless_format = error_message.format({}) sourced_format = error_message.format( {"foo.emb": "\n\nabcdefghijklm\nnopqrstuv"}) self.assertEqual( "foo.emb:3:4: error: Bad thing\n" "foo.emb:3:4: note: Some explanation\n" "foo.emb:3:4: note: More explanation", "".join([x[1] for x in sourceless_format])) self.assertEqual( [ (error.BOLD, "foo.emb:3:4: "), # Location (error.BRIGHT_RED, "error: "), # Severity (error.BOLD, "Bad thing\n"), # Message (error.BOLD, "foo.emb:3:4: "), # Location, line 2 (error.BRIGHT_BLACK, "note: "), # "Note" severity, line 2 (error.WHITE, "Some explanation\n"), # Message, line 2 (error.BOLD, "foo.emb:3:4: "), # Location, line 3 (error.BRIGHT_BLACK, "note: "), # "Note" severity, line 3 (error.WHITE, "More explanation"), # Message, line 3 ], sourceless_format) self.assertEqual( "foo.emb:3:4: error: Bad thing\n" "foo.emb:3:4: note: Some explanation\n" "foo.emb:3:4: note: More explanation\n" "abcdefghijklm\n" " ^^", "".join([x[1] for x in sourced_format])) self.assertEqual( [ (error.BOLD, "foo.emb:3:4: "), # Location (error.BRIGHT_RED, "error: "), # Severity (error.BOLD, "Bad thing\n"), # Message (error.BOLD, "foo.emb:3:4: "), # Location, line 2 (error.BRIGHT_BLACK, "note: "), # "Note" severity, line 2 (error.WHITE, "Some explanation\n"), # Message, line 2 (error.BOLD, "foo.emb:3:4: "), # Location, line 3 (error.BRIGHT_BLACK, "note: "), # "Note" severity, line 3 (error.WHITE, "More explanation\n"), # Message, line 3 (error.WHITE, "abcdefghijklm\n"), # Source snippet (error.BRIGHT_GREEN, " ^^"), # Column indicator ], sourced_format)
def test_str(self): self.assertEqual( "FOO 'bar' 1:2-3:4", str( parser_types.Token("FOO", "bar", parser_types.make_location((1, 2), (3, 4)))))
def test_bad_indent_matches_closed_indent(self): tokens, errors = tokenizer.tokenize(" a\nb\n c\n d", "file") self.assertFalse(tokens) self.assertEqual([[ error.error("file", parser_types.make_location( (4, 1), (4, 2)), "Bad indentation") ]], errors)
def test_bad_string_after_string_with_escaped_backslash_at_end(self): tokens, errors = tokenizer.tokenize(r'"\\""', "name") self.assertFalse(tokens) self.assertEqual([[ error.error("name", parser_types.make_location( (1, 5), (1, 6)), "Unrecognized token") ]], errors)
def test_case(self): tokens, errors = tokenizer.tokenize(string, "name") self.assertFalse(tokens) self.assertEqual([[ error.error("name", parser_types.make_location( (1, 1), (1, 2)), "Unrecognized token") ]], errors)
def parse_module(file_name, file_reader): """Parses a module, returning a module-level IR. Arguments: file_name: The name of the module's source file. file_reader: A callable that returns either: (file_contents, None) or (None, list_of_error_detail_strings) Returns: (ir, debug_info, errors), where ir is a module-level intermediate representation (IR), debug_info is a ModuleDebugInfo containing the tokenization, parse tree, and original source text of all modules, and errors is a list of tokenization or parse errors. If errors is not an empty list, ir will be None. Raises: FrontEndFailure: An error occurred while reading or parsing the module. str(error) will give a human-readable error message. """ source_code, errors = file_reader(file_name) if errors: location = parser_types.make_location((1, 1), (1, 1)) return None, None, [ [error.error(file_name, location, "Unable to read file.")] + [error.note(file_name, location, e) for e in errors] ] return parse_module_text(source_code, file_name)
def _tokenize(text): """"Tokenizes" text by making each character into a token.""" result = [] for i in range(len(text)): result.append( Token(text[i], parser_types.make_location((1, i + 1), (1, i + 2)))) return result
def test_bad_indent_two_spaces_versus_one_space(self): tokens, errors = tokenizer.tokenize(" a\n b", "file") self.assertFalse(tokens) self.assertEqual([[ error.error("file", parser_types.make_location( (2, 1), (2, 2)), "Bad indentation") ]], errors)
def test_bad_indent_tab_versus_space(self): # A bad indent is one that doesn't match a previous unmatched indent. tokens, errors = tokenizer.tokenize(" a\n\tb", "file") self.assertFalse(tokens) self.assertEqual([[ error.error("file", parser_types.make_location( (2, 1), (2, 2)), "Bad indentation") ]], errors)
def test_note(self): note_message = error.note("foo.emb", parser_types.make_location((3, 4), (3, 6)), "OK thing") self.assertEqual("foo.emb", note_message.source_file) self.assertEqual(error.NOTE, note_message.severity) self.assertEqual(parser_types.make_location((3, 4), (3, 6)), note_message.location) self.assertEqual("OK thing", note_message.message) sourced_format = note_message.format({"foo.emb": "\n\nabcdefghijklm"}) self.assertEqual( "foo.emb:3:4: note: OK thing\n" "abcdefghijklm\n" " ^^", "".join([x[1] for x in sourced_format])) self.assertEqual( [ (error.BOLD, "foo.emb:3:4: "), # Location (error.BRIGHT_BLACK, "note: "), # Severity (error.WHITE, "OK thing\n"), # Message (error.WHITE, "abcdefghijklm\n"), # Source snippet (error.BRIGHT_GREEN, " ^^"), # Column indicator ], sourced_format)
def test_warn(self): warning_message = error.warn( "foo.emb", parser_types.make_location((3, 4), (3, 6)), "Not good thing") self.assertEqual("foo.emb", warning_message.source_file) self.assertEqual(error.WARNING, warning_message.severity) self.assertEqual(parser_types.make_location((3, 4), (3, 6)), warning_message.location) self.assertEqual("Not good thing", warning_message.message) sourced_format = warning_message.format( {"foo.emb": "\n\nabcdefghijklm"}) self.assertEqual( "foo.emb:3:4: warning: Not good thing\n" "abcdefghijklm\n" " ^^", "".join([x[1] for x in sourced_format])) self.assertEqual( [ (error.BOLD, "foo.emb:3:4: "), # Location (error.BRIGHT_MAGENTA, "warning: "), # Severity (error.BOLD, "Not good thing\n"), # Message (error.WHITE, "abcdefghijklm\n"), # Source snippet (error.BRIGHT_GREEN, " ^^"), # Column indicator ], sourced_format)
def test_format_errors(self): errors = [[ error.note("foo.emb", parser_types.make_location((3, 4), (3, 6)), "note") ]] sources = {"foo.emb": "x\ny\nz bcd\nq\n"} self.assertEqual("foo.emb:3:4: note: note\n" "z bcd\n" " ^^", error.format_errors(errors, sources)) bold = error.BOLD reset = error.RESET white = error.WHITE bright_black = error.BRIGHT_BLACK bright_green = error.BRIGHT_GREEN self.assertEqual( bold + "foo.emb:3:4: " + reset + bright_black + "note: " + reset + white + "note\n" + reset + white + "z bcd\n" + reset + bright_green + " ^^" + reset, error.format_errors(errors, sources, use_color=True))
def test_split_errors(self): user_error = [ error.error("foo.emb", parser_types.make_location((1, 2), (3, 4)), "Bad thing"), error.note("foo.emb", parser_types.make_location((3, 4), (5, 6)), "Note: bad thing referrent") ] user_error_2 = [ error.error("foo.emb", parser_types.make_location( (8, 9), (10, 11)), "Bad thing"), error.note("foo.emb", parser_types.make_location( (10, 11), (12, 13)), "Note: bad thing referrent") ] synthetic_error = [ error.error("foo.emb", parser_types.make_location((1, 2), (3, 4)), "Bad thing"), error.note("foo.emb", parser_types.make_location((3, 4), (5, 6), True), "Note: bad thing referrent") ] synthetic_error_2 = [ error.error("foo.emb", parser_types.make_location((8, 9), (10, 11), True), "Bad thing"), error.note("foo.emb", parser_types.make_location( (10, 11), (12, 13)), "Note: bad thing referrent") ] user_errors, synthetic_errors = error.split_errors( [user_error, synthetic_error]) self.assertEqual([user_error], user_errors) self.assertEqual([synthetic_error], synthetic_errors) user_errors, synthetic_errors = error.split_errors( [synthetic_error, user_error]) self.assertEqual([user_error], user_errors) self.assertEqual([synthetic_error], synthetic_errors) user_errors, synthetic_errors = error.split_errors( [synthetic_error, user_error, synthetic_error_2, user_error_2]) self.assertEqual([user_error, user_error_2], user_errors) self.assertEqual([synthetic_error, synthetic_error_2], synthetic_errors)
def test_synthetic_error(self): error_message = error.error( "foo.emb", parser_types.make_location((3, 4), (3, 6), True), "Bad thing") sourceless_format = error_message.format({}) sourced_format = error_message.format({"foo.emb": "\n\nabcdefghijklm"}) self.assertEqual("foo.emb:[compiler bug]: error: Bad thing", "".join([x[1] for x in sourceless_format])) self.assertEqual( [ (error.BOLD, "foo.emb:[compiler bug]: "), # Location (error.BRIGHT_RED, "error: "), # Severity (error.BOLD, "Bad thing"), # Message ], sourceless_format) self.assertEqual("foo.emb:[compiler bug]: error: Bad thing", "".join([x[1] for x in sourced_format])) self.assertEqual( [ (error.BOLD, "foo.emb:[compiler bug]: "), # Location (error.BRIGHT_RED, "error: "), # Severity (error.BOLD, "Bad thing"), # Message ], sourced_format)
def test_equality(self): note_message = error.note("foo.emb", parser_types.make_location((3, 4), (3, 6)), "thing") self.assertEqual( note_message, error.note("foo.emb", parser_types.make_location((3, 4), (3, 6)), "thing")) self.assertNotEqual( note_message, error.warn("foo.emb", parser_types.make_location((3, 4), (3, 6)), "thing")) self.assertNotEqual( note_message, error.note("foo2.emb", parser_types.make_location((3, 4), (3, 6)), "thing")) self.assertNotEqual( note_message, error.note("foo.emb", parser_types.make_location((2, 4), (3, 6)), "thing")) self.assertNotEqual( note_message, error.note("foo.emb", parser_types.make_location((3, 4), (3, 6)), "thing2"))
def test_parse_location(self): self.assertEqual(parser_types.make_location((1, 2), (3, 4)), parser_types.parse_location("1:2-3:4")) self.assertEqual( parser_types.make_location((1, 2), (3, 4)), parser_types.parse_location(" 1 : 2 - 3 : 4 "))
def test_format_location(self): self.assertEqual( "1:2-3:4", parser_types.format_location( parser_types.make_location((1, 2), (3, 4))))
def test_make_location_logic_checks(self): self.assertRaises(ValueError, parser_types.make_location, (3, 4), (1, 2)) self.assertRaises(ValueError, parser_types.make_location, (1, 3), (1, 2)) self.assertTrue(parser_types.make_location((1, 2), (1, 2)))
def _parse(self, tokens): """_parse implements Shift-Reduce parsing algorithm. _parse implements the standard shift-reduce algorithm outlined on ASLU pp236-237. Arguments: tokens: the list of token objects to parse. Returns: A ParseResult. """ # The END_OF_INPUT token is explicitly added to avoid explicit "cursor < # len(tokens)" checks. tokens = list(tokens) + [Symbol(END_OF_INPUT)] # Each element of stack is a parse state and a (possibly partial) parse # tree. The state at the top of the stack encodes which productions are # "active" (that is, which ones the parser has seen partial input which # matches some prefix of the production, in a place where that production # might be valid), and, for each active production, how much of the # production has been completed. stack = [(0, None)] def state(): return stack[-1][0] cursor = 0 # On each iteration, look at the next symbol and the current state, and # perform the corresponding action. while True: if (state(), tokens[cursor].symbol) not in self.action: # Most state/symbol entries would be Errors, so rather than exhaustively # adding error entries, we just check here. if state() in self.default_errors: next_action = Error(self.default_errors[state()]) else: next_action = Error(None) else: next_action = self.action[state(), tokens[cursor].symbol] if isinstance(next_action, Shift): # Shift means that there are no "complete" productions on the stack, # and so the current token should be shifted onto the stack, with a new # state indicating the new set of "active" productions. stack.append((next_action.state, tokens[cursor])) cursor += 1 elif isinstance(next_action, Accept): # Accept means that parsing is over, successfully. assert len(stack) == 2, "Accepted incompletely-reduced input." assert tokens[cursor].symbol == END_OF_INPUT, ( "Accepted parse before " "end of input.") return ParseResult(stack[-1][1], None) elif isinstance(next_action, Reduce): # Reduce means that there is a complete production on the stack, and # that the next symbol implies that the completed production is the # correct production. # # Per ALSU, we would simply pop an element off the state stack for each # symbol on the rhs of the production, and then push a new state by # looking up the (post-pop) current state and the lhs of the production # in GOTO. The GOTO table, in some sense, is equivalent to shift # actions for nonterminal symbols. # # Here, we attach a new partial parse tree, with the production lhs as # the "name" of the tree, and the popped trees as the "children" of the # new tree. children = [ item[1] for item in stack[len(stack) - len(next_action.rule.rhs):] ] # Attach source_location, if known. The source location will not be # known if the reduction consumes no symbols (empty rhs) or if the # client did not specify source_locations for tokens. # # It is necessary to loop in order to handle cases like: # # C -> c D # D -> # # The D child of the C reduction will not have a source location # (because it is not produced from any source), so it is necessary to # scan backwards through C's children to find the end position. The # opposite is required in the case where initial children have no # source. # # These loops implicitly handle the case where the reduction has no # children, setting the source_location to None in that case. start_position = None end_position = None for child in children: if hasattr(child, "source_location" ) and child.source_location is not None: start_position = child.source_location.start break for child in reversed(children): if hasattr(child, "source_location" ) and child.source_location is not None: end_position = child.source_location.end break if start_position is None: source_location = None else: source_location = parser_types.make_location( start_position, end_position) reduction = Reduction(next_action.rule.lhs, children, next_action.rule, source_location) del stack[len(stack) - len(next_action.rule.rhs):] stack.append((self.goto[state(), next_action.rule.lhs], reduction)) elif isinstance(next_action, Error): # Error means that the parse is impossible. For typical grammars and # texts, this usually happens within a few tokens after the mistake in # the input stream, which is convenient (though imperfect) for error # reporting. return ParseResult( None, ParseError(next_action.code, cursor, tokens[cursor], state(), self.expected[state()])) else: assert False, "Shouldn't be here."
def tokenize(text, file_name): # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched # delimiters ([], (), and {}). """Tokenizes its argument. Arguments: text: The raw text of a .emb file. file_name: The name of the file to use in errors. Returns: A tuple of: a list of parser_types.Tokens or None a possibly-empty list of errors. """ tokens = [] indent_stack = [""] line_number = 0 for line in text.splitlines(): line_number += 1 # _tokenize_line splits the actual text into tokens. line_tokens, errors = _tokenize_line(line, line_number, file_name) if errors: return None, errors # Lines with only whitespace and comments are not used for Indent/Dedent # calculation, and do not produce end-of-line tokens. for token in line_tokens: if token.symbol != "Comment": break else: tokens.extend(line_tokens) tokens.append( parser_types.Token( '"\\n"', "\n", parser_types.make_location((line_number, len(line) + 1), (line_number, len(line) + 1)))) continue # Leading whitespace is whatever .lstrip() removes. leading_whitespace = line[0:len(line) - len(line.lstrip())] if leading_whitespace == indent_stack[-1]: # If the current leading whitespace is equal to the last leading # whitespace, do not emit an Indent or Dedent token. pass elif leading_whitespace.startswith(indent_stack[-1]): # If the current leading whitespace is longer than the last leading # whitespace, emit an Indent token. For the token text, take the new # part of the whitespace. tokens.append( parser_types.Token( "Indent", leading_whitespace[len(indent_stack[-1]):], parser_types.make_location( (line_number, len(indent_stack[-1]) + 1), (line_number, len(leading_whitespace) + 1)))) indent_stack.append(leading_whitespace) else: # Otherwise, search for the unclosed indentation level that matches # the current indentation level. Emit a Dedent token for each # newly-closed indentation level. for i in range(len(indent_stack) - 1, -1, -1): if leading_whitespace == indent_stack[i]: break tokens.append( parser_types.Token( "Dedent", "", parser_types.make_location( (line_number, len(leading_whitespace) + 1), (line_number, len(leading_whitespace) + 1)))) del indent_stack[i] else: return None, [[ error.error( file_name, parser_types.make_location( (line_number, 1), (line_number, len(leading_whitespace) + 1)), "Bad indentation") ]] tokens.extend(line_tokens) # Append an end-of-line token (for non-whitespace lines). tokens.append( parser_types.Token( '"\\n"', "\n", parser_types.make_location((line_number, len(line) + 1), (line_number, len(line) + 1)))) for i in range(len(indent_stack) - 1): tokens.append( parser_types.Token( "Dedent", "", parser_types.make_location((line_number + 1, 1), (line_number + 1, 1)))) return tokens, []