def sanity_check_format_result(formatted_text, original_text): """Checks that the given texts are equivalent.""" # The texts are considered equivalent if they tokenize to the same token # stream, except that: # # Multiple consecutive newline tokens are equivalent to a single newline # token. # # Extra newline tokens at the start of the stream should be ignored. # # Whitespace at the start or end of a token should be ignored. This matters # for documentation and comment tokens, which may have had trailing whitespace # in the original text, and for indent tokens, which may contain a different # number of space and/or tab characters. original_tokens, errors = tokenizer.tokenize(original_text, '') if errors: return ['BUG: original text is not tokenizable: {!r}'.format(errors)] formatted_tokens, errors = tokenizer.tokenize(formatted_text, '') if errors: return ['BUG: formatted text is not tokenizable: {!r}'.format(errors)] o_tokens = _collapse_newline_tokens(original_tokens) f_tokens = _collapse_newline_tokens(formatted_tokens) for i in range(len(o_tokens)): if (o_tokens[i].symbol != f_tokens[i].symbol or o_tokens[i].text.strip() != f_tokens[i].text.strip()): return ['BUG: Symbol {} differs: {!r} vs {!r}'.format(i, o_tokens[i], f_tokens[i])] return []
def test_ok_parse(self): parse_result = parser.parse_module( tokenizer.tokenize( "struct LogFileStatus:\n" " 0 [+4] UInt foo\n", "")[0]) self.assertTrue(parse_result.parse_tree) self.assertEqual(None, parse_result.error)
def test_bad_indent_matches_closed_indent(self): tokens, errors = tokenizer.tokenize(" a\nb\n c\n d", "file") self.assertFalse(tokens) self.assertEqual([[ error.error("file", parser_types.make_location( (4, 1), (4, 2)), "Bad indentation") ]], errors)
def test_bad_string_after_string_with_escaped_backslash_at_end(self): tokens, errors = tokenizer.tokenize(r'"\\""', "name") self.assertFalse(tokens) self.assertEqual([[ error.error("name", parser_types.make_location( (1, 5), (1, 6)), "Unrecognized token") ]], errors)
def test_bad_indent_two_spaces_versus_one_space(self): tokens, errors = tokenizer.tokenize(" a\n b", "file") self.assertFalse(tokens) self.assertEqual([[ error.error("file", parser_types.make_location( (2, 1), (2, 2)), "Bad indentation") ]], errors)
def test_case(self): tokens, errors = tokenizer.tokenize(string, "name") self.assertFalse(tokens) self.assertEqual([[ error.error("name", parser_types.make_location( (1, 1), (1, 2)), "Unrecognized token") ]], errors)
def parse(text): """Parses text as an Expression. This parses text using the expression subset of the Emboss grammar, and returns an ir_pb2.Expression. The expression only needs to be syntactically valid; it will not go through symbol resolution or type checking. This function is not intended to be called on arbitrary input; it asserts that the text successfully parses, but does not return errors. Arguments: text: The text of an Emboss expression, like "4 + 5" or "$max(1, a, b)". Returns: An ir_pb2.Expression corresponding to the textual form. Raises: AssertionError if text is not a well-formed Emboss expression, and assertions are enabled. """ tokens, errors = tokenizer.tokenize(text, "") assert not errors, "{!r}".format(errors) # tokenizer.tokenize always inserts a newline token at the end, which breaks # expression parsing. parse_result = parser.parse_expression(tokens[:-1]) assert not parse_result.error, "{!r}".format(parse_result.error) return module_ir.build_ir(parse_result.parse_tree)
def test_bad_indent_tab_versus_space(self): # A bad indent is one that doesn't match a previous unmatched indent. tokens, errors = tokenizer.tokenize(" a\n\tb", "file") self.assertFalse(tokens) self.assertEqual([[ error.error("file", parser_types.make_location( (2, 1), (2, 2)), "Bad indentation") ]], errors)
def _normalize_reserved_word_list(reserved_words): """Returns words that would be allowed as names if they were not reserved.""" interesting_reserved_words = [] for word in reserved_words: tokens, errors = tokenizer.tokenize(word, "") assert tokens and not errors, "Failed to tokenize " + word if tokens[0].symbol in ["SnakeWord", "CamelWord", "ShoutyWord"]: interesting_reserved_words.append(word) return sorted(interesting_reserved_words)
def test_error_reporting_by_example(self): parse_result = parser.parse_module( tokenizer.tokenize("struct LogFileStatus:\n" " 0 [+4] UInt\n", "")[0]) self.assertEqual(None, parse_result.parse_tree) self.assertEqual("A name is required for a struct field.", parse_result.error.code) self.assertEqual('"\\n"', parse_result.error.token.symbol) self.assertEqual(set(['"["', "SnakeWord", '"."', '":"', '"("']), parse_result.error.expected_tokens)
def test_error_reporting_without_example(self): parse_result = parser.parse_module( tokenizer.tokenize( "struct LogFileStatus:\n" " 0 [+4] UInt foo +\n", "")[0]) self.assertEqual(None, parse_result.parse_tree) self.assertEqual(None, parse_result.error.code) self.assertEqual('"+"', parse_result.error.token.symbol) self.assertEqual( set(['"("', '"\\n"', '"["', "Documentation", "Comment"]), parse_result.error.expected_tokens)
def test_parse_good_wildcard_example(self): errors = parser.parse_error_examples( _EXAMPLE_DIVIDER + # ======... ' \n struct must be followed by ":" \n\n' + # Second message. _MESSAGE_ERROR_DIVIDER + # ------... "struct Foo $ERR $ANY") tokens = tokenizer.tokenize("struct Foo ", "")[0] # The $ANY token should come just before the end-of-line token in the parsed # result. tokens.insert(-1, lr1.ANY_TOKEN) self.assertEqual(tokens, errors[0][0]) self.assertEqual('struct must be followed by ":"', errors[0][2])
def test_parse_good_error_examples(self): errors = parser.parse_error_examples( _EXAMPLE_DIVIDER + # ======... "structure names must be Camel" + # Message. _MESSAGE_ERROR_DIVIDER + # ------... "struct $ERR FOO" + # First example. _ERROR_DIVIDER + # --- "struct $ERR foo" + # Second example. _EXAMPLE_DIVIDER + # ======... ' \n struct must be followed by ":" \n\n' + # Second message. _MESSAGE_ERROR_DIVIDER + # ------... "struct Foo $ERR") # Example for second message. self.assertEqual( tokenizer.tokenize("struct FOO", "")[0], errors[0][0]) self.assertEqual("structure names must be Camel", errors[0][2]) self.assertEqual( tokenizer.tokenize("struct foo", "")[0], errors[1][0]) self.assertEqual("structure names must be Camel", errors[1][2]) self.assertEqual( tokenizer.tokenize("struct Foo ", "")[0], errors[2][0]) self.assertEqual('struct must be followed by ":"', errors[2][2])
def main(argv=()): flags = _parse_command_line(argv) if not flags.edit_in_place and len(flags.input_file) > 1: print('Multiple files may only be formatted without --no-edit-in-place.', file=sys.stderr) return 1 if flags.edit_in_place and flags.debug_show_line_types: print('The flag --debug-show-line-types requires --no-edit-in-place.', file=sys.stderr) return 1 for file_name in flags.input_file: with open(file_name) as f: source_code = f.read() tokens, errors = tokenizer.tokenize(source_code, file_name) if errors: _print_errors(errors, {file_name: source_code}, flags) continue parse_result = parser.parse_module(tokens) if parse_result.error: _print_errors( [error.make_error_from_parse_error(file_name, parse_result.error)], {file_name: source_code}, flags) continue formatted_text = format_emb.format_emboss_parse_tree( parse_result.parse_tree, format_emb.Config(show_line_types=flags.debug_show_line_types, indent_width=flags.indent)) if flags.check_result and not flags.debug_show_line_types: errors = format_emb.sanity_check_format_result(formatted_text, source_code) if errors: for e in errors: print(e, file=sys.stderr) continue if flags.edit_in_place: with open(file_name, 'w') as f: f.write(formatted_text) else: sys.stdout.write(formatted_text) return 0
def parse_error_examples(error_example_text): """Parses error examples from error_example_text. Arguments: error_example_text: The text of an error example file. Returns: A list of tuples, suitable for passing into generate_parser. Raises: ParserGenerationError: There is a problem parsing the error examples. """ error_examples = error_example_text.split("\n" + "=" * 80 + "\n") result = [] # Everything before the first "======" line is explanatory text: ignore it. for error_example in error_examples[1:]: message_and_examples = error_example.split("\n" + "-" * 80 + "\n") if len(message_and_examples) != 2: raise ParserGenerationError( "Expected one error message and one example section in:\n" + error_example) message, example_text = message_and_examples examples = example_text.split("\n---\n") for example in examples: # TODO(bolms): feed a line number into tokenize, so that tokenization # failures refer to the correct line within error_example_text. tokens, errors = tokenizer.tokenize(example, "") if errors: raise ParserGenerationError(str(errors)) for i in range(len(tokens)): if tokens[i].symbol == "BadWord" and tokens[i].text == "$ANY": tokens[i] = lr1.ANY_TOKEN error_token = None for i in range(len(tokens)): if tokens[i].symbol == "BadWord" and tokens[i].text == "$ERR": error_token = tokens[i + 1] del tokens[i] break else: raise ParserGenerationError( "No error token marker '$ERR' in:\n" + error_example) result.append((tokens, error_token, message.strip(), example)) return result
def parse_module_text(source_code, file_name): """Parses the text of a module, returning a module-level IR. Arguments: source_code: The text of the module to parse. file_name: The name of the module's source file (will be included in the resulting IR). Returns: A module-level intermediate representation (IR), prior to import and symbol resolution, and a corresponding ModuleDebugInfo, for debugging the parser. Raises: FrontEndFailure: An error occurred while parsing the module. str(error) will give a human-readable error message. """ # This is strictly an optimization to speed up tests, mostly by avoiding the # need to re-parse the prelude for every test .emb. if (source_code, file_name) in _cached_modules: debug_info = _cached_modules[source_code, file_name] ir = ir_pb2.Module() ir.CopyFrom(debug_info.ir) else: debug_info = ModuleDebugInfo(file_name) debug_info.source_code = source_code tokens, errors = tokenizer.tokenize(source_code, file_name) if errors: return _IrDebugInfo(None, debug_info, errors) debug_info.tokens = tokens parse_result = parser.parse_module(tokens) if parse_result.error: return _IrDebugInfo(None, debug_info, [ error.make_error_from_parse_error(file_name, parse_result.error) ]) debug_info.parse_tree = parse_result.parse_tree used_productions = set() ir = module_ir.build_ir(parse_result.parse_tree, used_productions) debug_info.used_productions = used_productions debug_info.ir = ir_pb2.Module() debug_info.ir.CopyFrom(ir) _cached_modules[source_code, file_name] = debug_info ir.source_file_name = file_name return _IrDebugInfo(ir, debug_info, [])