Python tokenizeの例、compiler.front_end.tokenizer.tokenize Pythonの例

コード例 #1

0

ファイルを表示

def sanity_check_format_result(formatted_text, original_text):
  """Checks that the given texts are equivalent."""
  # The texts are considered equivalent if they tokenize to the same token
  # stream, except that:
  #
  # Multiple consecutive newline tokens are equivalent to a single newline
  # token.
  #
  # Extra newline tokens at the start of the stream should be ignored.
  #
  # Whitespace at the start or end of a token should be ignored.  This matters
  # for documentation and comment tokens, which may have had trailing whitespace
  # in the original text, and for indent tokens, which may contain a different
  # number of space and/or tab characters.
  original_tokens, errors = tokenizer.tokenize(original_text, '')
  if errors:
    return ['BUG: original text is not tokenizable: {!r}'.format(errors)]

  formatted_tokens, errors = tokenizer.tokenize(formatted_text, '')
  if errors:
    return ['BUG: formatted text is not tokenizable: {!r}'.format(errors)]

  o_tokens = _collapse_newline_tokens(original_tokens)
  f_tokens = _collapse_newline_tokens(formatted_tokens)
  for i in range(len(o_tokens)):
    if (o_tokens[i].symbol != f_tokens[i].symbol or
        o_tokens[i].text.strip() != f_tokens[i].text.strip()):
      return ['BUG: Symbol {} differs: {!r} vs {!r}'.format(i, o_tokens[i],
                                                            f_tokens[i])]
  return []

コード例 #2

0

ファイルを表示

ファイル: parser_test.py プロジェクト: reventlov/emboss

 def test_ok_parse(self):
     parse_result = parser.parse_module(
         tokenizer.tokenize(
             "struct LogFileStatus:\n"
             "  0 [+4]    UInt    foo\n", "")[0])
     self.assertTrue(parse_result.parse_tree)
     self.assertEqual(None, parse_result.error)

コード例 #3

0

ファイルを表示

 def test_bad_indent_matches_closed_indent(self):
     tokens, errors = tokenizer.tokenize(" a\nb\n  c\n d", "file")
     self.assertFalse(tokens)
     self.assertEqual([[
         error.error("file", parser_types.make_location(
             (4, 1), (4, 2)), "Bad indentation")
     ]], errors)

コード例 #4

0

ファイルを表示

 def test_bad_string_after_string_with_escaped_backslash_at_end(self):
     tokens, errors = tokenizer.tokenize(r'"\\""', "name")
     self.assertFalse(tokens)
     self.assertEqual([[
         error.error("name", parser_types.make_location(
             (1, 5), (1, 6)), "Unrecognized token")
     ]], errors)

コード例 #5

0

ファイルを表示

 def test_bad_indent_two_spaces_versus_one_space(self):
     tokens, errors = tokenizer.tokenize("  a\n b", "file")
     self.assertFalse(tokens)
     self.assertEqual([[
         error.error("file", parser_types.make_location(
             (2, 1), (2, 2)), "Bad indentation")
     ]], errors)

コード例 #6

0

ファイルを表示

 def test_case(self):
     tokens, errors = tokenizer.tokenize(string, "name")
     self.assertFalse(tokens)
     self.assertEqual([[
         error.error("name", parser_types.make_location(
             (1, 1), (1, 2)), "Unrecognized token")
     ]], errors)

コード例 #7

0

ファイルを表示

ファイル: expression_parser.py プロジェクト: reventlov/emboss

def parse(text):
  """Parses text as an Expression.

  This parses text using the expression subset of the Emboss grammar, and
  returns an ir_pb2.Expression.  The expression only needs to be syntactically
  valid; it will not go through symbol resolution or type checking.  This
  function is not intended to be called on arbitrary input; it asserts that the
  text successfully parses, but does not return errors.

  Arguments:
    text: The text of an Emboss expression, like "4 + 5" or "$max(1, a, b)".

  Returns:
    An ir_pb2.Expression corresponding to the textual form.

  Raises:
    AssertionError if text is not a well-formed Emboss expression, and
    assertions are enabled.
  """
  tokens, errors = tokenizer.tokenize(text, "")
  assert not errors, "{!r}".format(errors)
  # tokenizer.tokenize always inserts a newline token at the end, which breaks
  # expression parsing.
  parse_result = parser.parse_expression(tokens[:-1])
  assert not parse_result.error, "{!r}".format(parse_result.error)
  return module_ir.build_ir(parse_result.parse_tree)

コード例 #8

0

ファイルを表示

 def test_bad_indent_tab_versus_space(self):
     # A bad indent is one that doesn't match a previous unmatched indent.
     tokens, errors = tokenizer.tokenize(" a\n\tb", "file")
     self.assertFalse(tokens)
     self.assertEqual([[
         error.error("file", parser_types.make_location(
             (2, 1), (2, 2)), "Bad indentation")
     ]], errors)

コード例 #9

0

ファイルを表示

ファイル: generate_grammar_md.py プロジェクト: reventlov/emboss

def _normalize_reserved_word_list(reserved_words):
  """Returns words that would be allowed as names if they were not reserved."""
  interesting_reserved_words = []
  for word in reserved_words:
    tokens, errors = tokenizer.tokenize(word, "")
    assert tokens and not errors, "Failed to tokenize " + word
    if tokens[0].symbol in ["SnakeWord", "CamelWord", "ShoutyWord"]:
      interesting_reserved_words.append(word)
  return sorted(interesting_reserved_words)

コード例 #10

0

ファイルを表示

ファイル: parser_test.py プロジェクト: reventlov/emboss

 def test_error_reporting_by_example(self):
     parse_result = parser.parse_module(
         tokenizer.tokenize("struct LogFileStatus:\n"
                            "  0 [+4]    UInt\n", "")[0])
     self.assertEqual(None, parse_result.parse_tree)
     self.assertEqual("A name is required for a struct field.",
                      parse_result.error.code)
     self.assertEqual('"\\n"', parse_result.error.token.symbol)
     self.assertEqual(set(['"["', "SnakeWord", '"."', '":"', '"("']),
                      parse_result.error.expected_tokens)

コード例 #11

0

ファイルを表示

ファイル: parser_test.py プロジェクト: reventlov/emboss

 def test_error_reporting_without_example(self):
     parse_result = parser.parse_module(
         tokenizer.tokenize(
             "struct LogFileStatus:\n"
             "  0 [+4]    UInt    foo +\n", "")[0])
     self.assertEqual(None, parse_result.parse_tree)
     self.assertEqual(None, parse_result.error.code)
     self.assertEqual('"+"', parse_result.error.token.symbol)
     self.assertEqual(
         set(['"("', '"\\n"', '"["', "Documentation", "Comment"]),
         parse_result.error.expected_tokens)

コード例 #12

0

ファイルを表示

ファイル: parser_test.py プロジェクト: reventlov/emboss

 def test_parse_good_wildcard_example(self):
     errors = parser.parse_error_examples(
         _EXAMPLE_DIVIDER +  # ======...
         '   \n   struct must be followed by ":"   \n\n'
         +  # Second message.
         _MESSAGE_ERROR_DIVIDER +  # ------...
         "struct Foo $ERR $ANY")
     tokens = tokenizer.tokenize("struct Foo          ", "")[0]
     # The $ANY token should come just before the end-of-line token in the parsed
     # result.
     tokens.insert(-1, lr1.ANY_TOKEN)
     self.assertEqual(tokens, errors[0][0])
     self.assertEqual('struct must be followed by ":"', errors[0][2])

コード例 #13

0

ファイルを表示

ファイル: parser_test.py プロジェクト: reventlov/emboss

 def test_parse_good_error_examples(self):
     errors = parser.parse_error_examples(
         _EXAMPLE_DIVIDER +  # ======...
         "structure names must be Camel" +  # Message.
         _MESSAGE_ERROR_DIVIDER +  # ------...
         "struct $ERR FOO" +  # First example.
         _ERROR_DIVIDER +  # ---
         "struct $ERR foo" +  # Second example.
         _EXAMPLE_DIVIDER +  # ======...
         '   \n   struct must be followed by ":"   \n\n'
         +  # Second message.
         _MESSAGE_ERROR_DIVIDER +  # ------...
         "struct Foo $ERR")  # Example for second message.
     self.assertEqual(
         tokenizer.tokenize("struct      FOO", "")[0], errors[0][0])
     self.assertEqual("structure names must be Camel", errors[0][2])
     self.assertEqual(
         tokenizer.tokenize("struct      foo", "")[0], errors[1][0])
     self.assertEqual("structure names must be Camel", errors[1][2])
     self.assertEqual(
         tokenizer.tokenize("struct Foo     ", "")[0], errors[2][0])
     self.assertEqual('struct must be followed by ":"', errors[2][2])

コード例 #14

0

ファイルを表示

def main(argv=()):
  flags = _parse_command_line(argv)

  if not flags.edit_in_place and len(flags.input_file) > 1:
    print('Multiple files may only be formatted without --no-edit-in-place.',
          file=sys.stderr)
    return 1

  if flags.edit_in_place and flags.debug_show_line_types:
    print('The flag --debug-show-line-types requires --no-edit-in-place.',
          file=sys.stderr)
    return 1

  for file_name in flags.input_file:
    with open(file_name) as f:
      source_code = f.read()

    tokens, errors = tokenizer.tokenize(source_code, file_name)
    if errors:
      _print_errors(errors, {file_name: source_code}, flags)
      continue

    parse_result = parser.parse_module(tokens)
    if parse_result.error:
      _print_errors(
          [error.make_error_from_parse_error(file_name, parse_result.error)],
          {file_name: source_code},
          flags)
      continue

    formatted_text = format_emb.format_emboss_parse_tree(
        parse_result.parse_tree,
        format_emb.Config(show_line_types=flags.debug_show_line_types,
                          indent_width=flags.indent))

    if flags.check_result and not flags.debug_show_line_types:
      errors = format_emb.sanity_check_format_result(formatted_text,
                                                     source_code)
      if errors:
        for e in errors:
          print(e, file=sys.stderr)
        continue

    if flags.edit_in_place:
      with open(file_name, 'w') as f:
        f.write(formatted_text)
    else:
      sys.stdout.write(formatted_text)

  return 0

コード例 #15

0

ファイルを表示

ファイル: parser.py プロジェクト: reventlov/emboss

def parse_error_examples(error_example_text):
    """Parses error examples from error_example_text.

  Arguments:
    error_example_text: The text of an error example file.

  Returns:
    A list of tuples, suitable for passing into generate_parser.

  Raises:
    ParserGenerationError: There is a problem parsing the error examples.
  """
    error_examples = error_example_text.split("\n" + "=" * 80 + "\n")
    result = []
    # Everything before the first "======" line is explanatory text: ignore it.
    for error_example in error_examples[1:]:
        message_and_examples = error_example.split("\n" + "-" * 80 + "\n")
        if len(message_and_examples) != 2:
            raise ParserGenerationError(
                "Expected one error message and one example section in:\n" +
                error_example)
        message, example_text = message_and_examples
        examples = example_text.split("\n---\n")
        for example in examples:
            # TODO(bolms): feed a line number into tokenize, so that tokenization
            # failures refer to the correct line within error_example_text.
            tokens, errors = tokenizer.tokenize(example, "")
            if errors:
                raise ParserGenerationError(str(errors))

            for i in range(len(tokens)):
                if tokens[i].symbol == "BadWord" and tokens[i].text == "$ANY":
                    tokens[i] = lr1.ANY_TOKEN

            error_token = None
            for i in range(len(tokens)):
                if tokens[i].symbol == "BadWord" and tokens[i].text == "$ERR":
                    error_token = tokens[i + 1]
                    del tokens[i]
                    break
            else:
                raise ParserGenerationError(
                    "No error token marker '$ERR' in:\n" + error_example)

            result.append((tokens, error_token, message.strip(), example))
    return result

コード例 #16

0

ファイルを表示

ファイル: glue.py プロジェクト: max19931/emboss

def parse_module_text(source_code, file_name):
    """Parses the text of a module, returning a module-level IR.

  Arguments:
    source_code: The text of the module to parse.
    file_name: The name of the module's source file (will be included in the
        resulting IR).

  Returns:
    A module-level intermediate representation (IR), prior to import and symbol
    resolution, and a corresponding ModuleDebugInfo, for debugging the parser.

  Raises:
    FrontEndFailure: An error occurred while parsing the module.  str(error)
        will give a human-readable error message.
  """
    # This is strictly an optimization to speed up tests, mostly by avoiding the
    # need to re-parse the prelude for every test .emb.
    if (source_code, file_name) in _cached_modules:
        debug_info = _cached_modules[source_code, file_name]
        ir = ir_pb2.Module()
        ir.CopyFrom(debug_info.ir)
    else:
        debug_info = ModuleDebugInfo(file_name)
        debug_info.source_code = source_code
        tokens, errors = tokenizer.tokenize(source_code, file_name)
        if errors:
            return _IrDebugInfo(None, debug_info, errors)
        debug_info.tokens = tokens
        parse_result = parser.parse_module(tokens)
        if parse_result.error:
            return _IrDebugInfo(None, debug_info, [
                error.make_error_from_parse_error(file_name,
                                                  parse_result.error)
            ])
        debug_info.parse_tree = parse_result.parse_tree
        used_productions = set()
        ir = module_ir.build_ir(parse_result.parse_tree, used_productions)
        debug_info.used_productions = used_productions
        debug_info.ir = ir_pb2.Module()
        debug_info.ir.CopyFrom(ir)
        _cached_modules[source_code, file_name] = debug_info
    ir.source_file_name = file_name
    return _IrDebugInfo(ir, debug_info, [])