def parse_source(self, bytessrc, compile_info): """Main entry point for parsing Python source. Everything from decoding the source to tokenizing to building the parse tree is handled here. """ # Detect source encoding. explicit_encoding = False enc = None if compile_info.flags & consts.PyCF_SOURCE_IS_UTF8: enc = 'utf-8' if compile_info.flags & consts.PyCF_IGNORE_COOKIE: textsrc = bytessrc elif bytessrc.startswith("\xEF\xBB\xBF"): bytessrc = bytessrc[3:] enc = 'utf-8' # If an encoding is explicitly given check that it is utf-8. decl_enc = _check_for_encoding(bytessrc) explicit_encoding = (decl_enc is not None) if decl_enc and decl_enc != "utf-8": raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc, filename=compile_info.filename) textsrc = bytessrc else: enc = _normalize_encoding(_check_for_encoding(bytessrc)) explicit_encoding = (enc is not None) if enc is None: enc = 'utf-8' try: textsrc = recode_to_utf8(self.space, bytessrc, enc) except OperationError as e: # if the codec is not found, LookupError is raised. we # check using 'is_w' not to mask potential IndexError or # KeyError space = self.space if e.match(space, space.w_LookupError): raise error.SyntaxError("Unknown encoding: %s" % enc, filename=compile_info.filename) # Transform unicode errors into SyntaxError if e.match(space, space.w_UnicodeDecodeError): e.normalize_exception(space) w_message = space.str(e.get_w_value(space)) raise error.SyntaxError(space.text_w(w_message)) raise if enc is not None: compile_info.encoding = enc if explicit_encoding: compile_info.flags |= consts.PyCF_FOUND_ENCODING return self._parse(textsrc, compile_info)
def recode_to_utf8(space, bytes, encoding): w_text = space.call_method(space.newbytes(bytes), "decode", space.newtext(encoding)) if not space.isinstance_w(w_text, space.w_unicode): raise error.SyntaxError("codec did not return a unicode object") w_recoded = space.call_method(w_text, "encode", space.newtext("utf-8")) return space.bytes_w(w_recoded)
def parse_source(self, textsrc, compile_info): """Main entry point for parsing Python source. Everything from decoding the source to tokenizing to building the parse tree is handled here. """ # Detect source encoding. enc = None if textsrc.startswith("\xEF\xBB\xBF"): textsrc = textsrc[3:] enc = 'utf-8' # If an encoding is explicitly given check that it is utf-8. decl_enc = _check_for_encoding(textsrc) if decl_enc and decl_enc != "utf-8": raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc, filename=compile_info.filename) elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8: enc = 'utf-8' if _check_for_encoding(textsrc) is not None: raise error.SyntaxError("coding declaration in unicode string", filename=compile_info.filename) else: enc = _normalize_encoding(_check_for_encoding(textsrc)) if enc is not None and enc not in ('utf-8', 'iso-8859-1'): try: textsrc = recode_to_utf8(self.space, textsrc, enc) except OperationError, e: # if the codec is not found, LookupError is raised. we # check using 'is_w' not to mask potential IndexError or # KeyError space = self.space if e.match(space, space.w_LookupError): raise error.SyntaxError("Unknown encoding: %s" % enc, filename=compile_info.filename) # Transform unicode errors into SyntaxError if e.match(space, space.w_UnicodeDecodeError): e.normalize_exception(space) w_message = space.str(e.get_w_value(space)) raise error.SyntaxError(space.str_w(w_message)) raise
def parse_source(self, textsrc, compile_info): """Main entry point for parsing Python source. Everything from decoding the source to tokenizing to building the parse tree is handled here. """ # Detect source encoding. enc = None if textsrc.startswith("\xEF\xBB\xBF"): textsrc = textsrc[3:] enc = 'utf-8' # If an encoding is explicitly given check that it is utf-8. decl_enc = _check_for_encoding(textsrc) if decl_enc and decl_enc != "utf-8": raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc, filename=compile_info.filename) elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8: enc = 'utf-8' if _check_for_encoding(textsrc) is not None: raise error.SyntaxError("coding declaration in unicode string", filename=compile_info.filename) else: enc = _normalize_encoding(_check_for_encoding(textsrc)) if enc is not None and enc not in ('utf-8', 'iso-8859-1'): try: textsrc = recode_to_utf8(self.space, textsrc, enc) except OperationError as e: # if the codec is not found, LookupError is raised. we # check using 'is_w' not to mask potential IndexError or # KeyError space = self.space if e.match(space, space.w_LookupError): raise error.SyntaxError( "Codificación desconocida: %s" % enc, filename=compile_info.filename) # Transform unicode errors into SyntaxError if e.match(space, space.w_UnicodeDecodeError): e.normalize_exception(space) w_message = space.str(e.get_w_value(space)) raise error.SyntaxError(space.text_w(w_message)) raise flags = compile_info.flags # The tokenizer is very picky about how it wants its input. source_lines = textsrc.splitlines(True) if source_lines and not source_lines[-1].endswith("\n"): source_lines[-1] += '\n' if textsrc and textsrc[-1] == "\n": flags &= ~consts.PyCF_DONT_IMPLY_DEDENT self.prepare(_targets[compile_info.mode]) tp = 0 try: try: # Note: we no longer pass the CO_FUTURE_* to the tokenizer, # which is expected to work independently of them. It's # certainly the case for all futures in Python <= 2.7. tokens = pytokenizer.generate_tokens(source_lines, flags) newflags, last_future_import = (future.add_future_flags( self.future_flags, tokens)) compile_info.last_future_import = last_future_import compile_info.flags |= newflags if compile_info.flags & consts.CO_FUTURE_PRINT_FUNCTION: self.grammar = pygram.python_grammar_no_print else: self.grammar = pygram.python_grammar for tp, value, lineno, column, line in tokens: if self.add_token(tp, value, lineno, column, line): break except error.TokenError as e: e.filename = compile_info.filename raise except error.TokenIndentationError as e: e.filename = compile_info.filename raise except parser.ParseError as e: # Catch parse errors, pretty them up and reraise them as a # SyntaxError. new_err = error.IndentationError if tp == pygram.tokens.INDENT: msg = "sangría inesperado" elif e.expected == pygram.tokens.INDENT: msg = "esperó un bloque sangriado" else: new_err = error.SyntaxError msg = "sintaxis no válida" raise new_err(msg, e.lineno, e.column, e.line, compile_info.filename) else: tree = self.root finally: # Avoid hanging onto the tree. self.root = None if enc is not None: compile_info.encoding = enc return tree