def FileEncoding(filename): """Return the file's encoding.""" try: with open(filename, 'rb') as fd: return tokenize.detect_encoding(fd.readline)[0] except IOError: raise
def ReadFile(filename, logger=None): """Read the contents of the file. An optional logger can be specified to emit messages to your favorite logging stream. If specified, then no exception is raised. This is external so that it can be used by third-party applications. Arguments: filename: (unicode) The name of the file. logger: (function) A function or lambda that takes a string and emits it. Returns: The contents of filename. Raises: IOError: raised if there was an error reading the file. """ try: with open(filename, 'rb') as fd: encoding = tokenize.detect_encoding(fd.readline)[0] except IOError as err: if logger: logger(err) raise try: with py3compat.open_with_encoding(filename, mode='r', encoding=encoding) as fd: source = fd.read() return source, encoding except IOError as err: if logger: logger(err) raise
def IsFortranOrHeaderFile(filename, headers_too=True): """Return True if filename is a Fortran file.""" if headers_too: if os.path.splitext(filename)[1] in ['.F','.F90','.f','.f90','.h']: # TODO: This can be dangerous. Esp. when it's a C-header. return True elif os.path.splitext(filename)[1] in ['.F','.F90','.f','.f90']: return True try: with open(filename, 'rb') as fd: encoding = tokenize.detect_encoding(fd.readline)[0] # Check for correctness of encoding. with py3compat.open_with_encoding(filename, encoding=encoding) as fd: fd.read() except UnicodeDecodeError: encoding = 'latin-1' except (IOError, SyntaxError): # If we fail to detect encoding (or the encoding cookie is incorrect - which # will make detect_encoding raise SyntaxError), assume it's not a Fortran # file. return False try: with py3compat.open_with_encoding(filename, mode='r', encoding=encoding) as fd: first_line = fd.readlines()[0] except (IOError, IndexError): return False # In all other cases assume everything is worse. return False
def IsPythonFile(filename): """Return True if filename is a Python file.""" if os.path.splitext(filename)[1] == '.py': return True try: with open(filename, 'rb') as fd: encoding = tokenize.detect_encoding(fd.readline)[0] # Check for correctness of encoding. with py3compat.open_with_encoding(filename, mode='r', encoding=encoding) as fd: fd.read() except UnicodeDecodeError: encoding = 'latin-1' except (IOError, SyntaxError): # If we fail to detect encoding (or the encoding cookie is incorrect - which # will make detect_encoding raise SyntaxError), assume it's not a Python # file. return False try: with py3compat.open_with_encoding(filename, mode='r', encoding=encoding) as fd: first_line = fd.readlines()[0] except (IOError, IndexError): return False return re.match(r'^#!.*\bpython[23]?\b', first_line)
def IsPythonFile(filename): """Return True if filename is a Python file.""" if os.path.splitext(filename)[1] == '.py': return True try: with open(filename, 'rb') as fd: encoding = tokenize.detect_encoding(fd.readline)[0] # Check for correctness of encoding. with py3compat.open_with_encoding(filename, encoding=encoding) as fd: fd.read() except UnicodeDecodeError: encoding = 'latin-1' except (IOError, SyntaxError): # If we fail to detect encoding (or the encoding cookie is incorrect - which # will make detect_encoding raise SyntaxError), assume it's not a Python # file. return False try: with py3compat.open_with_encoding(filename, mode='r', encoding=encoding) as fd: first_line = fd.readlines()[0] except (IOError, IndexError): return False return re.match(r'^#!.*\bpython[23]?\b', first_line)
def IsPythonFile(filename): """Return True if filename is a Python file.""" if os.path.splitext(filename)[1] == '.py': return True try: with open(filename, 'rb') as fd: encoding = tokenize.detect_encoding(fd.readline)[0] # Check for correctness of encoding. with py3compat.open_with_encoding(filename, encoding=encoding) as fd: fd.read() except UnicodeDecodeError: encoding = 'latin-1' except IOError: return False try: with py3compat.open_with_encoding(filename, mode='r', encoding=encoding) as fd: first_line = fd.readlines()[0] except (IOError, IndexError): return False return re.match(r'^#!.*\bpython[23]?\b', first_line)
def _detect_encoding(readline): """Return file encoding.""" try: from lib2to3.pgen2 import tokenize as lib2to3_tokenize encoding = lib2to3_tokenize.detect_encoding(readline)[0] return encoding except (LookupError, SyntaxError, UnicodeDecodeError): return 'latin-1'
def openpy(filename): from lib2to3.pgen2.tokenize import detect_encoding import io # The following is copied from tokenize.py in Python 3.2, # Copyright (c) 2001-2014 Python Software Foundation; All Rights Reserved buffer = io.open(filename, 'rb') encoding, lines = detect_encoding(buffer.readline) buffer.seek(0) text = io.TextIOWrapper(buffer, encoding, line_buffering=True) text.mode = 'r' return text
def test_all_project_files(self): for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertTrue(encoding is not None, "can't detect encoding for %s" % filepath) with open(filepath, "r") as fp: source = fp.read() source = source.decode(encoding) tree = driver.parse_string(source) new = unicode(tree) if diff(filepath, new, encoding): self.fail("Idempotency failed: %s" % filepath)
def get_module_import_alias(import_name, text): try: text = text.encode(detect_encoding(BytesIO(text.encode()).readline)[0]) except UnicodeEncodeError: # Script contains unicode symbol. Cannot run detect_encoding as it requires ascii. text = text.encode('utf-8') try: ast.parse(text) except SyntaxError: # Script contains syntax errors so cannot parse text return import_name for node in ast.walk(ast.parse(text)): if isinstance(node, ast.alias) and node.name == import_name: return node.asname return import_name
def detect_encoding(filename: str) -> str: """Return file encoding.""" from lib2to3.pgen2 import tokenize as lib2to3_tokenize mode = "rb" try: with open(filename, mode=mode) as input_file: encoding: str = lib2to3_tokenize.detect_encoding(input_file.readline)[0] # type: ignore # Check for correctness of encoding. with open_with_encoding(filename, encoding, mode=mode) as input_file: input_file.read() return encoding except (SyntaxError, LookupError, UnicodeDecodeError): return "latin-1"
def detect_encoding(filename): """Return file encoding.""" try: with open(filename, 'rb') as input_file: from lib2to3.pgen2 import tokenize as lib2to3_tokenize encoding = lib2to3_tokenize.detect_encoding(input_file.readline)[0] # Check for correctness of encoding. with open_with_encoding(filename, encoding) as input_file: input_file.read() return encoding except (SyntaxError, LookupError, UnicodeDecodeError): return 'latin-1'
def open_with_encoding_check(filename): # type: ignore """Open a file in read only mode using the encoding detected by detect_encoding(). """ fp = io.open(filename, 'rb') try: encoding, lines = detect_encoding(fp.readline) fp.seek(0) text = io.TextIOWrapper(fp, encoding, line_buffering=True) text.mode = 'r' return text except: fp.close() raise
def test_all_project_files(self): for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] fp.seek(0) source = fp.read() if encoding: source = source.decode(encoding) tree = driver.parse_string(source) new = str(tree) if encoding: new = new.encode(encoding) if diff(filepath, new): self.fail("Idempotency failed: %s" % filepath)
def test_all_project_files(self): for filepath in support.all_project_files(): print("Parsing %s..." % filepath) with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] fp.seek(0) source = fp.read() if encoding: source = source.decode(encoding) tree = driver.parse_string(source) new = str(tree) if encoding: new = new.encode(encoding) if diff(filepath, new): self.fail("Idempotency failed: %s" % filepath)
def test_all_project_files(self): if sys.platform.startswith("win"): # XXX something with newlines goes wrong on Windows. return for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertTrue(encoding is not None, "can't detect encoding for %s" % filepath) with io.open(filepath, "r", encoding=encoding) as fp: source = fp.read() tree = driver.parse_string(source) new = unicode(tree) if diff(filepath, new, encoding): self.fail("Idempotency failed: %s" % filepath)
def _read_python_source(self, filename): """ Do our best to decode a Python source file correctly. """ try: f = open(filename, "rb") except OSError as err: self.log_error("Can't open %s: %s", filename, err) return None, None try: encoding = tokenize.detect_encoding(f.readline)[0] finally: f.close() with _open_with_encoding(filename, "r", encoding=encoding) as f: return _from_system_newlines(f.read()), encoding
def test_all_project_files(self): if sys.platform.startswith("win"): # XXX something with newlines goes wrong on Windows. return for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertIsNotNone(encoding, "can't detect encoding for %s" % filepath) with open(filepath, "r") as fp: source = fp.read() source = source.decode(encoding) tree = driver.parse_string(source) new = unicode(tree) if diff(filepath, new, encoding): self.fail("Idempotency failed: %s" % filepath)
def test_all_project_files(self): for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertTrue(encoding is not None, "can't detect encoding for %s" % filepath) with open(filepath, "r", encoding=encoding) as fp: source = fp.read() try: tree = driver.parse_string(source) except ParseError as err: print('ParseError on file', filepath, err) continue new = str(tree) if diff(filepath, new): self.fail("Idempotency failed: %s" % filepath)
def test_all_project_files(self): for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertIsNotNone(encoding, "can't detect encoding for %s" % filepath) with open(filepath, "r") as fp: source = fp.read() source = source.decode(encoding) tree = driver.parse_string(source) new = unicode(tree) diffResult = diff(filepath, new, encoding) if diffResult: self.fail( "Idempotency failed: {} using {} encoding\n{}".format( filepath, encoding, diffResult))
def test_all_project_files(self): for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertIsNotNone(encoding, "can't detect encoding for %s" % filepath) with open(filepath, "r", encoding=encoding) as fp: source = fp.read() try: tree = driver.parse_string(source) except ParseError as err: if verbose > 0: warnings.warn("ParseError on file %s (%s)" % (filepath, err)) continue new = str(tree) x = diff(filepath, new) if x: self.fail("Idempotency failed: %s" % filepath)
def parse_string(self, code_str): """Parse a program string and remove unwanted outer levels in AST.""" # see lib2to3.tests.support.parse_string -- but we don't do the dedent # (support.reformat) if not isinstance(code_str, str): encoding, _ = tokenize2to3.detect_encoding( io.BytesIO(code_str).readline) code_str = str(code_str, encoding) features = refactor._detect_future_features(code_str) # pylint: disable=protected-access if "print_function" in features: driver = self._drivers["no_print_statement"] else: driver = self._drivers["print_statement"] code_ast = driver.parse_string(code_str + "\n\n", debug=False) if code_ast: code_ast.parent = None return code_ast
def test_all_project_files(self): for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertIsNotNone(encoding, "can't detect encoding for %s" % filepath) with open(filepath, "r", encoding=encoding) as fp: source = fp.read() try: tree = driver.parse_string(source) except ParseError as err: if verbose > 0: warnings.warn('ParseError on file %s (%s)' % (filepath, err)) continue new = str(tree) x = diff(filepath, new) if x: self.fail("Idempotency failed: %s" % filepath)
def test_all_project_files(self): for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertIsNotNone(encoding, "can't detect encoding for %s" % filepath) with open(filepath, "r", encoding=encoding) as fp: source = fp.read() try: tree = driver.parse_string(source) except ParseError: try: tree = driver_no_print_statement.parse_string(source) except ParseError as err: self.fail('ParseError on file %s (%s)' % (filepath, err)) new = str(tree) if new != source: print(diff_texts(source, new, filepath)) self.fail("Idempotency failed: %s" % filepath)
def test_all_project_files(self): if sys.platform.startswith("win"): # XXX something with newlines goes wrong on Windows. return for filepath in support.all_project_files(): with open(filepath, "rb") as fp: encoding = tokenize.detect_encoding(fp.readline)[0] self.assertTrue(encoding is not None, "can't detect encoding for %s" % filepath) with open(filepath, "r") as fp: source = fp.read() source = source.decode(encoding) tree = driver.parse_string(source) from test import test_support if test_support.due_to_ironpython_bug("http://ironpython.codeplex.com/workitem/28171"): continue new = unicode(tree) if diff(filepath, new, encoding): self.fail("Idempotency failed: %s" % filepath)
def _get_imported_from_future(code_str): """ Parse the given code and return a list of names that are imported from __future__. :param code_str: The code to parse :return list: List of names that are imported from __future__ """ future_imports = [] try: code_str = code_str.encode( detect_encoding(BytesIO(code_str.encode()).readline)[0]) except UnicodeEncodeError: # Script contains unicode symbol. Cannot run detect_encoding as it requires ascii. code_str = code_str.encode('utf-8') for node in ast.walk(ast.parse(code_str)): if isinstance(node, ast.ImportFrom): if node.module == '__future__': future_imports.extend( [import_alias.name for import_alias in node.names]) break return future_imports
def ReadFile(filename, logger=None): """Read the contents of the file. An optional logger can be specified to emit messages to your favorite logging stream. If specified, then no exception is raised. This is external so that it can be used by third-party applications. Arguments: filename: (unicode) The name of the file. logger: (function) A function or lambda that takes a string and emits it. Returns: The contents of filename. Raises: IOError: raised if there was an error reading the file. """ try: with open(filename, 'rb') as fd: encoding = tokenize.detect_encoding(fd.readline)[0] except IOError as err: if logger: logger(err) raise try: # Preserves line endings. with py3compat.open_with_encoding(filename, mode='r', encoding=encoding, newline='') as fd: lines = fd.readlines() line_ending = file_resources.LineEnding(lines) source = '\n'.join(line.rstrip('\r\n') for line in lines) + '\n' return source, line_ending, encoding except IOError as err: # pragma: no cover if logger: logger(err) raise
def detect_encoding(filename): """Return file encoding.""" try: input_file = open(filename, 'rb') except (IOError, OSError): # If the file doesn't exist, return the same thing # detect_encoding gives us for an empty file, utf-8. return 'utf-8' try: with input_file: from lib2to3.pgen2 import tokenize as lib2to3_tokenize encoding = lib2to3_tokenize.detect_encoding(input_file.readline)[0] # Check for correctness of encoding. with open_with_encoding(filename, encoding) as input_file: input_file.read() return encoding except (SyntaxError, LookupError, UnicodeDecodeError): return 'latin-1'
def ReadFile(filename, logger=None): """Read the contents of the file. An optional logger can be specified to emit messages to your favorite logging stream. If specified, then no exception is raised. This is external so that it can be used by third-party applications. Arguments: filename: (unicode) The name of the file. logger: (function) A function or lambda that takes a string and emits it. Returns: The contents of filename. Raises: IOError: raised if there was an error reading the file. """ try: with open(filename, 'rb') as fd: encoding = tokenize.detect_encoding(fd.readline)[0] except IOError as err: if logger: logger(err) raise try: # Preserves line endings. with py3compat.open_with_encoding( filename, mode='r', encoding=encoding, newline='') as fd: lines = fd.readlines() line_ending = file_resources.LineEnding(lines) source = '\n'.join(line.rstrip('\r\n') for line in lines) + '\n' return source, line_ending, encoding except IOError as err: # pragma: no cover if logger: logger(err) raise
def read_file_using_source_encoding(filename): with open(filename, 'rb') as infile: encoding = tokenize.detect_encoding(infile.readline)[0] with io.open(filename, 'r', encoding=encoding) as infile_with_encoding: return infile_with_encoding.read()
def decode_string_using_source_encoding(b): encoding = tokenize.detect_encoding(io.BytesIO(b).readline)[0] return b.decode(encoding)
def FileEncoding(filename): """Return the file's encoding.""" with open(filename, 'rb') as fd: return tokenize.detect_encoding(fd.readline)[0]
def detect_encoding(pyFile): from lib2to3.pgen2 import tokenize f=open(pyFile,'rb') return tokenize.detect_encoding(f.readline)[0]
def make_file_from_contents(path: str, contents_bytes: bytes) -> File: """Wrapper for File constructor. Computes the line offsets and creates a `File` object from `contents_bytes`. (`path` and `encoding` are passed through to the `File` object.) """ # pylint: disable=too-many-locals with io.BytesIO(contents_bytes) as src_f: try: encoding, _ = tokenize.detect_encoding(src_f.readline) # type: ignore except LookupError as exc: # TODO: first arg of UnicodeDecodError is encoding, but we # don't know that, so this is an inappropriate error # to raise. raise UnicodeDecodeError('???', contents_bytes, 0, 1, str(exc)) if encoding == 'utf8-sig': # TODO: see https://bugs.python.org/issue39155 encoding = 'utf-8-sig' decoder = codecs.getincrementaldecoder(encoding)() chr_to_byte_offset: Dict[int, int] = {} chr_offset = 0 last_byte_offset = 0 contents_list = [] for byte_offset, by in enumerate(contents_bytes): # TODO: benchmark other methods of converting an int to a byte: # by.to_bytes(1, sys.byteorder, signed=False)) # struct.unpack('1c', by))[0] # chr(by).encode('latin1') # (probably these are all dwarfed by the time used # to process the AST) ch = decoder.decode(bytes([by])) # Can raies UnicodeDecodeError if ch: contents_list.append(ch) assert chr_offset not in chr_to_byte_offset chr_to_byte_offset[chr_offset] = last_byte_offset chr_offset += 1 last_byte_offset = byte_offset + 1 final_by = decoder.decode(b'', True) # flush assert final_by == '', final_by contents_str = ''.join(contents_list) # Ast uses [star,end), so need to also have the last+1 offset: assert chr_offset not in chr_to_byte_offset chr_to_byte_offset[chr_offset] = last_byte_offset lineno_to_chr_offset = {1: 0} lineno = 1 for offset, char in enumerate(contents_str): # TODO: make this work with Windows '\r\n', Mac '\r' # e.g., use contents_str.splitlines(keepends=True) # (see code in ast_color.ColorFile._color_whitespace). if char == '\n': lineno += 1 lineno_to_chr_offset[lineno] = offset + 1 byte_to_chr_offset = {v: k for k, v in chr_to_byte_offset.items()} assert len(byte_to_chr_offset) == len(chr_to_byte_offset) # no dup k,v / v,k return File(path=path, contents_bytes=contents_bytes, contents_str=contents_str, encoding=encoding, lineno_to_chr_offset=lineno_to_chr_offset, chr_to_byte_offset=chr_to_byte_offset, byte_to_chr_offset=byte_to_chr_offset, chr_offsets_for_linenos=sorted((offset, lineno) for lineno, offset in lineno_to_chr_offset.items()), numlines=lineno - 1)
def read_file_using_source_encoding(filename): with open(filename, 'rb') as infile: encoding = tokenize.detect_encoding(infile.readline)[0] with io.open(filename, 'r', encoding=encoding) as infile: return infile.read()