def detect_encoding(file_path): detector = UniversalDetector() detector.reset() with open(file_path, 'rb') as file: for line in file.readlines(): detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def __detectEncoding_file(self, filename): detector = UniversalDetector() detector.reset() with open(filename, 'rb') as fReader: text = fReader.readlines() for line in text: detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def __detectEncoding(self): detector = UniversalDetector() detector.reset() with open(os.path.join(self.sourcePath, self.fileNameWithExt), 'rb') as fReader: text = fReader.readlines() for line in text: detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def validate_utf8(file): base_name = basename(file) detector = UniversalDetector() detector.reset() with open(file, "rb") as f: for i, line in enumerate(f): detector.feed(line) if detector.done or i > 1000: break detector.close() result = detector.result if not (result["encoding"] == "utf-8" and result["confidence"] >= 0.99): warn(message=f"File {file} should encoding with UTF-8", level=1) sys.exit(1) with open(file, "r") as f: content = f.read() normalized_nfc_content = Text(content) if normalized_nfc_content != content: warn(message=f"File {base_name} should normalized to NFC", error_type="Format nfc-normalized-failed", file=base_name, level=1)
class SmartCollector(object): def __init__(self, rootdir: str, lastfailed: ListOfString, ignore_source: ListOfString, commit_range: int, diff_current_head_with_branch: str, allow_preemptive_failures: bool, logger: logging.Logger): self.rootdir = rootdir self.lastfailed = lastfailed self.ignore_source = ignore_source self.commit_range = commit_range self.diff_current_head_with_branch = diff_current_head_with_branch self.allow_preemptive_failures = allow_preemptive_failures self.logger = logger self.packages = [] self.encoding_detector = UniversalDetector() def read_file(self, fpath): self.encoding_detector.reset() f = open(fpath, "rb") for line in f.readlines(): self.encoding_detector.feed(line) if self.encoding_detector.done: break f.close() if self.encoding_detector.result['encoding'] is None: enc = 'utf-8' else: enc = self.encoding_detector.result['encoding'].lower() with open(fpath, encoding=enc) as f: lines = f.readlines() contents = ''.join(lines) linecount = len(lines) try: ast.parse(contents) except Exception as e: raise Exception("Couldn't read file '%s' -- %s" % (fpath, str(e))) return contents, linecount def find_git_repo_root(self, dir: str) -> str: if ".git" in os.listdir(dir): return dir else: if os.path.dirname(dir) == dir: raise Exception("No git repo found relative to the pytest rootdir") else: return self.find_git_repo_root(os.path.dirname(dir)) @staticmethod def find_packages(dir: str) -> ListOfString: packages = [] is_valid_package = lambda x: True if os.path.isdir(x) and '__init__.py' in os.listdir(x) else False for root, _, _ in os.walk(dir): abs_root = os.path.abspath(root) if is_valid_package(abs_root): packages.append(abs_root) return packages def find_all_files(self, repo_path: str) -> DictOfChangedFile: all_files = {} for root, _, files in os.walk(repo_path): for f in files: fpath = os.path.join(root, f) if os.path.splitext(f)[-1] == ".py" and not self.should_ignore_source_file(fpath): contents, linecount = self.read_file(fpath) all_files[fpath] = ChangedFile( change_type='A', old_filepath=None, current_filepath=fpath, changed_lines=[range(1, linecount)] ) return all_files def find_changed_files(self, repo: Repo, repo_path: str) -> (DictOfChangedFile, DictOfChangedFile, DictOfChangedFile, DictOfChangedFile, DictOfChangedFile): changed_files = { 'A': {}, 'M': {}, 'D': {}, 'R': {}, 'T': {} } current_head = repo.head.commit previous_commits = repo.commit("%s~%d" % (self.diff_current_head_with_branch, self.commit_range)) diffs = previous_commits.diff(current_head) diffs_with_patch = previous_commits.diff(current_head, create_patch=True) for idx, d in enumerate(diffs): diff_text = diffs_with_patch[idx].diff.decode('utf-8').replace('\r', '') if re.match('^Binary files.*', diff_text) or len(diff_text) == 0: # TODO: figure out if there are any other special cases where the diff information is non-standard continue diff_lines_spec = diff_text.split('\n')[0].split('@@')[1].strip().replace('+', '').replace('-', '') changed_lines = None old_filepath = None if d.change_type == 'A': # added paths filepath = os.path.join(repo_path, d.a_path) if os.path.splitext(filepath)[-1] != '.py': continue _, linecount = self.read_file(filepath) changed_lines = [range(1, linecount)] elif d.change_type == 'M': # modified paths filepath = os.path.join(repo_path, d.a_path) if os.path.splitext(filepath)[-1] != '.py': continue ranges = diff_lines_spec.split(' ') if len(ranges) < 2: start, count = ranges[0].split(',') changed_lines = [range(start, start + count)] else: preimage = [int(x) for x in ranges[0].split(',')] preimage_start = preimage[0] if len(preimage) > 1: preimage_count = preimage[1] else: preimage_count = 0 postimage = [int(x) for x in ranges[1].split(',')] postimage_start = postimage[0] if len(postimage) > 1: postimage_count = postimage[1] else: postimage_count = 0 changed_lines = [ range(preimage_start, preimage_start + preimage_count), range(postimage_start, postimage_start + postimage_count) ] elif d.change_type == 'D': # deleted paths filepath = os.path.join(repo_path, d.a_path) if os.path.splitext(filepath)[-1] != '.py': continue elif d.change_type == 'R': # renamed paths filepath = os.path.join(repo_path, d.b_path) if os.path.splitext(filepath)[-1] != '.py': continue old_filepath = os.path.join(repo_path, d.a_path) _, linecount = self.read_file(filepath) changed_lines = [range(1, linecount)] elif d.change_type == 'T': # changed file types filepath = os.path.join(repo_path, d.b_rawpath) if os.path.splitext(filepath)[-1] != '.py': continue old_filepath = os.path.join(repo_path, d.a_path) _, linecount = self.read_file(filepath) changed_lines = [range(1, linecount)] else: # something is seriously wrong... raise Exception("Unknown change type '%s'" % d.change_type) # we only care about python files here if os.path.splitext(filepath)[-1] == ".py": if os.sep == "\\": filepath = filepath.replace('/', os.sep) if old_filepath is not None: old_filepath = old_filepath.replace('/', os.sep) changed_files[d.change_type][filepath] = ChangedFile( d.change_type, filepath, old_filepath=old_filepath, changed_lines=changed_lines ) return changed_files['A'], changed_files['M'], changed_files['D'], changed_files['R'], changed_files['T'] def should_ignore_source_file(self, source_file: str) -> bool: if self.ignore_source is not None: for ign in self.ignore_source: if os.path.commonpath([source_file, ign]) == ign: return True return False def find_changed_members(self, changed_module: ChangedFile, repo_path: str) -> ListOfString: # find all changed members of changed_module changed_members = [] name_extractor = ObjectNameExtractor() contents, total_lines = self.read_file(os.path.join(repo_path, changed_module.current_filepath)) module_ast = ast.parse(contents) direct_children = list(ast.iter_child_nodes(module_ast)) # get a set of all changed lines in changed_module changed_lines = set() for ch in changed_module.changed_lines: changed_lines.update(set(ch)) # the direct children of the module correspond to the imported names in test files for idx, node in enumerate(direct_children): if isinstance(node, ast.Assign) or isinstance(node, ast.FunctionDef) or isinstance(node, ast.ClassDef): try: r = range(node.lineno, direct_children[idx + 1].lineno) except IndexError: r = range(node.lineno, total_lines) if set(changed_lines).intersection(set(r)): if isinstance(node, ast.Assign): changed_members.extend(name_extractor.extract(node)) elif isinstance(node, ast.FunctionDef): changed_members.append(node.name) else: changed_members.append(node.name) return changed_members @staticmethod def find_fully_qualified_module_name(path: str) -> str: parts = [os.path.splitext(os.path.basename(path))[0]] while "__init__.py" in os.listdir(os.path.dirname(path)): parts.insert(0, os.path.basename(os.path.dirname(path))) path = os.path.dirname(path) return ".".join(parts) @staticmethod def file_in_project(dir, f): if dir not in os.path.commonpath([dir, f]): return False return True def dependencies_changed(self, path: str, object_name: str, change_map: DictOfListOfString, chain: ListOfString) -> bool: git_repo_root = self.find_git_repo_root(self.rootdir) if path in change_map.keys() and object_name in change_map[path]: # if we've seen this file before and already know it to be changed, just return True chain.insert(0, "%s::%s" % (path, object_name)) return True if not self.file_in_project(git_repo_root, path): # if the file is outside of the project, don't bother checking it or any of its dependencies return False # otherwise, recursively check the dependencies of this file for other known changes contents, _ = self.read_file(path) module_ast = ast.parse(contents) # find locally changed members locally_changed = [] if path in change_map.keys(): locally_changed = change_map[path] # find the object of interest in the ast obj = None definition_extractor = DefinitionNodeExtractor() definition_nodes = definition_extractor.extract(module_ast) for child in definition_nodes: if child.name == object_name: obj = child break else: continue if obj is None: # if the object wasn't a definition and is unchanged, assume that there are no further dependencies in the chain return False # extract imports imported_names_and_modules = {} imne = ImportModuleNameExtractor() extracted_imports = imne.extract(module_ast) for (module_name, imported_names, import_level) in extracted_imports: if module_name in sys.builtin_module_names: # we can safely assume that builtin module changes aren't relevant continue if module_name is None: # here we need to find the fully qualified module name for a package relative import assert import_level > 0 module_name = self.find_fully_qualified_module_name(os.path.dirname(path)) else: if import_level > 0: # another package relative import situation module_name = [module_name] src_path = path while import_level > 0: src_path = os.path.dirname(src_path) module_name.insert(0, os.path.basename(src_path)) import_level -= 1 module_name = '.'.join(module_name) if len(imported_names) == 0 or '*' in imported_names: imp = import_module(module_name) imported_names = dir(imp) i = import_module(module_name) for imported_name in imported_names: o = getattr(i, imported_name) if hasattr(o, '__module__') and o.__module__ not in sys.builtin_module_names and o.__module__ is not None: f = import_module(o.__module__).__file__ else: f = None if imported_name in imported_names_and_modules.keys(): if hasattr(i, '__file__') and i.__file__ not in imported_names_and_modules[imported_name] and self.file_in_project(git_repo_root, i.__file__): imported_names_and_modules[imported_name].append(i.__file__) if f is not None and f not in imported_names_and_modules[imported_name] and self.file_in_project(git_repo_root, f): imported_names_and_modules[imported_name].append(f) else: if hasattr(i, '__file__') and self.file_in_project(git_repo_root, i.__file__): imported_names_and_modules[imported_name] = [i.__file__] if f is not None and self.file_in_project(git_repo_root, f): imported_names_and_modules[imported_name].append(f) # check base classes recursively base_class_name_extractor = BaseClassNameExtractor() if isinstance(obj, ast.ClassDef): for base_name in base_class_name_extractor.extract(obj): if base_name in imported_names_and_modules.keys(): base_class_module_paths = imported_names_and_modules[base_name] for path in base_class_module_paths: if self.dependencies_changed(path, base_name, change_map, chain): if path in change_map.keys(): change_map[path].append(base_name) else: change_map[path] = [base_name] return True # extract call objects from obj object_name_extractor = ObjectNameExtractor() used_names = object_name_extractor.extract(obj) for name in used_names: if name == object_name: # to avoid infinite recursion when a class invokes it's own class methods or if a recursive function calls itself continue if name in locally_changed: if path in change_map.keys() and name in change_map[path]: return True if name in imported_names_and_modules.keys(): for module_path in imported_names_and_modules[name]: if self.dependencies_changed(module_path, name, change_map, chain): if module_path in change_map.keys(): change_map[module_path].append(name) else: change_map[module_path] = [name] return True else: continue return False def run(self, items): log_records = [] git_repo_root = self.find_git_repo_root(self.rootdir) self.packages = self.find_packages(git_repo_root) for p in self.packages: sys.path.insert(0, p) try: repo = Repo(git_repo_root) total_commits_on_head = len(list(repo.iter_commits("HEAD"))) if self.diff_current_head_with_branch == repo.active_branch.name and total_commits_on_head < 2: added_files = self.find_all_files(git_repo_root) modified_files = {} deleted_files = {} renamed_files = {} changed_filetype_files = {} else: # inspect the diff added_files, modified_files, deleted_files, renamed_files, changed_filetype_files = self.find_changed_files(repo, git_repo_root) changed_to_py = {} for changed_filetype in changed_filetype_files.values(): if os.path.splitext(changed_filetype.current_filepath) == ".py": changed_to_py[changed_filetype.current_filepath] = changed_filetype changed_files = {} changed_files.update(changed_to_py) changed_files.update(modified_files) changed_files.update(renamed_files) changed_files.update(added_files) # ignore anything explicitly set in --ignore-source flags changed_files = {k: v for k, v in changed_files.items() if not self.should_ignore_source_file(k)} # determine all changed members of each of the changed files (if applicable) changed_members_and_modules = { path: self.find_changed_members(ch, git_repo_root) for path, ch in changed_files.items() } test_count = 0 fixture_map = {} ast_map = {} for test in items: test_name = test.name.split('[')[0] # TODO: figure out a better way to handle test names of parameterized tests # if the test is new, run it anyway if str(test.fspath) in changed_files.keys() and changed_files[str(test.fspath)].change_type == 'A': log_records.append( ('RUN', test.nodeid, "New test") ) self.logger.info("Test '%s' is new, so will be run regardless of changes to the code it tests" % test.nodeid) test_count += 1 continue # if the test failed in the last run, run it anyway if test.nodeid in self.lastfailed: log_records.append( ('RUN', test.nodeid, "Failed on last run") ) self.logger.info( "Test '%s' failed on the last run, so will be run regardless of changes" % test.nodeid) test_count += 1 continue # if the test is already skipped, just ignore it if test.get_marker('skip'): log_records.append( ('SKIP', test.nodeid, "Found skip marker") ) self.logger.info("Found skip marker on test '%s' -- ignoring" % test.nodeid) continue # check dependencies within any defined fixtures if str(test.fspath) in ast_map.keys(): test_file_ast = ast_map[str(test.fspath)] else: contents, _ = self.read_file(str(test.fspath)) test_file_ast = ast.parse(contents) ast_map[str(test.fspath)] = test_file_ast test_node = None for child in ast.iter_child_nodes(test_file_ast): if isinstance(child, ast.ClassDef): for subchild in ast.iter_child_nodes(child): if isinstance(subchild, ast.FunctionDef) and subchild.name == test_name: test_node = subchild break if test_node is not None: break elif isinstance(child, ast.FunctionDef) and child.name == test_name: test_node = child break assert test_node is not None if str(test.fspath) not in fixture_map.keys(): fixture_extractor = FixtureExtractor() fixtures = fixture_extractor.extract(test_file_ast) fixture_map[str(test.fspath)] = fixtures found_changed_fixture = False for fixture in fixture_map[str(test.fspath)]: for arg in test_node.args.args: if arg.arg == fixture.name and self.dependencies_changed(str(test.fspath), fixture.name, changed_members_and_modules, []): log_records.append( ('RUN', test.nodeid, "Uses changed fixture") ) self.logger.info("Test '%s' will run because it uses a changed fixture (%s)" % ( test.nodeid, fixture.name)) test_count += 1 found_changed_fixture = True break if found_changed_fixture: break if found_changed_fixture: continue # otherwise, check the dependency chain from inside the test function chain = [] if self.dependencies_changed(str(test.fspath), test_name, changed_members_and_modules, chain): log_records.append( ('RUN', test.nodeid, "Dependency changed: " + ' -> '.join(chain)) ) self.logger.info( "Test '%s' will run because one of it's dependencies changed (%s)" % ( test.nodeid, ' -> '.join(chain))) test_count += 1 continue else: log_records.append( ('SKIP', test.nodeid, "Unchanged") ) self.logger.info("Test '%s' doesn't touch new or modified code -- SKIPPING" % test.nodeid) skip = pytest.mark.skip(reason="This test doesn't touch new or modified code") test.add_marker(skip) # TODO: add option to write to csv import csv with open("results.csv", "w") as csvfile: csvwriter = csv.writer(csvfile) for row in log_records: csvwriter.writerow(list(row)) self.logger.warning("Total tests selected to run: " + str(test_count)) self._revert_syspath() except Exception as e: self._handle_exception(str(e)) def _handle_exception(self, msg): self._revert_syspath() raise Exception(msg) def _revert_syspath(self): for _ in range(0, len(self.packages)): sys.path.pop(0)
parser = argparse.ArgumentParser() parser.add_argument("-l", help="list all encoding changes in file", action='store_true') parser.add_argument("-d", help="try to decode all Lines", action='store_true') parser.add_argument('filename') args = parser.parse_args() with open(args.filename, 'rb') as infile: det = UniversalDetector() if args.l: print("listing encodings of file \"{}\"".format(args.filename)) encoding = None for nl, line in enumerate(infile.readlines()): det.reset() det.feed(line) det.close() res = det.result if encoding != res["encoding"]: encoding = res["encoding"] if args.d: print("{}#{}#{}({})".format(nl, line.decode(res["encoding"]), res["encoding"], res["confidence"])) else: print("{}#{}#{}({})".format(nl, line, res["encoding"], res["confidence"])) else: i = 1000