def filtered_files(self, lang: Language) -> FrozenSet[Path]: """ Return all files that are decendants of any directory in TARGET that have an extension matching LANG that match any pattern in INCLUDES and do not match any pattern in EXCLUDES. Any file in TARGET bypasses excludes and includes. If a file in TARGET has a known extension that is not for langugage LANG then it is also filtered out Note also filters out any directory and decendants of `.git` """ if lang in self._filtered_targets: return self._filtered_targets[lang] targets = self.resolve_targets(self.targets) files, directories = partition_set(lambda p: not p.is_dir(), targets) # Error on non-existent files explicit_files, nonexistent_files = partition_set( lambda p: p.is_file(), files) if nonexistent_files: self.output_handler.handle_semgrep_error( FilesNotFoundError(tuple(nonexistent_files))) targets = self.expand_targets(directories, lang, self.respect_git_ignore) targets = self.filter_includes(targets, self.includes) targets = self.filter_excludes(targets, self.excludes + [".git"]) targets = self.filter_by_size(targets, self.max_target_bytes) # Remove explicit_files with known extensions. explicit_files_with_lang_extension = frozenset( f for f in explicit_files if (any( f.match(f"*{ext}") for ext in lang_to_exts(lang)))) targets = targets.union(explicit_files_with_lang_extension) if not self.skip_unknown_extensions: explicit_files_with_unknown_extensions = frozenset( f for f in explicit_files if not any(f.match(f"*{ext}") for ext in ALL_EXTENSIONS)) targets = targets.union(explicit_files_with_unknown_extensions) self._filtered_targets[lang] = targets return self._filtered_targets[lang]
def _expand_dir(curr_dir: Path, language: Language, respect_git_ignore: bool) -> FrozenSet[Path]: """ Recursively go through a directory and return list of all files with default file extension of language """ def _parse_output(output: str, curr_dir: Path) -> FrozenSet[Path]: """ Convert a newline delimited list of files to a set of path objects prepends curr_dir to all paths in said list If list is empty then returns an empty set """ files: FrozenSet[Path] = frozenset() if output: files = frozenset( p for p in (Path(curr_dir) / elem for elem in output.strip().split("\n")) if TargetManager._is_valid(p)) return files def _find_files_with_extension( curr_dir: Path, extension: FileExtension) -> FrozenSet[Path]: """ Return set of all files in curr_dir with given extension """ return frozenset(p for p in curr_dir.rglob(f"*{extension}") if TargetManager._is_valid(p) and p.is_file()) extensions = lang_to_exts(language) expanded: FrozenSet[Path] = frozenset() for ext in extensions: if respect_git_ignore: try: # Tracked files tracked_output = sub_check_output( ["git", "ls-files", f"*{ext}"], cwd=curr_dir.resolve(), encoding="utf-8", stderr=subprocess.DEVNULL, ) # Untracked but not ignored files untracked_output = sub_check_output( [ "git", "ls-files", "--other", "--exclude-standard", f"*{ext}", ], cwd=curr_dir.resolve(), encoding="utf-8", stderr=subprocess.DEVNULL, ) deleted_output = sub_check_output( ["git", "ls-files", "--deleted", f"*{ext}"], cwd=curr_dir.resolve(), encoding="utf-8", stderr=subprocess.DEVNULL, ) except (subprocess.CalledProcessError, FileNotFoundError): # Not a git directory or git not installed. Fallback to using rglob ext_files = _find_files_with_extension(curr_dir, ext) expanded = expanded.union(ext_files) else: tracked = _parse_output(tracked_output, curr_dir) untracked_unignored = _parse_output( untracked_output, curr_dir) deleted = _parse_output(deleted_output, curr_dir) expanded = expanded.union(tracked) expanded = expanded.union(untracked_unignored) expanded = expanded.difference(deleted) else: ext_files = _find_files_with_extension(curr_dir, ext) expanded = expanded.union(ext_files) return expanded