Example #1
0
    def filtered_files(self, lang: Language) -> FrozenSet[Path]:
        """
        Return all files that are decendants of any directory in TARGET that have
        an extension matching LANG that match any pattern in INCLUDES and do not
        match any pattern in EXCLUDES. Any file in TARGET bypasses excludes and includes.
        If a file in TARGET has a known extension that is not for langugage LANG then
        it is also filtered out

        Note also filters out any directory and decendants of `.git`
        """
        if lang in self._filtered_targets:
            return self._filtered_targets[lang]

        targets = self.resolve_targets(self.targets)

        files, directories = partition_set(lambda p: not p.is_dir(), targets)

        # Error on non-existent files
        explicit_files, nonexistent_files = partition_set(
            lambda p: p.is_file(), files)
        if nonexistent_files:
            self.output_handler.handle_semgrep_error(
                FilesNotFoundError(tuple(nonexistent_files)))

        targets = self.expand_targets(directories, lang,
                                      self.respect_git_ignore)
        targets = self.filter_includes(targets, self.includes)
        targets = self.filter_excludes(targets, self.excludes + [".git"])
        targets = self.filter_by_size(targets, self.max_target_bytes)

        # Remove explicit_files with known extensions.
        explicit_files_with_lang_extension = frozenset(
            f for f in explicit_files if (any(
                f.match(f"*{ext}") for ext in lang_to_exts(lang))))
        targets = targets.union(explicit_files_with_lang_extension)

        if not self.skip_unknown_extensions:
            explicit_files_with_unknown_extensions = frozenset(
                f for f in explicit_files
                if not any(f.match(f"*{ext}") for ext in ALL_EXTENSIONS))
            targets = targets.union(explicit_files_with_unknown_extensions)

        self._filtered_targets[lang] = targets
        return self._filtered_targets[lang]
Example #2
0
    def _expand_dir(curr_dir: Path, language: Language,
                    respect_git_ignore: bool) -> FrozenSet[Path]:
        """
        Recursively go through a directory and return list of all files with
        default file extension of language
        """
        def _parse_output(output: str, curr_dir: Path) -> FrozenSet[Path]:
            """
            Convert a newline delimited list of files to a set of path objects
            prepends curr_dir to all paths in said list

            If list is empty then returns an empty set
            """
            files: FrozenSet[Path] = frozenset()
            if output:
                files = frozenset(
                    p for p in (Path(curr_dir) / elem
                                for elem in output.strip().split("\n"))
                    if TargetManager._is_valid(p))
            return files

        def _find_files_with_extension(
                curr_dir: Path, extension: FileExtension) -> FrozenSet[Path]:
            """
            Return set of all files in curr_dir with given extension
            """
            return frozenset(p for p in curr_dir.rglob(f"*{extension}")
                             if TargetManager._is_valid(p) and p.is_file())

        extensions = lang_to_exts(language)
        expanded: FrozenSet[Path] = frozenset()

        for ext in extensions:
            if respect_git_ignore:
                try:
                    # Tracked files
                    tracked_output = sub_check_output(
                        ["git", "ls-files", f"*{ext}"],
                        cwd=curr_dir.resolve(),
                        encoding="utf-8",
                        stderr=subprocess.DEVNULL,
                    )

                    # Untracked but not ignored files
                    untracked_output = sub_check_output(
                        [
                            "git",
                            "ls-files",
                            "--other",
                            "--exclude-standard",
                            f"*{ext}",
                        ],
                        cwd=curr_dir.resolve(),
                        encoding="utf-8",
                        stderr=subprocess.DEVNULL,
                    )

                    deleted_output = sub_check_output(
                        ["git", "ls-files", "--deleted", f"*{ext}"],
                        cwd=curr_dir.resolve(),
                        encoding="utf-8",
                        stderr=subprocess.DEVNULL,
                    )
                except (subprocess.CalledProcessError, FileNotFoundError):
                    # Not a git directory or git not installed. Fallback to using rglob
                    ext_files = _find_files_with_extension(curr_dir, ext)
                    expanded = expanded.union(ext_files)
                else:
                    tracked = _parse_output(tracked_output, curr_dir)
                    untracked_unignored = _parse_output(
                        untracked_output, curr_dir)
                    deleted = _parse_output(deleted_output, curr_dir)
                    expanded = expanded.union(tracked)
                    expanded = expanded.union(untracked_unignored)
                    expanded = expanded.difference(deleted)

            else:
                ext_files = _find_files_with_extension(curr_dir, ext)
                expanded = expanded.union(ext_files)

        return expanded