Exemple #1
0
    def _run_rules(
        self, rules: List[Rule], target_manager: TargetManager
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Dict[str, Any]]],
               List[SemgrepError], int, ]:
        findings_by_rule: Dict[Rule, List[RuleMatch]] = {}
        debugging_steps_by_rule: Dict[Rule, List[Dict[str, Any]]] = {}
        all_errors: List[SemgrepError] = []
        file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0)
        max_timeout_files: List[Path] = []
        all_targets: Set[Path] = set()

        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                rule_matches, debugging_steps, errors, rule_targets = self._run_rule(
                    rule, target_manager, semgrep_core_ast_cache_dir,
                    max_timeout_files)
                all_targets = all_targets.union(rule_targets)
                findings_by_rule[rule] = rule_matches
                debugging_steps_by_rule[rule] = debugging_steps
                all_errors.extend(errors)
                for err in errors:
                    if isinstance(err, MatchTimeoutError):
                        file_timeouts[err.path] += 1
                        if (self._timeout_threshold != 0
                                and file_timeouts[err.path] >=
                                self._timeout_threshold):
                            max_timeout_files.append(err.path)

        all_errors = dedup_errors(all_errors)
        return findings_by_rule, debugging_steps_by_rule, all_errors, len(
            all_targets)
Exemple #2
0
    def _run_rules(
        self, rules: List[Rule], target_manager: TargetManager
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Dict[str, Any]]],
               List[CoreException], ]:
        findings_by_rule: Dict[Rule, List[RuleMatch]] = {}
        debugging_steps_by_rule: Dict[Rule, List[Dict[str, Any]]] = {}
        all_errors: List[CoreException] = []

        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}")
                rule_matches, debugging_steps, errors = self._run_rule(
                    rule, target_manager, semgrep_core_ast_cache_dir)
                findings_by_rule[rule] = rule_matches
                debugging_steps_by_rule[rule] = debugging_steps
                all_errors.extend(errors)

        all_errors = dedup_errors(all_errors)
        return findings_by_rule, debugging_steps_by_rule, all_errors
Exemple #3
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]],
               List[SemgrepError], Set[Path], ProfilingData, ]:
        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = collections.defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0)
        max_timeout_files: Set[Path] = set()

        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                for language in rule.languages:
                    debug_tqdm_write(f"Running rule {rule.id}...")
                    with tempfile.NamedTemporaryFile(
                            "w", suffix=".yaml"
                    ) as rule_file, tempfile.NamedTemporaryFile(
                            "w") as target_file, tempfile.NamedTemporaryFile(
                                "w") as equiv_file:
                        targets = self.get_files_for_language(
                            language, rule, target_manager)

                        targets = [
                            target for target in targets
                            if target not in max_timeout_files
                        ]

                        # opti: no need to call semgrep-core if no target files
                        if not targets:
                            continue
                        all_targets = all_targets.union(targets)

                        target_file.write("\n".join(
                            map(lambda p: str(p), targets)))
                        target_file.flush()
                        yaml = YAML()
                        yaml.dump({"rules": [rule._raw]}, rule_file)
                        rule_file.flush()

                        cmd = [SEMGREP_PATH] + [
                            "-lang",
                            language.value,
                            "-json",
                            "-config",
                            rule_file.name,
                            "-j",
                            str(self._jobs),
                            "-target_file",
                            target_file.name,
                            "-use_parsing_cache",
                            semgrep_core_ast_cache_dir,
                            "-timeout",
                            str(self._timeout),
                            "-max_memory",
                            str(self._max_memory),
                            "-json_time",
                        ]

                        if self._optimizations != "none":
                            cmd.append("-fast")

                        stderr: Optional[int] = subprocess.PIPE
                        if is_debug():
                            cmd += ["-debug"]
                            stderr = None

                        core_run = sub_run(cmd,
                                           stdout=subprocess.PIPE,
                                           stderr=stderr)
                        output_json = self._extract_core_output(rule, core_run)

                        if "time" in output_json:
                            self._add_match_times(rule, profiling_data,
                                                  output_json["time"])

                    # end with tempfile.NamedTemporaryFile(...) ...
                    pattern_matches = [
                        PatternMatch(match) for match in output_json["matches"]
                    ]
                    findings = create_output(rule, pattern_matches)

                    findings = dedup_output(findings)
                    outputs[rule].extend(findings)
                    parsed_errors = [
                        CoreException.from_json(e, language.value,
                                                rule.id).into_semgrep_error()
                        for e in output_json["errors"]
                    ]
                    for err in parsed_errors:
                        if isinstance(err, MatchTimeoutError):
                            file_timeouts[err.path] += 1
                            if (self._timeout_threshold != 0
                                    and file_timeouts[err.path] >=
                                    self._timeout_threshold):
                                max_timeout_files.add(err.path)
                    errors.extend(parsed_errors)
            # end for language ...
        # end for rule ...

        return outputs, {}, errors, all_targets, profiling_data