Exemple #1
0
    def _run_rules(
        self, rules: List[Rule], target_manager: TargetManager
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Dict[str, Any]]],
               List[SemgrepError], int, ]:
        findings_by_rule: Dict[Rule, List[RuleMatch]] = {}
        debugging_steps_by_rule: Dict[Rule, List[Dict[str, Any]]] = {}
        all_errors: List[SemgrepError] = []
        file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0)
        max_timeout_files: List[Path] = []
        all_targets: Set[Path] = set()

        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                rule_matches, debugging_steps, errors, rule_targets = self._run_rule(
                    rule, target_manager, semgrep_core_ast_cache_dir,
                    max_timeout_files)
                all_targets = all_targets.union(rule_targets)
                findings_by_rule[rule] = rule_matches
                debugging_steps_by_rule[rule] = debugging_steps
                all_errors.extend(errors)
                for err in errors:
                    if isinstance(err, MatchTimeoutError):
                        file_timeouts[err.path] += 1
                        if (self._timeout_threshold != 0
                                and file_timeouts[err.path] >=
                                self._timeout_threshold):
                            max_timeout_files.append(err.path)

        all_errors = dedup_errors(all_errors)
        return findings_by_rule, debugging_steps_by_rule, all_errors, len(
            all_targets)
Exemple #2
0
    def _run_rules(
        self, rules: List[Rule], target_manager: TargetManager
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Dict[str, Any]]],
               List[CoreException], ]:
        findings_by_rule: Dict[Rule, List[RuleMatch]] = {}
        debugging_steps_by_rule: Dict[Rule, List[Dict[str, Any]]] = {}
        all_errors: List[CoreException] = []

        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}")
                rule_matches, debugging_steps, errors = self._run_rule(
                    rule, target_manager, semgrep_core_ast_cache_dir)
                findings_by_rule[rule] = rule_matches
                debugging_steps_by_rule[rule] = debugging_steps
                all_errors.extend(errors)

        all_errors = dedup_errors(all_errors)
        return findings_by_rule, debugging_steps_by_rule, all_errors
Exemple #3
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        Dict[Any, Any],
    ]:
        from itertools import chain
        from collections import defaultdict

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode
                    output_json = self._parse_core_output(
                        out_bytes, err_bytes, returncode
                    )

                    if returncode != 0:
                        if "error" in output_json:
                            self._raise_semgrep_error_from_json(output_json, [], rule)
                        else:
                            raise SemgrepError(
                                f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}"
                            )

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
Exemple #4
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]],
               List[SemgrepError], Set[Path], ProfilingData, ]:
        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = collections.defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0)
        max_timeout_files: Set[Path] = set()

        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                for language in rule.languages:
                    debug_tqdm_write(f"Running rule {rule.id}...")
                    with tempfile.NamedTemporaryFile(
                            "w", suffix=".yaml"
                    ) as rule_file, tempfile.NamedTemporaryFile(
                            "w") as target_file, tempfile.NamedTemporaryFile(
                                "w") as equiv_file:
                        targets = self.get_files_for_language(
                            language, rule, target_manager)

                        targets = [
                            target for target in targets
                            if target not in max_timeout_files
                        ]

                        # opti: no need to call semgrep-core if no target files
                        if not targets:
                            continue
                        all_targets = all_targets.union(targets)

                        target_file.write("\n".join(
                            map(lambda p: str(p), targets)))
                        target_file.flush()
                        yaml = YAML()
                        yaml.dump({"rules": [rule._raw]}, rule_file)
                        rule_file.flush()

                        cmd = [SEMGREP_PATH] + [
                            "-lang",
                            language.value,
                            "-json",
                            "-config",
                            rule_file.name,
                            "-j",
                            str(self._jobs),
                            "-target_file",
                            target_file.name,
                            "-use_parsing_cache",
                            semgrep_core_ast_cache_dir,
                            "-timeout",
                            str(self._timeout),
                            "-max_memory",
                            str(self._max_memory),
                            "-json_time",
                        ]

                        if self._optimizations != "none":
                            cmd.append("-fast")

                        stderr: Optional[int] = subprocess.PIPE
                        if is_debug():
                            cmd += ["-debug"]
                            stderr = None

                        core_run = sub_run(cmd,
                                           stdout=subprocess.PIPE,
                                           stderr=stderr)
                        output_json = self._extract_core_output(rule, core_run)

                        if "time" in output_json:
                            self._add_match_times(rule, profiling_data,
                                                  output_json["time"])

                    # end with tempfile.NamedTemporaryFile(...) ...
                    pattern_matches = [
                        PatternMatch(match) for match in output_json["matches"]
                    ]
                    findings = create_output(rule, pattern_matches)

                    findings = dedup_output(findings)
                    outputs[rule].extend(findings)
                    parsed_errors = [
                        CoreException.from_json(e, language.value,
                                                rule.id).into_semgrep_error()
                        for e in output_json["errors"]
                    ]
                    for err in parsed_errors:
                        if isinstance(err, MatchTimeoutError):
                            file_timeouts[err.path] += 1
                            if (self._timeout_threshold != 0
                                    and file_timeouts[err.path] >=
                                    self._timeout_threshold):
                                max_timeout_files.add(err.path)
                    errors.extend(parsed_errors)
            # end for language ...
        # end for rule ...

        return outputs, {}, errors, all_targets, profiling_data
Exemple #5
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        ProfilingData,
    ]:
        from itertools import chain
        from collections import defaultdict

        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    all_targets = all_targets.union(targets)

                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    if self._report_time:
                        cmd += ["-json_time"]

                    if self._output_settings.debug:
                        cmd += ["-debug"]

                    core_run = sub_run(
                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                    )
                    output_json = self._extract_core_output(rule, [], core_run)

                    if "time" in output_json:
                        self._add_match_times(rule, profiling_data, output_json["time"])

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, all_targets, profiling_data