Exemple #1
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
        profiler: ProfileManager,
        match_time_matrix: Dict[Tuple[str, str], float],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]:
        """
        Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()

        for language, all_patterns_for_language in self._group_patterns_by_language(
            rule
        ).items():

            targets = self.get_files_for_language(language, rule, target_manager)
            targets = [target for target in targets if target not in max_timeout_files]
            all_targets = all_targets.union(targets)
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(
                    0, rule.expression, rule.severity, language, rule._yaml.span
                )

                output_json = profiler.track(
                    rule.id,
                    self._run_core_command,
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                    report_time=self._report_time,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX
                    or p.expression.operator == OPERATORS.NOT_REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex, targets)

                # regex-only rules only support OPERATORS.REGEX.
                # Skip passing this rule to semgrep-core.
                if language in REGEX_ONLY_LANGUAGE_KEYS:
                    continue

                # semgrep-core doesn't know about the following operators -
                # they are strictly semgrep Python features:
                #   - OPERATORS.METAVARIABLE_REGEX
                #   - OPERATORS.METAVARIABLE_COMPARISON
                patterns = [
                    pattern
                    for pattern in patterns
                    if pattern.expression.operator
                    not in [
                        OPERATORS.METAVARIABLE_REGEX,
                        OPERATORS.METAVARIABLE_COMPARISON,
                    ]
                ]

                patterns_json = [p.to_json() for p in patterns]

                if language == GENERIC_LANGUAGE:
                    output_json = profiler.track(
                        rule.id,
                        run_spacegrep,
                        rule.id,
                        patterns,
                        targets,
                        timeout=self._timeout,
                        report_time=self._report_time,
                    )
                else:  # Run semgrep-core
                    output_json = profiler.track(
                        rule.id,
                        self._run_core_command,
                        patterns_json,
                        patterns,
                        targets,
                        language,
                        rule,
                        "-rules_file",
                        cache_dir,
                        report_time=self._report_time,
                    )

            errors.extend(
                CoreException.from_json(e, language, rule.id).into_semgrep_error()
                for e in output_json["errors"]
            )
            outputs.extend(PatternMatch(m) for m in output_json["matches"])
            if "time" in output_json:
                self._add_match_times(rule, match_time_matrix, output_json["time"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[
            Rule, Dict[Path, List[PatternMatch]]
        ] = collections.defaultdict(lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"--> rule ({rule.id}) has findings on filepath: {filepath}"
                )

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec
                )
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)
        logger.debug(f"...ran on {len(all_targets)} files")

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors, all_targets
Exemple #2
0
def generate_file_pairs(
    target: Path,
    config: Path,
    ignore_todo: bool,
    strict: bool,
    unsafe: bool,
    json_output: bool,
    save_test_output_tar: bool = True,
) -> None:
    configs = list(config.rglob("*"))
    targets = list(target.rglob("*"))
    config_filenames = [
        config_filename for config_filename in configs
        if config_filename.suffix in YML_EXTENSIONS
        and not config_filename.name.startswith(".")
        and not config_filename.parent.name.startswith(".")
    ]
    config_test_filenames = {
        config_filename: [
            target_filename for target_filename in targets
            if relatively_eq(target, target_filename, config, config_filename)
            and target_filename.is_file()
            and target_filename.suffix not in YML_EXTENSIONS
        ]
        for config_filename in config_filenames
    }
    config_with_tests, config_without_tests = partition(
        lambda c: c[1], config_test_filenames.items())
    config_missing_tests_output = [str(c[0]) for c in config_without_tests]

    invoke_semgrep_fn = functools.partial(
        invoke_semgrep_multi,
        no_git_ignore=True,
        no_rewrite_rule_ids=True,
        strict=strict,
        dangerously_allow_arbitrary_code_execution_from_rules=unsafe,
    )
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        results = pool.starmap(invoke_semgrep_fn, config_with_tests)

    config_with_errors, config_without_errors = partition(
        lambda r: r[1], results)
    config_with_errors_output = [{
        "filename": str(filename),
        "error": str(error),
        "output": output
    } for filename, error, output in config_with_errors]

    tested = {
        filename: score_output_json(output, config_test_filenames[filename],
                                    ignore_todo)
        for filename, _, output in config_without_errors
    }

    results_output: Mapping[str, Mapping[str, Any]] = {
        str(filename): {
            "todo": todo,
            "checks": {
                check_id: {
                    "tp": tp,
                    "tn": tn,
                    "fp": fp,
                    "fn": fn,
                    "passed": (fp == 0) and (fn == 0),
                    "matches": matches[check_id],
                }
                for check_id, (tp, tn, fp, fn) in output.items()
            },
        }
        for filename, (output, matches, todo) in tested.items()
    }

    output = {
        "config_missing_tests": config_missing_tests_output,
        "config_with_errors": config_with_errors_output,
        "results": results_output,
    }

    strict_error = bool(config_with_errors_output) and strict
    any_failures = any(not check_results["passed"]
                       for file_results in results_output.values()
                       for check_results in file_results["checks"].values())
    exit_code = int(strict_error or any_failures)

    if json_output:
        print(json.dumps(output, indent=4, separators=(",", ": ")))
        sys.exit(exit_code)

    # save the results to json file and tar the file to upload as github artifact.
    if save_test_output_tar:
        list_to_output = []
        with open(SAVE_TEST_OUTPUT_JSON, "w") as f:
            for tup in results:
                true_result = tup[2]
                list_to_output.append(true_result)
            f.write(json.dumps(list_to_output, indent=4,
                               separators=(",", ":")))

        with tarfile.open(SAVE_TEST_OUTPUT_TAR, "w:gz") as tar:
            tar.add(SAVE_TEST_OUTPUT_JSON)

    if config_missing_tests_output:
        print("The following config files are missing tests:")
        print("\t" + "\n\t".join(config_missing_tests_output))

    if config_with_errors_output:
        print("The following config files produced errors:")
        print("\t" + "\n\t".join(f"{c['filename']}: {c['error']}"
                                 for c in config_with_errors_output))

    # Place failed and TODO tests at the bottom for higher visibility
    passed_results_first = collections.OrderedDict(
        sorted(
            results_output.items(),
            key=lambda t: any(not c["passed"] or t[1]["todo"]
                              for c in t[1]["checks"].values()),
        ))

    print(f"{len(tested)} yaml files tested")
    print("check id scoring:")
    print("=" * 80)

    totals: Dict[str, Any] = collections.defaultdict(int)

    for filename, rr in passed_results_first.items():
        print(f"(TODO: {rr['todo']}) {filename}")
        for check_id, check_results in rr["checks"].items():
            print(generate_check_output_line(check_id, check_results))
            if not check_results["passed"]:
                print(generate_matches_line(check_results))
            for confusion in ["tp", "tn", "fp", "fn"]:
                totals[confusion] += check_results[confusion]

    print("=" * 80)
    print(f"final confusion matrix: {generate_confusion_string(totals)}")
    print("=" * 80)

    sys.exit(exit_code)
Exemple #3
0
def generate_file_pairs(location: Path, ignore_todo: bool, strict: bool,
                        unsafe: bool, json_output: bool) -> None:
    output = {}
    filenames = list(location.rglob("*"))
    config_filenames = [
        filename for filename in filenames
        if filename.suffix in YML_EXTENSIONS and not filename.name.startswith(
            ".") and not filename.parent.name.startswith(".")
    ]
    config_test_filenames = {
        config_filename: [
            inner_filename for inner_filename in filenames
            if inner_filename.with_suffix("") == config_filename.with_suffix(
                "") and inner_filename.is_file()
            and inner_filename.suffix not in YML_EXTENSIONS
        ]
        for config_filename in config_filenames
    }
    config_with_tests, config_without_tests = partition(
        lambda c: c[1], config_test_filenames.items())
    output["config_missing_tests"] = [str(c[0]) for c in config_without_tests]

    invoke_semgrep_fn = functools.partial(
        invoke_semgrep_multi,
        no_git_ignore=True,
        no_rewrite_rule_ids=True,
        strict=strict,
        dangerously_allow_arbitrary_code_execution_from_rules=unsafe,
        testing=True,
    )
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        results = pool.starmap(invoke_semgrep_fn, config_with_tests)

    config_with_errors, config_without_errors = partition(
        lambda r: r[1], results)
    output["config_with_errors"] = [{
        "filename": str(filename),
        "error": str(error),
        "output": output
    } for filename, error, output in config_with_errors]

    tested = {
        filename: score_output_json(output, config_test_filenames[filename],
                                    ignore_todo)
        for filename, _, output in config_without_errors
    }

    output["results"] = {
        str(filename): {
            "todo": todo,
            "checks": {
                check_id: {
                    "tp": tp,
                    "tn": tn,
                    "fp": fp,
                    "fn": fn,
                    "passed": (fp == 0) and (fn == 0),
                    "matches": matches[check_id],
                }
                for check_id, (tp, tn, fp, fn) in output.items()
            },
        }
        for filename, (output, matches, todo) in tested.items()
    }

    strict_error = bool(output["config_with_errors"]) and strict
    any_failures = any(not check_results["passed"]
                       for file_results in output["results"].values()
                       for check_results in file_results["checks"].values())
    exit_code = int(strict_error or any_failures)

    if json_output:
        print(json.dumps(output, indent=4, separators=(",", ": ")))
        sys.exit(exit_code)

    if output["config_missing_tests"]:
        print("The following config files are missing tests:")
        print("\t" + "\n\t".join(output["config_missing_tests"]))

    if output["config_with_errors"]:
        print("The following config files produced errors:")
        print("\t" + "\n\t".join(f"{c['filename']}: {c['error']}"
                                 for c in output["config_with_errors"]))

    # Place failed tests at the bottom for higher visibility
    passed_results_first = collections.OrderedDict(
        sorted(
            output["results"].items(),
            key=lambda t: any(not c["passed"]
                              for c in t[1]["checks"].values()),
        ))

    print(f"{len(tested)} yaml files tested")
    print("check id scoring:")
    print("=" * 80)

    totals = collections.defaultdict(int)

    for filename, results in passed_results_first.items():
        print(f"(TODO: {results['todo']}) {filename}")
        for check_id, check_results in results["checks"].items():
            print(generate_check_output_line(check_id, check_results))
            if not check_results["passed"]:
                print(generate_matches_line(check_results))
            for confusion in ["tp", "tn", "fp", "fn"]:
                totals[confusion] += check_results[confusion]

    print("=" * 80)
    print(f"final confusion matrix: {generate_confusion_string(totals)}")
    print("=" * 80)

    sys.exit(exit_code)
Exemple #4
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []

        for language, all_patterns_for_language in self._group_patterns_by_language(
                rule).items():

            targets = self.get_files_for_language(language, rule,
                                                  target_manager)
            targets = [
                target for target in targets if target not in max_timeout_files
            ]
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(0, rule.expression, rule.severity, language,
                                  rule._yaml.span)

                output_json = self._run_core_command(
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex,
                                               targets)

                # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX -
                # this is strictly a semgrep Python feature. Metavariable regex
                # filtering is performed purely in Python code then compared
                # against semgrep-core's results for other patterns.
                patterns = [
                    pattern for pattern in patterns if
                    pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX
                ]

                patterns_json = [p.to_json() for p in patterns]

                output_json = self._run_core_command(
                    patterns_json,
                    patterns,
                    targets,
                    language,
                    rule,
                    "-rules_file",
                    cache_dir,
                )

            errors.extend(
                CoreException.from_json(e, language,
                                        rule.id).into_semgrep_error()
                for e in output_json["errors"])
            outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors
Exemple #5
0
def main(
    output_handler: OutputHandler,
    target: List[str],
    pattern: str,
    lang: str,
    configs: List[str],
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    dryrun: bool = False,
    disable_nosem: bool = False,
    dangerously_allow_arbitrary_code_execution_from_rules: bool = False,
    no_git_ignore: bool = False,
    timeout: int = DEFAULT_TIMEOUT,
    max_memory: int = 0,
    max_target_bytes: int = 0,
    timeout_threshold: int = 0,
    skip_unknown_extensions: bool = False,
    severity: Optional[List[str]] = None,
    optimizations: str = "none",
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    configs_obj, errors = get_config(pattern, lang, configs)
    all_rules = configs_obj.get_rules(no_rewrite_rule_ids)

    if severity is None or severity == []:
        filtered_rules = all_rules
    else:
        filtered_rules = [
            rule for rule in all_rules if rule.severity in severity
        ]

    output_handler.handle_semgrep_errors(errors)

    is_sarif = output_handler.settings.output_format == OutputFormat.SARIF

    if errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not pattern:
        plural = "s" if len(configs_obj.valid) > 1 else ""
        config_id_if_single = (list(configs_obj.valid.keys())[0]
                               if len(configs_obj.valid) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        logger.verbose(
            f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}"
        )

        if len(configs_obj.valid) == 0:
            if len(errors) > 0:
                raise SemgrepError(
                    f"no valid configuration file found ({len(errors)} configs were invalid)",
                    code=MISSING_CONFIG_EXIT_CODE,
                )
            else:
                raise SemgrepError(
                    """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>.
If you're looking for a config to start with, there are thousands at: https://semgrep.dev
The two most popular are:
    --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI
    --config=p/security-audit # find security audit points; noisy, not recommended for CI
""",
                    code=MISSING_CONFIG_EXIT_CODE,
                )

        notify_user_of_work(filtered_rules, include, exclude)

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        max_target_bytes=max_target_bytes,
        targets=target,
        respect_git_ignore=respect_git_ignore,
        output_handler=output_handler,
        skip_unknown_extensions=skip_unknown_extensions,
    )

    profiler = ProfileManager()

    join_rules, rest_of_the_rules = partition(
        lambda rule: rule.mode == JOIN_MODE,
        filtered_rules,
    )
    filtered_rules = rest_of_the_rules

    start_time = time.time()
    # actually invoke semgrep
    (
        rule_matches_by_rule,
        debug_steps_by_rule,
        semgrep_errors,
        all_targets,
        profiling_data,
    ) = CoreRunner(
        jobs=jobs,
        timeout=timeout,
        max_memory=max_memory,
        timeout_threshold=timeout_threshold,
        optimizations=optimizations,
    ).invoke_semgrep(target_manager, profiler, filtered_rules)

    if join_rules:
        import semgrep.join_rule as join_rule

        for rule in join_rules:
            join_rule_matches, join_rule_errors = join_rule.run_join_rule(
                rule.raw, [Path(t) for t in target_manager.targets])
            join_rule_matches_by_rule = {
                Rule.from_json(rule.raw): join_rule_matches
            }
            rule_matches_by_rule.update(join_rule_matches_by_rule)
            output_handler.handle_semgrep_errors(join_rule_errors)

    profiler.save("total_time", start_time)

    output_handler.handle_semgrep_errors(semgrep_errors)

    nosem_errors = []
    for rule, rule_matches in rule_matches_by_rule.items():
        evolved_rule_matches = []
        for rule_match in rule_matches:
            ignored, returned_errors = rule_match_nosem(rule_match, strict)
            evolved_rule_matches.append(
                attr.evolve(rule_match, is_ignored=ignored))
            nosem_errors.extend(returned_errors)
        rule_matches_by_rule[rule] = evolved_rule_matches

    output_handler.handle_semgrep_errors(nosem_errors)

    num_findings_nosem = 0
    if not disable_nosem:
        filtered_rule_matches_by_rule = {}
        for rule, rule_matches in rule_matches_by_rule.items():
            filtered_rule_matches = []
            for rule_match in rule_matches:
                if rule_match._is_ignored:
                    num_findings_nosem += 1
                else:
                    filtered_rule_matches.append(rule_match)
            filtered_rule_matches_by_rule[rule] = filtered_rule_matches
        # SARIF output includes ignored findings, but labels them as suppressed.
        # https://docs.oasis-open.org/sarif/sarif/v2.1.0/csprd01/sarif-v2.1.0-csprd01.html#_Toc10541099
        if not is_sarif:
            rule_matches_by_rule = filtered_rule_matches_by_rule

    num_findings = sum(len(v) for v in rule_matches_by_rule.values())
    stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings"

    if metric_manager.is_enabled:
        project_url = None
        try:
            project_url = sub_check_output(
                ["git", "ls-remote", "--get-url"],
                encoding="utf-8",
                stderr=subprocess.DEVNULL,
            )
        except Exception as e:
            logger.debug(
                f"Failed to get project url from 'git ls-remote': {e}")
            try:
                # add \n to match urls from git ls-remote (backwards compatability)
                project_url = manually_search_file(".git/config", ".com", "\n")
            except Exception as e:
                logger.debug(
                    f"Failed to get project url from .git/config: {e}")

        metric_manager.set_project_hash(project_url)
        metric_manager.set_configs_hash(configs)
        metric_manager.set_rules_hash(filtered_rules)
        metric_manager.set_num_rules(len(filtered_rules))
        metric_manager.set_num_targets(len(all_targets))
        metric_manager.set_num_findings(num_findings)
        metric_manager.set_num_ignored(num_findings_nosem)
        metric_manager.set_run_time(profiler.calls["total_time"][0])
        total_bytes_scanned = sum(t.stat().st_size for t in all_targets)
        metric_manager.set_total_bytes_scanned(total_bytes_scanned)
        metric_manager.set_errors(
            list(type(e).__name__ for e in semgrep_errors))
        metric_manager.set_run_timings(profiling_data, list(all_targets),
                                       filtered_rules)

    output_handler.handle_semgrep_core_output(
        rule_matches_by_rule,
        debug_steps_by_rule,
        stats_line,
        all_targets,
        profiler,
        filtered_rules,
        profiling_data,
    )

    if autofix:
        apply_fixes(rule_matches_by_rule, dryrun)
Exemple #6
0
    def _run_rule(
        self, rule: Rule, target_manager: TargetManager, cache_dir: str
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[CoreException] = []
        equivalences = rule.equivalences

        for language, all_patterns_for_language in self._group_patterns_by_language(
            [rule]).items():
            try:
                targets = target_manager.get_files(language, rule.includes,
                                                   rule.excludes)
            except _UnknownLanguageError as ex:
                raise UnknownLanguageError(
                    short_msg="invalid language",
                    long_msg=f"unsupported language {language}",
                    spans=[
                        rule.languages_span.with_context(before=1, after=1)
                    ],
                ) from ex

            if targets == []:
                continue

            # semgrep-core doesn't know about OPERATORS.REGEX - this is
            # strictly a semgrep Python feature. Regex filtering is
            # performed purely in Python code then compared against
            # semgrep-core's results for other patterns.
            patterns_regex, patterns = partition(
                lambda p: p.expression.operator == OPERATORS.REGEX,
                all_patterns_for_language,
            )
            if patterns_regex:
                patterns_json = [
                    pattern.to_json() for pattern in patterns_regex
                ]

                try:
                    patterns_re = [(pattern["id"],
                                    re.compile(pattern["pattern"]))
                                   for pattern in patterns_json]
                except re.error as err:
                    raise SemgrepError(
                        f"invalid regular expression specified: {err}")

                re_fn = functools.partial(get_re_matches, patterns_re)
                with multiprocessing.Pool(self._jobs) as pool:
                    matches = pool.map(re_fn, targets)

                outputs.extend(single_match for file_matches in matches
                               for single_match in file_matches)

            patterns_json = [p.to_json() for p in patterns]
            with tempfile.NamedTemporaryFile(
                    "w") as pattern_file, tempfile.NamedTemporaryFile(
                        "w") as target_file, tempfile.NamedTemporaryFile(
                            "w") as equiv_file:
                yaml = YAML()
                yaml.dump({"rules": patterns_json}, pattern_file)
                pattern_file.flush()
                target_file.write("\n".join(str(t) for t in targets))
                target_file.flush()

                cmd = [SEMGREP_PATH] + [
                    "-lang",
                    language,
                    "-rules_file",
                    pattern_file.name,
                    "-j",
                    str(self._jobs),
                    "-target_file",
                    target_file.name,
                    "-use_parsing_cache",
                    cache_dir,
                ]

                if equivalences:
                    self._write_equivalences_file(equiv_file, equivalences)
                    cmd += ["-equivalences", equiv_file.name]

                core_run = sub_run(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

                debug_print(core_run.stderr.decode("utf-8", "replace"))

                if core_run.returncode != 0:
                    # see if semgrep output a JSON error that we can decode
                    semgrep_output = core_run.stdout.decode("utf-8", "replace")
                    try:
                        output_json = json.loads(semgrep_output)
                    except ValueError:
                        raise SemgrepError(
                            f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                    if "error" in output_json:
                        self._raise_semgrep_error_from_json(
                            output_json, patterns)
                    else:
                        raise SemgrepError(
                            f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                output_json = json.loads(
                    (core_run.stdout.decode("utf-8", "replace")))
                errors.extend(
                    CoreException.from_json(e, language)
                    for e in output_json["errors"])
                outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                debug_print(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors