def invoke_semgrep(ctx: click.Context) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    workdir = Path.cwd()
    targets = TargetFileManager(
        base_path=workdir,
        base_commit=ctx.obj.meta.base_commit_ref,
        paths=[workdir],
        ignore_rules_file=get_semgrepignore(ctx.obj.sapp.scan),
    )

    debug_echo("=== seeing if there are any findings")
    findings = FindingSets()

    with targets.current_paths() as paths, get_semgrep_config(
            ctx) as config_args:
        click.echo("=== looking for current issues in " +
                   unit_len(paths, "file"))
        for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE):
            args = ["--json", *config_args]
            for path in chunk:
                args.extend(["--include", path])
            findings.current.update(
                Finding.from_semgrep_result(result, ctx)
                for result in json.loads(str(semgrep(*args)))["results"])
            click.echo(
                f"| {unit_len(findings.current, 'current issue')} found")

    if not findings.current:
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues"
        )
    else:
        with targets.baseline_paths() as paths, get_semgrep_config(
                ctx) as config_args:
            if paths:
                paths_with_findings = {
                    finding.path
                    for finding in findings.current
                }
                paths_to_check = set(str(path)
                                     for path in paths) & paths_with_findings
                click.echo("=== looking for pre-existing issues in " +
                           unit_len(paths_to_check, "file"))
                for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE):
                    args = ["--json", *config_args]
                    for path in chunk:
                        args.extend(["--include", path])
                    findings.baseline.update(
                        Finding.from_semgrep_result(result, ctx)
                        for result in json.loads(str(semgrep(
                            *args)))["results"])
                click.echo(
                    f"| {unit_len(findings.baseline, 'pre-existing issue')} found"
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report")
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open(
                "w") as sarif_file, get_semgrep_config(ctx) as config_args:
            args = ["--sarif", *config_args]
            for path in paths:
                args.extend(["--include", path])
            semgrep(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return findings
Exemple #2
0
def get_findings(
    config_specifier: str,
    committed_datetime: Optional[datetime],
    base_commit_ref: Optional[str],
    head_ref: Optional[str],
    semgrep_ignore: TextIO,
    uses_managed_policy: bool,
) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    with fix_head_for_github(base_commit_ref, head_ref) as base_ref:
        workdir = Path.cwd()
        targets = TargetFileManager(
            base_path=workdir,
            base_commit=base_ref,
            paths=[workdir],
            ignore_rules_file=semgrep_ignore,
        )

        config_args = ["--config", config_specifier]
        rewrite_args = ["--no-rewrite-rule-ids"] if uses_managed_policy else []

        debug_echo("=== seeing if there are any findings")
        findings = FindingSets()

        with targets.current_paths() as paths:
            click.echo("=== looking for current issues in " +
                       unit_len(paths, "file"),
                       err=True)

            args = [
                "--skip-unknown-extensions",
                "--disable-nosem",
                "--json",
                *rewrite_args,
                *config_args,
            ]
            semgrep_results = invoke_semgrep(args, [str(p)
                                                    for p in paths])["results"]

            findings.current.update_findings(
                Finding.from_semgrep_result(result, committed_datetime)
                for result in semgrep_results
                if not result["extra"].get("is_ignored"))
            findings.ignored.update_findings(
                Finding.from_semgrep_result(result, committed_datetime)
                for result in semgrep_results
                if result["extra"].get("is_ignored"))
            click.echo(
                f"| {unit_len(findings.current, 'current issue')} found",
                err=True)
            click.echo(
                f"| {unit_len(findings.ignored, 'ignored issue')} found",
                err=True,
            )

    if not findings.current:
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues",
            err=True,
        )
    else:
        with targets.baseline_paths() as paths:
            paths_with_findings = {
                finding.path
                for finding in findings.current
            }
            paths_to_check = list(
                set(str(path) for path in paths) & paths_with_findings)
            if not paths_to_check:
                click.echo(
                    "=== not looking at pre-existing issues since all files with current issues are newly created",
                    err=True,
                )
            else:
                click.echo(
                    "=== looking for pre-existing issues in " +
                    unit_len(paths_to_check, "file"),
                    err=True,
                )

                args = [
                    "--skip-unknown-extensions",
                    "--json",
                    *rewrite_args,
                    *config_args,
                ]
                semgrep_results = invoke_semgrep(args,
                                                 paths_to_check)["results"]
                findings.baseline.update_findings(
                    Finding.from_semgrep_result(result, committed_datetime)
                    for result in semgrep_results)
                click.echo(
                    f"| {unit_len(findings.baseline, 'pre-existing issue')} found",
                    err=True,
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report", err=True)
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open(
                "w") as sarif_file:
            args = ["--sarif", *rewrite_args, *config_args]
            for path in paths:
                args.extend(["--include", str(path)])
            semgrep_exec(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return findings
def _get_head_findings(
        context: RunContext, extra_args: Sequence[str],
        targets: TargetFileManager) -> Tuple[FindingSets, RunStats]:
    """
    Gets findings for the project's HEAD git commit

    :param context: The Semgrep run context object
    :param extra_args: Extra arguments to pass to Semgrep
    :param targets: This run's target manager
    :return: A findings object with existing head findings and empty baseline findings
    """
    with targets.current_paths() as paths:
        click.echo("=== looking for current issues in " +
                   unit_len(paths, "file"),
                   err=True)

        for path in paths:
            debug_echo(f"searching {str(path)}")

        args = [
            "--skip-unknown-extensions",
            "--disable-nosem",
            "--json",
            "--autofix",
            "--dryrun",
            "--time",
            "--timeout-threshold",
            "3",
            *extra_args,
        ]
        exit_code, semgrep_output = invoke_semgrep(
            args,
            [str(p) for p in paths],
            timeout=context.timeout,
            explicit_semgrepignore_path=context.action_ignores_path,
        )
        findings = FindingSets(
            exit_code,
            searched_paths=set(targets.searched_paths),
            errors=semgrep_output.errors,
        )

        stats = RunStats(
            rule_list=semgrep_output.timing.rules,
            target_data=semgrep_output.timing.targets,
        )

        findings.current.update_findings(
            Finding.from_semgrep_result(result, context.committed_datetime)
            for result in semgrep_output.results
            if not result["extra"].get("is_ignored"))
        findings.ignored.update_findings(
            Finding.from_semgrep_result(result, context.committed_datetime)
            for result in semgrep_output.results
            if result["extra"].get("is_ignored"))
        if findings.errors:
            click.echo(
                f"| Semgrep exited with {unit_len(findings.errors, 'error')}:",
                err=True,
            )
            for e in findings.errors:
                for s in render_error(e):
                    click.echo(f"|    {s}", err=True)
        inventory_findings_len = 0
        for finding in findings.current:
            if finding.is_cai_finding():
                inventory_findings_len += 1
        click.echo(
            f"| {unit_len(range(len(findings.current) - inventory_findings_len), 'current issue')} found",
            err=True,
        )
        if len(findings.ignored) > 0:
            click.echo(
                f"| {unit_len(findings.ignored, 'issue')} muted with nosemgrep comment (not counted as current)",
                err=True,
            )
    return findings, stats
def _update_baseline_findings(
    context: RunContext,
    findings: FindingSets,
    local_configs: Set[str],
    extra_args: Sequence[str],
    targets: TargetFileManager,
) -> None:
    """
    Updates findings.baseline with findings from the baseline git commit

    :param context: Semgrep run context
    :param findings: Findings structure from running on the head git commit
    :param local_configs: Any local semgrep.yml configs
    :param extra_args: Extra Semgrep arguments
    :param targets: File targets from head commit
    """
    if not findings.current and not findings.ignored:
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues",
            err=True,
        )
    else:
        with targets.baseline_paths() as paths:
            paths_with_findings = {
                finding.path
                for finding in findings.current.union(findings.ignored)
            }
            paths_to_check = list(
                set(str(path) for path in paths) & paths_with_findings)
            if not paths_to_check:
                click.echo(
                    "=== not looking at pre-existing issues since all files with current issues are newly created",
                    err=True,
                )
            else:
                config_args = []
                for conf in context.config_specifier:
                    # If a local config existed with initial scan but doesn't exist
                    # in baseline, treat as if no issues found in baseline with that config
                    if conf in local_configs and not Path(conf).exists():
                        click.echo(
                            f"=== {conf} file not found in baseline, skipping scanning for baseline",
                            err=True,
                        )
                        continue
                    config_args.extend(["--config", conf])

                if not config_args:
                    click.echo(
                        "=== not looking at pre-exiting issues since after filtering out local files that don't exist in baseline, no configs left to run",
                        err=True,
                    )
                else:
                    click.echo(
                        "=== looking for pre-existing issues in " +
                        unit_len(paths_to_check, "file"),
                        err=True,
                    )

                    args = [
                        "--skip-unknown-extensions",
                        "--disable-nosem",
                        "--json",
                        *extra_args,
                        *config_args,
                    ]

                    # If possible, disable metrics so that we get metrics only once per semgrep-action run
                    # However, if run with config auto we must allow metrics to be sent
                    if "auto" not in config_args:
                        args.extend(["--metrics", "off"])

                    _, semgrep_output = invoke_semgrep(
                        args,
                        paths_to_check,
                        timeout=context.timeout,
                        baseline=True,
                        explicit_semgrepignore_path=context.
                        action_ignores_path,
                    )
                    findings.baseline.update_findings(
                        Finding.from_semgrep_result(result,
                                                    context.committed_datetime)
                        for result in semgrep_output.results)
                    inventory_findings_len = 0
                    for finding in findings.baseline:
                        if finding.is_cai_finding():
                            inventory_findings_len += 1
                    baseline_findings_count = (len(findings.baseline) -
                                               inventory_findings_len)
                    click.echo(
                        f"| {unit_len(range(baseline_findings_count), 'current issue')} removed by diffing logic",
                        err=True,
                    )
Exemple #5
0
def invoke_semgrep(
    config_specifier: str,
    committed_datetime: Optional[datetime],
    base_commit_ref: Optional[str],
    semgrep_ignore: TextIO,
) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    workdir = Path.cwd()
    targets = TargetFileManager(
        base_path=workdir,
        base_commit=base_commit_ref,
        paths=[workdir],
        ignore_rules_file=semgrep_ignore,
    )

    config_args = ["--config", config_specifier]

    debug_echo("=== seeing if there are any findings")
    finding_set = FindingSets()

    with targets.current_paths() as paths:
        click.echo("=== looking for current issues in " +
                   unit_len(paths, "file"),
                   err=True)
        for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE):
            args = ["--skip-unknown-extensions", "--json", *config_args]
            for path in chunk:
                args.append(path)
            count = 0
            for result in json.loads(str(semgrep(*args)))["results"]:
                finding_set.update_current(result, committed_datetime)
                count += 1
            click.echo(
                f"| {count} {cardinalize('current issue', count)} found",
                err=True)

    if not finding_set.has_current_issues():
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues",
            err=True,
        )
    else:
        with targets.baseline_paths() as paths:
            if paths:
                paths_with_findings = finding_set.paths_with_current_findings()
                paths_to_check = set(str(path)
                                     for path in paths) & paths_with_findings
                click.echo(
                    "=== looking for pre-existing issues in " +
                    unit_len(paths_to_check, "file"),
                    err=True,
                )
                for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE):
                    args = [
                        "--skip-unknown-extensions", "--json", *config_args
                    ]
                    for path in chunk:
                        args.append(path)
                    count = 0
                    for result in json.loads(str(semgrep(*args)))["results"]:
                        finding_set.update_baseline(result, committed_datetime)
                        count += 1
                click.echo(
                    f"| {count} {cardinalize('pre-existing issue', count)} found",
                    err=True,
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report", err=True)
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open(
                "w") as sarif_file:
            args = ["--sarif", *config_args]
            for path in paths:
                args.extend(["--include", path])
            semgrep(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return finding_set