def invoke_semgrep(ctx: click.Context) -> FindingSets: debug_echo("=== adding semgrep configuration") workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=ctx.obj.meta.base_commit_ref, paths=[workdir], ignore_rules_file=get_semgrepignore(ctx.obj.sapp.scan), ) debug_echo("=== seeing if there are any findings") findings = FindingSets() with targets.current_paths() as paths, get_semgrep_config( ctx) as config_args: click.echo("=== looking for current issues in " + unit_len(paths, "file")) for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE): args = ["--json", *config_args] for path in chunk: args.extend(["--include", path]) findings.current.update( Finding.from_semgrep_result(result, ctx) for result in json.loads(str(semgrep(*args)))["results"]) click.echo( f"| {unit_len(findings.current, 'current issue')} found") if not findings.current: click.echo( "=== not looking at pre-existing issues since there are no current issues" ) else: with targets.baseline_paths() as paths, get_semgrep_config( ctx) as config_args: if paths: paths_with_findings = { finding.path for finding in findings.current } paths_to_check = set(str(path) for path in paths) & paths_with_findings click.echo("=== looking for pre-existing issues in " + unit_len(paths_to_check, "file")) for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE): args = ["--json", *config_args] for path in chunk: args.extend(["--include", path]) findings.baseline.update( Finding.from_semgrep_result(result, ctx) for result in json.loads(str(semgrep( *args)))["results"]) click.echo( f"| {unit_len(findings.baseline, 'pre-existing issue')} found" ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report") sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open( "w") as sarif_file, get_semgrep_config(ctx) as config_args: args = ["--sarif", *config_args] for path in paths: args.extend(["--include", path]) semgrep(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return findings
def get_findings( config_specifier: str, committed_datetime: Optional[datetime], base_commit_ref: Optional[str], head_ref: Optional[str], semgrep_ignore: TextIO, uses_managed_policy: bool, ) -> FindingSets: debug_echo("=== adding semgrep configuration") with fix_head_for_github(base_commit_ref, head_ref) as base_ref: workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=base_ref, paths=[workdir], ignore_rules_file=semgrep_ignore, ) config_args = ["--config", config_specifier] rewrite_args = ["--no-rewrite-rule-ids"] if uses_managed_policy else [] debug_echo("=== seeing if there are any findings") findings = FindingSets() with targets.current_paths() as paths: click.echo("=== looking for current issues in " + unit_len(paths, "file"), err=True) args = [ "--skip-unknown-extensions", "--disable-nosem", "--json", *rewrite_args, *config_args, ] semgrep_results = invoke_semgrep(args, [str(p) for p in paths])["results"] findings.current.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results if not result["extra"].get("is_ignored")) findings.ignored.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results if result["extra"].get("is_ignored")) click.echo( f"| {unit_len(findings.current, 'current issue')} found", err=True) click.echo( f"| {unit_len(findings.ignored, 'ignored issue')} found", err=True, ) if not findings.current: click.echo( "=== not looking at pre-existing issues since there are no current issues", err=True, ) else: with targets.baseline_paths() as paths: paths_with_findings = { finding.path for finding in findings.current } paths_to_check = list( set(str(path) for path in paths) & paths_with_findings) if not paths_to_check: click.echo( "=== not looking at pre-existing issues since all files with current issues are newly created", err=True, ) else: click.echo( "=== looking for pre-existing issues in " + unit_len(paths_to_check, "file"), err=True, ) args = [ "--skip-unknown-extensions", "--json", *rewrite_args, *config_args, ] semgrep_results = invoke_semgrep(args, paths_to_check)["results"] findings.baseline.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results) click.echo( f"| {unit_len(findings.baseline, 'pre-existing issue')} found", err=True, ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report", err=True) sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open( "w") as sarif_file: args = ["--sarif", *rewrite_args, *config_args] for path in paths: args.extend(["--include", str(path)]) semgrep_exec(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return findings
def _get_head_findings( context: RunContext, extra_args: Sequence[str], targets: TargetFileManager) -> Tuple[FindingSets, RunStats]: """ Gets findings for the project's HEAD git commit :param context: The Semgrep run context object :param extra_args: Extra arguments to pass to Semgrep :param targets: This run's target manager :return: A findings object with existing head findings and empty baseline findings """ with targets.current_paths() as paths: click.echo("=== looking for current issues in " + unit_len(paths, "file"), err=True) for path in paths: debug_echo(f"searching {str(path)}") args = [ "--skip-unknown-extensions", "--disable-nosem", "--json", "--autofix", "--dryrun", "--time", "--timeout-threshold", "3", *extra_args, ] exit_code, semgrep_output = invoke_semgrep( args, [str(p) for p in paths], timeout=context.timeout, explicit_semgrepignore_path=context.action_ignores_path, ) findings = FindingSets( exit_code, searched_paths=set(targets.searched_paths), errors=semgrep_output.errors, ) stats = RunStats( rule_list=semgrep_output.timing.rules, target_data=semgrep_output.timing.targets, ) findings.current.update_findings( Finding.from_semgrep_result(result, context.committed_datetime) for result in semgrep_output.results if not result["extra"].get("is_ignored")) findings.ignored.update_findings( Finding.from_semgrep_result(result, context.committed_datetime) for result in semgrep_output.results if result["extra"].get("is_ignored")) if findings.errors: click.echo( f"| Semgrep exited with {unit_len(findings.errors, 'error')}:", err=True, ) for e in findings.errors: for s in render_error(e): click.echo(f"| {s}", err=True) inventory_findings_len = 0 for finding in findings.current: if finding.is_cai_finding(): inventory_findings_len += 1 click.echo( f"| {unit_len(range(len(findings.current) - inventory_findings_len), 'current issue')} found", err=True, ) if len(findings.ignored) > 0: click.echo( f"| {unit_len(findings.ignored, 'issue')} muted with nosemgrep comment (not counted as current)", err=True, ) return findings, stats
def _update_baseline_findings( context: RunContext, findings: FindingSets, local_configs: Set[str], extra_args: Sequence[str], targets: TargetFileManager, ) -> None: """ Updates findings.baseline with findings from the baseline git commit :param context: Semgrep run context :param findings: Findings structure from running on the head git commit :param local_configs: Any local semgrep.yml configs :param extra_args: Extra Semgrep arguments :param targets: File targets from head commit """ if not findings.current and not findings.ignored: click.echo( "=== not looking at pre-existing issues since there are no current issues", err=True, ) else: with targets.baseline_paths() as paths: paths_with_findings = { finding.path for finding in findings.current.union(findings.ignored) } paths_to_check = list( set(str(path) for path in paths) & paths_with_findings) if not paths_to_check: click.echo( "=== not looking at pre-existing issues since all files with current issues are newly created", err=True, ) else: config_args = [] for conf in context.config_specifier: # If a local config existed with initial scan but doesn't exist # in baseline, treat as if no issues found in baseline with that config if conf in local_configs and not Path(conf).exists(): click.echo( f"=== {conf} file not found in baseline, skipping scanning for baseline", err=True, ) continue config_args.extend(["--config", conf]) if not config_args: click.echo( "=== not looking at pre-exiting issues since after filtering out local files that don't exist in baseline, no configs left to run", err=True, ) else: click.echo( "=== looking for pre-existing issues in " + unit_len(paths_to_check, "file"), err=True, ) args = [ "--skip-unknown-extensions", "--disable-nosem", "--json", *extra_args, *config_args, ] # If possible, disable metrics so that we get metrics only once per semgrep-action run # However, if run with config auto we must allow metrics to be sent if "auto" not in config_args: args.extend(["--metrics", "off"]) _, semgrep_output = invoke_semgrep( args, paths_to_check, timeout=context.timeout, baseline=True, explicit_semgrepignore_path=context. action_ignores_path, ) findings.baseline.update_findings( Finding.from_semgrep_result(result, context.committed_datetime) for result in semgrep_output.results) inventory_findings_len = 0 for finding in findings.baseline: if finding.is_cai_finding(): inventory_findings_len += 1 baseline_findings_count = (len(findings.baseline) - inventory_findings_len) click.echo( f"| {unit_len(range(baseline_findings_count), 'current issue')} removed by diffing logic", err=True, )
def invoke_semgrep( config_specifier: str, committed_datetime: Optional[datetime], base_commit_ref: Optional[str], semgrep_ignore: TextIO, ) -> FindingSets: debug_echo("=== adding semgrep configuration") workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=base_commit_ref, paths=[workdir], ignore_rules_file=semgrep_ignore, ) config_args = ["--config", config_specifier] debug_echo("=== seeing if there are any findings") finding_set = FindingSets() with targets.current_paths() as paths: click.echo("=== looking for current issues in " + unit_len(paths, "file"), err=True) for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE): args = ["--skip-unknown-extensions", "--json", *config_args] for path in chunk: args.append(path) count = 0 for result in json.loads(str(semgrep(*args)))["results"]: finding_set.update_current(result, committed_datetime) count += 1 click.echo( f"| {count} {cardinalize('current issue', count)} found", err=True) if not finding_set.has_current_issues(): click.echo( "=== not looking at pre-existing issues since there are no current issues", err=True, ) else: with targets.baseline_paths() as paths: if paths: paths_with_findings = finding_set.paths_with_current_findings() paths_to_check = set(str(path) for path in paths) & paths_with_findings click.echo( "=== looking for pre-existing issues in " + unit_len(paths_to_check, "file"), err=True, ) for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE): args = [ "--skip-unknown-extensions", "--json", *config_args ] for path in chunk: args.append(path) count = 0 for result in json.loads(str(semgrep(*args)))["results"]: finding_set.update_baseline(result, committed_datetime) count += 1 click.echo( f"| {count} {cardinalize('pre-existing issue', count)} found", err=True, ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report", err=True) sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open( "w") as sarif_file: args = ["--sarif", *config_args] for path in paths: args.extend(["--include", path]) semgrep(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return finding_set