def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], profiler: ProfileManager, match_time_matrix: Dict[Tuple[str, str], float], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] all_targets: Set[Path] = set() for language, all_patterns_for_language in self._group_patterns_by_language( rule ).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [target for target in targets if target not in max_timeout_files] all_targets = all_targets.union(targets) if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern( 0, rule.expression, rule.severity, language, rule._yaml.span ) output_json = profiler.track( rule.id, self._run_core_command, [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, report_time=self._report_time, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX or p.expression.operator == OPERATORS.NOT_REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # regex-only rules only support OPERATORS.REGEX. # Skip passing this rule to semgrep-core. if language in REGEX_ONLY_LANGUAGE_KEYS: continue # semgrep-core doesn't know about the following operators - # they are strictly semgrep Python features: # - OPERATORS.METAVARIABLE_REGEX # - OPERATORS.METAVARIABLE_COMPARISON patterns = [ pattern for pattern in patterns if pattern.expression.operator not in [ OPERATORS.METAVARIABLE_REGEX, OPERATORS.METAVARIABLE_COMPARISON, ] ] patterns_json = [p.to_json() for p in patterns] if language == GENERIC_LANGUAGE: output_json = profiler.track( rule.id, run_spacegrep, rule.id, patterns, targets, timeout=self._timeout, report_time=self._report_time, ) else: # Run semgrep-core output_json = profiler.track( rule.id, self._run_core_command, patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, report_time=self._report_time, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) outputs.extend(PatternMatch(m) for m in output_json["matches"]) if "time" in output_json: self._add_match_times(rule, match_time_matrix, output_json["time"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[ Rule, Dict[Path, List[PatternMatch]] ] = collections.defaultdict(lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"--> rule ({rule.id}) has findings on filepath: {filepath}" ) findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec ) findings.extend(findings_for_rule) findings = dedup_output(findings) logger.debug(f"...ran on {len(all_targets)} files") # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors, all_targets
def generate_file_pairs( target: Path, config: Path, ignore_todo: bool, strict: bool, unsafe: bool, json_output: bool, save_test_output_tar: bool = True, ) -> None: configs = list(config.rglob("*")) targets = list(target.rglob("*")) config_filenames = [ config_filename for config_filename in configs if config_filename.suffix in YML_EXTENSIONS and not config_filename.name.startswith(".") and not config_filename.parent.name.startswith(".") ] config_test_filenames = { config_filename: [ target_filename for target_filename in targets if relatively_eq(target, target_filename, config, config_filename) and target_filename.is_file() and target_filename.suffix not in YML_EXTENSIONS ] for config_filename in config_filenames } config_with_tests, config_without_tests = partition( lambda c: c[1], config_test_filenames.items()) config_missing_tests_output = [str(c[0]) for c in config_without_tests] invoke_semgrep_fn = functools.partial( invoke_semgrep_multi, no_git_ignore=True, no_rewrite_rule_ids=True, strict=strict, dangerously_allow_arbitrary_code_execution_from_rules=unsafe, ) with multiprocessing.Pool(multiprocessing.cpu_count()) as pool: results = pool.starmap(invoke_semgrep_fn, config_with_tests) config_with_errors, config_without_errors = partition( lambda r: r[1], results) config_with_errors_output = [{ "filename": str(filename), "error": str(error), "output": output } for filename, error, output in config_with_errors] tested = { filename: score_output_json(output, config_test_filenames[filename], ignore_todo) for filename, _, output in config_without_errors } results_output: Mapping[str, Mapping[str, Any]] = { str(filename): { "todo": todo, "checks": { check_id: { "tp": tp, "tn": tn, "fp": fp, "fn": fn, "passed": (fp == 0) and (fn == 0), "matches": matches[check_id], } for check_id, (tp, tn, fp, fn) in output.items() }, } for filename, (output, matches, todo) in tested.items() } output = { "config_missing_tests": config_missing_tests_output, "config_with_errors": config_with_errors_output, "results": results_output, } strict_error = bool(config_with_errors_output) and strict any_failures = any(not check_results["passed"] for file_results in results_output.values() for check_results in file_results["checks"].values()) exit_code = int(strict_error or any_failures) if json_output: print(json.dumps(output, indent=4, separators=(",", ": "))) sys.exit(exit_code) # save the results to json file and tar the file to upload as github artifact. if save_test_output_tar: list_to_output = [] with open(SAVE_TEST_OUTPUT_JSON, "w") as f: for tup in results: true_result = tup[2] list_to_output.append(true_result) f.write(json.dumps(list_to_output, indent=4, separators=(",", ":"))) with tarfile.open(SAVE_TEST_OUTPUT_TAR, "w:gz") as tar: tar.add(SAVE_TEST_OUTPUT_JSON) if config_missing_tests_output: print("The following config files are missing tests:") print("\t" + "\n\t".join(config_missing_tests_output)) if config_with_errors_output: print("The following config files produced errors:") print("\t" + "\n\t".join(f"{c['filename']}: {c['error']}" for c in config_with_errors_output)) # Place failed and TODO tests at the bottom for higher visibility passed_results_first = collections.OrderedDict( sorted( results_output.items(), key=lambda t: any(not c["passed"] or t[1]["todo"] for c in t[1]["checks"].values()), )) print(f"{len(tested)} yaml files tested") print("check id scoring:") print("=" * 80) totals: Dict[str, Any] = collections.defaultdict(int) for filename, rr in passed_results_first.items(): print(f"(TODO: {rr['todo']}) {filename}") for check_id, check_results in rr["checks"].items(): print(generate_check_output_line(check_id, check_results)) if not check_results["passed"]: print(generate_matches_line(check_results)) for confusion in ["tp", "tn", "fp", "fn"]: totals[confusion] += check_results[confusion] print("=" * 80) print(f"final confusion matrix: {generate_confusion_string(totals)}") print("=" * 80) sys.exit(exit_code)
def generate_file_pairs(location: Path, ignore_todo: bool, strict: bool, unsafe: bool, json_output: bool) -> None: output = {} filenames = list(location.rglob("*")) config_filenames = [ filename for filename in filenames if filename.suffix in YML_EXTENSIONS and not filename.name.startswith( ".") and not filename.parent.name.startswith(".") ] config_test_filenames = { config_filename: [ inner_filename for inner_filename in filenames if inner_filename.with_suffix("") == config_filename.with_suffix( "") and inner_filename.is_file() and inner_filename.suffix not in YML_EXTENSIONS ] for config_filename in config_filenames } config_with_tests, config_without_tests = partition( lambda c: c[1], config_test_filenames.items()) output["config_missing_tests"] = [str(c[0]) for c in config_without_tests] invoke_semgrep_fn = functools.partial( invoke_semgrep_multi, no_git_ignore=True, no_rewrite_rule_ids=True, strict=strict, dangerously_allow_arbitrary_code_execution_from_rules=unsafe, testing=True, ) with multiprocessing.Pool(multiprocessing.cpu_count()) as pool: results = pool.starmap(invoke_semgrep_fn, config_with_tests) config_with_errors, config_without_errors = partition( lambda r: r[1], results) output["config_with_errors"] = [{ "filename": str(filename), "error": str(error), "output": output } for filename, error, output in config_with_errors] tested = { filename: score_output_json(output, config_test_filenames[filename], ignore_todo) for filename, _, output in config_without_errors } output["results"] = { str(filename): { "todo": todo, "checks": { check_id: { "tp": tp, "tn": tn, "fp": fp, "fn": fn, "passed": (fp == 0) and (fn == 0), "matches": matches[check_id], } for check_id, (tp, tn, fp, fn) in output.items() }, } for filename, (output, matches, todo) in tested.items() } strict_error = bool(output["config_with_errors"]) and strict any_failures = any(not check_results["passed"] for file_results in output["results"].values() for check_results in file_results["checks"].values()) exit_code = int(strict_error or any_failures) if json_output: print(json.dumps(output, indent=4, separators=(",", ": "))) sys.exit(exit_code) if output["config_missing_tests"]: print("The following config files are missing tests:") print("\t" + "\n\t".join(output["config_missing_tests"])) if output["config_with_errors"]: print("The following config files produced errors:") print("\t" + "\n\t".join(f"{c['filename']}: {c['error']}" for c in output["config_with_errors"])) # Place failed tests at the bottom for higher visibility passed_results_first = collections.OrderedDict( sorted( output["results"].items(), key=lambda t: any(not c["passed"] for c in t[1]["checks"].values()), )) print(f"{len(tested)} yaml files tested") print("check id scoring:") print("=" * 80) totals = collections.defaultdict(int) for filename, results in passed_results_first.items(): print(f"(TODO: {results['todo']}) {filename}") for check_id, check_results in results["checks"].items(): print(generate_check_output_line(check_id, check_results)) if not check_results["passed"]: print(generate_matches_line(check_results)) for confusion in ["tp", "tn", "fp", "fn"]: totals[confusion] += check_results[confusion] print("=" * 80) print(f"final confusion matrix: {generate_confusion_string(totals)}") print("=" * 80) sys.exit(exit_code)
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] for language, all_patterns_for_language in self._group_patterns_by_language( rule).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [ target for target in targets if target not in max_timeout_files ] if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern(0, rule.expression, rule.severity, language, rule._yaml.span) output_json = self._run_core_command( [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX - # this is strictly a semgrep Python feature. Metavariable regex # filtering is performed purely in Python code then compared # against semgrep-core's results for other patterns. patterns = [ pattern for pattern in patterns if pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX ] patterns_json = [p.to_json() for p in patterns] output_json = self._run_core_command( patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors
def main( output_handler: OutputHandler, target: List[str], pattern: str, lang: str, configs: List[str], no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, strict: bool = False, autofix: bool = False, dryrun: bool = False, disable_nosem: bool = False, dangerously_allow_arbitrary_code_execution_from_rules: bool = False, no_git_ignore: bool = False, timeout: int = DEFAULT_TIMEOUT, max_memory: int = 0, max_target_bytes: int = 0, timeout_threshold: int = 0, skip_unknown_extensions: bool = False, severity: Optional[List[str]] = None, optimizations: str = "none", ) -> None: if include is None: include = [] if exclude is None: exclude = [] configs_obj, errors = get_config(pattern, lang, configs) all_rules = configs_obj.get_rules(no_rewrite_rule_ids) if severity is None or severity == []: filtered_rules = all_rules else: filtered_rules = [ rule for rule in all_rules if rule.severity in severity ] output_handler.handle_semgrep_errors(errors) is_sarif = output_handler.settings.output_format == OutputFormat.SARIF if errors and strict: raise SemgrepError( f"run with --strict and there were {len(errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not pattern: plural = "s" if len(configs_obj.valid) > 1 else "" config_id_if_single = (list(configs_obj.valid.keys())[0] if len(configs_obj.valid) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") logger.verbose( f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}" ) if len(configs_obj.valid) == 0: if len(errors) > 0: raise SemgrepError( f"no valid configuration file found ({len(errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) else: raise SemgrepError( """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>. If you're looking for a config to start with, there are thousands at: https://semgrep.dev The two most popular are: --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI --config=p/security-audit # find security audit points; noisy, not recommended for CI """, code=MISSING_CONFIG_EXIT_CODE, ) notify_user_of_work(filtered_rules, include, exclude) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, max_target_bytes=max_target_bytes, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, skip_unknown_extensions=skip_unknown_extensions, ) profiler = ProfileManager() join_rules, rest_of_the_rules = partition( lambda rule: rule.mode == JOIN_MODE, filtered_rules, ) filtered_rules = rest_of_the_rules start_time = time.time() # actually invoke semgrep ( rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, all_targets, profiling_data, ) = CoreRunner( jobs=jobs, timeout=timeout, max_memory=max_memory, timeout_threshold=timeout_threshold, optimizations=optimizations, ).invoke_semgrep(target_manager, profiler, filtered_rules) if join_rules: import semgrep.join_rule as join_rule for rule in join_rules: join_rule_matches, join_rule_errors = join_rule.run_join_rule( rule.raw, [Path(t) for t in target_manager.targets]) join_rule_matches_by_rule = { Rule.from_json(rule.raw): join_rule_matches } rule_matches_by_rule.update(join_rule_matches_by_rule) output_handler.handle_semgrep_errors(join_rule_errors) profiler.save("total_time", start_time) output_handler.handle_semgrep_errors(semgrep_errors) nosem_errors = [] for rule, rule_matches in rule_matches_by_rule.items(): evolved_rule_matches = [] for rule_match in rule_matches: ignored, returned_errors = rule_match_nosem(rule_match, strict) evolved_rule_matches.append( attr.evolve(rule_match, is_ignored=ignored)) nosem_errors.extend(returned_errors) rule_matches_by_rule[rule] = evolved_rule_matches output_handler.handle_semgrep_errors(nosem_errors) num_findings_nosem = 0 if not disable_nosem: filtered_rule_matches_by_rule = {} for rule, rule_matches in rule_matches_by_rule.items(): filtered_rule_matches = [] for rule_match in rule_matches: if rule_match._is_ignored: num_findings_nosem += 1 else: filtered_rule_matches.append(rule_match) filtered_rule_matches_by_rule[rule] = filtered_rule_matches # SARIF output includes ignored findings, but labels them as suppressed. # https://docs.oasis-open.org/sarif/sarif/v2.1.0/csprd01/sarif-v2.1.0-csprd01.html#_Toc10541099 if not is_sarif: rule_matches_by_rule = filtered_rule_matches_by_rule num_findings = sum(len(v) for v in rule_matches_by_rule.values()) stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings" if metric_manager.is_enabled: project_url = None try: project_url = sub_check_output( ["git", "ls-remote", "--get-url"], encoding="utf-8", stderr=subprocess.DEVNULL, ) except Exception as e: logger.debug( f"Failed to get project url from 'git ls-remote': {e}") try: # add \n to match urls from git ls-remote (backwards compatability) project_url = manually_search_file(".git/config", ".com", "\n") except Exception as e: logger.debug( f"Failed to get project url from .git/config: {e}") metric_manager.set_project_hash(project_url) metric_manager.set_configs_hash(configs) metric_manager.set_rules_hash(filtered_rules) metric_manager.set_num_rules(len(filtered_rules)) metric_manager.set_num_targets(len(all_targets)) metric_manager.set_num_findings(num_findings) metric_manager.set_num_ignored(num_findings_nosem) metric_manager.set_run_time(profiler.calls["total_time"][0]) total_bytes_scanned = sum(t.stat().st_size for t in all_targets) metric_manager.set_total_bytes_scanned(total_bytes_scanned) metric_manager.set_errors( list(type(e).__name__ for e in semgrep_errors)) metric_manager.set_run_timings(profiling_data, list(all_targets), filtered_rules) output_handler.handle_semgrep_core_output( rule_matches_by_rule, debug_steps_by_rule, stats_line, all_targets, profiler, filtered_rules, profiling_data, ) if autofix: apply_fixes(rule_matches_by_rule, dryrun)
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[CoreException] = [] equivalences = rule.equivalences for language, all_patterns_for_language in self._group_patterns_by_language( [rule]).items(): try: targets = target_manager.get_files(language, rule.includes, rule.excludes) except _UnknownLanguageError as ex: raise UnknownLanguageError( short_msg="invalid language", long_msg=f"unsupported language {language}", spans=[ rule.languages_span.with_context(before=1, after=1) ], ) from ex if targets == []: continue # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: patterns_json = [ pattern.to_json() for pattern in patterns_regex ] try: patterns_re = [(pattern["id"], re.compile(pattern["pattern"])) for pattern in patterns_json] except re.error as err: raise SemgrepError( f"invalid regular expression specified: {err}") re_fn = functools.partial(get_re_matches, patterns_re) with multiprocessing.Pool(self._jobs) as pool: matches = pool.map(re_fn, targets) outputs.extend(single_match for file_matches in matches for single_match in file_matches) patterns_json = [p.to_json() for p in patterns] with tempfile.NamedTemporaryFile( "w") as pattern_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: yaml = YAML() yaml.dump({"rules": patterns_json}, pattern_file) pattern_file.flush() target_file.write("\n".join(str(t) for t in targets)) target_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-rules_file", pattern_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", cache_dir, ] if equivalences: self._write_equivalences_file(equiv_file, equivalences) cmd += ["-equivalences", equiv_file.name] core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) debug_print(core_run.stderr.decode("utf-8", "replace")) if core_run.returncode != 0: # see if semgrep output a JSON error that we can decode semgrep_output = core_run.stdout.decode("utf-8", "replace") try: output_json = json.loads(semgrep_output) except ValueError: raise SemgrepError( f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) if "error" in output_json: self._raise_semgrep_error_from_json( output_json, patterns) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) output_json = json.loads( (core_run.stdout.decode("utf-8", "replace"))) errors.extend( CoreException.from_json(e, language) for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): debug_print( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors