def _flatten_rule_patterns(self, rules: List[Rule]) -> Iterator[Pattern]: """ Convert list of rules to format understandable by semgrep core """ for rule_index, rule in enumerate(rules): flat_expressions = list( enumerate_patterns_in_boolean_expression(rule.expression)) for expr in flat_expressions: if not should_send_to_semgrep_core(expr): continue span = (rule.pattern_spans.get(expr.pattern_id) if expr.pattern_id is not None else None) for lang in rule.languages: yield Pattern(rule_index, expr, rule.severity, lang, span)
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], profiler: ProfileManager, match_time_matrix: Dict[Tuple[str, str], float], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] all_targets: Set[Path] = set() for language, all_patterns_for_language in self._group_patterns_by_language( rule ).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [target for target in targets if target not in max_timeout_files] all_targets = all_targets.union(targets) if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern( 0, rule.expression, rule.severity, language, rule._yaml.span ) output_json = profiler.track( rule.id, self._run_core_command, [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, report_time=self._report_time, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX or p.expression.operator == OPERATORS.NOT_REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # regex-only rules only support OPERATORS.REGEX. # Skip passing this rule to semgrep-core. if language in REGEX_ONLY_LANGUAGE_KEYS: continue # semgrep-core doesn't know about the following operators - # they are strictly semgrep Python features: # - OPERATORS.METAVARIABLE_REGEX # - OPERATORS.METAVARIABLE_COMPARISON patterns = [ pattern for pattern in patterns if pattern.expression.operator not in [ OPERATORS.METAVARIABLE_REGEX, OPERATORS.METAVARIABLE_COMPARISON, ] ] patterns_json = [p.to_json() for p in patterns] if language == GENERIC_LANGUAGE: output_json = profiler.track( rule.id, run_spacegrep, rule.id, patterns, targets, timeout=self._timeout, report_time=self._report_time, ) else: # Run semgrep-core output_json = profiler.track( rule.id, self._run_core_command, patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, report_time=self._report_time, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) outputs.extend(PatternMatch(m) for m in output_json["matches"]) if "time" in output_json: self._add_match_times(rule, match_time_matrix, output_json["time"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[ Rule, Dict[Path, List[PatternMatch]] ] = collections.defaultdict(lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"--> rule ({rule.id}) has findings on filepath: {filepath}" ) findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec ) findings.extend(findings_for_rule) findings = dedup_output(findings) logger.debug(f"...ran on {len(all_targets)} files") # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors, all_targets
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] for language, all_patterns_for_language in self._group_patterns_by_language( rule).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [ target for target in targets if target not in max_timeout_files ] if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern(0, rule.expression, rule.severity, language, rule._yaml.span) output_json = self._run_core_command( [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX - # this is strictly a semgrep Python feature. Metavariable regex # filtering is performed purely in Python code then compared # against semgrep-core's results for other patterns. patterns = [ pattern for pattern in patterns if pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX ] patterns_json = [p.to_json() for p in patterns] output_json = self._run_core_command( patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors