def get_re_matches( patterns_re: Sequence[Tuple[Any, TPattern[Any]]], path: Path ) -> List[PatternMatch]: try: contents = path.read_text() except UnicodeDecodeError: logger.debug(f"regex matcher skipping binary file at {path}") return [] return [ PatternMatch( { "check_id": pattern_id, "path": str(path), "start": { "offset": match.start(), "line": _offset_to_line_no(match.start(), contents), "col": _offset_to_col_no(match.start(), contents), }, "end": { "offset": match.end(), "line": _offset_to_line_no(match.end(), contents), "col": _offset_to_col_no(match.end(), contents), }, "extra": {}, } ) for pattern_id, pattern in patterns_re for match in re.finditer(pattern, contents) ]
def compare_where_python(where_expression: str, pattern_match: PatternMatch) -> bool: result = False return_var = "semgrep_pattern_return" lines = where_expression.strip().split("\n") to_eval = "\n".join(lines[:-1] + [f"{return_var} = {lines[-1]}"]) local_vars = { metavar: pattern_match.get_metavariable_value(metavar) for metavar in pattern_match.metavariables } scope = {"vars": local_vars} try: # fmt: off exec( to_eval, scope ) # nosem: contrib.dlint.dlint-equivalent.insecure-exec-use, python.lang.security.audit.exec-detected.exec-detected # fmt: on result = scope[return_var] # type: ignore except KeyError as ex: logger.error( f"could not find metavariable {ex} while evaluating where-python expression '{where_expression}', consider case where metavariable is missing" ) except Exception as ex: logger.error( f"received error '{repr(ex)}' while evaluating where-python expression '{where_expression}'" ) if not isinstance(result, bool): raise SemgrepError( f"where-python expression '{where_expression}' needs boolean output but got {result}" ) return result
def interpolate_message_metavariables( rule: Rule, pattern_match: PatternMatch, all_pattern_match_metavariables: Dict[str, List[PatternMatch]], ) -> str: msg_text = rule.message for metavar_text in all_pattern_match_metavariables: replace_text = metavar_text try: # Always prefer the pattern match metavariable first. replace_text = pattern_match.get_metavariable_value(metavar_text) except KeyError: # If one isn't present, retrieve the value from all metavariables. pattern_matches_with_metavars_that_enclose_match: List[PatternMatch] = list( filter( lambda possible_enclosing_match: possible_enclosing_match.range.is_range_enclosing_or_eq( pattern_match.range ), all_pattern_match_metavariables[metavar_text], ) ) if len(pattern_matches_with_metavars_that_enclose_match): replace_text = pattern_matches_with_metavars_that_enclose_match[ 0 ].get_metavariable_value(metavar_text) msg_text = msg_text.replace(metavar_text, replace_text) return msg_text
def interpolate_message_metavariables(rule: Rule, pattern_match: PatternMatch) -> str: msg_text = rule.message for metavar in pattern_match.metavars: msg_text = msg_text.replace( metavar, pattern_match.get_metavariable_value(metavar)) return msg_text
def interpolate_fix_metavariables( rule: Rule, pattern_match: PatternMatch) -> Optional[str]: fix_str = rule.fix if fix_str is None: return None for metavar in pattern_match.metavars: fix_str = fix_str.replace( metavar, pattern_match.get_metavariable_value(metavar)) return fix_str
def interpolate_message_metavariables( rule: Rule, pattern_match: PatternMatch, propagated_metavariables: Dict[str, str]) -> str: msg_text = rule.message for metavariable in pattern_match.metavariables: msg_text = msg_text.replace( metavariable, pattern_match.get_metavariable_value(metavariable)) for metavariable, metavariable_text in propagated_metavariables.items(): msg_text = msg_text.replace(metavariable, metavariable_text) return msg_text
def interpolate_fix_metavariables( rule: Rule, pattern_match: PatternMatch, propagated_metavariables: Dict[str, str]) -> Optional[str]: fix_str = rule.fix if fix_str is None: return None for metavariable in pattern_match.metavariables: fix_str = fix_str.replace( metavariable, pattern_match.get_metavariable_value(metavariable)) for metavariable, metavariable_text in propagated_metavariables.items(): fix_str = fix_str.replace(metavariable, metavariable_text) return fix_str
def interpolate_string_with_metavariables( text: str, pattern_match: PatternMatch, propagated_metavariables: Dict[str, str]) -> str: """Interpolates a string with the metavariables contained in it, returning a new string""" # Sort by metavariable length to avoid name collisions (eg. $X2 must be handled before $X) for metavariable in sorted(pattern_match.metavariables, key=len, reverse=True): text = text.replace(metavariable, pattern_match.get_metavariable_value(metavariable)) # Sort by metavariable length to avoid name collisions (eg. $X2 must be handled before $X) for metavariable in sorted(propagated_metavariables.keys(), key=len, reverse=True): text = text.replace(metavariable, propagated_metavariables[metavariable]) return text
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], Dict[Any, Any], ]: from itertools import chain from collections import defaultdict outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode output_json = self._parse_core_output( out_bytes, err_bytes, returncode ) if returncode != 0: if "error" in output_json: self._raise_semgrep_error_from_json(output_json, [], rule) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}" ) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], profiler: ProfileManager, match_time_matrix: Dict[Tuple[str, str], float], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] all_targets: Set[Path] = set() for language, all_patterns_for_language in self._group_patterns_by_language( rule ).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [target for target in targets if target not in max_timeout_files] all_targets = all_targets.union(targets) if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern( 0, rule.expression, rule.severity, language, rule._yaml.span ) output_json = profiler.track( rule.id, self._run_core_command, [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, report_time=self._report_time, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX or p.expression.operator == OPERATORS.NOT_REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # regex-only rules only support OPERATORS.REGEX. # Skip passing this rule to semgrep-core. if language in REGEX_ONLY_LANGUAGE_KEYS: continue # semgrep-core doesn't know about the following operators - # they are strictly semgrep Python features: # - OPERATORS.METAVARIABLE_REGEX # - OPERATORS.METAVARIABLE_COMPARISON patterns = [ pattern for pattern in patterns if pattern.expression.operator not in [ OPERATORS.METAVARIABLE_REGEX, OPERATORS.METAVARIABLE_COMPARISON, ] ] patterns_json = [p.to_json() for p in patterns] if language == GENERIC_LANGUAGE: output_json = profiler.track( rule.id, run_spacegrep, rule.id, patterns, targets, timeout=self._timeout, report_time=self._report_time, ) else: # Run semgrep-core output_json = profiler.track( rule.id, self._run_core_command, patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, report_time=self._report_time, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) outputs.extend(PatternMatch(m) for m in output_json["matches"]) if "time" in output_json: self._add_match_times(rule, match_time_matrix, output_json["time"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[ Rule, Dict[Path, List[PatternMatch]] ] = collections.defaultdict(lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"--> rule ({rule.id}) has findings on filepath: {filepath}" ) findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec ) findings.extend(findings_for_rule) findings = dedup_output(findings) logger.debug(f"...ran on {len(all_targets)} files") # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors, all_targets
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], ProfilingData, ]: logger.debug(f"Passing whole rules directly to semgrep_core") outputs: Dict[Rule, List[RuleMatch]] = collections.defaultdict(list) errors: List[SemgrepError] = [] all_targets: Set[Path] = set() file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0) max_timeout_files: Set[Path] = set() profiling_data: ProfilingData = ProfilingData() # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule in progress_bar( rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"): for language in rule.languages: debug_tqdm_write(f"Running rule {rule.id}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: targets = self.get_files_for_language( language, rule, target_manager) targets = [ target for target in targets if target not in max_timeout_files ] # opti: no need to call semgrep-core if no target files if not targets: continue all_targets = all_targets.union(targets) target_file.write("\n".join( map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language.value, "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), "-json_time", ] if self._optimizations != "none": cmd.append("-fast") stderr: Optional[int] = subprocess.PIPE if is_debug(): cmd += ["-debug"] stderr = None core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=stderr) output_json = self._extract_core_output(rule, core_run) if "time" in output_json: self._add_match_times(rule, profiling_data, output_json["time"]) # end with tempfile.NamedTemporaryFile(...) ... pattern_matches = [ PatternMatch(match) for match in output_json["matches"] ] findings = create_output(rule, pattern_matches) findings = dedup_output(findings) outputs[rule].extend(findings) parsed_errors = [ CoreException.from_json(e, language.value, rule.id).into_semgrep_error() for e in output_json["errors"] ] for err in parsed_errors: if isinstance(err, MatchTimeoutError): file_timeouts[err.path] += 1 if (self._timeout_threshold != 0 and file_timeouts[err.path] >= self._timeout_threshold): max_timeout_files.add(err.path) errors.extend(parsed_errors) # end for language ... # end for rule ... return outputs, {}, errors, all_targets, profiling_data
def run_join_rule( join_rule: Dict[str, Any], targets: List[Path], ) -> Tuple[List[RuleMatch], List[SemgrepError]]: """ Run a 'join' mode rule. Join rules are comprised of multiple Semgrep rules and a set of conditions which must be satisfied in order to return a result. These conditions are typically some comparison of metavariable contents from different rules. 'join_rule' is a join rule definition in dictionary form. The required keys are {'id', 'mode', 'severity', 'message', 'join'}. 'join' is dictionary with the required keys {'refs', 'on'}. 'refs' is dictionary with the required key {'rule'}. 'rule' is identical to a Semgrep config string -- the same thing used on the command line. e.g., `semgrep -f p/javascript.lang.security.rule` or `semgrep -f path/to/rule.yaml`. 'refs' has optional keys {'renames', 'as'}. 'renames' is a list of objects with properties {'from', 'to'}. 'renames' are used to rename metavariables of the associated 'rule'. 'as' lets you alias the collection of rule results for use in the conditions, similar to a SQL alias. By default, collection names will be the rule ID. 'on' is a list of strings of the form <collection>.<property> <operator> <collection>.<property>. These are the conditions which must be satisifed for this rule to report results. All conditions must be satisfied. See semgrep/tests/e2e/rules/join_rules/user-input-with-unescaped-extension.yaml for an example. """ join_contents = join_rule.get("join", {}) semgrep_config_strings = [ ref.get("rule") for ref in join_contents.get("refs", []) ] config_map = create_config_map(semgrep_config_strings) join_rule_refs: List[Ref] = [ Ref( id=config_map[ref.get("rule")].id, renames={ rename.get("from"): rename.get("to") for rename in ref.get("renames", []) }, alias=ref.get("as"), ) for ref in join_contents.get("refs", []) ] refs_lookup = {ref.id: ref for ref in join_rule_refs} alias_lookup = {ref.alias: ref.id for ref in join_rule_refs} try: conditions = [ Condition.parse(condition_string) for condition_string in join_contents.get("on", []) ] except InvalidConditionError as e: return [], [e] # Run Semgrep with tempfile.NamedTemporaryFile() as rule_path: yaml.dump({"rules": [rule.raw for rule in config_map.values()]}, rule_path) rule_path.flush() output = semgrep.semgrep_main.invoke_semgrep( config=Path(rule_path.name), targets=targets, no_rewrite_rule_ids=True, optimizations="all", ) assert isinstance(output, dict) # placate mypy results = output.get("results", []) errors = output.get("errors", []) parsed_errors = [] for error_dict in errors: try: """ This is a hack to reconstitute errors after they've been JSONified as output. Subclasses of SemgrepError define the 'level' and 'code' as class properties, which means they aren't accepted as arguments when instantiated. 'type' is also added when errors are JSONified, and is just a string of the error class name. It's not used as an argument. All of these properties will be properly populated because it's using the class properties of the SemgrepError inferred by 'type'. """ del error_dict["code"] del error_dict["level"] errortype = error_dict.get("type") del error_dict["type"] parsed_errors.append( ERROR_MAP[error_dict.get(errortype)].from_dict(error_dict)) except KeyError: logger.warning( f"Could not reconstitute Semgrep error: {error_dict}.\nSkipping processing of error" ) continue # Small optimization: if there are no results for rules that # are used in a condition, there's no sense in continuing. collection_set_unaliased = { alias_lookup[collection] for collection in create_collection_set_from_conditions(conditions) } rule_ids = set(result.get("check_id") for result in results) if collection_set_unaliased - rule_ids: logger.debug( f"No results for {collection_set_unaliased - rule_ids} in join rule '{join_rule.get('id')}'." ) return [], parsed_errors # Rename metavariables with user-defined renames. rename_metavars_in_place(results, refs_lookup) # Create a model map. This allows dynamically creating DB tables based # on Semgrep's results. There is one table for each rule ID. model_map = create_model_map(results) db.connect() db.create_tables(model_map.values()) # Populate the model tables with real data from the Semgrep results. load_results_into_db(results, model_map) # Apply the conditions and only keep combinations # of findings that satisfy the conditions. matches = [] matched_on_conditions = match_on_conditions( model_map, alias_lookup, [ Condition.parse(condition_string) for condition_string in join_contents.get("on", []) ], ) if matched_on_conditions: # This is ugly, but makes mypy happy for match in matched_on_conditions: matches.append( json.loads(match.raw.decode("utf-8", errors="replace"))) rule_matches = [ RuleMatch( id=join_rule.get("id", match.get("check_id", "[empty]")), pattern_match=PatternMatch({}), message=join_rule.get( "message", match.get("extra", {}).get("message", "[empty]")), metadata=join_rule.get("metadata", match.get("extra", {}).get("metadata", {})), severity=join_rule.get("severity", match.get("severity", "INFO")), path=Path(match.get("path", "[empty]")), start=match.get("start", {}), end=match.get("end", {}), extra=match.get("extra", {}), fix=None, fix_regex=None, lines_cache={}, ) for match in matches ] db.close() return rule_matches, parsed_errors
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], ProfilingData, ]: from itertools import chain from collections import defaultdict logger.debug(f"Passing whole rules directly to semgrep_core") outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] all_targets: Set[Path] = set() profiling_data: ProfilingData = ProfilingData() # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue all_targets = all_targets.union(targets) target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] if self._report_time: cmd += ["-json_time"] if self._output_settings.debug: cmd += ["-debug"] core_run = sub_run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) output_json = self._extract_core_output(rule, [], core_run) if "time" in output_json: self._add_match_times(rule, profiling_data, output_json["time"]) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, all_targets, profiling_data
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] for language, all_patterns_for_language in self._group_patterns_by_language( rule).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [ target for target in targets if target not in max_timeout_files ] if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern(0, rule.expression, rule.severity, language, rule._yaml.span) output_json = self._run_core_command( [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX - # this is strictly a semgrep Python feature. Metavariable regex # filtering is performed purely in Python code then compared # against semgrep-core's results for other patterns. patterns = [ pattern for pattern in patterns if pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX ] patterns_json = [p.to_json() for p in patterns] output_json = self._run_core_command( patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[CoreException] = [] equivalences = rule.equivalences for language, all_patterns_for_language in self._group_patterns_by_language( [rule]).items(): try: targets = target_manager.get_files(language, rule.includes, rule.excludes) except _UnknownLanguageError as ex: raise UnknownLanguageError( short_msg="invalid language", long_msg=f"unsupported language {language}", spans=[ rule.languages_span.with_context(before=1, after=1) ], ) from ex if targets == []: continue # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: patterns_json = [ pattern.to_json() for pattern in patterns_regex ] try: patterns_re = [(pattern["id"], re.compile(pattern["pattern"])) for pattern in patterns_json] except re.error as err: raise SemgrepError( f"invalid regular expression specified: {err}") re_fn = functools.partial(get_re_matches, patterns_re) with multiprocessing.Pool(self._jobs) as pool: matches = pool.map(re_fn, targets) outputs.extend(single_match for file_matches in matches for single_match in file_matches) patterns_json = [p.to_json() for p in patterns] with tempfile.NamedTemporaryFile( "w") as pattern_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: yaml = YAML() yaml.dump({"rules": patterns_json}, pattern_file) pattern_file.flush() target_file.write("\n".join(str(t) for t in targets)) target_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-rules_file", pattern_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", cache_dir, ] if equivalences: self._write_equivalences_file(equiv_file, equivalences) cmd += ["-equivalences", equiv_file.name] core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) debug_print(core_run.stderr.decode("utf-8", "replace")) if core_run.returncode != 0: # see if semgrep output a JSON error that we can decode semgrep_output = core_run.stdout.decode("utf-8", "replace") try: output_json = json.loads(semgrep_output) except ValueError: raise SemgrepError( f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) if "error" in output_json: self._raise_semgrep_error_from_json( output_json, patterns) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) output_json = json.loads( (core_run.stdout.decode("utf-8", "replace"))) errors.extend( CoreException.from_json(e, language) for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): debug_print( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors