Exemple #1
0
    def _flatten_rule_patterns(self, rules: List[Rule]) -> Iterator[Pattern]:
        """
        Convert list of rules to format understandable by semgrep core
        """
        for rule_index, rule in enumerate(rules):
            flat_expressions = list(
                enumerate_patterns_in_boolean_expression(rule.expression))
            for expr in flat_expressions:
                if not should_send_to_semgrep_core(expr):
                    continue

                span = (rule.pattern_spans.get(expr.pattern_id)
                        if expr.pattern_id is not None else None)

                for lang in rule.languages:
                    yield Pattern(rule_index, expr, rule.severity, lang, span)
Exemple #2
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
        profiler: ProfileManager,
        match_time_matrix: Dict[Tuple[str, str], float],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]:
        """
        Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()

        for language, all_patterns_for_language in self._group_patterns_by_language(
            rule
        ).items():

            targets = self.get_files_for_language(language, rule, target_manager)
            targets = [target for target in targets if target not in max_timeout_files]
            all_targets = all_targets.union(targets)
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(
                    0, rule.expression, rule.severity, language, rule._yaml.span
                )

                output_json = profiler.track(
                    rule.id,
                    self._run_core_command,
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                    report_time=self._report_time,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX
                    or p.expression.operator == OPERATORS.NOT_REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex, targets)

                # regex-only rules only support OPERATORS.REGEX.
                # Skip passing this rule to semgrep-core.
                if language in REGEX_ONLY_LANGUAGE_KEYS:
                    continue

                # semgrep-core doesn't know about the following operators -
                # they are strictly semgrep Python features:
                #   - OPERATORS.METAVARIABLE_REGEX
                #   - OPERATORS.METAVARIABLE_COMPARISON
                patterns = [
                    pattern
                    for pattern in patterns
                    if pattern.expression.operator
                    not in [
                        OPERATORS.METAVARIABLE_REGEX,
                        OPERATORS.METAVARIABLE_COMPARISON,
                    ]
                ]

                patterns_json = [p.to_json() for p in patterns]

                if language == GENERIC_LANGUAGE:
                    output_json = profiler.track(
                        rule.id,
                        run_spacegrep,
                        rule.id,
                        patterns,
                        targets,
                        timeout=self._timeout,
                        report_time=self._report_time,
                    )
                else:  # Run semgrep-core
                    output_json = profiler.track(
                        rule.id,
                        self._run_core_command,
                        patterns_json,
                        patterns,
                        targets,
                        language,
                        rule,
                        "-rules_file",
                        cache_dir,
                        report_time=self._report_time,
                    )

            errors.extend(
                CoreException.from_json(e, language, rule.id).into_semgrep_error()
                for e in output_json["errors"]
            )
            outputs.extend(PatternMatch(m) for m in output_json["matches"])
            if "time" in output_json:
                self._add_match_times(rule, match_time_matrix, output_json["time"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[
            Rule, Dict[Path, List[PatternMatch]]
        ] = collections.defaultdict(lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"--> rule ({rule.id}) has findings on filepath: {filepath}"
                )

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec
                )
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)
        logger.debug(f"...ran on {len(all_targets)} files")

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors, all_targets
Exemple #3
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []

        for language, all_patterns_for_language in self._group_patterns_by_language(
                rule).items():

            targets = self.get_files_for_language(language, rule,
                                                  target_manager)
            targets = [
                target for target in targets if target not in max_timeout_files
            ]
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(0, rule.expression, rule.severity, language,
                                  rule._yaml.span)

                output_json = self._run_core_command(
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex,
                                               targets)

                # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX -
                # this is strictly a semgrep Python feature. Metavariable regex
                # filtering is performed purely in Python code then compared
                # against semgrep-core's results for other patterns.
                patterns = [
                    pattern for pattern in patterns if
                    pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX
                ]

                patterns_json = [p.to_json() for p in patterns]

                output_json = self._run_core_command(
                    patterns_json,
                    patterns,
                    targets,
                    language,
                    rule,
                    "-rules_file",
                    cache_dir,
                )

            errors.extend(
                CoreException.from_json(e, language,
                                        rule.id).into_semgrep_error()
                for e in output_json["errors"])
            outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors