Esempio n. 1
0
def evaluate(rule: Rule, pattern_matches: List[PatternMatch],
             allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]:
    """
    Takes a Rule and list of pattern matches from a single file and
    handles the boolean expression evaluation of the Rule's patterns
    Returns a list of RuleMatches.
    """
    output = []
    pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches)
    steps_for_debugging = [{
        "filter": "initial",
        "pattern_id": None,
        "ranges": {
            k: list(set(vv.range for vv in v))
            for k, v in pattern_ids_to_pattern_matches.items()
        },
    }]
    logger.debug(str(pattern_ids_to_pattern_matches))
    if rule.mode == TAINT_MODE:
        valid_ranges_to_output = {
            pattern_match.range
            for pattern_match in pattern_matches
        }
    else:
        valid_ranges_to_output = evaluate_expression(
            rule.expression,
            pattern_ids_to_pattern_matches,
            flags={RCE_RULE_FLAG: allow_exec},
            steps_for_debugging=steps_for_debugging,
        )

        # only output matches which are inside these offsets!
        logger.debug(f"compiled result {valid_ranges_to_output}")
        logger.debug("-" * 80)

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            message = interpolate_message_metavariables(rule, pattern_match)
            fix = interpolate_fix_metavariables(rule, pattern_match)
            rule_match = RuleMatch.from_pattern_match(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return output, steps_for_debugging
Esempio n. 2
0
def create_output(
    rule: Rule,
    pattern_matches: List[PatternMatch],
    valid_ranges_to_output: Optional[Set[Range]] = None,
) -> List[RuleMatch]:
    output = []

    if valid_ranges_to_output is None:
        valid_ranges_to_output = {
            pattern_match.range
            for pattern_match in pattern_matches
        }

    propagated_metavariable_lookup = {
        _range: {
            metavariable: pm.get_metavariable_value(metavariable)
            for pm in pattern_matches
            for metavariable in _range.propagated_metavariables
            if compare_propagated_metavariable(_range, pm, metavariable)
        }
        for _range in valid_ranges_to_output
    }

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            propagated_metavariables = propagated_metavariable_lookup[
                pattern_match.range]
            message = interpolate_string_with_metavariables(
                rule.message, pattern_match, propagated_metavariables)
            fix = (interpolate_string_with_metavariables(
                rule.fix, pattern_match, propagated_metavariables)
                   if rule.fix else None)
            rule_match = RuleMatch.from_pattern_match(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return sorted(output,
                  key=lambda rule_match: rule_match._pattern_match.range.start)
Esempio n. 3
0
def evaluate(rule: Rule, pattern_matches: List[PatternMatch],
             allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]:
    """
    Takes a Rule and list of pattern matches from a single file and
    handles the boolean expression evaluation of the Rule's patterns
    Returns a list of RuleMatches.
    """
    output = []

    pattern_ids_to_pattern_matches: Dict[PatternId,
                                         List[PatternMatch]] = OrderedDict()
    for pm in stabilize_evaluation_ordering(pattern_matches,
                                            key=lambda pm: pm.id):
        pattern_ids_to_pattern_matches.setdefault(pm.id, []).append(pm)

    initial_ranges: DebugRanges = {
        pattern_id: set(pm.range for pm in pattern_matches)
        for pattern_id, pattern_matches in
        pattern_ids_to_pattern_matches.items()
    }
    steps_for_debugging = [DebuggingStep("initial", None, initial_ranges, {})]

    if rule.mode == TAINT_MODE:
        valid_ranges_to_output = {
            pattern_match.range
            for pattern_match in pattern_matches
        }
    else:
        valid_ranges_to_output = evaluate_expression(
            rule.expression,
            pattern_ids_to_pattern_matches,
            allow_exec=allow_exec,
            steps_for_debugging=steps_for_debugging,
        )

        # only output matches which are inside these offsets!
        logger.debug(f"compiled result {valid_ranges_to_output}")
        logger.debug(BREAK_LINE)

    propagated_metavariable_lookup = {
        _range: {
            metavariable: pm.get_metavariable_value(metavariable)
            for pm in pattern_matches
            for metavariable in _range.propagated_metavariables
            if compare_propagated_metavariable(_range, pm, metavariable)
        }
        for _range in valid_ranges_to_output
    }

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            propagated_metavariables = propagated_metavariable_lookup[
                pattern_match.range]
            message = interpolate_message_metavariables(
                rule, pattern_match, propagated_metavariables)
            fix = interpolate_fix_metavariables(rule, pattern_match,
                                                propagated_metavariables)
            rule_match = RuleMatch.from_pattern_match(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return output, [attr.asdict(step) for step in steps_for_debugging]
Esempio n. 4
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        Dict[Any, Any],
    ]:
        from itertools import chain
        from collections import defaultdict

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode
                    output_json = self._parse_core_output(
                        out_bytes, err_bytes, returncode
                    )

                    if returncode != 0:
                        if "error" in output_json:
                            self._raise_semgrep_error_from_json(output_json, [], rule)
                        else:
                            raise SemgrepError(
                                f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}"
                            )

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
Esempio n. 5
0
def evaluate(rule: Rule, pattern_matches: List[PatternMatch],
             allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]:
    """
    Takes a Rule and list of pattern matches from a single file and
    handles the boolean expression evaluation of the Rule's patterns
    Returns a list of RuleMatches.
    """
    output = []
    pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches)
    steps_for_debugging = [{
        "filter": "initial",
        "pattern_id": None,
        "ranges": {
            k: list(set(vv.range for vv in v))
            for k, v in pattern_ids_to_pattern_matches.items()
        },
    }]
    logger.debug(str(pattern_ids_to_pattern_matches))
    if rule.mode == TAINT_MODE:
        valid_ranges_to_output = {
            pattern_match.range
            for pattern_match in pattern_matches
        }
    else:
        valid_ranges_to_output = evaluate_expression(
            rule.expression,
            pattern_ids_to_pattern_matches,
            flags={RCE_RULE_FLAG: allow_exec},
            steps_for_debugging=steps_for_debugging,
        )

        # only output matches which are inside these offsets!
        logger.debug(f"compiled result {valid_ranges_to_output}")
        logger.debug("-" * 80)

    # Addresses https://github.com/returntocorp/semgrep/issues/1699,
    # where metavariables from pattern-inside are not bound to messages.
    # This should handle cases with pattern + pattern-inside. This doesn't handle
    # pattern-not-inside because it is difficult to determine metavariables for
    # exclusion ranges. For example: imagine a pattern-not-inside for 'def $CLASS(): ...'
    # and a file has multiple classes inside. How do we infer which metavariable was
    # intended for interpolation? As such, this will fix the immediate issue and should
    # handle the most common case.
    # Another corner case is: what should we do with nested metavars? Imagine 'def $FUNC(): ...'
    # and code with nested functions. Did we want the top-level function? The lowest-level? What
    # about other nesting cases? ¯\_(ツ)_/¯ Right now it will prefer the largest PatternMatch range.
    all_pattern_match_metavariables: Dict[
        str, List[PatternMatch]] = defaultdict(list)
    for pattern_match in pattern_matches:
        for metavar_text in pattern_match.metavars.keys():
            all_pattern_match_metavariables[metavar_text].append(pattern_match)

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            message = interpolate_message_metavariables(
                rule, pattern_match, all_pattern_match_metavariables)
            fix = interpolate_fix_metavariables(rule, pattern_match)
            rule_match = RuleMatch.from_pattern_match(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return output, steps_for_debugging
Esempio n. 6
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        ProfilingData,
    ]:
        from itertools import chain
        from collections import defaultdict

        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    all_targets = all_targets.union(targets)

                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    if self._report_time:
                        cmd += ["-json_time"]

                    if self._output_settings.debug:
                        cmd += ["-debug"]

                    core_run = sub_run(
                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                    )
                    output_json = self._extract_core_output(rule, [], core_run)

                    if "time" in output_json:
                        self._add_match_times(rule, profiling_data, output_json["time"])

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, all_targets, profiling_data