Ejemplo n.º 1
0
def evaluate(
    rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool
) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]:
    """
        Takes a Rule and list of pattern matches from a single file and
        handles the boolean expression evaluation of the Rule's patterns
        Returns a list of RuleMatches.
    """
    output = []
    pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches)
    steps_for_debugging = [
        {
            "filter": "initial",
            "pattern_id": None,
            "ranges": {
                k: list(set(vv.range for vv in v))
                for k, v in pattern_ids_to_pattern_matches.items()
            },
        }
    ]
    logger.debug(str(pattern_ids_to_pattern_matches))
    if rule.mode == TAINT_MODE:
        valid_ranges_to_output = {
            pattern_match.range for pattern_match in pattern_matches
        }
    else:
        valid_ranges_to_output = evaluate_expression(
            rule.expression,
            pattern_ids_to_pattern_matches,
            flags={RCE_RULE_FLAG: allow_exec},
            steps_for_debugging=steps_for_debugging,
        )

        # only output matches which are inside these offsets!
        logger.debug(f"compiled result {valid_ranges_to_output}")
        logger.debug("-" * 80)

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            message = interpolate_message_metavariables(rule, pattern_match)
            fix = interpolate_fix_metavariables(rule, pattern_match)
            rule_match = RuleMatch(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return output, steps_for_debugging
Ejemplo n.º 2
0
def create_output(
    rule: Rule,
    pattern_matches: List[PatternMatch],
    valid_ranges_to_output: Optional[Set[Range]] = None,
) -> List[RuleMatch]:
    output = []

    if valid_ranges_to_output is None:
        valid_ranges_to_output = {
            pattern_match.range
            for pattern_match in pattern_matches
        }

    propagated_metavariable_lookup = {
        _range: {
            metavariable: pm.get_metavariable_value(metavariable)
            for pm in pattern_matches
            for metavariable in _range.propagated_metavariables
            if compare_propagated_metavariable(_range, pm, metavariable)
        }
        for _range in valid_ranges_to_output
    }

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            propagated_metavariables = propagated_metavariable_lookup[
                pattern_match.range]
            message = interpolate_string_with_metavariables(
                rule.message, pattern_match, propagated_metavariables)
            fix = (interpolate_string_with_metavariables(
                rule.fix, pattern_match, propagated_metavariables)
                   if rule.fix else None)
            rule_match = RuleMatch.from_pattern_match(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return sorted(output,
                  key=lambda rule_match: rule_match._pattern_match.range.start)
Ejemplo n.º 3
0
        def convert_to_rule_match(match: CoreMatch, rule: Rule) -> RuleMatch:
            metavariables = read_metavariables(match)
            message = interpolate(rule.message, metavariables)
            fix = interpolate(rule.fix, metavariables) if rule.fix else None

            rule_match = RuleMatch(
                rule.id,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
                path=match.path,
                start=match.start,
                end=match.end,
                extra=match.extra,
                lines_cache={},
            )
            return rule_match
Ejemplo n.º 4
0
def evaluate(rule: Rule, pattern_matches: List[PatternMatch],
             allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]:
    """
    Takes a Rule and list of pattern matches from a single file and
    handles the boolean expression evaluation of the Rule's patterns
    Returns a list of RuleMatches.
    """
    output = []

    pattern_ids_to_pattern_matches: Dict[PatternId,
                                         List[PatternMatch]] = OrderedDict()
    for pm in stabilize_evaluation_ordering(pattern_matches,
                                            key=lambda pm: pm.id):
        pattern_ids_to_pattern_matches.setdefault(pm.id, []).append(pm)

    initial_ranges: DebugRanges = {
        pattern_id: set(pm.range for pm in pattern_matches)
        for pattern_id, pattern_matches in
        pattern_ids_to_pattern_matches.items()
    }
    steps_for_debugging = [DebuggingStep("initial", None, initial_ranges, {})]

    if rule.mode == TAINT_MODE:
        valid_ranges_to_output = {
            pattern_match.range
            for pattern_match in pattern_matches
        }
    else:
        valid_ranges_to_output = evaluate_expression(
            rule.expression,
            pattern_ids_to_pattern_matches,
            allow_exec=allow_exec,
            steps_for_debugging=steps_for_debugging,
        )

        # only output matches which are inside these offsets!
        logger.debug(f"compiled result {valid_ranges_to_output}")
        logger.debug(BREAK_LINE)

    propagated_metavariable_lookup = {
        _range: {
            metavariable: pm.get_metavariable_value(metavariable)
            for pm in pattern_matches
            for metavariable in _range.propagated_metavariables
            if compare_propagated_metavariable(_range, pm, metavariable)
        }
        for _range in valid_ranges_to_output
    }

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            propagated_metavariables = propagated_metavariable_lookup[
                pattern_match.range]
            message = interpolate_message_metavariables(
                rule, pattern_match, propagated_metavariables)
            fix = interpolate_fix_metavariables(rule, pattern_match,
                                                propagated_metavariables)
            rule_match = RuleMatch.from_pattern_match(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return output, [attr.asdict(step) for step in steps_for_debugging]
Ejemplo n.º 5
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        Dict[Any, Any],
    ]:
        from itertools import chain
        from collections import defaultdict

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode
                    output_json = self._parse_core_output(
                        out_bytes, err_bytes, returncode
                    )

                    if returncode != 0:
                        if "error" in output_json:
                            self._raise_semgrep_error_from_json(output_json, [], rule)
                        else:
                            raise SemgrepError(
                                f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}"
                            )

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
Ejemplo n.º 6
0
def evaluate(rule: Rule, pattern_matches: List[PatternMatch],
             allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]:
    """
    Takes a Rule and list of pattern matches from a single file and
    handles the boolean expression evaluation of the Rule's patterns
    Returns a list of RuleMatches.
    """
    output = []
    pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches)
    steps_for_debugging = [{
        "filter": "initial",
        "pattern_id": None,
        "ranges": {
            k: list(set(vv.range for vv in v))
            for k, v in pattern_ids_to_pattern_matches.items()
        },
    }]
    logger.debug(str(pattern_ids_to_pattern_matches))
    if rule.mode == TAINT_MODE:
        valid_ranges_to_output = {
            pattern_match.range
            for pattern_match in pattern_matches
        }
    else:
        valid_ranges_to_output = evaluate_expression(
            rule.expression,
            pattern_ids_to_pattern_matches,
            flags={RCE_RULE_FLAG: allow_exec},
            steps_for_debugging=steps_for_debugging,
        )

        # only output matches which are inside these offsets!
        logger.debug(f"compiled result {valid_ranges_to_output}")
        logger.debug("-" * 80)

    # Addresses https://github.com/returntocorp/semgrep/issues/1699,
    # where metavariables from pattern-inside are not bound to messages.
    # This should handle cases with pattern + pattern-inside. This doesn't handle
    # pattern-not-inside because it is difficult to determine metavariables for
    # exclusion ranges. For example: imagine a pattern-not-inside for 'def $CLASS(): ...'
    # and a file has multiple classes inside. How do we infer which metavariable was
    # intended for interpolation? As such, this will fix the immediate issue and should
    # handle the most common case.
    # Another corner case is: what should we do with nested metavars? Imagine 'def $FUNC(): ...'
    # and code with nested functions. Did we want the top-level function? The lowest-level? What
    # about other nesting cases? ¯\_(ツ)_/¯ Right now it will prefer the largest PatternMatch range.
    all_pattern_match_metavariables: Dict[
        str, List[PatternMatch]] = defaultdict(list)
    for pattern_match in pattern_matches:
        for metavar_text in pattern_match.metavars.keys():
            all_pattern_match_metavariables[metavar_text].append(pattern_match)

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            message = interpolate_message_metavariables(
                rule, pattern_match, all_pattern_match_metavariables)
            fix = interpolate_fix_metavariables(rule, pattern_match)
            rule_match = RuleMatch.from_pattern_match(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return output, steps_for_debugging
Ejemplo n.º 7
0
def run_join_rule(
    join_rule: Dict[str, Any],
    targets: List[Path],
) -> Tuple[List[RuleMatch], List[SemgrepError]]:
    """
    Run a 'join' mode rule.

    Join rules are comprised of multiple Semgrep rules and a set
    of conditions which must be satisfied in order to return a result.
    These conditions are typically some comparison of metavariable contents
    from different rules.

    'join_rule' is a join rule definition in dictionary form. The required keys are
    {'id', 'mode', 'severity', 'message', 'join'}.

    'join' is dictionary with the required keys {'refs', 'on'}.

    'refs' is dictionary with the required key {'rule'}. 'rule' is identical to
    a Semgrep config string -- the same thing used on the command line. e.g.,
    `semgrep -f p/javascript.lang.security.rule` or `semgrep -f path/to/rule.yaml`.

    'refs' has optional keys {'renames', 'as'}. 'renames' is a list of objects
    with properties {'from', 'to'}. 'renames' are used to rename metavariables
    of the associated 'rule'. 'as' lets you alias the collection of rule results
    for use in the conditions, similar to a SQL alias. By default, collection names
    will be the rule ID.

    'on' is a list of strings of the form <collection>.<property> <operator> <collection>.<property>.
    These are the conditions which must be satisifed for this rule to report results.
    All conditions must be satisfied.

    See semgrep/tests/e2e/rules/join_rules/user-input-with-unescaped-extension.yaml
    for an example.
    """
    join_contents = join_rule.get("join", {})
    semgrep_config_strings = [
        ref.get("rule") for ref in join_contents.get("refs", [])
    ]
    config_map = create_config_map(semgrep_config_strings)

    join_rule_refs: List[Ref] = [
        Ref(
            id=config_map[ref.get("rule")].id,
            renames={
                rename.get("from"): rename.get("to")
                for rename in ref.get("renames", [])
            },
            alias=ref.get("as"),
        ) for ref in join_contents.get("refs", [])
    ]
    refs_lookup = {ref.id: ref for ref in join_rule_refs}
    alias_lookup = {ref.alias: ref.id for ref in join_rule_refs}

    try:
        conditions = [
            Condition.parse(condition_string)
            for condition_string in join_contents.get("on", [])
        ]
    except InvalidConditionError as e:
        return [], [e]

    # Run Semgrep
    with tempfile.NamedTemporaryFile() as rule_path:
        yaml.dump({"rules": [rule.raw for rule in config_map.values()]},
                  rule_path)
        rule_path.flush()
        output = semgrep.semgrep_main.invoke_semgrep(
            config=Path(rule_path.name),
            targets=targets,
            no_rewrite_rule_ids=True,
            optimizations="all",
        )

    assert isinstance(output, dict)  # placate mypy

    results = output.get("results", [])
    errors = output.get("errors", [])

    parsed_errors = []
    for error_dict in errors:
        try:
            """
            This is a hack to reconstitute errors after they've been
            JSONified as output. Subclasses of SemgrepError define the 'level'
            and 'code' as class properties, which means they aren't accepted
            as arguments when instantiated. 'type' is also added when errors are
            JSONified, and is just a string of the error class name. It's not used
            as an argument.
            All of these properties will be properly populated because it's using the
            class properties of the SemgrepError inferred by 'type'.
            """
            del error_dict["code"]
            del error_dict["level"]
            errortype = error_dict.get("type")
            del error_dict["type"]
            parsed_errors.append(
                ERROR_MAP[error_dict.get(errortype)].from_dict(error_dict))
        except KeyError:
            logger.warning(
                f"Could not reconstitute Semgrep error: {error_dict}.\nSkipping processing of error"
            )
            continue

    # Small optimization: if there are no results for rules that
    # are used in a condition, there's no sense in continuing.
    collection_set_unaliased = {
        alias_lookup[collection]
        for collection in create_collection_set_from_conditions(conditions)
    }
    rule_ids = set(result.get("check_id") for result in results)
    if collection_set_unaliased - rule_ids:
        logger.debug(
            f"No results for {collection_set_unaliased - rule_ids} in join rule '{join_rule.get('id')}'."
        )
        return [], parsed_errors

    # Rename metavariables with user-defined renames.
    rename_metavars_in_place(results, refs_lookup)

    # Create a model map. This allows dynamically creating DB tables based
    # on Semgrep's results. There is one table for each rule ID.
    model_map = create_model_map(results)
    db.connect()
    db.create_tables(model_map.values())

    # Populate the model tables with real data from the Semgrep results.
    load_results_into_db(results, model_map)

    # Apply the conditions and only keep combinations
    # of findings that satisfy the conditions.
    matches = []
    matched_on_conditions = match_on_conditions(
        model_map,
        alias_lookup,
        [
            Condition.parse(condition_string)
            for condition_string in join_contents.get("on", [])
        ],
    )
    if matched_on_conditions:  # This is ugly, but makes mypy happy
        for match in matched_on_conditions:
            matches.append(
                json.loads(match.raw.decode("utf-8", errors="replace")))

    rule_matches = [
        RuleMatch(
            id=join_rule.get("id", match.get("check_id", "[empty]")),
            pattern_match=PatternMatch({}),
            message=join_rule.get(
                "message",
                match.get("extra", {}).get("message", "[empty]")),
            metadata=join_rule.get("metadata",
                                   match.get("extra", {}).get("metadata", {})),
            severity=join_rule.get("severity", match.get("severity", "INFO")),
            path=Path(match.get("path", "[empty]")),
            start=match.get("start", {}),
            end=match.get("end", {}),
            extra=match.get("extra", {}),
            fix=None,
            fix_regex=None,
            lines_cache={},
        ) for match in matches
    ]

    db.close()
    return rule_matches, parsed_errors
Ejemplo n.º 8
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        ProfilingData,
    ]:
        from itertools import chain
        from collections import defaultdict

        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    all_targets = all_targets.union(targets)

                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    if self._report_time:
                        cmd += ["-json_time"]

                    if self._output_settings.debug:
                        cmd += ["-debug"]

                    core_run = sub_run(
                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                    )
                    output_json = self._extract_core_output(rule, [], core_run)

                    if "time" in output_json:
                        self._add_match_times(rule, profiling_data, output_json["time"])

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, all_targets, profiling_data