Esempio n. 1
0
def get_re_matches(
    patterns_re: Sequence[Tuple[Any, TPattern[Any]]], path: Path
) -> List[PatternMatch]:
    try:
        contents = path.read_text()
    except UnicodeDecodeError:
        logger.debug(f"regex matcher skipping binary file at {path}")
        return []

    return [
        PatternMatch(
            {
                "check_id": pattern_id,
                "path": str(path),
                "start": {
                    "offset": match.start(),
                    "line": _offset_to_line_no(match.start(), contents),
                    "col": _offset_to_col_no(match.start(), contents),
                },
                "end": {
                    "offset": match.end(),
                    "line": _offset_to_line_no(match.end(), contents),
                    "col": _offset_to_col_no(match.end(), contents),
                },
                "extra": {},
            }
        )
        for pattern_id, pattern in patterns_re
        for match in re.finditer(pattern, contents)
    ]
Esempio n. 2
0
def compare_where_python(where_expression: str,
                         pattern_match: PatternMatch) -> bool:
    result = False
    return_var = "semgrep_pattern_return"
    lines = where_expression.strip().split("\n")
    to_eval = "\n".join(lines[:-1] + [f"{return_var} = {lines[-1]}"])

    local_vars = {
        metavar: pattern_match.get_metavariable_value(metavar)
        for metavar in pattern_match.metavariables
    }
    scope = {"vars": local_vars}

    try:
        # fmt: off
        exec(
            to_eval, scope
        )  # nosem: contrib.dlint.dlint-equivalent.insecure-exec-use, python.lang.security.audit.exec-detected.exec-detected
        # fmt: on
        result = scope[return_var]  # type: ignore
    except KeyError as ex:
        logger.error(
            f"could not find metavariable {ex} while evaluating where-python expression '{where_expression}', consider case where metavariable is missing"
        )
    except Exception as ex:
        logger.error(
            f"received error '{repr(ex)}' while evaluating where-python expression '{where_expression}'"
        )

    if not isinstance(result, bool):
        raise SemgrepError(
            f"where-python expression '{where_expression}' needs boolean output but got {result}"
        )
    return result
Esempio n. 3
0
def interpolate_message_metavariables(
    rule: Rule,
    pattern_match: PatternMatch,
    all_pattern_match_metavariables: Dict[str, List[PatternMatch]],
) -> str:
    msg_text = rule.message
    for metavar_text in all_pattern_match_metavariables:
        replace_text = metavar_text
        try:  # Always prefer the pattern match metavariable first.
            replace_text = pattern_match.get_metavariable_value(metavar_text)
        except KeyError:  # If one isn't present, retrieve the value from all metavariables.
            pattern_matches_with_metavars_that_enclose_match: List[PatternMatch] = list(
                filter(
                    lambda possible_enclosing_match: possible_enclosing_match.range.is_range_enclosing_or_eq(
                        pattern_match.range
                    ),
                    all_pattern_match_metavariables[metavar_text],
                )
            )
            if len(pattern_matches_with_metavars_that_enclose_match):
                replace_text = pattern_matches_with_metavars_that_enclose_match[
                    0
                ].get_metavariable_value(metavar_text)
        msg_text = msg_text.replace(metavar_text, replace_text)
    return msg_text
Esempio n. 4
0
def interpolate_message_metavariables(rule: Rule,
                                      pattern_match: PatternMatch) -> str:
    msg_text = rule.message
    for metavar in pattern_match.metavars:
        msg_text = msg_text.replace(
            metavar, pattern_match.get_metavariable_value(metavar))
    return msg_text
Esempio n. 5
0
def interpolate_fix_metavariables(
        rule: Rule, pattern_match: PatternMatch) -> Optional[str]:
    fix_str = rule.fix
    if fix_str is None:
        return None
    for metavar in pattern_match.metavars:
        fix_str = fix_str.replace(
            metavar, pattern_match.get_metavariable_value(metavar))
    return fix_str
Esempio n. 6
0
def interpolate_message_metavariables(
        rule: Rule, pattern_match: PatternMatch,
        propagated_metavariables: Dict[str, str]) -> str:
    msg_text = rule.message
    for metavariable in pattern_match.metavariables:
        msg_text = msg_text.replace(
            metavariable, pattern_match.get_metavariable_value(metavariable))
    for metavariable, metavariable_text in propagated_metavariables.items():
        msg_text = msg_text.replace(metavariable, metavariable_text)
    return msg_text
Esempio n. 7
0
def interpolate_fix_metavariables(
        rule: Rule, pattern_match: PatternMatch,
        propagated_metavariables: Dict[str, str]) -> Optional[str]:
    fix_str = rule.fix
    if fix_str is None:
        return None
    for metavariable in pattern_match.metavariables:
        fix_str = fix_str.replace(
            metavariable, pattern_match.get_metavariable_value(metavariable))
    for metavariable, metavariable_text in propagated_metavariables.items():
        fix_str = fix_str.replace(metavariable, metavariable_text)
    return fix_str
Esempio n. 8
0
def interpolate_string_with_metavariables(
        text: str, pattern_match: PatternMatch,
        propagated_metavariables: Dict[str, str]) -> str:
    """Interpolates a string with the metavariables contained in it, returning a new string"""

    # Sort by metavariable length to avoid name collisions (eg. $X2 must be handled before $X)
    for metavariable in sorted(pattern_match.metavariables,
                               key=len,
                               reverse=True):
        text = text.replace(metavariable,
                            pattern_match.get_metavariable_value(metavariable))

    # Sort by metavariable length to avoid name collisions (eg. $X2 must be handled before $X)
    for metavariable in sorted(propagated_metavariables.keys(),
                               key=len,
                               reverse=True):
        text = text.replace(metavariable,
                            propagated_metavariables[metavariable])

    return text
Esempio n. 9
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        Dict[Any, Any],
    ]:
        from itertools import chain
        from collections import defaultdict

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode
                    output_json = self._parse_core_output(
                        out_bytes, err_bytes, returncode
                    )

                    if returncode != 0:
                        if "error" in output_json:
                            self._raise_semgrep_error_from_json(output_json, [], rule)
                        else:
                            raise SemgrepError(
                                f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}"
                            )

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
Esempio n. 10
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
        profiler: ProfileManager,
        match_time_matrix: Dict[Tuple[str, str], float],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]:
        """
        Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()

        for language, all_patterns_for_language in self._group_patterns_by_language(
            rule
        ).items():

            targets = self.get_files_for_language(language, rule, target_manager)
            targets = [target for target in targets if target not in max_timeout_files]
            all_targets = all_targets.union(targets)
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(
                    0, rule.expression, rule.severity, language, rule._yaml.span
                )

                output_json = profiler.track(
                    rule.id,
                    self._run_core_command,
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                    report_time=self._report_time,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX
                    or p.expression.operator == OPERATORS.NOT_REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex, targets)

                # regex-only rules only support OPERATORS.REGEX.
                # Skip passing this rule to semgrep-core.
                if language in REGEX_ONLY_LANGUAGE_KEYS:
                    continue

                # semgrep-core doesn't know about the following operators -
                # they are strictly semgrep Python features:
                #   - OPERATORS.METAVARIABLE_REGEX
                #   - OPERATORS.METAVARIABLE_COMPARISON
                patterns = [
                    pattern
                    for pattern in patterns
                    if pattern.expression.operator
                    not in [
                        OPERATORS.METAVARIABLE_REGEX,
                        OPERATORS.METAVARIABLE_COMPARISON,
                    ]
                ]

                patterns_json = [p.to_json() for p in patterns]

                if language == GENERIC_LANGUAGE:
                    output_json = profiler.track(
                        rule.id,
                        run_spacegrep,
                        rule.id,
                        patterns,
                        targets,
                        timeout=self._timeout,
                        report_time=self._report_time,
                    )
                else:  # Run semgrep-core
                    output_json = profiler.track(
                        rule.id,
                        self._run_core_command,
                        patterns_json,
                        patterns,
                        targets,
                        language,
                        rule,
                        "-rules_file",
                        cache_dir,
                        report_time=self._report_time,
                    )

            errors.extend(
                CoreException.from_json(e, language, rule.id).into_semgrep_error()
                for e in output_json["errors"]
            )
            outputs.extend(PatternMatch(m) for m in output_json["matches"])
            if "time" in output_json:
                self._add_match_times(rule, match_time_matrix, output_json["time"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[
            Rule, Dict[Path, List[PatternMatch]]
        ] = collections.defaultdict(lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"--> rule ({rule.id}) has findings on filepath: {filepath}"
                )

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec
                )
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)
        logger.debug(f"...ran on {len(all_targets)} files")

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors, all_targets
Esempio n. 11
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]],
               List[SemgrepError], Set[Path], ProfilingData, ]:
        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = collections.defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0)
        max_timeout_files: Set[Path] = set()

        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                for language in rule.languages:
                    debug_tqdm_write(f"Running rule {rule.id}...")
                    with tempfile.NamedTemporaryFile(
                            "w", suffix=".yaml"
                    ) as rule_file, tempfile.NamedTemporaryFile(
                            "w") as target_file, tempfile.NamedTemporaryFile(
                                "w") as equiv_file:
                        targets = self.get_files_for_language(
                            language, rule, target_manager)

                        targets = [
                            target for target in targets
                            if target not in max_timeout_files
                        ]

                        # opti: no need to call semgrep-core if no target files
                        if not targets:
                            continue
                        all_targets = all_targets.union(targets)

                        target_file.write("\n".join(
                            map(lambda p: str(p), targets)))
                        target_file.flush()
                        yaml = YAML()
                        yaml.dump({"rules": [rule._raw]}, rule_file)
                        rule_file.flush()

                        cmd = [SEMGREP_PATH] + [
                            "-lang",
                            language.value,
                            "-json",
                            "-config",
                            rule_file.name,
                            "-j",
                            str(self._jobs),
                            "-target_file",
                            target_file.name,
                            "-use_parsing_cache",
                            semgrep_core_ast_cache_dir,
                            "-timeout",
                            str(self._timeout),
                            "-max_memory",
                            str(self._max_memory),
                            "-json_time",
                        ]

                        if self._optimizations != "none":
                            cmd.append("-fast")

                        stderr: Optional[int] = subprocess.PIPE
                        if is_debug():
                            cmd += ["-debug"]
                            stderr = None

                        core_run = sub_run(cmd,
                                           stdout=subprocess.PIPE,
                                           stderr=stderr)
                        output_json = self._extract_core_output(rule, core_run)

                        if "time" in output_json:
                            self._add_match_times(rule, profiling_data,
                                                  output_json["time"])

                    # end with tempfile.NamedTemporaryFile(...) ...
                    pattern_matches = [
                        PatternMatch(match) for match in output_json["matches"]
                    ]
                    findings = create_output(rule, pattern_matches)

                    findings = dedup_output(findings)
                    outputs[rule].extend(findings)
                    parsed_errors = [
                        CoreException.from_json(e, language.value,
                                                rule.id).into_semgrep_error()
                        for e in output_json["errors"]
                    ]
                    for err in parsed_errors:
                        if isinstance(err, MatchTimeoutError):
                            file_timeouts[err.path] += 1
                            if (self._timeout_threshold != 0
                                    and file_timeouts[err.path] >=
                                    self._timeout_threshold):
                                max_timeout_files.add(err.path)
                    errors.extend(parsed_errors)
            # end for language ...
        # end for rule ...

        return outputs, {}, errors, all_targets, profiling_data
Esempio n. 12
0
def run_join_rule(
    join_rule: Dict[str, Any],
    targets: List[Path],
) -> Tuple[List[RuleMatch], List[SemgrepError]]:
    """
    Run a 'join' mode rule.

    Join rules are comprised of multiple Semgrep rules and a set
    of conditions which must be satisfied in order to return a result.
    These conditions are typically some comparison of metavariable contents
    from different rules.

    'join_rule' is a join rule definition in dictionary form. The required keys are
    {'id', 'mode', 'severity', 'message', 'join'}.

    'join' is dictionary with the required keys {'refs', 'on'}.

    'refs' is dictionary with the required key {'rule'}. 'rule' is identical to
    a Semgrep config string -- the same thing used on the command line. e.g.,
    `semgrep -f p/javascript.lang.security.rule` or `semgrep -f path/to/rule.yaml`.

    'refs' has optional keys {'renames', 'as'}. 'renames' is a list of objects
    with properties {'from', 'to'}. 'renames' are used to rename metavariables
    of the associated 'rule'. 'as' lets you alias the collection of rule results
    for use in the conditions, similar to a SQL alias. By default, collection names
    will be the rule ID.

    'on' is a list of strings of the form <collection>.<property> <operator> <collection>.<property>.
    These are the conditions which must be satisifed for this rule to report results.
    All conditions must be satisfied.

    See semgrep/tests/e2e/rules/join_rules/user-input-with-unescaped-extension.yaml
    for an example.
    """
    join_contents = join_rule.get("join", {})
    semgrep_config_strings = [
        ref.get("rule") for ref in join_contents.get("refs", [])
    ]
    config_map = create_config_map(semgrep_config_strings)

    join_rule_refs: List[Ref] = [
        Ref(
            id=config_map[ref.get("rule")].id,
            renames={
                rename.get("from"): rename.get("to")
                for rename in ref.get("renames", [])
            },
            alias=ref.get("as"),
        ) for ref in join_contents.get("refs", [])
    ]
    refs_lookup = {ref.id: ref for ref in join_rule_refs}
    alias_lookup = {ref.alias: ref.id for ref in join_rule_refs}

    try:
        conditions = [
            Condition.parse(condition_string)
            for condition_string in join_contents.get("on", [])
        ]
    except InvalidConditionError as e:
        return [], [e]

    # Run Semgrep
    with tempfile.NamedTemporaryFile() as rule_path:
        yaml.dump({"rules": [rule.raw for rule in config_map.values()]},
                  rule_path)
        rule_path.flush()
        output = semgrep.semgrep_main.invoke_semgrep(
            config=Path(rule_path.name),
            targets=targets,
            no_rewrite_rule_ids=True,
            optimizations="all",
        )

    assert isinstance(output, dict)  # placate mypy

    results = output.get("results", [])
    errors = output.get("errors", [])

    parsed_errors = []
    for error_dict in errors:
        try:
            """
            This is a hack to reconstitute errors after they've been
            JSONified as output. Subclasses of SemgrepError define the 'level'
            and 'code' as class properties, which means they aren't accepted
            as arguments when instantiated. 'type' is also added when errors are
            JSONified, and is just a string of the error class name. It's not used
            as an argument.
            All of these properties will be properly populated because it's using the
            class properties of the SemgrepError inferred by 'type'.
            """
            del error_dict["code"]
            del error_dict["level"]
            errortype = error_dict.get("type")
            del error_dict["type"]
            parsed_errors.append(
                ERROR_MAP[error_dict.get(errortype)].from_dict(error_dict))
        except KeyError:
            logger.warning(
                f"Could not reconstitute Semgrep error: {error_dict}.\nSkipping processing of error"
            )
            continue

    # Small optimization: if there are no results for rules that
    # are used in a condition, there's no sense in continuing.
    collection_set_unaliased = {
        alias_lookup[collection]
        for collection in create_collection_set_from_conditions(conditions)
    }
    rule_ids = set(result.get("check_id") for result in results)
    if collection_set_unaliased - rule_ids:
        logger.debug(
            f"No results for {collection_set_unaliased - rule_ids} in join rule '{join_rule.get('id')}'."
        )
        return [], parsed_errors

    # Rename metavariables with user-defined renames.
    rename_metavars_in_place(results, refs_lookup)

    # Create a model map. This allows dynamically creating DB tables based
    # on Semgrep's results. There is one table for each rule ID.
    model_map = create_model_map(results)
    db.connect()
    db.create_tables(model_map.values())

    # Populate the model tables with real data from the Semgrep results.
    load_results_into_db(results, model_map)

    # Apply the conditions and only keep combinations
    # of findings that satisfy the conditions.
    matches = []
    matched_on_conditions = match_on_conditions(
        model_map,
        alias_lookup,
        [
            Condition.parse(condition_string)
            for condition_string in join_contents.get("on", [])
        ],
    )
    if matched_on_conditions:  # This is ugly, but makes mypy happy
        for match in matched_on_conditions:
            matches.append(
                json.loads(match.raw.decode("utf-8", errors="replace")))

    rule_matches = [
        RuleMatch(
            id=join_rule.get("id", match.get("check_id", "[empty]")),
            pattern_match=PatternMatch({}),
            message=join_rule.get(
                "message",
                match.get("extra", {}).get("message", "[empty]")),
            metadata=join_rule.get("metadata",
                                   match.get("extra", {}).get("metadata", {})),
            severity=join_rule.get("severity", match.get("severity", "INFO")),
            path=Path(match.get("path", "[empty]")),
            start=match.get("start", {}),
            end=match.get("end", {}),
            extra=match.get("extra", {}),
            fix=None,
            fix_regex=None,
            lines_cache={},
        ) for match in matches
    ]

    db.close()
    return rule_matches, parsed_errors
Esempio n. 13
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        ProfilingData,
    ]:
        from itertools import chain
        from collections import defaultdict

        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    all_targets = all_targets.union(targets)

                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    if self._report_time:
                        cmd += ["-json_time"]

                    if self._output_settings.debug:
                        cmd += ["-debug"]

                    core_run = sub_run(
                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                    )
                    output_json = self._extract_core_output(rule, [], core_run)

                    if "time" in output_json:
                        self._add_match_times(rule, profiling_data, output_json["time"])

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, all_targets, profiling_data
Esempio n. 14
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []

        for language, all_patterns_for_language in self._group_patterns_by_language(
                rule).items():

            targets = self.get_files_for_language(language, rule,
                                                  target_manager)
            targets = [
                target for target in targets if target not in max_timeout_files
            ]
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(0, rule.expression, rule.severity, language,
                                  rule._yaml.span)

                output_json = self._run_core_command(
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex,
                                               targets)

                # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX -
                # this is strictly a semgrep Python feature. Metavariable regex
                # filtering is performed purely in Python code then compared
                # against semgrep-core's results for other patterns.
                patterns = [
                    pattern for pattern in patterns if
                    pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX
                ]

                patterns_json = [p.to_json() for p in patterns]

                output_json = self._run_core_command(
                    patterns_json,
                    patterns,
                    targets,
                    language,
                    rule,
                    "-rules_file",
                    cache_dir,
                )

            errors.extend(
                CoreException.from_json(e, language,
                                        rule.id).into_semgrep_error()
                for e in output_json["errors"])
            outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors
Esempio n. 15
0
    def _run_rule(
        self, rule: Rule, target_manager: TargetManager, cache_dir: str
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[CoreException] = []
        equivalences = rule.equivalences

        for language, all_patterns_for_language in self._group_patterns_by_language(
            [rule]).items():
            try:
                targets = target_manager.get_files(language, rule.includes,
                                                   rule.excludes)
            except _UnknownLanguageError as ex:
                raise UnknownLanguageError(
                    short_msg="invalid language",
                    long_msg=f"unsupported language {language}",
                    spans=[
                        rule.languages_span.with_context(before=1, after=1)
                    ],
                ) from ex

            if targets == []:
                continue

            # semgrep-core doesn't know about OPERATORS.REGEX - this is
            # strictly a semgrep Python feature. Regex filtering is
            # performed purely in Python code then compared against
            # semgrep-core's results for other patterns.
            patterns_regex, patterns = partition(
                lambda p: p.expression.operator == OPERATORS.REGEX,
                all_patterns_for_language,
            )
            if patterns_regex:
                patterns_json = [
                    pattern.to_json() for pattern in patterns_regex
                ]

                try:
                    patterns_re = [(pattern["id"],
                                    re.compile(pattern["pattern"]))
                                   for pattern in patterns_json]
                except re.error as err:
                    raise SemgrepError(
                        f"invalid regular expression specified: {err}")

                re_fn = functools.partial(get_re_matches, patterns_re)
                with multiprocessing.Pool(self._jobs) as pool:
                    matches = pool.map(re_fn, targets)

                outputs.extend(single_match for file_matches in matches
                               for single_match in file_matches)

            patterns_json = [p.to_json() for p in patterns]
            with tempfile.NamedTemporaryFile(
                    "w") as pattern_file, tempfile.NamedTemporaryFile(
                        "w") as target_file, tempfile.NamedTemporaryFile(
                            "w") as equiv_file:
                yaml = YAML()
                yaml.dump({"rules": patterns_json}, pattern_file)
                pattern_file.flush()
                target_file.write("\n".join(str(t) for t in targets))
                target_file.flush()

                cmd = [SEMGREP_PATH] + [
                    "-lang",
                    language,
                    "-rules_file",
                    pattern_file.name,
                    "-j",
                    str(self._jobs),
                    "-target_file",
                    target_file.name,
                    "-use_parsing_cache",
                    cache_dir,
                ]

                if equivalences:
                    self._write_equivalences_file(equiv_file, equivalences)
                    cmd += ["-equivalences", equiv_file.name]

                core_run = sub_run(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

                debug_print(core_run.stderr.decode("utf-8", "replace"))

                if core_run.returncode != 0:
                    # see if semgrep output a JSON error that we can decode
                    semgrep_output = core_run.stdout.decode("utf-8", "replace")
                    try:
                        output_json = json.loads(semgrep_output)
                    except ValueError:
                        raise SemgrepError(
                            f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                    if "error" in output_json:
                        self._raise_semgrep_error_from_json(
                            output_json, patterns)
                    else:
                        raise SemgrepError(
                            f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                output_json = json.loads(
                    (core_run.stdout.decode("utf-8", "replace")))
                errors.extend(
                    CoreException.from_json(e, language)
                    for e in output_json["errors"])
                outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                debug_print(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors