コード例 #1
0
def get_re_matches(patterns_re: List[Tuple], path: Path) -> List[PatternMatch]:
    try:
        contents = path.read_text()
    except UnicodeDecodeError:
        debug_print(f"regex matcher skipping binary file at {path}")
        return []

    return [
        PatternMatch({
            "check_id": pattern_id,
            "path": str(path),
            "start": {
                "offset": match.start(),
                "line": _offset_to_line_no(match.start(), contents),
                "col": _offset_to_col_no(match.start(), contents),
            },
            "end": {
                "offset": match.end(),
                "line": _offset_to_line_no(match.end(), contents),
                "col": _offset_to_col_no(match.end(), contents),
            },
            "extra": {
                "lines": [contents[match.start():match.end()]]
            },
        }) for pattern_id, pattern in patterns_re
        for match in re.finditer(pattern, contents)
    ]
コード例 #2
0
ファイル: config_resolver.py プロジェクト: mtkis22/semgrep
def generate_config() -> None:
    import requests  # here for faster startup times

    # defensive coding
    if Path(DEFAULT_CONFIG_FILE).exists():
        raise SemgrepError(
            f"{DEFAULT_CONFIG_FILE} already exists. Please remove and try again"
        )
    try:
        r = requests.get(TEMPLATE_YAML_URL, timeout=10)
        r.raise_for_status()
        template_str = r.text
    except Exception as e:
        debug_print(str(e))
        print_stderr(
            f"There was a problem downloading the latest template config. Using fallback template"
        )
        template_str = """rules:
  - id: eqeq-is-bad
    pattern: $X == $X
    message: "$X == $X is a useless equality check"
    languages: [python]
    severity: ERROR"""
    try:
        with open(DEFAULT_CONFIG_FILE, "w") as template:
            template.write(template_str)
            print_stderr(
                f"Template config successfully written to {DEFAULT_CONFIG_FILE}"
            )
    except Exception as e:
        raise SemgrepError(str(e))
コード例 #3
0
def _evaluate_expression(
    expression: BooleanRuleExpression,
    pattern_ids_to_pattern_matches: Dict[PatternId, List[PatternMatch]],
    ranges_left: Set[Range],
    steps_for_debugging: List[Dict[str, Any]],
    flags: Optional[Dict[str, Any]] = None,
) -> Set[Range]:
    if (expression.operator == OPERATORS.AND_EITHER
            or expression.operator == OPERATORS.AND_ALL):
        assert (
            expression.children is not None
        ), f"{pattern_names_for_operator(OPERATORS.AND_EITHER)} or {pattern_names_for_operator(OPERATORS.AND_ALL)} must have a list of subpatterns"

        # recurse on the nested expressions
        if expression.operator == OPERATORS.AND_EITHER:
            # remove anything that does not equal one of these ranges
            evaluated_ranges = [
                _evaluate_expression(
                    expr,
                    pattern_ids_to_pattern_matches,
                    ranges_left.copy(),
                    steps_for_debugging,
                    flags=flags,
                ) for expr in expression.children
            ]
            ranges_left.intersection_update(flatten(evaluated_ranges))
        else:
            # chain intersection eagerly; intersect for every AND'ed child
            for expr in expression.children:
                remainining_ranges = _evaluate_expression(
                    expr,
                    pattern_ids_to_pattern_matches,
                    ranges_left.copy(),
                    steps_for_debugging,
                    flags=flags,
                )
                ranges_left.intersection_update(remainining_ranges)

        debug_print(f"after filter `{expression.operator}`: {ranges_left}")
        steps_for_debugging.append({
            "filter": f"{pattern_name_for_operator(expression.operator)}",
            "pattern_id": None,
            "ranges": list(ranges_left),
        })
    else:
        assert (
            expression.children is None
        ), f"only `{pattern_names_for_operator(OPERATORS.AND_EITHER)}` or `{pattern_names_for_operator(OPERATORS.AND_ALL)}` expressions can have multiple subpatterns"
        ranges_left = _evaluate_single_expression(
            expression,
            pattern_ids_to_pattern_matches,
            ranges_left,
            steps_for_debugging,
            flags=flags,
        )
    return ranges_left
コード例 #4
0
    def post_output(cls, output_url: str, output: str) -> None:
        import requests  # here for faster startup times

        print_stderr(f"posting to {output_url}...")
        try:
            r = requests.post(output_url, data=output, timeout=10)
            debug_print(
                f"posted to {output_url} and got status_code:{r.status_code}")
        except requests.exceptions.Timeout:
            raise SemgrepError(f"posting output to {output_url} timed out")
コード例 #5
0
ファイル: config_resolver.py プロジェクト: mtkis22/semgrep
def resolve_config(config_str: Optional[str]) -> Dict[str, YamlTree]:
    """ resolves if config arg is a registry entry, a url, or a file, folder, or loads from defaults if None"""
    start_t = time.time()
    if config_str is None:
        config = load_config_from_local_path()
    elif config_str in RULES_REGISTRY:
        config = download_config(RULES_REGISTRY[config_str])
    elif is_url(config_str):
        config = download_config(config_str)
    else:
        config = load_config_from_local_path(config_str)
    if config:
        debug_print(f"loaded {len(config)} configs in {time.time() - start_t}")
    return config
コード例 #6
0
    def invoke_semgrep(
        self, target_manager: TargetManager, rules: List[Rule]
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Dict[str, Any]]],
               List[CoreException], ]:
        """
            Takes in rules and targets and retuns object with findings
        """
        start = datetime.now()

        findings_by_rule, debug_steps_by_rule, errors = self._run_rules(
            rules, target_manager)

        debug_print(f"semgrep ran in {datetime.now() - start}")

        return findings_by_rule, debug_steps_by_rule, errors
コード例 #7
0
def is_running_latest(version_cache_path: Path = VERSION_CACHE_PATH) -> bool:
    latest_version_str = _get_latest_version(version_cache_path)
    if latest_version_str is None:
        return False

    try:
        latest_version = Version(latest_version_str)
        current_version = Version(constants.__VERSION__)
    except InvalidVersion as e:
        util.debug_print(f"Invalid version string: {e}")
        return False

    if current_version < latest_version:
        return False

    return True
コード例 #8
0
ファイル: config_resolver.py プロジェクト: mtkis22/semgrep
def download_config(config_url: str) -> Dict[str, YamlTree]:
    import requests  # here for faster startup times

    DOWNLOADING_MESSAGE = f"downloading config..."
    debug_print(f"trying to download from {config_url}")
    print_stderr(
        f"using config from {nice_semgrep_url(config_url)}. Visit https://semgrep.live/registry to see all public rules."
    )
    print_stderr(DOWNLOADING_MESSAGE, end="\r")
    headers = {"User-Agent": SEMGREP_USER_AGENT}

    try:
        r = requests.get(config_url, stream=True, headers=headers, timeout=10)
        if r.status_code == requests.codes.ok:
            content_type = r.headers.get("Content-Type")
            yaml_types = [
                "text/plain",
                "application/x-yaml",
                "text/x-yaml",
                "text/yaml",
                "text/vnd.yaml",
            ]
            if content_type and any((ct in content_type for ct in yaml_types)):
                return parse_config_string(
                    "remote-url",
                    r.content.decode("utf-8"),
                    filename=f"{config_url[:20]}...",
                )
            else:
                raise SemgrepError(
                    f"unknown content-type: {content_type} returned by config url: {config_url}. Can not parse"
                )
        else:
            raise SemgrepError(
                f"bad status code: {r.status_code} returned by config url: {config_url}"
            )
    except Exception as e:
        raise SemgrepError(
            f"Failed to download config from {config_url}: {str(e)}")

    return None
コード例 #9
0
def _fetch_latest_version(
    url: str = VERSION_CHECK_URL, timeout: int = VERSION_CHECK_TIMEOUT
) -> Optional[str]:
    try:
        import requests

        resp = requests.get(
            url,
            headers={"User-Agent": f"Semgrep/{constants.__VERSION__}"},
            timeout=timeout,
        )
    except Exception as e:
        util.debug_print(f"Fetching latest version failed to connect: {e}")
        return None
    else:
        if resp.status_code != requests.codes.OK:
            util.debug_print(
                f"Fetching latest version received HTTP error code: {resp.status_code}"
            )
            return None
        try:
            resp_json = resp.json()
        except ValueError:
            util.debug_print("Fetching latest version received invalid JSON")
            return None
        else:
            return str(resp_json["latest"])
コード例 #10
0
def _get_version_from_cache(version_cache_path: Path) -> Optional[str]:
    now = time.time()

    if version_cache_path.is_file():
        with version_cache_path.open() as f:
            timestamp_str = f.readline().strip()
            latest_version_str = f.readline().strip()

            try:
                # Treat time as integer seconds so no need to deal with str float conversion
                timestamp = int(timestamp_str)
            except ValueError:
                util.debug_print(
                    f"Version cache invalid timestamp: {timestamp_str}")
                return None

            one_day = 86400
            if now - timestamp > one_day:
                util.debug_print(
                    f"Version cache expired: {timestamp_str}:{now}")
                return None

            return latest_version_str

    util.debug_print("Version cache does not exist")
    return None
コード例 #11
0
def evaluate(rule: Rule, pattern_matches: List[PatternMatch],
             allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]:
    """
        Takes a Rule and list of pattern matches from a single file and
        handles the boolean expression evaluation of the Rule's patterns
        Returns a list of RuleMatches.
    """
    output = []

    pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches)
    steps_for_debugging = [{
        "filter": "initial",
        "pattern_id": None,
        "ranges": {
            k: list(set(vv.range for vv in v))
            for k, v in pattern_ids_to_pattern_matches.items()
        },
    }]
    debug_print(str(pattern_ids_to_pattern_matches))
    valid_ranges_to_output = evaluate_expression(
        rule.expression,
        pattern_ids_to_pattern_matches,
        flags={RCE_RULE_FLAG: allow_exec},
        steps_for_debugging=steps_for_debugging,
    )

    # only output matches which are inside these offsets!
    debug_print(f"compiled result {valid_ranges_to_output}")
    debug_print("-" * 80)

    for pattern_match in pattern_matches:
        if pattern_match.range in valid_ranges_to_output:
            message = interpolate_message_metavariables(rule, pattern_match)
            fix = interpolate_fix_metavariables(rule, pattern_match)
            rule_match = RuleMatch(
                rule.id,
                pattern_match,
                message=message,
                metadata=rule.metadata,
                severity=rule.severity,
                fix=fix,
                fix_regex=rule.fix_regex,
            )
            output.append(rule_match)

    return output, steps_for_debugging
コード例 #12
0
ファイル: semgrep_main.py プロジェクト: mtkis22/semgrep
def rule_match_nosem(rule_match: RuleMatch, strict: bool) -> bool:
    if not rule_match.lines:
        return False

    # Only consider the first line of a match. This will keep consistent
    # behavior on where we expect a 'nosem' comment to exist. If we allow these
    # comments on any line of a match it will get confusing as to what finding
    # the 'nosem' is referring to.
    re_match = NOSEM_INLINE_RE.search(rule_match.lines[0])
    if re_match is None:
        return False

    ids_str = re_match.groupdict()["ids"]
    if ids_str is None:
        debug_print(
            f"found 'nosem' comment, skipping rule '{rule_match.id}' on line {rule_match.start['line']}"
        )
        return True

    pattern_ids = {
        pattern_id.strip()
        for pattern_id in COMMA_SEPARATED_LIST_RE.split(ids_str)
        if pattern_id.strip()
    }

    result = False
    for pattern_id in pattern_ids:
        if rule_match.id == pattern_id:
            debug_print(
                f"found 'nosem' comment with id '{pattern_id}', skipping rule '{rule_match.id}' on line {rule_match.start['line']}"
            )
            result = result or True
        else:
            message = f"found 'nosem' comment with id '{pattern_id}', but no corresponding rule trying '{rule_match.id}'"
            if strict:
                raise SemgrepError(message)
            else:
                debug_print(message)

    return result
コード例 #13
0
ファイル: test.py プロジェクト: mtkis22/semgrep
def score_output_json(
    json_out: Dict[str, Any], test_files: List[Path], ignore_todo: bool
) -> Tuple[Dict[str, List[int]], Dict[str, Dict[str, Any]], int]:
    comment_lines: Dict[str, Dict[str, List[int]]] = collections.defaultdict(
        lambda: collections.defaultdict(list))
    reported_lines: Dict[str, Dict[str, List[int]]] = collections.defaultdict(
        lambda: collections.defaultdict(list))
    ignore_lines: Dict[str, List[int]] = collections.defaultdict(list)
    score_by_checkid: Dict[str, List[int]] = collections.defaultdict(
        lambda: [0, 0, 0, 0])
    expected_reported_by_check_id: Dict[str, Dict[
        str, Any]] = collections.defaultdict(dict)
    num_todo = 0

    for test_file in test_files:
        test_file_resolved = str(test_file.resolve())
        with open(test_file_resolved) as fin:
            all_lines = fin.readlines()
            for i, line in enumerate(all_lines):
                # +1 because we are 0 based and semgrep output is not, plus skip the comment line
                effective_line_num = i + 2

                todo_in_line = line_has_todo_rule(line)
                todo_ok_in_line = line_has_todo_ok(line)
                if todo_in_line:
                    num_todo += 1
                if (not ignore_todo and todo_in_line) or line_has_rule(line):
                    comment_lines[test_file_resolved][normalize_rule_id(
                        line)].append(effective_line_num)
                if ignore_todo and todo_ok_in_line:
                    ignore_lines[test_file_resolved].append(effective_line_num)

    for result in json_out["results"]:
        reported_lines[str(Path(
            result["path"]).resolve())][result["check_id"]].append(
                int(result["start"]["line"]))

    def join_keys(a: Dict[str, Any], b: Dict[str, Any]) -> Set[str]:
        return set(a.keys()).union(set(b.keys()))

    for file_path in join_keys(comment_lines, reported_lines):
        for check_id in join_keys(comment_lines[file_path],
                                  reported_lines[file_path]):
            all_reported = set(reported_lines[file_path][check_id])
            expected = set(comment_lines[file_path][check_id])
            ignored = set(ignore_lines[file_path])

            reported = all_reported - ignored

            new_cm = compute_confusion_matrix(reported, expected)
            debug_print(
                f"reported lines for check {check_id}: {sorted(reported)}, expected lines: {sorted(expected)} (ignored: {sorted(ignored)}, confusion matrix: {new_cm}"
            )
            expected_reported_by_check_id[check_id][file_path] = (expected,
                                                                  reported)
            # TODO: -- re-enable this
            # assert len(set(reported_lines[file_path][check_id])) == len(
            #    reported_lines[file_path][check_id]
            # ), f"for testing, please don't make rules that fire multiple times on the same line ({check_id} in {file_path} on lines {reported_lines[file_path][check_id]})"
            old_cm = score_by_checkid[check_id]
            score_by_checkid[check_id] = [
                old_cm[i] + new_cm[i] for i in range(len(new_cm))
            ]

    return (score_by_checkid, expected_reported_by_check_id, num_todo)
コード例 #14
0
ファイル: semgrep_main.py プロジェクト: mtkis22/semgrep
def main(
    output_handler: OutputHandler,
    target: List[str],
    pattern: str,
    lang: str,
    config: str,
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    dryrun: bool = False,
    disable_nosem: bool = False,
    dangerously_allow_arbitrary_code_execution_from_rules: bool = False,
    no_git_ignore: bool = False,
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    valid_configs, config_errors = get_config(pattern, lang, config)

    output_handler.handle_semgrep_errors(config_errors)

    if config_errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(config_errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not no_rewrite_rule_ids:
        # re-write the configs to have the hierarchical rule ids
        valid_configs = rename_rule_ids(valid_configs)

    # extract just the rules from valid configs
    all_rules = flatten_configs(valid_configs)

    if not pattern:
        plural = "s" if len(valid_configs) > 1 else ""
        config_id_if_single = (list(valid_configs.keys())[0]
                               if len(valid_configs) == 1 else "")
        invalid_msg = (f"({len(config_errors)} config files were invalid)"
                       if len(config_errors) else "")
        debug_print(
            f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}"
        )

        notify_user_of_work(all_rules, include, exclude)

        if len(valid_configs) == 0:
            raise SemgrepError(
                f"no valid configuration file found ({len(config_errors)} configs were invalid)",
                code=MISSING_CONFIG_EXIT_CODE,
            )

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        targets=target,
        respect_git_ignore=respect_git_ignore,
    )

    # actually invoke semgrep
    rule_matches_by_rule, debug_steps_by_rule, semgrep_core_errors = CoreRunner(
        allow_exec=dangerously_allow_arbitrary_code_execution_from_rules,
        jobs=jobs,
    ).invoke_semgrep(target_manager, all_rules)

    semgrep_errors = [e.into_semgrep_error() for e in semgrep_core_errors]
    output_handler.handle_semgrep_errors(semgrep_errors)

    if not disable_nosem:
        rule_matches_by_rule = {
            rule: [
                rule_match for rule_match in rule_matches
                if not rule_match_nosem(rule_match, strict)
            ]
            for rule, rule_matches in rule_matches_by_rule.items()
        }

    output_handler.handle_semgrep_core_output(rule_matches_by_rule,
                                              debug_steps_by_rule)

    if autofix:
        apply_fixes(rule_matches_by_rule, dryrun)
コード例 #15
0
    def _run_rule(
        self, rule: Rule, target_manager: TargetManager, cache_dir: str
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[CoreException] = []
        equivalences = rule.equivalences

        for language, all_patterns_for_language in self._group_patterns_by_language(
            [rule]).items():
            try:
                targets = target_manager.get_files(language, rule.includes,
                                                   rule.excludes)
            except _UnknownLanguageError as ex:
                raise UnknownLanguageError(
                    short_msg="invalid language",
                    long_msg=f"unsupported language {language}",
                    spans=[
                        rule.languages_span.with_context(before=1, after=1)
                    ],
                ) from ex

            if targets == []:
                continue

            # semgrep-core doesn't know about OPERATORS.REGEX - this is
            # strictly a semgrep Python feature. Regex filtering is
            # performed purely in Python code then compared against
            # semgrep-core's results for other patterns.
            patterns_regex, patterns = partition(
                lambda p: p.expression.operator == OPERATORS.REGEX,
                all_patterns_for_language,
            )
            if patterns_regex:
                patterns_json = [
                    pattern.to_json() for pattern in patterns_regex
                ]

                try:
                    patterns_re = [(pattern["id"],
                                    re.compile(pattern["pattern"]))
                                   for pattern in patterns_json]
                except re.error as err:
                    raise SemgrepError(
                        f"invalid regular expression specified: {err}")

                re_fn = functools.partial(get_re_matches, patterns_re)
                with multiprocessing.Pool(self._jobs) as pool:
                    matches = pool.map(re_fn, targets)

                outputs.extend(single_match for file_matches in matches
                               for single_match in file_matches)

            patterns_json = [p.to_json() for p in patterns]
            with tempfile.NamedTemporaryFile(
                    "w") as pattern_file, tempfile.NamedTemporaryFile(
                        "w") as target_file, tempfile.NamedTemporaryFile(
                            "w") as equiv_file:
                yaml = YAML()
                yaml.dump({"rules": patterns_json}, pattern_file)
                pattern_file.flush()
                target_file.write("\n".join(str(t) for t in targets))
                target_file.flush()

                cmd = [SEMGREP_PATH] + [
                    "-lang",
                    language,
                    "-rules_file",
                    pattern_file.name,
                    "-j",
                    str(self._jobs),
                    "-target_file",
                    target_file.name,
                    "-use_parsing_cache",
                    cache_dir,
                ]

                if equivalences:
                    self._write_equivalences_file(equiv_file, equivalences)
                    cmd += ["-equivalences", equiv_file.name]

                core_run = sub_run(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

                debug_print(core_run.stderr.decode("utf-8", "replace"))

                if core_run.returncode != 0:
                    # see if semgrep output a JSON error that we can decode
                    semgrep_output = core_run.stdout.decode("utf-8", "replace")
                    try:
                        output_json = json.loads(semgrep_output)
                    except ValueError:
                        raise SemgrepError(
                            f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                    if "error" in output_json:
                        self._raise_semgrep_error_from_json(
                            output_json, patterns)
                    else:
                        raise SemgrepError(
                            f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                output_json = json.loads(
                    (core_run.stdout.decode("utf-8", "replace")))
                errors.extend(
                    CoreException.from_json(e, language)
                    for e in output_json["errors"])
                outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                debug_print(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors
コード例 #16
0
ファイル: test.py プロジェクト: mtkis22/semgrep
def generate_file_pairs(location: Path, ignore_todo: bool, strict: bool,
                        semgrep_verbose: bool, unsafe: bool) -> None:
    filenames = list(location.rglob("*"))
    no_tests = []
    tested = []
    semgrep_error = []
    print("starting tests...")
    for filename in filenames:
        if (filename.suffix in YML_EXTENSIONS
                and not filename.name.startswith(".")
                and not filename.parent.name.startswith(".")):
            # find all filenames that have the same name but not extension, or are in a folder with the same name as a the yaml file
            yaml_file_name_without_ext = filename.with_suffix("")

            children_test_files = [
                p for p in filenames
                if str(p.with_suffix("")) == (str(yaml_file_name_without_ext))
            ]
            # remove yaml files from the test lists
            test_files = [
                path for path in children_test_files
                if path.suffix not in YML_EXTENSIONS and path.is_file()
            ]
            if not len(test_files):
                no_tests.append(filename)
                continue
            # invoke semgrep
            try:
                output_json = invoke_semgrep(
                    filename,
                    test_files,
                    no_git_ignore=True,
                    no_rewrite_rule_ids=True,
                    strict=strict,
                    dangerously_allow_arbitrary_code_execution_from_rules=
                    unsafe,
                )
                tested.append((filename,
                               score_output_json(output_json, test_files,
                                                 ignore_todo)))
            except Exception as ex:
                print(
                    f"semgrep error running with config {filename} on {test_files}:\n{ex}"
                )
                semgrep_error.append(filename)

    if len(semgrep_error) and strict:
        print("exiting due to semgrep/config errors and strict flag")
        sys.exit(1)

    print(f"{len(no_tests)} yaml files missing tests")
    debug_print(f"missing tests: {no_tests}")
    print(f"{len(tested)} yaml files tested")
    print("check id scoring:")
    print("=" * 80)
    failed_tests = []
    total_confusion = [0, 0, 0, 0]

    for (filename, (output, expected_reported_by_check_id,
                    num_todo)) in tested:
        print(filename)
        if not len(output.items()):
            print(f"  no checks fired (TODOs: {num_todo})")
        for check_id, (tp, tn, fp, fn) in output.items():
            good = (fp == 0) and (fn == 0)
            if not good:
                failed_tests.append((filename, check_id,
                                     expected_reported_by_check_id[check_id]))
            status = "✔" if good else "✖"
            todo_text = f"(TODOs: {num_todo})" if num_todo > 0 else ""
            confusion = [tp, tn, fp, fn]
            # add to the total confusion matrix
            total_confusion = [
                total_confusion[i] + confusion[i]
                for i in range(len(confusion))
            ]
            print(
                f"  {status} - {check_id.ljust(60)}{confusion_matrix_to_string(confusion)} {todo_text}"
            )

    print("=" * 80)
    print(
        f"final confusion matrix: {confusion_matrix_to_string(total_confusion)}"
    )
    print("=" * 80)

    if len(failed_tests) > 0:
        print(f"failing rule files: ")
        for (filename, check_id, failed_test_files) in failed_tests:
            print(f" ✖ FAILED rule file: {filename} check: {check_id}")
            for test_file_path, (expected,
                                 reported) in failed_test_files.items():
                print(
                    f"              in test: {test_file_path}, expected lines: {sorted(expected)} != reported: {sorted(reported)}"
                )
        print(
            f"{len(failed_tests)} checks failed tests (run with verbose flag for more details)"
        )
        sys.exit(1)
    else:
        print("all tests passed")
        sys.exit(0)
コード例 #17
0
def _evaluate_single_expression(
    expression: BooleanRuleExpression,
    pattern_ids_to_pattern_matches: Dict[PatternId, List[PatternMatch]],
    ranges_left: Set[Range],
    steps_for_debugging: List[Dict[str, Any]],
    flags: Optional[Dict[str, Any]] = None,
) -> Set[Range]:

    assert expression.pattern_id, f"<internal error: expected pattern id: {expression}>"
    results_for_pattern = [
        x.range
        for x in pattern_ids_to_pattern_matches.get(expression.pattern_id, [])
    ]

    if expression.operator == OPERATORS.AND:
        # remove all ranges that don't equal the ranges for this pattern
        return ranges_left.intersection(results_for_pattern)
    elif expression.operator == OPERATORS.AND_NOT:
        # remove all ranges that DO equal the ranges for this pattern
        # difference_update = Remove all elements of another set from this set.
        output_ranges = ranges_left.difference(results_for_pattern)
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        steps_for_debugging.append({
            "filter":
            pattern_name_for_operator(expression.operator),
            "pattern_id":
            expression.pattern_id,
            "ranges":
            list(output_ranges),
        })
        return output_ranges
    elif expression.operator == OPERATORS.AND_INSIDE:
        # remove all ranges (not enclosed by) or (not equal to) the inside ranges
        output_ranges = set()
        for arange in ranges_left:
            for keep_inside_this_range in results_for_pattern:
                is_enclosed = keep_inside_this_range.is_enclosing_or_eq(arange)
                # print(
                #    f'candidate range is {arange}, needs to be `{operator}` {keep_inside_this_range}; keep?: {keep}')
                if is_enclosed:
                    output_ranges.add(arange)
                    break  # found a match, no need to keep going
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        steps_for_debugging.append({
            "filter":
            pattern_name_for_operator(expression.operator),
            "pattern_id":
            expression.pattern_id,
            "ranges":
            list(output_ranges),
        })
        return output_ranges
    elif expression.operator == OPERATORS.AND_NOT_INSIDE:
        # remove all ranges enclosed by or equal to
        output_ranges = ranges_left.copy()
        for arange in ranges_left:
            for keep_inside_this_range in results_for_pattern:
                if keep_inside_this_range.is_enclosing_or_eq(arange):
                    output_ranges.remove(arange)
                    break
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        steps_for_debugging.append({
            "filter":
            pattern_name_for_operator(expression.operator),
            "pattern_id":
            expression.pattern_id,
            "ranges":
            list(output_ranges),
        })
        return output_ranges
    elif expression.operator == OPERATORS.WHERE_PYTHON:
        if not flags or not flags[RCE_RULE_FLAG]:
            raise SemgrepError(
                f"at least one rule needs to execute arbitrary code; this is dangerous! if you want to continue, enable the flag: {RCE_RULE_FLAG}",
                code=NEED_ARBITRARY_CODE_EXEC_EXIT_CODE,
            )
        assert expression.operand, "must have operand for this operator type"

        output_ranges = set()
        # Look through every range that hasn't been filtered yet
        for pattern_match in list(
                flatten(pattern_ids_to_pattern_matches.values())):
            # Only need to check where-python clause if the range hasn't already been filtered

            if pattern_match.range in ranges_left:
                debug_print(
                    f"WHERE is {expression.operand}, metavars: {pattern_match.metavars}"
                )
                if _where_python_statement_matches(expression.operand,
                                                   pattern_match.metavars):
                    output_ranges.add(pattern_match.range)
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        steps_for_debugging.append({
            "filter":
            pattern_name_for_operator(expression.operator),
            "pattern_id":
            expression.pattern_id,
            "ranges":
            list(output_ranges),
        })
        return output_ranges
    elif expression.operator == OPERATORS.REGEX:
        # remove all ranges that don't equal the ranges for this pattern
        output_ranges = ranges_left.intersection(results_for_pattern)
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        steps_for_debugging.append({
            "filter":
            pattern_name_for_operator(expression.operator),
            "pattern_id":
            expression.pattern_id,
            "ranges":
            list(output_ranges),
        })
        return output_ranges
    else:
        raise UnknownOperatorError(f"unknown operator {expression.operator}")