Exemple #1
0
def flatten_rule_patterns(all_rules) -> Iterator[Dict[str, Any]]:
    for rule_index, rule in enumerate(all_rules):
        flat_expressions = list(
            enumerate_patterns_in_boolean_expression(
                build_boolean_expression(rule)))
        for expr in flat_expressions:
            if not should_send_to_sgrep(expr):
                continue
            # if we don't copy an array (like `languages`), the yaml file will refer to it by reference (with an anchor)
            # which is nice and all but the sgrep YAML parser doesn't support that
            new_check_id = f"{rule_index}.{expr.pattern_id}"
            yield {
                "id": new_check_id,
                "pattern": expr.operand,
                "severity": rule["severity"],
                "languages": rule["languages"].copy(),
                "message": "<internalonly>",
            }
Exemple #2
0
def validate_patterns(valid_configs: Dict[str, Any]) -> List[str]:
    invalid: List[str] = []
    for config_id, config in valid_configs.items():
        rules = config.get(RULES_KEY, [])
        for rule in rules:
            expressions = enumerate_patterns_in_boolean_expression(
                build_boolean_expression(rule)
            )
            for expr in expressions:
                for language in rule["languages"]:
                    # avoid patterns that don't have pattern_ids, like pattern-either
                    if should_send_to_sgrep(expr) and not validate_pattern_with_sgrep(
                        expr.operand, language  # type: ignore
                    ):
                        invalid.append(expr.operand)  # type: ignore
                        print_error(
                            f"in {config_id}, pattern in rule {rule['id']} can't be parsed for language {language}: {expr.operand}"
                        )
    return invalid
Exemple #3
0
def validate_single_rule(config_id: str, rule_index: int, rule: Dict[str, Any]) -> bool:
    rule_id_err_msg = f'(rule id: {rule.get("id", MISSING_RULE_ID)})'
    if not set(rule.keys()).issuperset(YAML_MUST_HAVE_KEYS):
        print_error(
            f"{config_id} is missing keys at rule {rule_index+1} {rule_id_err_msg}, must have: {YAML_MUST_HAVE_KEYS}"
        )
        return False
    if not set(rule.keys()).issubset(YAML_ALL_VALID_RULE_KEYS):
        print_error(
            f"{config_id} has invalid rule key at rule {rule_index+1} {rule_id_err_msg}, can only have: {YAML_ALL_VALID_RULE_KEYS}"
        )
        return False
    try:
        _ = build_boolean_expression(rule)
    except InvalidRuleSchema as ex:
        print_error(
            f"{config_id}: inside rule {rule_index+1} {rule_id_err_msg}, pattern fields can't look like this: {ex}"
        )
        return False

    return True
Exemple #4
0
def main(args: argparse.Namespace):
    """ main function that parses args and runs sgrep """

    # get the proper paths for targets i.e. handle base path of /home/repo when it exists in docker
    targets = config_resolver.resolve_targets(args.target)

    # first check if user asked to generate a config
    if args.generate_config:
        config_resolver.generate_config()

    # let's check for a pattern
    elif args.pattern:
        # and a language
        if not args.lang:
            print_error_exit(
                "language must be specified when a pattern is passed")
        lang = args.lang
        pattern = args.pattern

        # TODO for now we generate a manual config. Might want to just call sgrep -e ... -l ...
        configs = config_resolver.manual_config(pattern, lang)
    else:
        # else let's get a config. A config is a dict from config_id -> config. Config Id is not well defined at this point.
        configs = config_resolver.resolve_config(args.config)

    # if we can't find a config, use default r2c rules
    if not configs:
        print_error_exit(
            f"No config given. If you want to see some examples, try running with --config r2c"
        )

    # let's split our configs into valid and invalid configs.
    # It's possible that a config_id exists in both because we check valid rules and invalid rules
    # instead of just hard failing for that config if mal-formed
    valid_configs, errors = validate_configs(configs)

    validate = args.validate
    strict = args.strict

    if errors:
        if strict:
            print_error_exit(
                f"run with --strict and there were {len(errors)} errors loading configs"
            )
        elif validate:
            print_error_exit(
                f"run with --validate and there were {len(errors)} errors loading configs"
            )
    elif validate:  # no errors!
        print_error_exit("Config is valid", exit_code=0)

    if not args.no_rewrite_rule_ids:
        # re-write the configs to have the hierarchical rule ids
        valid_configs = rename_rule_ids(valid_configs)

    # now validate all the patterns inside the configs
    if not args.skip_pattern_validation:
        start_validate_t = time.time()
        invalid_patterns = validate_patterns(valid_configs)
        if len(invalid_patterns):
            print_error_exit(
                f"{len(invalid_patterns)} invalid patterns found inside rules; aborting"
            )
        debug_print(
            f"debug: validated config in {time.time() - start_validate_t}")

    # extract just the rules from valid configs
    all_rules = flatten_configs(valid_configs)

    if not args.pattern:
        plural = "s" if len(valid_configs) > 1 else ""
        config_id_if_single = (list(valid_configs.keys())[0]
                               if len(valid_configs) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        print_msg(
            f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}"
        )
    # TODO log valid and invalid configs if verbose

    # a rule can have multiple patterns inside it. Flatten these so we can send sgrep a single yml file list of patterns
    all_patterns = list(flatten_rule_patterns(all_rules))

    # actually invoke sgrep
    start = datetime.now()
    output_json = invoke_sgrep(all_patterns, targets, strict)
    debug_print(f"sgrep ran in {datetime.now() - start}")
    debug_print(str(output_json))

    # group output; we want to see all of the same rule ids on the same file path
    by_rule_index: Dict[int,
                        Dict[str,
                             List[Dict[str, Any]]]] = collections.defaultdict(
                                 lambda: collections.defaultdict(list))

    for finding in output_json["errors"]:
        print_error(f"sgrep: {finding['path']}: {finding['check_id']}")

    if strict and len(output_json["errors"]):
        print_error_exit(
            f"run with --strict and {len(output_json['errors'])} errors occurred during sgrep run; exiting"
        )

    for finding in output_json["matches"]:
        # decode the rule index from the output check_id
        rule_index = int(finding["check_id"].split(".")[0])
        by_rule_index[rule_index][finding["path"]].append(finding)

    current_path = Path.cwd()
    outputs_after_booleans = []
    ignored_in_tests = 0
    for rule_index, paths in by_rule_index.items():
        expression = build_boolean_expression(all_rules[rule_index])
        debug_print(str(expression))
        # expression = (op, pattern_id) for (op, pattern_id, pattern) in expression_with_patterns]
        for filepath, results in paths.items():
            debug_print(
                f"-------- rule (index {rule_index}) {all_rules[rule_index]['id']}------ filepath: {filepath}"
            )
            check_ids_to_ranges = parse_sgrep_output(results)
            debug_print(str(check_ids_to_ranges))
            valid_ranges_to_output = evaluate_expression(
                expression,
                check_ids_to_ranges,
                flags={
                    RCE_RULE_FLAG:
                    args.dangerously_allow_arbitrary_code_execution_from_rules
                },
            )

            # only output matches which are inside these offsets!
            debug_print(f"compiled result {valid_ranges_to_output}")
            debug_print("-" * 80)
            for result in results:
                if sgrep_finding_to_range(
                        result).range in valid_ranges_to_output:
                    path_object = Path(result["path"])
                    if args.exclude_tests and should_exclude_this_path(
                            path_object):
                        ignored_in_tests += 1
                        continue

                    # restore the original rule ID
                    result["check_id"] = all_rules[rule_index]["id"]
                    # rewrite the path to be relative to the current working directory
                    result["path"] = str(
                        safe_relative_to(path_object, current_path))

                    # restore the original message
                    result["extra"]["message"] = rewrite_message_with_metavars(
                        all_rules[rule_index], result)
                    result = transform_to_r2c_output(result)
                    outputs_after_booleans.append(result)

    if ignored_in_tests > 0:
        print_error(
            f"warning: ignored {ignored_in_tests} results in tests due to --exclude-tests option"
        )

    # output results
    output_data = {"results": outputs_after_booleans}
    if not args.quiet:
        if args.json:
            print(build_output_json(output_data))
        else:
            print("\n".join(build_normal_output(output_data,
                                                color_output=True)))
    if args.output:
        save_output(args.output, output_data, args.json)
    if args.error and outputs_after_booleans:
        sys.exit(FINDINGS_EXIT_CODE)

    return output_data