def flatten_rule_patterns(all_rules) -> Iterator[Dict[str, Any]]: for rule_index, rule in enumerate(all_rules): flat_expressions = list( enumerate_patterns_in_boolean_expression( build_boolean_expression(rule))) for expr in flat_expressions: if not should_send_to_sgrep(expr): continue # if we don't copy an array (like `languages`), the yaml file will refer to it by reference (with an anchor) # which is nice and all but the sgrep YAML parser doesn't support that new_check_id = f"{rule_index}.{expr.pattern_id}" yield { "id": new_check_id, "pattern": expr.operand, "severity": rule["severity"], "languages": rule["languages"].copy(), "message": "<internalonly>", }
def validate_patterns(valid_configs: Dict[str, Any]) -> List[str]: invalid: List[str] = [] for config_id, config in valid_configs.items(): rules = config.get(RULES_KEY, []) for rule in rules: expressions = enumerate_patterns_in_boolean_expression( build_boolean_expression(rule) ) for expr in expressions: for language in rule["languages"]: # avoid patterns that don't have pattern_ids, like pattern-either if should_send_to_sgrep(expr) and not validate_pattern_with_sgrep( expr.operand, language # type: ignore ): invalid.append(expr.operand) # type: ignore print_error( f"in {config_id}, pattern in rule {rule['id']} can't be parsed for language {language}: {expr.operand}" ) return invalid
def validate_single_rule(config_id: str, rule_index: int, rule: Dict[str, Any]) -> bool: rule_id_err_msg = f'(rule id: {rule.get("id", MISSING_RULE_ID)})' if not set(rule.keys()).issuperset(YAML_MUST_HAVE_KEYS): print_error( f"{config_id} is missing keys at rule {rule_index+1} {rule_id_err_msg}, must have: {YAML_MUST_HAVE_KEYS}" ) return False if not set(rule.keys()).issubset(YAML_ALL_VALID_RULE_KEYS): print_error( f"{config_id} has invalid rule key at rule {rule_index+1} {rule_id_err_msg}, can only have: {YAML_ALL_VALID_RULE_KEYS}" ) return False try: _ = build_boolean_expression(rule) except InvalidRuleSchema as ex: print_error( f"{config_id}: inside rule {rule_index+1} {rule_id_err_msg}, pattern fields can't look like this: {ex}" ) return False return True
def main(args: argparse.Namespace): """ main function that parses args and runs sgrep """ # get the proper paths for targets i.e. handle base path of /home/repo when it exists in docker targets = config_resolver.resolve_targets(args.target) # first check if user asked to generate a config if args.generate_config: config_resolver.generate_config() # let's check for a pattern elif args.pattern: # and a language if not args.lang: print_error_exit( "language must be specified when a pattern is passed") lang = args.lang pattern = args.pattern # TODO for now we generate a manual config. Might want to just call sgrep -e ... -l ... configs = config_resolver.manual_config(pattern, lang) else: # else let's get a config. A config is a dict from config_id -> config. Config Id is not well defined at this point. configs = config_resolver.resolve_config(args.config) # if we can't find a config, use default r2c rules if not configs: print_error_exit( f"No config given. If you want to see some examples, try running with --config r2c" ) # let's split our configs into valid and invalid configs. # It's possible that a config_id exists in both because we check valid rules and invalid rules # instead of just hard failing for that config if mal-formed valid_configs, errors = validate_configs(configs) validate = args.validate strict = args.strict if errors: if strict: print_error_exit( f"run with --strict and there were {len(errors)} errors loading configs" ) elif validate: print_error_exit( f"run with --validate and there were {len(errors)} errors loading configs" ) elif validate: # no errors! print_error_exit("Config is valid", exit_code=0) if not args.no_rewrite_rule_ids: # re-write the configs to have the hierarchical rule ids valid_configs = rename_rule_ids(valid_configs) # now validate all the patterns inside the configs if not args.skip_pattern_validation: start_validate_t = time.time() invalid_patterns = validate_patterns(valid_configs) if len(invalid_patterns): print_error_exit( f"{len(invalid_patterns)} invalid patterns found inside rules; aborting" ) debug_print( f"debug: validated config in {time.time() - start_validate_t}") # extract just the rules from valid configs all_rules = flatten_configs(valid_configs) if not args.pattern: plural = "s" if len(valid_configs) > 1 else "" config_id_if_single = (list(valid_configs.keys())[0] if len(valid_configs) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") print_msg( f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}" ) # TODO log valid and invalid configs if verbose # a rule can have multiple patterns inside it. Flatten these so we can send sgrep a single yml file list of patterns all_patterns = list(flatten_rule_patterns(all_rules)) # actually invoke sgrep start = datetime.now() output_json = invoke_sgrep(all_patterns, targets, strict) debug_print(f"sgrep ran in {datetime.now() - start}") debug_print(str(output_json)) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[int, Dict[str, List[Dict[str, Any]]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for finding in output_json["errors"]: print_error(f"sgrep: {finding['path']}: {finding['check_id']}") if strict and len(output_json["errors"]): print_error_exit( f"run with --strict and {len(output_json['errors'])} errors occurred during sgrep run; exiting" ) for finding in output_json["matches"]: # decode the rule index from the output check_id rule_index = int(finding["check_id"].split(".")[0]) by_rule_index[rule_index][finding["path"]].append(finding) current_path = Path.cwd() outputs_after_booleans = [] ignored_in_tests = 0 for rule_index, paths in by_rule_index.items(): expression = build_boolean_expression(all_rules[rule_index]) debug_print(str(expression)) # expression = (op, pattern_id) for (op, pattern_id, pattern) in expression_with_patterns] for filepath, results in paths.items(): debug_print( f"-------- rule (index {rule_index}) {all_rules[rule_index]['id']}------ filepath: {filepath}" ) check_ids_to_ranges = parse_sgrep_output(results) debug_print(str(check_ids_to_ranges)) valid_ranges_to_output = evaluate_expression( expression, check_ids_to_ranges, flags={ RCE_RULE_FLAG: args.dangerously_allow_arbitrary_code_execution_from_rules }, ) # only output matches which are inside these offsets! debug_print(f"compiled result {valid_ranges_to_output}") debug_print("-" * 80) for result in results: if sgrep_finding_to_range( result).range in valid_ranges_to_output: path_object = Path(result["path"]) if args.exclude_tests and should_exclude_this_path( path_object): ignored_in_tests += 1 continue # restore the original rule ID result["check_id"] = all_rules[rule_index]["id"] # rewrite the path to be relative to the current working directory result["path"] = str( safe_relative_to(path_object, current_path)) # restore the original message result["extra"]["message"] = rewrite_message_with_metavars( all_rules[rule_index], result) result = transform_to_r2c_output(result) outputs_after_booleans.append(result) if ignored_in_tests > 0: print_error( f"warning: ignored {ignored_in_tests} results in tests due to --exclude-tests option" ) # output results output_data = {"results": outputs_after_booleans} if not args.quiet: if args.json: print(build_output_json(output_data)) else: print("\n".join(build_normal_output(output_data, color_output=True))) if args.output: save_output(args.output, output_data, args.json) if args.error and outputs_after_booleans: sys.exit(FINDINGS_EXIT_CODE) return output_data