def invoke_sgrep(all_rules: List[Dict[str, Any]], targets: List[Path], strict: bool) -> Dict[str, Any]: """Returns parsed json output of sgrep""" outputs: List[Any] = [] # multiple invocations per language errors: List[Any] = [] for language, all_rules_for_language in group_rule_by_langauges( all_rules).items(): with tempfile.NamedTemporaryFile("w") as fout: # very important not to sort keys here yaml_as_str = yaml.safe_dump({"rules": all_rules_for_language}, sort_keys=False) fout.write(yaml_as_str) fout.flush() extra_args = (["-report_parse_errors", "-report_fatal_errors"] if strict else []) cmd = ([SGREP_PATH] + extra_args + [ "-lang", language, f"-rules_file", fout.name, *[str(path) for path in targets], ]) try: output = subprocess.check_output(cmd, shell=False) except subprocess.CalledProcessError as ex: print_error( f"non-zero return code while invoking sgrep with:\n\t{' '.join(cmd)}\n{ex}" ) print_error_exit(f"\n\n{PLEASE_FILE_ISSUE_TEXT}") output_json = json.loads((output.decode("utf-8", "replace"))) errors.extend(output_json["errors"]) outputs.extend(output_json["matches"]) return {"matches": outputs, "errors": errors}
def generate_config(): # defensive coding if Path(DEFAULT_CONFIG_FILE).exists(): print_error_exit( f"{DEFAULT_CONFIG_FILE} already exists. Please remove and try again" ) try: r = requests.get(TEMPLATE_YAML_URL, timeout=10) r.raise_for_status() template_str = r.text except Exception as e: debug_print(str(e)) print_msg( f"There was a problem downloading the latest template config. Using fallback template" ) template_str = """rules: - id: eqeq-is-bad pattern: $X == $X message: "$X == $X is a useless equality check" languages: [python] severity: ERROR""" try: with open(DEFAULT_CONFIG_FILE, "w") as template: template.write(template_str) print_msg( f"Template config successfully written to {DEFAULT_CONFIG_FILE}" ) sys.exit(0) except Exception as e: print_error_exit(e)
def download_config(config_url: str) -> Dict[str, Optional[Dict[str, Any]]]: debug_print(f"trying to download from {config_url}") try: r = requests.get(config_url, stream=True) if r.status_code == requests.codes.ok: content_type = r.headers.get("Content-Type") if content_type and "text/plain" in content_type: return parse_config_string("remote-url", r.content.decode("utf-8")) elif content_type and content_type == "application/x-gzip": fname = f"/tmp/{base64.b64encode(config_url.encode()).decode()}" shutil.rmtree(fname, ignore_errors=True) with tarfile.open(fileobj=r.raw, mode="r:gz") as tar: tar.extractall(fname) extracted = Path(fname) for path in extracted.iterdir(): # get first folder in extracted folder (this is how GH does it) return parse_config_folder(path, relative=True) else: print_error_exit( f"unknown content-type: {content_type} returned by config url: {config_url}. Can not parse" ) assert False else: print_error_exit( f"bad status code: {r.status_code} returned by config url: {config_url}" ) assert False except Exception as e: print_error(e) return {config_url: None}
def load_config_from_local_path( location: Optional[str] = None, ) -> Dict[str, Optional[Dict[str, Any]]]: base_path = get_base_path() if location is None: default_file = base_path.joinpath(DEFAULT_CONFIG_FILE) default_folder = base_path.joinpath(DEFAULT_CONFIG_FOLDER) if default_file.exists(): return parse_config_at_path(default_file) elif default_folder.exists(): return parse_config_folder(default_folder, relative=True) else: return {str(default_file): None} else: loc = base_path.joinpath(location) if loc.exists(): if loc.is_file(): return parse_config_at_path(loc) elif loc.is_dir(): return parse_config_folder(loc) else: print_error_exit( f"config location `{loc}` is not a file or folder!") assert False else: addendum = "" if IN_DOCKER: addendum = " (since you are running in docker, you cannot specify arbitary paths on the host; they must be mounted into the container)" print_error_exit( f"unable to find a config; path `{loc}` does not exist{addendum}" ) assert False
def dump_parsed_ast( to_json: bool, language: str, pattern: Optional[str], targets: List[Path] ) -> None: with tempfile.NamedTemporaryFile("w") as fout: args = [] if pattern: fout.write(pattern) fout.flush() args = ["-lang", language, "-dump_pattern", fout.name] else: if len(targets) != 1: print_error_exit("exactly one target file is required with this option") target = targets[0] args = ["-lang", language, "-dump_ast", str(target)] if to_json: args = ["-json"] + args cmd = [SGREP_PATH] + args try: output = subprocess.check_output(cmd, shell=False) except subprocess.CalledProcessError as ex: print_error(f"error invoking sgrep with:\n\t{' '.join(cmd)}\n{ex}") print_error_exit(f"\n\n{PLEASE_FILE_ISSUE_TEXT}") print(output.decode())
def adjust_for_docker(in_precommit: bool = False): # change into this folder so that all paths are relative to it if IN_DOCKER and not IN_GH_ACTION and not in_precommit: if not Path(REPO_HOME_DOCKER).exists(): print_error_exit( f"you are running sgrep in docker, but you forgot to mount the current directory in Docker: missing: -v $(pwd):{REPO_HOME_DOCKER}" ) if Path(REPO_HOME_DOCKER).exists(): os.chdir(REPO_HOME_DOCKER)
def test_main(args): _test_compute_confusion_matrix() if len(args.target) != 1: print_error_exit("only one target directory allowed for tests") target = Path(args.target[0]) main( target, args.test_ignore_todo, args.verbose, args.strict, args.verbose, args.dangerously_allow_arbitrary_code_execution_from_rules, )
def _where_python_statement_matches(where_expression: str, metavars: Dict[str, str]) -> bool: # TODO: filter out obvious dangerous things here global output output = None # type: ignore # HACK: we're executing arbitrary Python in the where-python, # be careful my friend vars = metavars try: exec(f"global output; output = {where_expression}") except Exception as ex: print_error( f"error evaluating a where-python expression: `{where_expression}`: {ex}" ) if type(output) != type(True): # type: ignore print_error_exit( # type: ignore f"python where expression needs boolean output but got: {output} for {where_expression}" # type: ignore ) # type: ignore return output == True # type: ignore
# logging options logging = parser.add_argument_group("logging") logging.add_argument( "-v", "--verbose", help=f"Sets the logging level to verbose. E.g. statements about which files are being processed will be printed.", action="store_true", ) ### Parse and validate args = parser.parse_args() if args.lang and not args.pattern or (args.pattern and not args.lang): parser.error("-e/--pattern and -l/--lang must both be specified") # set the flags set_flags(args.verbose, args.quiet) # change cwd if using docker config_resolver.adjust_for_docker(args.precommit) try: if args.test: test.test_main(args) else: sgrep_main.main(args) except NotImplementedError as ex: print_error_exit( f"sgrep encountered an error: {ex}; this is not your fault. {PLEASE_FILE_ISSUE_TEXT}" )
def _evaluate_single_expression( expression: BooleanRuleExpression, results: Dict[PatternId, List[SgrepRange]], ranges_left: Set[Range], flags: Optional[Dict[str, Any]] = None, ) -> Set[Range]: assert expression.pattern_id, f"<internal error: expected pattern id: {expression}>" results_for_pattern = [ x.range for x in results.get(expression.pattern_id, []) ] if expression.operator == OPERATORS.AND: # remove all ranges that don't equal the ranges for this pattern return ranges_left.intersection(results_for_pattern) elif expression.operator == OPERATORS.AND_NOT: # remove all ranges that DO equal the ranges for this pattern # difference_update = Remove all elements of another set from this set. return ranges_left.difference(results_for_pattern) elif expression.operator == OPERATORS.AND_INSIDE: # remove all ranges (not enclosed by) or (not equal to) the inside ranges output_ranges = set() for arange in ranges_left: for keep_inside_this_range in results_for_pattern: is_enclosed = keep_inside_this_range.is_enclosing_or_eq(arange) # print( # f'candidate range is {arange}, needs to be `{operator}` {keep_inside_this_range}; keep?: {keep}') if is_enclosed: output_ranges.add(arange) break # found a match, no need to keep going debug_print(f"after filter `{expression.operator}`: {output_ranges}") return output_ranges elif expression.operator == OPERATORS.AND_NOT_INSIDE: # remove all ranges enclosed by or equal to output_ranges = ranges_left.copy() for arange in ranges_left: for keep_inside_this_range in results_for_pattern: if keep_inside_this_range.is_enclosing_or_eq(arange): output_ranges.remove(arange) break debug_print(f"after filter `{expression.operator}`: {output_ranges}") return output_ranges elif expression.operator == OPERATORS.WHERE_PYTHON: if not flags or flags[RCE_RULE_FLAG] != True: print_error_exit( f"at least one rule needs to execute arbitrary code; this is dangerous! if you want to continue, enable the flag: {RCE_RULE_FLAG}" ) assert expression.operand, "must have operand for this operator type" output_ranges = set() # Look through every range that hasn't been filtered yet for sgrep_range in list(flatten(results.values())): # Only need to check where-python clause if the range hasn't already been filtered if sgrep_range.range in ranges_left: debug_print( f"WHERE is {expression.operand}, metavars: {sgrep_range.metavars}" ) if _where_python_statement_matches(expression.operand, sgrep_range.metavars): output_ranges.add(sgrep_range.range) debug_print(f"after filter `{expression.operator}`: {output_ranges}") return output_ranges else: raise NotImplementedError(f"unknown operator {expression.operator}")
def main(args: argparse.Namespace): """ main function that parses args and runs sgrep """ # get the proper paths for targets i.e. handle base path of /home/repo when it exists in docker targets = config_resolver.resolve_targets(args.target) # first check if user asked to generate a config if args.generate_config: config_resolver.generate_config() # let's check for a pattern elif args.pattern: # and a language if not args.lang: print_error_exit( "language must be specified when a pattern is passed") lang = args.lang pattern = args.pattern # TODO for now we generate a manual config. Might want to just call sgrep -e ... -l ... configs = config_resolver.manual_config(pattern, lang) else: # else let's get a config. A config is a dict from config_id -> config. Config Id is not well defined at this point. configs = config_resolver.resolve_config(args.config) # if we can't find a config, use default r2c rules if not configs: print_error_exit( f"No config given. If you want to see some examples, try running with --config r2c" ) # let's split our configs into valid and invalid configs. # It's possible that a config_id exists in both because we check valid rules and invalid rules # instead of just hard failing for that config if mal-formed valid_configs, errors = validate_configs(configs) validate = args.validate strict = args.strict if errors: if strict: print_error_exit( f"run with --strict and there were {len(errors)} errors loading configs" ) elif validate: print_error_exit( f"run with --validate and there were {len(errors)} errors loading configs" ) elif validate: # no errors! print_error_exit("Config is valid", exit_code=0) if not args.no_rewrite_rule_ids: # re-write the configs to have the hierarchical rule ids valid_configs = rename_rule_ids(valid_configs) # now validate all the patterns inside the configs if not args.skip_pattern_validation: start_validate_t = time.time() invalid_patterns = validate_patterns(valid_configs) if len(invalid_patterns): print_error_exit( f"{len(invalid_patterns)} invalid patterns found inside rules; aborting" ) debug_print( f"debug: validated config in {time.time() - start_validate_t}") # extract just the rules from valid configs all_rules = flatten_configs(valid_configs) if not args.pattern: plural = "s" if len(valid_configs) > 1 else "" config_id_if_single = (list(valid_configs.keys())[0] if len(valid_configs) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") print_msg( f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}" ) # TODO log valid and invalid configs if verbose # a rule can have multiple patterns inside it. Flatten these so we can send sgrep a single yml file list of patterns all_patterns = list(flatten_rule_patterns(all_rules)) # actually invoke sgrep start = datetime.now() output_json = invoke_sgrep(all_patterns, targets, strict) debug_print(f"sgrep ran in {datetime.now() - start}") debug_print(str(output_json)) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[int, Dict[str, List[Dict[str, Any]]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for finding in output_json["errors"]: print_error(f"sgrep: {finding['path']}: {finding['check_id']}") if strict and len(output_json["errors"]): print_error_exit( f"run with --strict and {len(output_json['errors'])} errors occurred during sgrep run; exiting" ) for finding in output_json["matches"]: # decode the rule index from the output check_id rule_index = int(finding["check_id"].split(".")[0]) by_rule_index[rule_index][finding["path"]].append(finding) current_path = Path.cwd() outputs_after_booleans = [] ignored_in_tests = 0 for rule_index, paths in by_rule_index.items(): expression = build_boolean_expression(all_rules[rule_index]) debug_print(str(expression)) # expression = (op, pattern_id) for (op, pattern_id, pattern) in expression_with_patterns] for filepath, results in paths.items(): debug_print( f"-------- rule (index {rule_index}) {all_rules[rule_index]['id']}------ filepath: {filepath}" ) check_ids_to_ranges = parse_sgrep_output(results) debug_print(str(check_ids_to_ranges)) valid_ranges_to_output = evaluate_expression( expression, check_ids_to_ranges, flags={ RCE_RULE_FLAG: args.dangerously_allow_arbitrary_code_execution_from_rules }, ) # only output matches which are inside these offsets! debug_print(f"compiled result {valid_ranges_to_output}") debug_print("-" * 80) for result in results: if sgrep_finding_to_range( result).range in valid_ranges_to_output: path_object = Path(result["path"]) if args.exclude_tests and should_exclude_this_path( path_object): ignored_in_tests += 1 continue # restore the original rule ID result["check_id"] = all_rules[rule_index]["id"] # rewrite the path to be relative to the current working directory result["path"] = str( safe_relative_to(path_object, current_path)) # restore the original message result["extra"]["message"] = rewrite_message_with_metavars( all_rules[rule_index], result) result = transform_to_r2c_output(result) outputs_after_booleans.append(result) if ignored_in_tests > 0: print_error( f"warning: ignored {ignored_in_tests} results in tests due to --exclude-tests option" ) # output results output_data = {"results": outputs_after_booleans} if not args.quiet: if args.json: print(build_output_json(output_data)) else: print("\n".join(build_normal_output(output_data, color_output=True))) if args.output: save_output(args.output, output_data, args.json) if args.error and outputs_after_booleans: sys.exit(FINDINGS_EXIT_CODE) return output_data