コード例 #1
0
ファイル: sgrep_main.py プロジェクト: sylvestre/sgrep
def invoke_sgrep(all_rules: List[Dict[str, Any]], targets: List[Path],
                 strict: bool) -> Dict[str, Any]:
    """Returns parsed json output of sgrep"""

    outputs: List[Any] = []  # multiple invocations per language
    errors: List[Any] = []
    for language, all_rules_for_language in group_rule_by_langauges(
            all_rules).items():
        with tempfile.NamedTemporaryFile("w") as fout:
            # very important not to sort keys here
            yaml_as_str = yaml.safe_dump({"rules": all_rules_for_language},
                                         sort_keys=False)
            fout.write(yaml_as_str)
            fout.flush()
            extra_args = (["-report_parse_errors", "-report_fatal_errors"]
                          if strict else [])
            cmd = ([SGREP_PATH] + extra_args + [
                "-lang",
                language,
                f"-rules_file",
                fout.name,
                *[str(path) for path in targets],
            ])
            try:
                output = subprocess.check_output(cmd, shell=False)
            except subprocess.CalledProcessError as ex:
                print_error(
                    f"non-zero return code while invoking sgrep with:\n\t{' '.join(cmd)}\n{ex}"
                )
                print_error_exit(f"\n\n{PLEASE_FILE_ISSUE_TEXT}")
            output_json = json.loads((output.decode("utf-8", "replace")))

            errors.extend(output_json["errors"])
            outputs.extend(output_json["matches"])
    return {"matches": outputs, "errors": errors}
コード例 #2
0
ファイル: config_resolver.py プロジェクト: sylvestre/sgrep
def generate_config():
    # defensive coding
    if Path(DEFAULT_CONFIG_FILE).exists():
        print_error_exit(
            f"{DEFAULT_CONFIG_FILE} already exists. Please remove and try again"
        )
    try:
        r = requests.get(TEMPLATE_YAML_URL, timeout=10)
        r.raise_for_status()
        template_str = r.text
    except Exception as e:
        debug_print(str(e))
        print_msg(
            f"There was a problem downloading the latest template config. Using fallback template"
        )
        template_str = """rules:
  - id: eqeq-is-bad
    pattern: $X == $X
    message: "$X == $X is a useless equality check"
    languages: [python]
    severity: ERROR"""
    try:
        with open(DEFAULT_CONFIG_FILE, "w") as template:
            template.write(template_str)
            print_msg(
                f"Template config successfully written to {DEFAULT_CONFIG_FILE}"
            )
            sys.exit(0)
    except Exception as e:
        print_error_exit(e)
コード例 #3
0
ファイル: config_resolver.py プロジェクト: sylvestre/sgrep
def download_config(config_url: str) -> Dict[str, Optional[Dict[str, Any]]]:
    debug_print(f"trying to download from {config_url}")
    try:
        r = requests.get(config_url, stream=True)
        if r.status_code == requests.codes.ok:
            content_type = r.headers.get("Content-Type")
            if content_type and "text/plain" in content_type:
                return parse_config_string("remote-url",
                                           r.content.decode("utf-8"))
            elif content_type and content_type == "application/x-gzip":
                fname = f"/tmp/{base64.b64encode(config_url.encode()).decode()}"
                shutil.rmtree(fname, ignore_errors=True)
                with tarfile.open(fileobj=r.raw, mode="r:gz") as tar:
                    tar.extractall(fname)
                extracted = Path(fname)
                for path in extracted.iterdir():
                    # get first folder in extracted folder (this is how GH does it)
                    return parse_config_folder(path, relative=True)
            else:
                print_error_exit(
                    f"unknown content-type: {content_type} returned by config url: {config_url}. Can not parse"
                )
                assert False
        else:
            print_error_exit(
                f"bad status code: {r.status_code} returned by config url: {config_url}"
            )
            assert False
    except Exception as e:
        print_error(e)
    return {config_url: None}
コード例 #4
0
ファイル: config_resolver.py プロジェクト: sylvestre/sgrep
def load_config_from_local_path(
    location: Optional[str] = None, ) -> Dict[str, Optional[Dict[str, Any]]]:
    base_path = get_base_path()
    if location is None:
        default_file = base_path.joinpath(DEFAULT_CONFIG_FILE)
        default_folder = base_path.joinpath(DEFAULT_CONFIG_FOLDER)
        if default_file.exists():
            return parse_config_at_path(default_file)
        elif default_folder.exists():
            return parse_config_folder(default_folder, relative=True)
        else:
            return {str(default_file): None}
    else:
        loc = base_path.joinpath(location)
        if loc.exists():
            if loc.is_file():
                return parse_config_at_path(loc)
            elif loc.is_dir():
                return parse_config_folder(loc)
            else:
                print_error_exit(
                    f"config location `{loc}` is not a file or folder!")
                assert False
        else:
            addendum = ""
            if IN_DOCKER:
                addendum = " (since you are running in docker, you cannot specify arbitary paths on the host; they must be mounted into the container)"
            print_error_exit(
                f"unable to find a config; path `{loc}` does not exist{addendum}"
            )
            assert False
コード例 #5
0
ファイル: sgrep_main.py プロジェクト: MeghaJakhotia/sgrep
def dump_parsed_ast(
    to_json: bool, language: str, pattern: Optional[str], targets: List[Path]
) -> None:
    with tempfile.NamedTemporaryFile("w") as fout:
        args = []
        if pattern:
            fout.write(pattern)
            fout.flush()
            args = ["-lang", language, "-dump_pattern", fout.name]
        else:
            if len(targets) != 1:
                print_error_exit("exactly one target file is required with this option")
            target = targets[0]
            args = ["-lang", language, "-dump_ast", str(target)]

        if to_json:
            args = ["-json"] + args

        cmd = [SGREP_PATH] + args
        try:
            output = subprocess.check_output(cmd, shell=False)
        except subprocess.CalledProcessError as ex:
            print_error(f"error invoking sgrep with:\n\t{' '.join(cmd)}\n{ex}")
            print_error_exit(f"\n\n{PLEASE_FILE_ISSUE_TEXT}")
        print(output.decode())
コード例 #6
0
ファイル: config_resolver.py プロジェクト: sylvestre/sgrep
def adjust_for_docker(in_precommit: bool = False):
    # change into this folder so that all paths are relative to it
    if IN_DOCKER and not IN_GH_ACTION and not in_precommit:
        if not Path(REPO_HOME_DOCKER).exists():
            print_error_exit(
                f"you are running sgrep in docker, but you forgot to mount the current directory in Docker: missing: -v $(pwd):{REPO_HOME_DOCKER}"
            )
    if Path(REPO_HOME_DOCKER).exists():
        os.chdir(REPO_HOME_DOCKER)
コード例 #7
0
ファイル: test.py プロジェクト: sylvestre/sgrep
def test_main(args):
    _test_compute_confusion_matrix()
    if len(args.target) != 1:
        print_error_exit("only one target directory allowed for tests")
    target = Path(args.target[0])
    main(
        target,
        args.test_ignore_todo,
        args.verbose,
        args.strict,
        args.verbose,
        args.dangerously_allow_arbitrary_code_execution_from_rules,
    )
コード例 #8
0
ファイル: evaluation.py プロジェクト: sylvestre/sgrep
def _where_python_statement_matches(where_expression: str,
                                    metavars: Dict[str, str]) -> bool:
    # TODO: filter out obvious dangerous things here
    global output
    output = None  # type: ignore

    # HACK: we're executing arbitrary Python in the where-python,
    # be careful my friend
    vars = metavars
    try:
        exec(f"global output; output = {where_expression}")
    except Exception as ex:
        print_error(
            f"error evaluating a where-python expression: `{where_expression}`: {ex}"
        )

    if type(output) != type(True):  # type: ignore
        print_error_exit(  # type: ignore
            f"python where expression needs boolean output but got: {output} for {where_expression}"  # type: ignore
        )  # type: ignore
    return output == True  # type: ignore
コード例 #9
0
ファイル: sgrep.py プロジェクト: sylvestre/sgrep
    # logging options
    logging = parser.add_argument_group("logging")

    logging.add_argument(
        "-v",
        "--verbose",
        help=f"Sets the logging level to verbose. E.g. statements about which files are being processed will be printed.",
        action="store_true",
    )

    ### Parse and validate
    args = parser.parse_args()
    if args.lang and not args.pattern or (args.pattern and not args.lang):
        parser.error("-e/--pattern and -l/--lang must both be specified")

    # set the flags
    set_flags(args.verbose, args.quiet)

    # change cwd if using docker
    config_resolver.adjust_for_docker(args.precommit)

    try:
        if args.test:
            test.test_main(args)
        else:
            sgrep_main.main(args)
    except NotImplementedError as ex:
        print_error_exit(
            f"sgrep encountered an error: {ex}; this is not your fault. {PLEASE_FILE_ISSUE_TEXT}"
        )
コード例 #10
0
ファイル: evaluation.py プロジェクト: sylvestre/sgrep
def _evaluate_single_expression(
    expression: BooleanRuleExpression,
    results: Dict[PatternId, List[SgrepRange]],
    ranges_left: Set[Range],
    flags: Optional[Dict[str, Any]] = None,
) -> Set[Range]:

    assert expression.pattern_id, f"<internal error: expected pattern id: {expression}>"
    results_for_pattern = [
        x.range for x in results.get(expression.pattern_id, [])
    ]

    if expression.operator == OPERATORS.AND:
        # remove all ranges that don't equal the ranges for this pattern
        return ranges_left.intersection(results_for_pattern)
    elif expression.operator == OPERATORS.AND_NOT:
        # remove all ranges that DO equal the ranges for this pattern
        # difference_update = Remove all elements of another set from this set.
        return ranges_left.difference(results_for_pattern)
    elif expression.operator == OPERATORS.AND_INSIDE:
        # remove all ranges (not enclosed by) or (not equal to) the inside ranges
        output_ranges = set()
        for arange in ranges_left:
            for keep_inside_this_range in results_for_pattern:
                is_enclosed = keep_inside_this_range.is_enclosing_or_eq(arange)
                # print(
                #    f'candidate range is {arange}, needs to be `{operator}` {keep_inside_this_range}; keep?: {keep}')
                if is_enclosed:
                    output_ranges.add(arange)
                    break  # found a match, no need to keep going
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        return output_ranges
    elif expression.operator == OPERATORS.AND_NOT_INSIDE:
        # remove all ranges enclosed by or equal to
        output_ranges = ranges_left.copy()
        for arange in ranges_left:
            for keep_inside_this_range in results_for_pattern:
                if keep_inside_this_range.is_enclosing_or_eq(arange):
                    output_ranges.remove(arange)
                    break
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        return output_ranges
    elif expression.operator == OPERATORS.WHERE_PYTHON:
        if not flags or flags[RCE_RULE_FLAG] != True:
            print_error_exit(
                f"at least one rule needs to execute arbitrary code; this is dangerous! if you want to continue, enable the flag: {RCE_RULE_FLAG}"
            )
        assert expression.operand, "must have operand for this operator type"

        output_ranges = set()
        # Look through every range that hasn't been filtered yet
        for sgrep_range in list(flatten(results.values())):
            # Only need to check where-python clause if the range hasn't already been filtered

            if sgrep_range.range in ranges_left:
                debug_print(
                    f"WHERE is {expression.operand}, metavars: {sgrep_range.metavars}"
                )
                if _where_python_statement_matches(expression.operand,
                                                   sgrep_range.metavars):
                    output_ranges.add(sgrep_range.range)
        debug_print(f"after filter `{expression.operator}`: {output_ranges}")
        return output_ranges

    else:
        raise NotImplementedError(f"unknown operator {expression.operator}")
コード例 #11
0
ファイル: sgrep_main.py プロジェクト: sylvestre/sgrep
def main(args: argparse.Namespace):
    """ main function that parses args and runs sgrep """

    # get the proper paths for targets i.e. handle base path of /home/repo when it exists in docker
    targets = config_resolver.resolve_targets(args.target)

    # first check if user asked to generate a config
    if args.generate_config:
        config_resolver.generate_config()

    # let's check for a pattern
    elif args.pattern:
        # and a language
        if not args.lang:
            print_error_exit(
                "language must be specified when a pattern is passed")
        lang = args.lang
        pattern = args.pattern

        # TODO for now we generate a manual config. Might want to just call sgrep -e ... -l ...
        configs = config_resolver.manual_config(pattern, lang)
    else:
        # else let's get a config. A config is a dict from config_id -> config. Config Id is not well defined at this point.
        configs = config_resolver.resolve_config(args.config)

    # if we can't find a config, use default r2c rules
    if not configs:
        print_error_exit(
            f"No config given. If you want to see some examples, try running with --config r2c"
        )

    # let's split our configs into valid and invalid configs.
    # It's possible that a config_id exists in both because we check valid rules and invalid rules
    # instead of just hard failing for that config if mal-formed
    valid_configs, errors = validate_configs(configs)

    validate = args.validate
    strict = args.strict

    if errors:
        if strict:
            print_error_exit(
                f"run with --strict and there were {len(errors)} errors loading configs"
            )
        elif validate:
            print_error_exit(
                f"run with --validate and there were {len(errors)} errors loading configs"
            )
    elif validate:  # no errors!
        print_error_exit("Config is valid", exit_code=0)

    if not args.no_rewrite_rule_ids:
        # re-write the configs to have the hierarchical rule ids
        valid_configs = rename_rule_ids(valid_configs)

    # now validate all the patterns inside the configs
    if not args.skip_pattern_validation:
        start_validate_t = time.time()
        invalid_patterns = validate_patterns(valid_configs)
        if len(invalid_patterns):
            print_error_exit(
                f"{len(invalid_patterns)} invalid patterns found inside rules; aborting"
            )
        debug_print(
            f"debug: validated config in {time.time() - start_validate_t}")

    # extract just the rules from valid configs
    all_rules = flatten_configs(valid_configs)

    if not args.pattern:
        plural = "s" if len(valid_configs) > 1 else ""
        config_id_if_single = (list(valid_configs.keys())[0]
                               if len(valid_configs) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        print_msg(
            f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}"
        )
    # TODO log valid and invalid configs if verbose

    # a rule can have multiple patterns inside it. Flatten these so we can send sgrep a single yml file list of patterns
    all_patterns = list(flatten_rule_patterns(all_rules))

    # actually invoke sgrep
    start = datetime.now()
    output_json = invoke_sgrep(all_patterns, targets, strict)
    debug_print(f"sgrep ran in {datetime.now() - start}")
    debug_print(str(output_json))

    # group output; we want to see all of the same rule ids on the same file path
    by_rule_index: Dict[int,
                        Dict[str,
                             List[Dict[str, Any]]]] = collections.defaultdict(
                                 lambda: collections.defaultdict(list))

    for finding in output_json["errors"]:
        print_error(f"sgrep: {finding['path']}: {finding['check_id']}")

    if strict and len(output_json["errors"]):
        print_error_exit(
            f"run with --strict and {len(output_json['errors'])} errors occurred during sgrep run; exiting"
        )

    for finding in output_json["matches"]:
        # decode the rule index from the output check_id
        rule_index = int(finding["check_id"].split(".")[0])
        by_rule_index[rule_index][finding["path"]].append(finding)

    current_path = Path.cwd()
    outputs_after_booleans = []
    ignored_in_tests = 0
    for rule_index, paths in by_rule_index.items():
        expression = build_boolean_expression(all_rules[rule_index])
        debug_print(str(expression))
        # expression = (op, pattern_id) for (op, pattern_id, pattern) in expression_with_patterns]
        for filepath, results in paths.items():
            debug_print(
                f"-------- rule (index {rule_index}) {all_rules[rule_index]['id']}------ filepath: {filepath}"
            )
            check_ids_to_ranges = parse_sgrep_output(results)
            debug_print(str(check_ids_to_ranges))
            valid_ranges_to_output = evaluate_expression(
                expression,
                check_ids_to_ranges,
                flags={
                    RCE_RULE_FLAG:
                    args.dangerously_allow_arbitrary_code_execution_from_rules
                },
            )

            # only output matches which are inside these offsets!
            debug_print(f"compiled result {valid_ranges_to_output}")
            debug_print("-" * 80)
            for result in results:
                if sgrep_finding_to_range(
                        result).range in valid_ranges_to_output:
                    path_object = Path(result["path"])
                    if args.exclude_tests and should_exclude_this_path(
                            path_object):
                        ignored_in_tests += 1
                        continue

                    # restore the original rule ID
                    result["check_id"] = all_rules[rule_index]["id"]
                    # rewrite the path to be relative to the current working directory
                    result["path"] = str(
                        safe_relative_to(path_object, current_path))

                    # restore the original message
                    result["extra"]["message"] = rewrite_message_with_metavars(
                        all_rules[rule_index], result)
                    result = transform_to_r2c_output(result)
                    outputs_after_booleans.append(result)

    if ignored_in_tests > 0:
        print_error(
            f"warning: ignored {ignored_in_tests} results in tests due to --exclude-tests option"
        )

    # output results
    output_data = {"results": outputs_after_booleans}
    if not args.quiet:
        if args.json:
            print(build_output_json(output_data))
        else:
            print("\n".join(build_normal_output(output_data,
                                                color_output=True)))
    if args.output:
        save_output(args.output, output_data, args.json)
    if args.error and outputs_after_booleans:
        sys.exit(FINDINGS_EXIT_CODE)

    return output_data