Example #1
0
def test_explicit_path(tmp_path, monkeypatch):
    foo = tmp_path / "foo"
    foo.mkdir()
    (foo / "a.go").touch()
    (foo / "b.go").touch()
    foo_noext = foo / "noext"
    foo_noext.touch()
    foo_a = foo / "a.py"
    foo_a.touch()
    foo_b = foo / "b.py"
    foo_b.touch()

    monkeypatch.chdir(tmp_path)

    # Should include explicitly passed python file
    foo_a = foo_a.relative_to(tmp_path)
    output_settings = OutputSettings(
        output_format=OutputFormat.TEXT,
        output_destination=None,
        error_on_findings=False,
        strict=False,
    )
    defaulthandler = OutputHandler(output_settings)

    python_language = Language("python")

    assert foo_a in TargetManager(
        [], [], ["foo/a.py"], False, defaulthandler
    ).get_files(python_language, [], [])

    # Should include explicitly passed python file even if is in excludes
    assert foo_a not in TargetManager(
        [], ["foo/a.py"], ["."], False, defaulthandler
    ).get_files(python_language, [], [])
    assert foo_a in TargetManager(
        [], ["foo/a.py"], [".", "foo/a.py"], False, defaulthandler
    ).get_files(python_language, [], [])

    # Should ignore expliclty passed .go file when requesting python
    assert (
        TargetManager([], [], ["foo/a.go"], False, defaulthandler).get_files(
            python_language, [], []
        )
        == []
    )

    # Should include explicitly passed file with unknown extension
    assert cmp_path_sets(
        set(
            TargetManager([], [], ["foo/noext"], False, defaulthandler).get_files(
                python_language, [], []
            )
        ),
        {foo_noext},
    )
Example #2
0
def test_filter_by_size():
    with NamedTemporaryFile() as fp:
        fp.write(b"0123456789")
        fp.flush()
        path = Path(fp.name)
        targets = frozenset({path})

        # no max size
        assert len(TargetManager.filter_by_size(targets, 0)) == 1

        # file is under max size
        assert len(TargetManager.filter_by_size(targets, 20)) == 1

        # file is over max size
        assert len(TargetManager.filter_by_size(targets, 5)) == 0
Example #3
0
def test_ignore_git_dir(tmp_path, monkeypatch):
    """
    Ignores all files in .git directory when scanning generic
    """
    foo = tmp_path / ".git"
    foo.mkdir()
    (foo / "bar").touch()

    monkeypatch.chdir(tmp_path)
    language = Language("generic")
    output_settings = OutputSettings(
        output_format=OutputFormat.TEXT,
        output_destination=None,
        error_on_findings=False,
        verbose_errors=False,
        strict=False,
        json_stats=False,
        output_time=False,
        output_per_finding_max_lines_limit=None,
        output_per_line_max_chars_limit=None,
    )
    defaulthandler = OutputHandler(output_settings)
    assert [] == TargetManager([], [], 0, [foo], True, defaulthandler, False).get_files(
        language, [], []
    )
def test_skip_symlink(tmp_path, monkeypatch):
    foo = tmp_path / "foo"
    foo.mkdir()
    (foo / "a.py").touch()
    (foo / "link.py").symlink_to(foo / "a.py")

    monkeypatch.chdir(tmp_path)

    python_language = Language("python")

    assert cmp_path_sets(
        TargetManager.expand_targets([foo], python_language, False),
        {foo / "a.py"},
    )

    assert cmp_path_sets(
        TargetManager.expand_targets([foo / "link.py"], python_language,
                                     False), set())
Example #5
0
 def get_files_for_language(
     language: Language, rule: Rule, target_manager: TargetManager
 ) -> List[Path]:
     try:
         targets = target_manager.get_files(language, rule.includes, rule.excludes)
     except _UnknownLanguageError as ex:
         raise UnknownLanguageError(
             short_msg=f"invalid language: {language}",
             long_msg=f"unsupported language: {language}. supported languages are: {', '.join(all_supported_languages())}",
             spans=[rule.languages_span.with_context(before=1, after=1)],
         ) from ex
     return targets
Example #6
0
def test_explicit_path(tmp_path, monkeypatch):
    foo = tmp_path / "foo"
    foo.mkdir()
    (foo / "a.go").touch()
    (foo / "b.go").touch()
    foo_noext = foo / "noext"
    foo_noext.touch()
    foo_a = foo / "a.py"
    foo_a.touch()
    foo_b = foo / "b.py"
    foo_b.touch()

    monkeypatch.chdir(tmp_path)

    # Should include explicitly passed python file
    foo_a = foo_a.relative_to(tmp_path)
    assert foo_a in TargetManager([], [], ["foo/a.py"],
                                  False).get_files("python", [], [])

    # Should include explicitly passed python file even if is in excludes
    assert foo_a not in TargetManager([], ["foo/a.py"], ["."],
                                      False).get_files("python", [], [])
    assert foo_a in TargetManager([], ["foo/a.py"], [".", "foo/a.py"],
                                  False).get_files("python", [], [])

    # Should ignore expliclty passed .go file when requesting python
    assert TargetManager([], [], ["foo/a.go"],
                         False).get_files("python", [], []) == []

    # Should include explicitly passed file with unknown extension
    assert cmp_path_sets(
        set(
            TargetManager([], [], ["foo/noext"],
                          False).get_files("python", [], [])),
        {foo_noext},
    )
Example #7
0
def test_delete_git(tmp_path, monkeypatch):
    """
        Check that deleted files are not included in expanded targets
    """
    foo = tmp_path / "foo.py"
    bar = tmp_path / "bar.py"
    foo.touch()
    bar.touch()

    monkeypatch.chdir(tmp_path)
    subprocess.run(["git", "init"])
    subprocess.run(["git", "add", foo])
    subprocess.run(["git", "commit", "-m", "first commit"])

    foo.unlink()
    subprocess.run(["git", "status"])

    assert cmp_path_sets(
        TargetManager.expand_targets([Path(".")], "python", True), {bar})
def test_filter_exclude():
    all_file_names = [
        "foo.py",
        "foo.go",
        "foo.java",
        "foo/bar.py",
        "foo/bar.go",
        "bar/foo/baz/bar.go",
        "foo/bar.java",
        "bar/baz",
        "baz.py",
        "baz.go",
        "baz.java",
        "bar/foo/foo.py",
        "foo",
        "bar/baz/foo/a.py",
        "bar/baz/foo/b.py",
        "bar/baz/foo/c.py",
        "bar/baz/qux/foo/a.py",
        "/foo/bar/baz/a.py",
    ]
    all_files = set({Path(elem) for elem in all_file_names})

    # Filter out .py files
    assert len(TargetManager.filter_excludes(all_files, ["*.py"])) == 9

    # Filter out files in a foo directory ancestor
    assert len(TargetManager.filter_excludes(all_files, ["foo"])) == 7

    # Filter out files with an ancestor named bar/baz
    assert len(TargetManager.filter_excludes(all_files, ["bar/baz"])) == 12

    # Filter out go files
    assert len(TargetManager.filter_excludes(all_files, ["*.go"])) == 14

    # Filter out go and java files
    assert len(TargetManager.filter_excludes(all_files,
                                             ["*.go", "*.java"])) == 11

    # Filter out go files with a direct ancestor named foo
    assert len(TargetManager.filter_excludes(all_files, ["foo/*.go"])) == 17
def test_expand_targets_not_git(tmp_path, monkeypatch):
    """
    Check that directory expansion works with relative paths, absolute paths, paths with ..
    """
    foo = tmp_path / "foo"
    foo.mkdir()
    (foo / "a.go").touch()
    (foo / "b.go").touch()
    (foo / "py").touch()
    foo_a = foo / "a.py"
    foo_a.touch()
    foo_b = foo / "b.py"
    foo_b.touch()

    bar = tmp_path / "bar"
    bar.mkdir()
    bar_a = bar / "a.py"
    bar_a.touch()
    bar_b = bar / "b.py"
    bar_b.touch()

    foo_bar = foo / "bar"
    foo_bar.mkdir()
    foo_bar_a = foo_bar / "a.py"
    foo_bar_a.touch()
    foo_bar_b = foo_bar / "b.py"
    foo_bar_b.touch()

    in_foo_bar = {foo_bar_a, foo_bar_b}
    in_foo = {foo_a, foo_b}.union(in_foo_bar)
    in_bar = {bar_a, bar_b}
    in_all = in_foo.union(in_bar)

    python_language = Language("python")

    monkeypatch.chdir(tmp_path)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path(".")], python_language, False),
        in_all)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, False),
        in_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo")], python_language, False),
        in_foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo").resolve()], python_language,
                                     False),
        in_foo,
    )
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo/bar")], python_language,
                                     False),
        in_foo_bar,
    )
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo/bar").resolve()],
                                     python_language, False),
        in_foo_bar,
    )

    monkeypatch.chdir(foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path(".")], python_language, False),
        in_foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("./foo")], python_language, False),
        set())
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, False),
        in_foo_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, False),
        in_foo_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("..")], python_language, False),
        in_all)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("../bar")], python_language, False),
        in_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("../foo/bar")], python_language,
                                     False),
        in_foo_bar,
    )
def test_expand_targets_git(tmp_path, monkeypatch):
    """
    Test TargetManager with visible_to_git_only flag on in a git repository
    with nested .gitignores
    """
    foo = tmp_path / "foo"
    foo.mkdir()
    foo_a_go = foo / "a.go"
    foo_a_go.touch()
    (foo / "b.go").touch()
    (foo / "py").touch()
    foo_a = foo / "a.py"
    foo_a.touch()
    foo_b = foo / "b.py"
    foo_b.touch()

    bar = tmp_path / "bar"
    bar.mkdir()
    bar_a = bar / "a.py"
    bar_a.touch()
    bar_b = bar / "b.py"
    bar_b.touch()

    foo_bar = foo / "bar"
    foo_bar.mkdir()
    foo_bar_a = foo_bar / "a.py"
    foo_bar_a.touch()
    foo_bar_b = foo_bar / "b.py"
    foo_bar_b.touch()

    monkeypatch.chdir(tmp_path)
    subprocess.run(["git", "init"])
    subprocess.run(["git", "add", foo_a])
    subprocess.run(["git", "add", foo_bar_a])
    subprocess.run(["git", "add", foo_bar_b])
    subprocess.run(["git", "add", foo_a_go])
    subprocess.run(["git", "commit", "-m", "first"])

    # Check that all files are visible without a .gitignore
    in_foo_bar = {foo_bar_a, foo_bar_b}
    in_foo = {foo_a, foo_b}.union(in_foo_bar)
    in_bar = {bar_a, bar_b}
    in_all = in_foo.union(in_bar)

    python_language = Language("python")

    monkeypatch.chdir(tmp_path)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path(".")], python_language, True),
        in_all)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, True),
        in_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo")], python_language, True),
        in_foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo").resolve()], python_language,
                                     True),
        in_foo,
    )
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo/bar")], python_language, True),
        in_foo_bar,
    )
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo/bar").resolve()],
                                     python_language, True),
        in_foo_bar,
    )
    monkeypatch.chdir(foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path(".")], python_language, True),
        in_foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("./foo")], python_language, True),
        set())
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, True),
        in_foo_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, True),
        in_foo_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("..")], python_language, True),
        in_all)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("../bar")], python_language, True),
        in_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("../foo/bar")], python_language,
                                     True),
        in_foo_bar,
    )

    # Add bar/, foo/bar/a.py, foo/b.py to gitignores
    monkeypatch.chdir(tmp_path)
    (tmp_path / ".gitignore").write_text("bar/\nfoo/bar/a.py")
    (tmp_path / "foo" / ".gitignore").write_text("b.py")

    # Reflect what should now be visible given gitignores
    in_foo_bar = {
        foo_bar_a,
        foo_bar_b,
    }  # foo/bar/a.py is gitignored but is already tracked
    in_foo = {foo_a}.union(
        in_foo_bar)  # foo/b.py is gitignored with a nested gitignore
    in_bar = set()  # bar/ is gitignored
    in_all = in_foo.union(in_bar)

    monkeypatch.chdir(tmp_path)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path(".")], python_language, True),
        in_all)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, True),
        in_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo")], python_language, True),
        in_foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo").resolve()], python_language,
                                     True),
        in_foo,
    )
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo/bar")], python_language, True),
        in_foo_bar,
    )
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("foo/bar").resolve()],
                                     python_language, True),
        in_foo_bar,
    )
    monkeypatch.chdir(foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path(".")], python_language, True),
        in_foo)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("./foo")], python_language, True),
        set())
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, True),
        in_foo_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("bar")], python_language, True),
        in_foo_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("..")], python_language, True),
        in_all)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("../bar")], python_language, True),
        in_bar)
    assert cmp_path_sets(
        TargetManager.expand_targets([Path("../foo/bar")], python_language,
                                     True),
        in_foo_bar,
    )
Example #11
0
def main(
    output_handler: OutputHandler,
    target: List[str],
    pattern: str,
    lang: str,
    config: str,
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    dryrun: bool = False,
    disable_nosem: bool = False,
    dangerously_allow_arbitrary_code_execution_from_rules: bool = False,
    no_git_ignore: bool = False,
    timeout: int = 0,
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    valid_configs, config_errors = get_config(pattern, lang, config)

    output_handler.handle_semgrep_errors(config_errors)

    if config_errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(config_errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not no_rewrite_rule_ids:
        # re-write the configs to have the hierarchical rule ids
        valid_configs = rename_rule_ids(valid_configs)

    # extract just the rules from valid configs
    all_rules = flatten_configs(valid_configs)

    if not pattern:
        plural = "s" if len(valid_configs) > 1 else ""
        config_id_if_single = (list(valid_configs.keys())[0]
                               if len(valid_configs) == 1 else "")
        invalid_msg = (f"({len(config_errors)} config files were invalid)"
                       if len(config_errors) else "")
        logger.debug(
            f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}"
        )

        notify_user_of_work(all_rules, include, exclude)

        if len(valid_configs) == 0:
            raise SemgrepError(
                f"no valid configuration file found ({len(config_errors)} configs were invalid)",
                code=MISSING_CONFIG_EXIT_CODE,
            )

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        targets=target,
        respect_git_ignore=respect_git_ignore,
        output_handler=output_handler,
    )

    # actually invoke semgrep
    rule_matches_by_rule, debug_steps_by_rule, semgrep_errors = CoreRunner(
        allow_exec=dangerously_allow_arbitrary_code_execution_from_rules,
        jobs=jobs,
        timeout=timeout,
    ).invoke_semgrep(target_manager, all_rules)

    output_handler.handle_semgrep_errors(semgrep_errors)

    if not disable_nosem:
        rule_matches_by_rule = {
            rule: [
                rule_match for rule_match in rule_matches
                if not rule_match_nosem(rule_match, strict)
            ]
            for rule, rule_matches in rule_matches_by_rule.items()
        }

    output_handler.handle_semgrep_core_output(rule_matches_by_rule,
                                              debug_steps_by_rule)

    if autofix:
        apply_fixes(rule_matches_by_rule, dryrun)
Example #12
0
def test_filter_exclude():
    all_file_names = [
        "/foo/bar/baz/a.py",
        "bar/baz",
        "bar/baz/foo/a.py",
        "bar/baz/foo/b.py",
        "bar/baz/foo/c.py",
        "bar/baz/qux/foo/a.py",
        "bar/foo/baz/bar.go",
        "bar/foo/foo.py",
        "baz.go",
        "baz.java",
        "baz.py",
        "baz/foo",
        "foo",
        "foo.go",
        "foo.java",
        "foo.py",
        "foo/bar.go",
        "foo/bar.java",
        "foo/bar.py",
    ]
    all_files = frozenset({Path(elem) for elem in all_file_names})

    # Filter out .py files
    assert TargetManager.filter_excludes(all_files, ["*.py"]) == {
        Path(p)
        for p in [
            "bar/baz",
            "bar/foo/baz/bar.go",
            "baz.go",
            "baz.java",
            "baz/foo",
            "foo",
            "foo.go",
            "foo.java",
            "foo/bar.go",
            "foo/bar.java",
        ]
    }

    # Filter out go files
    assert TargetManager.filter_excludes(all_files, ["*.go"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
            "bar/foo/foo.py",
            "baz.java",
            "baz.py",
            "baz/foo",
            "foo",
            "foo.java",
            "foo.py",
            "foo/bar.java",
            "foo/bar.py",
        ]
    }

    # Filter out go and java files
    assert TargetManager.filter_excludes(all_files, ["*.go", "*.java"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
            "bar/foo/foo.py",
            "baz.py",
            "baz/foo",
            "foo",
            "foo.py",
            "foo/bar.py",
        ]
    }

    # Filter out files named foo or in a foo directory ancestor
    assert TargetManager.filter_excludes(all_files, ["foo"]) == {
        Path(p)
        for p in [
            "bar/baz",
            "baz.go",
            "baz.java",
            "baz.py",
            "foo.go",
            "foo.java",
            "foo.py",
        ]
    }

    # Filter out files with an ancestor named bar/baz
    assert TargetManager.filter_excludes(all_files, ["bar/baz"]) == {
        Path(p)
        for p in [
            "bar/foo/baz/bar.go",
            "bar/foo/foo.py",
            "baz.go",
            "baz.java",
            "baz.py",
            "baz/foo",
            "foo",
            "foo.go",
            "foo.java",
            "foo.py",
            "foo/bar.go",
            "foo/bar.java",
            "foo/bar.py",
        ]
    }

    # Filter out go files with a direct ancestor named foo
    assert TargetManager.filter_excludes(all_files, ["foo/*.go"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
            "bar/foo/baz/bar.go",
            "bar/foo/foo.py",
            "baz.go",
            "baz.java",
            "baz.py",
            "baz/foo",
            "foo",
            "foo.go",
            "foo.java",
            "foo.py",
            "foo/bar.java",
            "foo/bar.py",
        ]
    }

    # Filter out go files with a ancestor named foo
    assert TargetManager.filter_excludes(all_files, ["foo/**/*.go"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
            "bar/foo/foo.py",
            "baz.go",
            "baz.java",
            "baz.py",
            "baz/foo",
            "foo",
            "foo.go",
            "foo.java",
            "foo.py",
            "foo/bar.java",
            "foo/bar.py",
        ]
    }

    # Filter out py files with three-characters name
    assert TargetManager.filter_excludes(all_files, ["???.py"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
            "bar/foo/baz/bar.go",
            "baz.go",
            "baz.java",
            "baz/foo",
            "foo",
            "foo.go",
            "foo.java",
            "foo/bar.go",
            "foo/bar.java",
        ]
    }
Example #13
0
def test_filter_include():
    all_file_names = [
        "/foo/bar/baz/a.py",
        "bar/baz",
        "bar/baz/foo/a.py",
        "bar/baz/foo/b.py",
        "bar/baz/foo/c.py",
        "bar/baz/qux/foo/a.py",
        "bar/foo/baz/bar.go",
        "bar/foo/foo.py",
        "baz.go",
        "baz.java",
        "baz.py",
        "baz/foo",
        "foo",
        "foo.go",
        "foo.java",
        "foo.py",
        "foo/bar.go",
        "foo/bar.java",
        "foo/bar.py",
    ]
    all_files = frozenset({Path(elem) for elem in all_file_names})

    # All .py files
    assert TargetManager.filter_includes(all_files, ["*.py"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
            "bar/foo/foo.py",
            "baz.py",
            "foo.py",
            "foo/bar.py",
        ]
    }

    # All go files
    assert TargetManager.filter_includes(all_files, ["*.go"]) == {
        Path(p)
        for p in [
            "bar/foo/baz/bar.go",
            "baz.go",
            "foo.go",
            "foo/bar.go",
        ]
    }

    # All go and java files
    assert TargetManager.filter_includes(all_files, ["*.go", "*.java"]) == {
        Path(p)
        for p in [
            "bar/foo/baz/bar.go",
            "baz.go",
            "baz.java",
            "foo.go",
            "foo.java",
            "foo/bar.go",
            "foo/bar.java",
        ]
    }

    # All files named foo or in a foo directory ancestor
    assert TargetManager.filter_includes(all_files, ["foo"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
            "bar/foo/baz/bar.go",
            "bar/foo/foo.py",
            "baz/foo",
            "foo",
            "foo/bar.go",
            "foo/bar.java",
            "foo/bar.py",
        ]
    }

    # All files with an ancestor named bar/baz
    assert TargetManager.filter_includes(all_files, ["bar/baz"]) == {
        Path(p)
        for p in [
            "/foo/bar/baz/a.py",
            "bar/baz",
            "bar/baz/foo/a.py",
            "bar/baz/foo/b.py",
            "bar/baz/foo/c.py",
            "bar/baz/qux/foo/a.py",
        ]
    }

    # All go files with a direct ancestor named foo
    assert TargetManager.filter_includes(
        all_files, ["foo/*.go"]) == {Path(p)
                                     for p in [
                                         "foo/bar.go",
                                     ]}

    # All go files with a ancestor named foo
    assert TargetManager.filter_includes(all_files, ["foo/**/*.go"]) == {
        Path(p)
        for p in [
            "bar/foo/baz/bar.go",
            "foo/bar.go",
        ]
    }

    # All py files with three-characters name
    assert TargetManager.filter_includes(all_files, ["???.py"]) == {
        Path(p)
        for p in [
            "bar/foo/foo.py",
            "baz.py",
            "foo.py",
            "foo/bar.py",
        ]
    }

    # Test some different variantions of the pattern yield the same result.
    assert TargetManager.filter_includes(
        all_files,
        ["baz/qux"]) == TargetManager.filter_includes(all_files, ["/baz/qux"])
    assert TargetManager.filter_includes(
        all_files,
        ["baz/qux"]) == TargetManager.filter_includes(all_files, ["baz/qux/"])
    assert TargetManager.filter_includes(
        all_files,
        ["baz/qux"]) == TargetManager.filter_includes(all_files, ["/baz/qux/"])
    assert TargetManager.filter_includes(
        all_files,
        ["baz/qux"]) == TargetManager.filter_includes(all_files,
                                                      ["**/baz/qux"])
    assert TargetManager.filter_includes(
        all_files,
        ["baz/qux"]) == TargetManager.filter_includes(all_files,
                                                      ["baz/qux/**"])
    assert TargetManager.filter_includes(
        all_files,
        ["baz/qux"]) == TargetManager.filter_includes(all_files,
                                                      ["**/baz/qux/**"])
Example #14
0
    def _run_rule(
        self, rule: Rule, target_manager: TargetManager, cache_dir: str
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[CoreException] = []
        equivalences = rule.equivalences

        for language, all_patterns_for_language in self._group_patterns_by_language(
            [rule]).items():
            try:
                targets = target_manager.get_files(language, rule.includes,
                                                   rule.excludes)
            except _UnknownLanguageError as ex:
                raise UnknownLanguageError(
                    short_msg="invalid language",
                    long_msg=f"unsupported language {language}",
                    spans=[
                        rule.languages_span.with_context(before=1, after=1)
                    ],
                ) from ex

            if targets == []:
                continue

            # semgrep-core doesn't know about OPERATORS.REGEX - this is
            # strictly a semgrep Python feature. Regex filtering is
            # performed purely in Python code then compared against
            # semgrep-core's results for other patterns.
            patterns_regex, patterns = partition(
                lambda p: p.expression.operator == OPERATORS.REGEX,
                all_patterns_for_language,
            )
            if patterns_regex:
                patterns_json = [
                    pattern.to_json() for pattern in patterns_regex
                ]

                try:
                    patterns_re = [(pattern["id"],
                                    re.compile(pattern["pattern"]))
                                   for pattern in patterns_json]
                except re.error as err:
                    raise SemgrepError(
                        f"invalid regular expression specified: {err}")

                re_fn = functools.partial(get_re_matches, patterns_re)
                with multiprocessing.Pool(self._jobs) as pool:
                    matches = pool.map(re_fn, targets)

                outputs.extend(single_match for file_matches in matches
                               for single_match in file_matches)

            patterns_json = [p.to_json() for p in patterns]
            with tempfile.NamedTemporaryFile(
                    "w") as pattern_file, tempfile.NamedTemporaryFile(
                        "w") as target_file, tempfile.NamedTemporaryFile(
                            "w") as equiv_file:
                yaml = YAML()
                yaml.dump({"rules": patterns_json}, pattern_file)
                pattern_file.flush()
                target_file.write("\n".join(str(t) for t in targets))
                target_file.flush()

                cmd = [SEMGREP_PATH] + [
                    "-lang",
                    language,
                    "-rules_file",
                    pattern_file.name,
                    "-j",
                    str(self._jobs),
                    "-target_file",
                    target_file.name,
                    "-use_parsing_cache",
                    cache_dir,
                ]

                if equivalences:
                    self._write_equivalences_file(equiv_file, equivalences)
                    cmd += ["-equivalences", equiv_file.name]

                core_run = sub_run(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

                debug_print(core_run.stderr.decode("utf-8", "replace"))

                if core_run.returncode != 0:
                    # see if semgrep output a JSON error that we can decode
                    semgrep_output = core_run.stdout.decode("utf-8", "replace")
                    try:
                        output_json = json.loads(semgrep_output)
                    except ValueError:
                        raise SemgrepError(
                            f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                    if "error" in output_json:
                        self._raise_semgrep_error_from_json(
                            output_json, patterns)
                    else:
                        raise SemgrepError(
                            f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                output_json = json.loads(
                    (core_run.stdout.decode("utf-8", "replace")))
                errors.extend(
                    CoreException.from_json(e, language)
                    for e in output_json["errors"])
                outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                debug_print(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors
Example #15
0
def test_explicit_path(tmp_path, monkeypatch):
    foo = tmp_path / "foo"
    foo.mkdir()
    (foo / "a.go").touch()
    (foo / "b.go").touch()
    foo_noext = foo / "noext"
    foo_noext.touch()
    foo_a = foo / "a.py"
    foo_a.touch()
    foo_b = foo / "b.py"
    foo_b.touch()

    monkeypatch.chdir(tmp_path)

    # Should include explicitly passed python file
    foo_a = foo_a.relative_to(tmp_path)
    output_settings = OutputSettings(
        output_format=OutputFormat.TEXT,
        output_destination=None,
        error_on_findings=False,
        verbose_errors=False,
        strict=False,
        json_stats=False,
        output_time=False,
        output_per_finding_max_lines_limit=None,
        output_per_line_max_chars_limit=None,
    )
    defaulthandler = OutputHandler(output_settings)

    python_language = Language("python")

    assert foo_a in TargetManager([], [], 0, ["foo/a.py"], False,
                                  defaulthandler,
                                  False).get_files(python_language, [], [])
    assert foo_a in TargetManager([], [], 0, ["foo/a.py"], False,
                                  defaulthandler,
                                  True).get_files(python_language, [], [])

    # Should include explicitly passed python file even if is in excludes
    assert foo_a not in TargetManager([], ["foo/a.py"], 0, ["."], False,
                                      defaulthandler,
                                      False).get_files(python_language, [], [])
    assert foo_a in TargetManager([], ["foo/a.py"], 0, [".", "foo/a.py"],
                                  False, defaulthandler,
                                  False).get_files(python_language, [], [])

    # Should ignore expliclty passed .go file when requesting python
    assert (TargetManager([], [], 0, ["foo/a.go"], False, defaulthandler,
                          False).get_files(python_language, [],
                                           []) == frozenset())

    # Should include explicitly passed file with unknown extension if skip_unknown_extensions=False
    assert cmp_path_sets(
        TargetManager([], [], 0, ["foo/noext"], False, defaulthandler,
                      False).get_files(python_language, [], []),
        {foo_noext},
    )

    # Should not include explicitly passed file with unknown extension if skip_unknown_extensions=True
    assert cmp_path_sets(
        TargetManager([], [], 0, ["foo/noext"], False, defaulthandler,
                      True).get_files(python_language, [], []),
        set(),
    )

    # Should include explicitly passed file with correct extension even if skip_unknown_extensions=True
    assert cmp_path_sets(
        TargetManager([], [], 0, ["foo/noext", "foo/a.py"], False,
                      defaulthandler, True).get_files(python_language, [], []),
        {foo_a},
    )
Example #16
0
def main(
    output_handler: OutputHandler,
    target: List[str],
    pattern: str,
    lang: str,
    configs: List[str],
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    dryrun: bool = False,
    disable_nosem: bool = False,
    dangerously_allow_arbitrary_code_execution_from_rules: bool = False,
    no_git_ignore: bool = False,
    timeout: int = DEFAULT_TIMEOUT,
    max_memory: int = 0,
    max_target_bytes: int = 0,
    timeout_threshold: int = 0,
    skip_unknown_extensions: bool = False,
    severity: Optional[List[str]] = None,
    optimizations: str = "none",
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    configs_obj, errors = get_config(pattern, lang, configs)
    all_rules = configs_obj.get_rules(no_rewrite_rule_ids)

    if severity is None or severity == []:
        filtered_rules = all_rules
    else:
        filtered_rules = [
            rule for rule in all_rules if rule.severity in severity
        ]

    output_handler.handle_semgrep_errors(errors)

    if errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not pattern:
        plural = "s" if len(configs_obj.valid) > 1 else ""
        config_id_if_single = (list(configs_obj.valid.keys())[0]
                               if len(configs_obj.valid) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        logger.verbose(
            f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}"
        )

        if len(configs_obj.valid) == 0:
            if len(errors) > 0:
                raise SemgrepError(
                    f"no valid configuration file found ({len(errors)} configs were invalid)",
                    code=MISSING_CONFIG_EXIT_CODE,
                )
            else:
                raise SemgrepError(
                    """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>.
If you're looking for a config to start with, there are thousands at: https://semgrep.dev
The two most popular are:
    --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI
    --config=p/security-audit # find security audit points; noisy, not recommended for CI
""",
                    code=MISSING_CONFIG_EXIT_CODE,
                )

        notify_user_of_work(filtered_rules, include, exclude)

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        max_target_bytes=max_target_bytes,
        targets=target,
        respect_git_ignore=respect_git_ignore,
        output_handler=output_handler,
        skip_unknown_extensions=skip_unknown_extensions,
    )

    profiler = ProfileManager()

    # # Turn off optimizations if using features not supported yet
    if optimizations == "all":
        # taint mode rules not yet supported
        if any(rule.mode == TAINT_MODE for rule in filtered_rules):
            logger.info("Running without optimizations since taint rule found")
            optimizations = "none"
        # step by step evaluation output not yet supported
        elif output_handler.settings.debug:
            logger.info(
                "Running without optimizations since step-by-step evaluation output desired"
            )
            optimizations = "none"

        elif any(rule.has_pattern_where_python() for rule in filtered_rules):
            logger.info(
                "Running without optimizations since running pattern-where-python rules"
            )
            optimizations = "none"

    start_time = time.time()
    # actually invoke semgrep
    (
        rule_matches_by_rule,
        debug_steps_by_rule,
        semgrep_errors,
        all_targets,
        profiling_data,
    ) = CoreRunner(
        output_settings=output_handler.settings,
        allow_exec=dangerously_allow_arbitrary_code_execution_from_rules,
        jobs=jobs,
        timeout=timeout,
        max_memory=max_memory,
        timeout_threshold=timeout_threshold,
        optimizations=optimizations,
    ).invoke_semgrep(target_manager, profiler, filtered_rules)
    profiler.save("total_time", start_time)

    output_handler.handle_semgrep_errors(semgrep_errors)

    nosem_errors = []
    for rule, rule_matches in rule_matches_by_rule.items():
        evolved_rule_matches = []
        for rule_match in rule_matches:
            ignored, returned_errors = rule_match_nosem(rule_match, strict)
            evolved_rule_matches.append(
                attr.evolve(rule_match, is_ignored=ignored))
            nosem_errors.extend(returned_errors)
        rule_matches_by_rule[rule] = evolved_rule_matches

    output_handler.handle_semgrep_errors(nosem_errors)

    num_findings_nosem = 0
    if not disable_nosem:
        filtered_rule_matches_by_rule = {}
        for rule, rule_matches in rule_matches_by_rule.items():
            filtered_rule_matches = []
            for rule_match in rule_matches:
                if rule_match._is_ignored:
                    num_findings_nosem += 1
                else:
                    filtered_rule_matches.append(rule_match)
            filtered_rule_matches_by_rule[rule] = filtered_rule_matches
        rule_matches_by_rule = filtered_rule_matches_by_rule

    num_findings = sum(len(v) for v in rule_matches_by_rule.values())
    stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings"

    if metric_manager.is_enabled:
        project_url = None
        try:
            project_url = sub_check_output(
                ["git", "ls-remote", "--get-url"],
                encoding="utf-8",
                stderr=subprocess.DEVNULL,
            )
        except Exception as e:
            logger.debug(
                f"Failed to get project url from 'git ls-remote': {e}")
            try:
                # add \n to match urls from git ls-remote (backwards compatability)
                project_url = manually_search_file(".git/config", ".com", "\n")
            except Exception as e:
                logger.debug(
                    f"Failed to get project url from .git/config: {e}")

        metric_manager.set_project_hash(project_url)
        metric_manager.set_configs_hash(configs)
        metric_manager.set_rules_hash(filtered_rules)
        metric_manager.set_num_rules(len(filtered_rules))
        metric_manager.set_num_targets(len(all_targets))
        metric_manager.set_num_findings(num_findings)
        metric_manager.set_num_ignored(num_findings_nosem)
        metric_manager.set_run_time(profiler.calls["total_time"][0])
        total_bytes_scanned = sum(t.stat().st_size for t in all_targets)
        metric_manager.set_total_bytes_scanned(total_bytes_scanned)
        metric_manager.set_errors(
            list(type(e).__name__ for e in semgrep_errors))
        metric_manager.set_run_timings(profiling_data, all_targets,
                                       filtered_rules)

    output_handler.handle_semgrep_core_output(
        rule_matches_by_rule,
        debug_steps_by_rule,
        stats_line,
        all_targets,
        profiler,
        filtered_rules,
        profiling_data,
    )

    if autofix:
        apply_fixes(rule_matches_by_rule, dryrun)
Example #17
0
def main(
    output_handler: OutputHandler,
    target: List[str],
    pattern: str,
    lang: str,
    configs: List[str],
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    dryrun: bool = False,
    disable_nosem: bool = False,
    dangerously_allow_arbitrary_code_execution_from_rules: bool = False,
    no_git_ignore: bool = False,
    timeout: int = DEFAULT_TIMEOUT,
    max_memory: int = 0,
    timeout_threshold: int = 0,
    skip_unknown_extensions: bool = False,
    testing: bool = False,
    severity: Optional[List[str]] = None,
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    configs_obj, errors = get_config(pattern, lang, configs)
    all_rules = configs_obj.get_rules(no_rewrite_rule_ids)

    if severity is None or severity == []:
        filtered_rules = all_rules
    else:
        filtered_rules = [
            rule for rule in all_rules if rule.severity in severity
        ]

    output_handler.handle_semgrep_errors(errors)

    if errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not pattern:
        plural = "s" if len(configs_obj.valid) > 1 else ""
        config_id_if_single = (list(configs_obj.valid.keys())[0]
                               if len(configs_obj.valid) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        logger.debug(
            f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}"
        )

        if len(configs_obj.valid) == 0:
            raise SemgrepError(
                f"no valid configuration file found ({len(errors)} configs were invalid)",
                code=MISSING_CONFIG_EXIT_CODE,
            )

        notify_user_of_work(filtered_rules, include, exclude)

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        targets=target,
        respect_git_ignore=respect_git_ignore,
        output_handler=output_handler,
        skip_unknown_extensions=skip_unknown_extensions,
    )

    # actually invoke semgrep
    rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, num_targets = CoreRunner(
        allow_exec=dangerously_allow_arbitrary_code_execution_from_rules,
        jobs=jobs,
        timeout=timeout,
        max_memory=max_memory,
        timeout_threshold=timeout_threshold,
        testing=testing,
    ).invoke_semgrep(target_manager, filtered_rules)

    output_handler.handle_semgrep_errors(semgrep_errors)

    rule_matches_by_rule = {
        rule: [
            attr.evolve(rule_match,
                        is_ignored=rule_match_nosem(rule_match, strict))
            for rule_match in rule_matches
        ]
        for rule, rule_matches in rule_matches_by_rule.items()
    }

    if not disable_nosem:
        rule_matches_by_rule = {
            rule: [
                rule_match for rule_match in rule_matches
                if not rule_match._is_ignored
            ]
            for rule, rule_matches in rule_matches_by_rule.items()
        }

    num_findings = sum(len(v) for v in rule_matches_by_rule.values())
    stats_line = f"ran {len(filtered_rules)} rules on {num_targets} files: {num_findings} findings"

    output_handler.handle_semgrep_core_output(rule_matches_by_rule,
                                              debug_steps_by_rule, stats_line)

    if autofix:
        apply_fixes(rule_matches_by_rule, dryrun)
Example #18
0
def main(
    output_handler: OutputHandler,
    target: List[str],
    pattern: str,
    lang: str,
    configs: List[str],
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    dryrun: bool = False,
    disable_nosem: bool = False,
    dangerously_allow_arbitrary_code_execution_from_rules: bool = False,
    no_git_ignore: bool = False,
    timeout: int = DEFAULT_TIMEOUT,
    max_memory: int = 0,
    timeout_threshold: int = 0,
    skip_unknown_extensions: bool = False,
    severity: Optional[List[str]] = None,
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    configs_obj, errors = get_config(pattern, lang, configs)
    all_rules = configs_obj.get_rules(no_rewrite_rule_ids)

    if severity is None or severity == []:
        filtered_rules = all_rules
    else:
        filtered_rules = [
            rule for rule in all_rules if rule.severity in severity
        ]

    output_handler.handle_semgrep_errors(errors)

    if errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not pattern:
        plural = "s" if len(configs_obj.valid) > 1 else ""
        config_id_if_single = (list(configs_obj.valid.keys())[0]
                               if len(configs_obj.valid) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        logger.debug(
            f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}"
        )

        if len(configs_obj.valid) == 0:
            if len(errors) > 0:
                raise SemgrepError(
                    f"no valid configuration file found ({len(errors)} configs were invalid)",
                    code=MISSING_CONFIG_EXIT_CODE,
                )
            else:
                raise SemgrepError(
                    """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>.
If you're looking for a config to start with, there are thousands at: https://semgrep.dev
The two most popular are:
    --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI
    --config=p/security-audit # find security audit points; noisy, not recommended for CI
""",
                    code=MISSING_CONFIG_EXIT_CODE,
                )

        notify_user_of_work(filtered_rules, include, exclude)

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        targets=target,
        respect_git_ignore=respect_git_ignore,
        output_handler=output_handler,
        skip_unknown_extensions=skip_unknown_extensions,
    )

    # actually invoke semgrep
    (
        rule_matches_by_rule,
        debug_steps_by_rule,
        semgrep_errors,
        all_targets,
        profiler,
    ) = CoreRunner(
        allow_exec=dangerously_allow_arbitrary_code_execution_from_rules,
        jobs=jobs,
        timeout=timeout,
        max_memory=max_memory,
        timeout_threshold=timeout_threshold,
    ).invoke_semgrep(target_manager, filtered_rules)

    output_handler.handle_semgrep_errors(semgrep_errors)

    rule_matches_by_rule = {
        rule: [
            attr.evolve(rule_match,
                        is_ignored=rule_match_nosem(rule_match, strict))
            for rule_match in rule_matches
        ]
        for rule, rule_matches in rule_matches_by_rule.items()
    }

    if not disable_nosem:
        rule_matches_by_rule = {
            rule: [
                rule_match for rule_match in rule_matches
                if not rule_match._is_ignored
            ]
            for rule, rule_matches in rule_matches_by_rule.items()
        }

    num_findings = sum(len(v) for v in rule_matches_by_rule.values())
    stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings"

    output_handler.handle_semgrep_core_output(rule_matches_by_rule,
                                              debug_steps_by_rule, stats_line,
                                              all_targets, profiler)

    if autofix:
        apply_fixes(rule_matches_by_rule, dryrun)
Example #19
0
def main(
    *,
    output_handler: OutputHandler,
    target: Sequence[str],
    pattern: Optional[str],
    lang: Optional[str],
    configs: Sequence[str],
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[Sequence[str]] = None,
    exclude: Optional[Sequence[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    replacement: Optional[str] = None,
    dryrun: bool = False,
    disable_nosem: bool = False,
    no_git_ignore: bool = False,
    timeout: int = DEFAULT_TIMEOUT,
    max_memory: int = 0,
    max_target_bytes: int = 0,
    timeout_threshold: int = 0,
    skip_unknown_extensions: bool = False,
    severity: Optional[Sequence[str]] = None,
    optimizations: str = "none",
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    configs_obj, errors = get_config(pattern, lang, configs, replacement)
    all_rules = configs_obj.get_rules(no_rewrite_rule_ids)

    if not severity:
        filtered_rules = all_rules
    else:
        filtered_rules = [
            rule for rule in all_rules if rule.severity.value in severity
        ]

    output_handler.handle_semgrep_errors(errors)

    if errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not pattern:
        plural = "s" if len(configs_obj.valid) > 1 else ""
        config_id_if_single = (list(configs_obj.valid.keys())[0]
                               if len(configs_obj.valid) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        logger.verbose(
            f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}"
            .strip())

        if len(configs_obj.valid) == 0:
            if len(errors) > 0:
                raise SemgrepError(
                    f"no valid configuration file found ({len(errors)} configs were invalid)",
                    code=MISSING_CONFIG_EXIT_CODE,
                )
            else:
                raise SemgrepError(
                    """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>.
If you're looking for a config to start with, there are thousands at: https://semgrep.dev
The two most popular are:
    --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI
    --config=p/security-audit # find security audit points; noisy, not recommended for CI
""",
                    code=MISSING_CONFIG_EXIT_CODE,
                )

        notify_user_of_work(filtered_rules, include, exclude)

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        max_target_bytes=max_target_bytes,
        targets=target,
        respect_git_ignore=respect_git_ignore,
        output_handler=output_handler,
        skip_unknown_extensions=skip_unknown_extensions,
    )

    profiler = ProfileManager()

    join_rules, rest_of_the_rules = partition(
        lambda rule: rule.mode == JOIN_MODE,
        filtered_rules,
    )
    filtered_rules = rest_of_the_rules

    start_time = time.time()
    # actually invoke semgrep
    (
        rule_matches_by_rule,
        debug_steps_by_rule,
        semgrep_errors,
        all_targets,
        profiling_data,
    ) = CoreRunner(
        jobs=jobs,
        timeout=timeout,
        max_memory=max_memory,
        timeout_threshold=timeout_threshold,
        optimizations=optimizations,
    ).invoke_semgrep(target_manager, profiler, filtered_rules)

    if join_rules:
        import semgrep.join_rule as join_rule

        for rule in join_rules:
            join_rule_matches, join_rule_errors = join_rule.run_join_rule(
                rule.raw, [Path(t) for t in target_manager.targets])
            join_rule_matches_by_rule = {
                Rule.from_json(rule.raw): join_rule_matches
            }
            rule_matches_by_rule.update(join_rule_matches_by_rule)
            output_handler.handle_semgrep_errors(join_rule_errors)

    profiler.save("total_time", start_time)

    filtered_matches = process_ignores(rule_matches_by_rule,
                                       output_handler,
                                       strict=strict,
                                       disable_nosem=disable_nosem)

    output_handler.handle_semgrep_errors(semgrep_errors)
    output_handler.handle_semgrep_errors(filtered_matches.errors)

    num_findings = sum(len(v) for v in filtered_matches.matches.values())
    stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings"

    if metric_manager.is_enabled:
        project_url = None
        try:
            project_url = sub_check_output(
                ["git", "ls-remote", "--get-url"],
                encoding="utf-8",
                stderr=subprocess.DEVNULL,
            )
        except Exception as e:
            logger.debug(
                f"Failed to get project url from 'git ls-remote': {e}")
            try:
                # add \n to match urls from git ls-remote (backwards compatability)
                project_url = manually_search_file(".git/config", ".com", "\n")
            except Exception as e:
                logger.debug(
                    f"Failed to get project url from .git/config: {e}")

        metric_manager.set_project_hash(project_url)
        metric_manager.set_configs_hash(configs)
        metric_manager.set_rules_hash(filtered_rules)
        metric_manager.set_num_rules(len(filtered_rules))
        metric_manager.set_num_targets(len(all_targets))
        metric_manager.set_num_findings(num_findings)
        metric_manager.set_num_ignored(filtered_matches.num_matches)
        metric_manager.set_run_time(profiler.calls["total_time"][0])
        total_bytes_scanned = sum(t.stat().st_size for t in all_targets)
        metric_manager.set_total_bytes_scanned(total_bytes_scanned)
        metric_manager.set_errors(
            list(type(e).__name__ for e in semgrep_errors))
        metric_manager.set_run_timings(profiling_data, list(all_targets),
                                       filtered_rules)

    output_handler.handle_semgrep_core_output(
        filtered_matches.matches,
        debug_steps_by_rule,
        stats_line,
        all_targets,
        profiler,
        filtered_rules,
        profiling_data,
    )

    if autofix:
        apply_fixes(filtered_matches.matches, dryrun)