Esempio n. 1
0
    def report_results(self, results: Results) -> None:
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        response: Optional["requests.Response"] = None

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/findings",
            json={
                "token": os.getenv("GITHUB_TOKEN"),
                "findings": [
                    finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS)
                    for finding in results.findings.new
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../findings responded: {response!r}")
        try:
            response.raise_for_status()

            errors = response.json()["errors"]
            for error in errors:
                message = error["message"]
                click.echo(f"Server returned following warning: {message}", err=True)

        except requests.RequestException:
            raise ActionFailure(f"API server returned this error: {response.text}")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/ignores",
            json={
                "findings": [
                    finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS)
                    for finding in results.findings.ignored
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../ignores responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(f"API server returned this error: {response.text}")

        # mark as complete
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={"exit_code": -1, "stats": results.stats},
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
Esempio n. 2
0
    def _find_branchoff_point(self, attempt_count: int = 0) -> str:
        fetch_depth = 4 ** attempt_count  # fetch 4, 16, 64, 256, 1024, ...
        if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT:  # get all commits on last try
            fetch_depth = 2 ** 31 - 1  # git expects a signed 32-bit integer

        if attempt_count:  # skip fetching on first try
            debug_echo(
                f"fetching {fetch_depth} commits to find branch-off point of pull request"
            )
            git.fetch("origin", "--depth", fetch_depth, self.base_branch_tip)
            git.fetch("origin", "--depth", fetch_depth, self.head_ref)

        try:  # check if both branches connect to the yet-unknown branch-off point now
            process = git("merge-base", self.base_branch_tip, self.head_ref)
        except sh.ErrorReturnCode as error:
            output = error.stderr.decode().strip()
            if (
                output  # output is empty when unable to find branch-off point
                and "Not a valid " not in output  # the error when a ref is missing
            ):
                exit_with_sh_error(error)

            if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT:
                raise ActionFailure(
                    "Could not find branch-off point between "
                    f"the baseline tip {self.base_branch_tip} and current head '{self.head_ref}' "
                )

            return self._find_branchoff_point(attempt_count + 1)
        else:
            return process.stdout.decode().strip()
Esempio n. 3
0
    def _abort_on_conflicting_untracked_paths(self) -> None:
        """
        Raises ActionFailure if untracked paths were touched in the baseline, too.

        :raises ActionFailure: If the git repo is not in a clean state
        """
        repo = get_git_repo()

        if not repo or self._base_commit is None:
            return

        changed_paths = set(self._status.added + self._status.modified +
                            self._status.removed + self._status.unmerged)
        untracked_paths = {
            self._fname_to_path(repo, str(path))
            for path in (
                self._dirty_paths_by_status.get(StatusCode.Untracked, []))
        }
        overlapping_paths = untracked_paths & changed_paths

        if overlapping_paths:
            raise ActionFailure(
                "Some paths that changed since the baseline commit now show up as untracked files. "
                f"Please commit or stash your untracked changes in these paths: {overlapping_paths}."
            )
Esempio n. 4
0
    def report_failure(self, stderr: str, exit_code: int) -> int:
        """
        Send semgrep cli non-zero exit code information to server
        and return what exit code semgrep should exit with.
        """
        debug_echo(f"=== sending failure information to semgrep app")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/error",
            json={
                "exit_code": exit_code,
                "stderr": stderr,
            },
            timeout=30,
        )

        debug_echo(f"=== POST .../error responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        exit_code = int(response.json()["exit_code"])
        return exit_code
Esempio n. 5
0
 def __post_init__(self) -> None:
     if self.token and self.deployment_id:
         self.is_configured = True
     if self.is_configured and not validate_publish_token(self.token):
         raise ActionFailure(
             f"Received invalid publish token, token length {len(self.token)}. "
             f"Please check your publish token.")
     self.session = requests.Session()
     self.session.headers["Authorization"] = f"Bearer {self.token}"
Esempio n. 6
0
    def _abort_on_pending_changes(self) -> None:
        """
        Raises ActionFailure if any tracked files are changed.

        :raises ActionFailure: If the git repo is not in a clean state
        """
        if set(self._dirty_paths_by_status) - {StatusCode.Untracked}:
            raise ActionFailure(
                "Found pending changes in tracked files. Diff-aware runs require a clean git state."
            )
Esempio n. 7
0
def _fix_head_for_github(
    base_commit_ref: Optional[str] = None,
    head_ref: Optional[str] = None,
) -> Iterator[Optional[str]]:
    """
    GHA can checkout the incorrect commit for a PR (it will create a fake merge commit),
    so we need to reset the head to the actual PR branch head before continuing.

    Note that this code is written in a generic manner, so that it becomes a no-op when
    the CI system has not artifically altered the HEAD ref.

    :return: The baseline ref as a commit hash
    """

    stashed_rev: Optional[str] = None
    base_ref: Optional[str] = base_commit_ref

    if get_git_repo() is None:
        yield base_ref
        return

    if base_ref:
        # Preserve location of head^ after we possibly change location below
        try:
            process = git(["rev-parse", base_ref])
            base_ref = process.stdout.decode("utf-8").rstrip()
        except sh.ErrorReturnCode as ex:
            raise ActionFailure(f"There is a problem with your git project:{ex}")

    if head_ref:
        stashed_rev = git(["branch", "--show-current"]).stdout.decode("utf-8").rstrip()
        if not stashed_rev:
            stashed_rev = git(["rev-parse", "HEAD"]).stdout.decode("utf-8").rstrip()
        click.echo(f"| not on head ref {head_ref}; checking that out now...", err=True)
        git.checkout([head_ref])

    try:
        if base_ref is not None:
            merge_base = git("merge-base", base_ref, "HEAD").rstrip()
            # fmt:off
            click.echo("| reporting findings introduced by these commits:", err=True)
            print_git_log(f"{merge_base}..HEAD")
            if merge_base != git("rev-parse", base_ref).rstrip():
                click.echo("| also reporting findings fixed by these commits from the baseline branch:", err=True)
                print_git_log(f"{merge_base}..{base_ref}")
                click.echo("| to exclude these latter commits, run with", err=True)
                click.echo(f"|   --baseline-ref $(git merge-base {base_commit_ref} HEAD)", err=True)
            # fmt: on

        yield base_ref
    finally:
        if stashed_rev is not None:
            click.echo(f"| returning to original head revision {stashed_rev}", err=True)
            git.checkout([stashed_rev])
Esempio n. 8
0
    def fetch_rules_text(self) -> str:
        """Get a YAML string with the configured semgrep rules in it."""
        response = self.session.get(
            f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml",
            timeout=30,
        )
        debug_echo(f"=== POST .../rules.yaml responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}\n"
                "Failed to get configured rules")

        # Can remove once server guarantees will always have at least one rule
        parsed = yaml.load(response.text)
        if not parsed["rules"]:
            raise ActionFailure("No rules returned by server for this scan.")
        else:
            return response.text
Esempio n. 9
0
    def report_start(self, meta: GitMeta) -> None:
        """
        Get scan id and file ignores

        returns name of policy used to scan
        """
        debug_echo(f"=== reporting start to semgrep app at {self.url}")

        response = self.session.post(
            f"{self.url}/api/agent/deployment/{self.deployment_id}/scan",
            json={"meta": meta.to_dict()},
            timeout=30,
        )

        debug_echo(f"=== POST .../scan responded: {response!r}")

        if response.status_code == 404:
            raise ActionFailure(
                "Failed to create a scan with given token and deployment_id."
                "Please make sure they have been set correctly."
                f"API server at {self.url} returned this response: {response.text}"
            )

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
        else:
            body = response.json()
            self.scan = Scan(
                id=glom(body, T["scan"]["id"]),
                ignore_patterns=glom(
                    body, T["scan"]["meta"].get("ignored_files", [])),
                policy_list=glom(body, T["policy"]),
                autofix=glom(body, T.get("autofix", False)),
            )
            debug_echo(f"=== Our scan object is: {self.scan!r}")
Esempio n. 10
0
    def __post_init__(self) -> None:
        if self.token:
            self.is_configured = True

            self.session = requests.Session()
            self.session.mount("https://", RETRYING_ADAPTER)
            self.session.headers["Authorization"] = f"Bearer {self.token}"

            if validate_token_length(self.token):
                self.get_deployment_from_token(self.token)
            else:
                raise ActionFailure(
                    f"Received invalid publish token. Length is too short.")
Esempio n. 11
0
 def get_deployment_from_token(self, token: str) -> None:
     response = self.session.get(
         f"{self.url}/api/agent/deployment",
         json={},
         timeout=30,
     )
     try:
         response.raise_for_status()
     except requests.RequestException:
         raise ActionFailure(
             f"API server at {self.url} returned this error: {response.text}\n"
             "Failed to get deployment")
     data = response.json()
     self.deployment_id = data.get("deployment").get("id")
     self.deployment_name = data.get("deployment").get("name")
Esempio n. 12
0
    def _baseline_context(self) -> Iterator[None]:
        """
        Runs a block of code on files from the current branch HEAD.

        :raises ActionFailure: If git cannot detect a HEAD commit
        :raises ActionFailure: If unmerged files are detected
        """
        repo = get_git_repo()

        if not repo:
            yield
            return

        self._abort_on_pending_changes()
        self._abort_on_conflicting_untracked_paths()

        current_tree = git("write-tree").stdout.decode().strip()
        try:
            for a in self._status.added:
                a.unlink()
            git.checkout(self._base_commit, "--", ".")
            yield
        finally:
            # git checkout will fail if the checked-out index deletes all files in the repo
            # In this case, we still want to continue without error.
            # Note that we have no good way of detecting this issue without inspecting the checkout output
            # message, which means we are fragile with respect to git version here.
            try:
                git.checkout(current_tree.strip(), "--", ".")
            except sh.ErrorReturnCode as error:
                output = error.stderr.decode()
                if (output and len(output) >= 2 and
                        "pathspec '.' did not match any file(s) known to git"
                        in output.strip()):
                    debug_echo(
                        "Restoring git index failed due to total repository deletion; skipping checkout"
                    )
                else:
                    raise ActionFailure(
                        f"Fatal error restoring Git state; please restore your repository state manually:\n{output}"
                    )

            if self._status.removed:
                # Need to check if file exists since it is possible file was deleted
                # in both the base and head. Only call if there are files to delete
                to_remove = [r for r in self._status.removed if r.exists()]
                if to_remove:
                    git.rm("-f", *(str(r) for r in to_remove))
Esempio n. 13
0
 def expand_directives(self, line: str) -> Iterable[str]:
     """Load :include files"""
     if line.startswith(":include "):
         include_path = self.base_path / line[9:]
         if include_path.is_file():
             with include_path.open() as include_lines:
                 sub_base = include_path.parent.resolve()
                 sub_parser = Parser(sub_base)
                 return sub_parser.parse(include_lines)
         else:
             debug_echo(
                 f"Skipping `:include {include_path}` directive, file not found"
             )
             return []
     elif CONTROL_REGEX.match(line):
         raise ActionFailure(
             f"Unknown ignore directive in Semgrep ignore file at {self.base_path}: '{line}'"
         )
     else:
         return (line for _ in range(1))
Esempio n. 14
0
    def get_git_status(self) -> GitStatus:
        """
        Returns Absolute Paths to all files that are staged

        Ignores files that are symlinks to directories
        """
        import gitdb.exc  # type: ignore

        repo = get_git_repo()

        if not repo or self._base_commit is None:
            return GitStatus([], [], [], [])

        try:
            repo.rev_parse(self._base_commit)
        except gitdb.exc.BadName:
            raise ActionFailure(f"Unknown git ref '{self._base_commit}'")

        # Output of git command will be relative to git project root
        status_output = zsplit(
            git.diff(
                "--cached",
                "--name-status",
                "--no-ext-diff",
                "-z",
                "--diff-filter=ACDMRTUXB",
                "--ignore-submodules",
                self._base_commit,
            ).stdout.decode())

        added = []
        modified = []
        removed = []
        unmerged = []
        while status_output:
            code = status_output[0]
            fname = status_output[1]
            trim_size = 2

            if not code.strip():
                continue
            if code == StatusCode.Untracked or code == StatusCode.Ignored:
                continue

            resolved_name = self._fname_to_path(repo, fname)

            # If file is symlink to directory, skip
            absolute_name = Path(repo.working_tree_dir) / fname
            if absolute_name.is_symlink() and resolved_name.is_dir():
                click.echo(
                    f"| Skipping {absolute_name} since it is a symlink to a directory: {resolved_name}",
                    err=True,
                )
            else:
                # The following detection for unmerged codes comes from `man git-status`
                if code == StatusCode.Unmerged:
                    unmerged.append(resolved_name)
                if (code[0] == StatusCode.Renamed
                    ):  # code is RXXX, where XXX is percent similarity
                    removed.append(resolved_name)
                    fname = status_output[2]
                    trim_size += 1
                    added.append(resolved_name)
                if code == StatusCode.Added:
                    added.append(resolved_name)
                if code == StatusCode.Modified:
                    modified.append(resolved_name)
                if code == StatusCode.Deleted:
                    removed.append(resolved_name)

            status_output = status_output[trim_size:]
        debug_echo(
            f"Git status:\nadded: {added}\nmodified: {modified}\nremoved: {removed}\nunmerged: {unmerged}"
        )

        return GitStatus(added, modified, removed, unmerged)
Esempio n. 15
0
    def report_results(self, results: Results) -> None:
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy()

        if "pr-comment-autofix" in os.getenv("SEMGREP_AGENT_OPT_IN_FEATURES",
                                             ""):
            fields_to_omit.remove("fixed_lines")

        response: Optional["requests.Response"] = None

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/findings",
            json={
                # send a backup token in case the app is not available
                "token":
                os.getenv("GITHUB_TOKEN"),
                "findings": [
                    finding.to_dict(omit=fields_to_omit)
                    for finding in results.findings.new
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../findings responded: {response!r}")
        try:
            response.raise_for_status()

            errors = response.json()["errors"]
            for error in errors:
                message = error["message"]
                click.echo(f"Server returned following warning: {message}",
                           err=True)

        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/ignores",
            json={
                "findings":
                [finding.to_dict() for finding in results.findings.ignored],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../ignores responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        # mark as complete
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={
                "exit_code": -1,
                "stats": results.stats
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
Esempio n. 16
0
    def report_results(self, results: Results, rule_ids: Sequence[str],
                       cai_ids: Sequence[str]) -> None:
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy()

        if self.scan.autofix:
            fields_to_omit.remove("fixed_lines")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/findings",
            json={
                # send a backup token in case the app is not available
                "token":
                os.getenv("GITHUB_TOKEN"),
                "gitlab_token":
                os.getenv("GITLAB_TOKEN"),
                "findings": [
                    finding.to_dict(omit=fields_to_omit)
                    for finding in results.findings.new
                ],
                "searched_paths":
                [str(p) for p in results.findings.searched_paths],
                "rule_ids":
                rule_ids,
                "cai_ids":
                cai_ids,
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../findings responded: {response!r}")
        try:
            response.raise_for_status()

            errors = response.json()["errors"]
            for error in errors:
                message = error["message"]
                click.echo(f"Server returned following warning: {message}",
                           err=True)

        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/ignores",
            json={
                "findings": [
                    finding.to_dict()
                    for finding in results.findings.new_ignored
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../ignores responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        # mark as complete
        # In order to not overload our app database, we truncate target stats to the 20 heaviest hitters. This adds
        # approximately 80 kB of database load per scan when using p/ci.
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={
                "exit_code": results.findings.max_exit_code,
                "stats": results.stats(n_heavy_targets=20),
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )