def report_results(self, results: Results) -> None: debug_echo(f"=== reporting results to semgrep app at {self.url}") response: Optional["requests.Response"] = None response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json={ "token": os.getenv("GITHUB_TOKEN"), "findings": [ finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS) for finding in results.findings.new ], }, timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() errors = response.json()["errors"] for error in errors: message = error["message"] click.echo(f"Server returned following warning: {message}", err=True) except requests.RequestException: raise ActionFailure(f"API server returned this error: {response.text}") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/ignores", json={ "findings": [ finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS) for finding in results.findings.ignored ], }, timeout=30, ) debug_echo(f"=== POST .../ignores responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure(f"API server returned this error: {response.text}") # mark as complete response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={"exit_code": -1, "stats": results.stats}, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )
def _find_branchoff_point(self, attempt_count: int = 0) -> str: fetch_depth = 4 ** attempt_count # fetch 4, 16, 64, 256, 1024, ... if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT: # get all commits on last try fetch_depth = 2 ** 31 - 1 # git expects a signed 32-bit integer if attempt_count: # skip fetching on first try debug_echo( f"fetching {fetch_depth} commits to find branch-off point of pull request" ) git.fetch("origin", "--depth", fetch_depth, self.base_branch_tip) git.fetch("origin", "--depth", fetch_depth, self.head_ref) try: # check if both branches connect to the yet-unknown branch-off point now process = git("merge-base", self.base_branch_tip, self.head_ref) except sh.ErrorReturnCode as error: output = error.stderr.decode().strip() if ( output # output is empty when unable to find branch-off point and "Not a valid " not in output # the error when a ref is missing ): exit_with_sh_error(error) if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT: raise ActionFailure( "Could not find branch-off point between " f"the baseline tip {self.base_branch_tip} and current head '{self.head_ref}' " ) return self._find_branchoff_point(attempt_count + 1) else: return process.stdout.decode().strip()
def _abort_on_conflicting_untracked_paths(self) -> None: """ Raises ActionFailure if untracked paths were touched in the baseline, too. :raises ActionFailure: If the git repo is not in a clean state """ repo = get_git_repo() if not repo or self._base_commit is None: return changed_paths = set(self._status.added + self._status.modified + self._status.removed + self._status.unmerged) untracked_paths = { self._fname_to_path(repo, str(path)) for path in ( self._dirty_paths_by_status.get(StatusCode.Untracked, [])) } overlapping_paths = untracked_paths & changed_paths if overlapping_paths: raise ActionFailure( "Some paths that changed since the baseline commit now show up as untracked files. " f"Please commit or stash your untracked changes in these paths: {overlapping_paths}." )
def report_failure(self, stderr: str, exit_code: int) -> int: """ Send semgrep cli non-zero exit code information to server and return what exit code semgrep should exit with. """ debug_echo(f"=== sending failure information to semgrep app") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/error", json={ "exit_code": exit_code, "stderr": stderr, }, timeout=30, ) debug_echo(f"=== POST .../error responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") exit_code = int(response.json()["exit_code"]) return exit_code
def __post_init__(self) -> None: if self.token and self.deployment_id: self.is_configured = True if self.is_configured and not validate_publish_token(self.token): raise ActionFailure( f"Received invalid publish token, token length {len(self.token)}. " f"Please check your publish token.") self.session = requests.Session() self.session.headers["Authorization"] = f"Bearer {self.token}"
def _abort_on_pending_changes(self) -> None: """ Raises ActionFailure if any tracked files are changed. :raises ActionFailure: If the git repo is not in a clean state """ if set(self._dirty_paths_by_status) - {StatusCode.Untracked}: raise ActionFailure( "Found pending changes in tracked files. Diff-aware runs require a clean git state." )
def _fix_head_for_github( base_commit_ref: Optional[str] = None, head_ref: Optional[str] = None, ) -> Iterator[Optional[str]]: """ GHA can checkout the incorrect commit for a PR (it will create a fake merge commit), so we need to reset the head to the actual PR branch head before continuing. Note that this code is written in a generic manner, so that it becomes a no-op when the CI system has not artifically altered the HEAD ref. :return: The baseline ref as a commit hash """ stashed_rev: Optional[str] = None base_ref: Optional[str] = base_commit_ref if get_git_repo() is None: yield base_ref return if base_ref: # Preserve location of head^ after we possibly change location below try: process = git(["rev-parse", base_ref]) base_ref = process.stdout.decode("utf-8").rstrip() except sh.ErrorReturnCode as ex: raise ActionFailure(f"There is a problem with your git project:{ex}") if head_ref: stashed_rev = git(["branch", "--show-current"]).stdout.decode("utf-8").rstrip() if not stashed_rev: stashed_rev = git(["rev-parse", "HEAD"]).stdout.decode("utf-8").rstrip() click.echo(f"| not on head ref {head_ref}; checking that out now...", err=True) git.checkout([head_ref]) try: if base_ref is not None: merge_base = git("merge-base", base_ref, "HEAD").rstrip() # fmt:off click.echo("| reporting findings introduced by these commits:", err=True) print_git_log(f"{merge_base}..HEAD") if merge_base != git("rev-parse", base_ref).rstrip(): click.echo("| also reporting findings fixed by these commits from the baseline branch:", err=True) print_git_log(f"{merge_base}..{base_ref}") click.echo("| to exclude these latter commits, run with", err=True) click.echo(f"| --baseline-ref $(git merge-base {base_commit_ref} HEAD)", err=True) # fmt: on yield base_ref finally: if stashed_rev is not None: click.echo(f"| returning to original head revision {stashed_rev}", err=True) git.checkout([stashed_rev])
def fetch_rules_text(self) -> str: """Get a YAML string with the configured semgrep rules in it.""" response = self.session.get( f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml", timeout=30, ) debug_echo(f"=== POST .../rules.yaml responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}\n" "Failed to get configured rules") # Can remove once server guarantees will always have at least one rule parsed = yaml.load(response.text) if not parsed["rules"]: raise ActionFailure("No rules returned by server for this scan.") else: return response.text
def report_start(self, meta: GitMeta) -> None: """ Get scan id and file ignores returns name of policy used to scan """ debug_echo(f"=== reporting start to semgrep app at {self.url}") response = self.session.post( f"{self.url}/api/agent/deployment/{self.deployment_id}/scan", json={"meta": meta.to_dict()}, timeout=30, ) debug_echo(f"=== POST .../scan responded: {response!r}") if response.status_code == 404: raise ActionFailure( "Failed to create a scan with given token and deployment_id." "Please make sure they have been set correctly." f"API server at {self.url} returned this response: {response.text}" ) try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" ) else: body = response.json() self.scan = Scan( id=glom(body, T["scan"]["id"]), ignore_patterns=glom( body, T["scan"]["meta"].get("ignored_files", [])), policy_list=glom(body, T["policy"]), autofix=glom(body, T.get("autofix", False)), ) debug_echo(f"=== Our scan object is: {self.scan!r}")
def __post_init__(self) -> None: if self.token: self.is_configured = True self.session = requests.Session() self.session.mount("https://", RETRYING_ADAPTER) self.session.headers["Authorization"] = f"Bearer {self.token}" if validate_token_length(self.token): self.get_deployment_from_token(self.token) else: raise ActionFailure( f"Received invalid publish token. Length is too short.")
def get_deployment_from_token(self, token: str) -> None: response = self.session.get( f"{self.url}/api/agent/deployment", json={}, timeout=30, ) try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}\n" "Failed to get deployment") data = response.json() self.deployment_id = data.get("deployment").get("id") self.deployment_name = data.get("deployment").get("name")
def _baseline_context(self) -> Iterator[None]: """ Runs a block of code on files from the current branch HEAD. :raises ActionFailure: If git cannot detect a HEAD commit :raises ActionFailure: If unmerged files are detected """ repo = get_git_repo() if not repo: yield return self._abort_on_pending_changes() self._abort_on_conflicting_untracked_paths() current_tree = git("write-tree").stdout.decode().strip() try: for a in self._status.added: a.unlink() git.checkout(self._base_commit, "--", ".") yield finally: # git checkout will fail if the checked-out index deletes all files in the repo # In this case, we still want to continue without error. # Note that we have no good way of detecting this issue without inspecting the checkout output # message, which means we are fragile with respect to git version here. try: git.checkout(current_tree.strip(), "--", ".") except sh.ErrorReturnCode as error: output = error.stderr.decode() if (output and len(output) >= 2 and "pathspec '.' did not match any file(s) known to git" in output.strip()): debug_echo( "Restoring git index failed due to total repository deletion; skipping checkout" ) else: raise ActionFailure( f"Fatal error restoring Git state; please restore your repository state manually:\n{output}" ) if self._status.removed: # Need to check if file exists since it is possible file was deleted # in both the base and head. Only call if there are files to delete to_remove = [r for r in self._status.removed if r.exists()] if to_remove: git.rm("-f", *(str(r) for r in to_remove))
def expand_directives(self, line: str) -> Iterable[str]: """Load :include files""" if line.startswith(":include "): include_path = self.base_path / line[9:] if include_path.is_file(): with include_path.open() as include_lines: sub_base = include_path.parent.resolve() sub_parser = Parser(sub_base) return sub_parser.parse(include_lines) else: debug_echo( f"Skipping `:include {include_path}` directive, file not found" ) return [] elif CONTROL_REGEX.match(line): raise ActionFailure( f"Unknown ignore directive in Semgrep ignore file at {self.base_path}: '{line}'" ) else: return (line for _ in range(1))
def get_git_status(self) -> GitStatus: """ Returns Absolute Paths to all files that are staged Ignores files that are symlinks to directories """ import gitdb.exc # type: ignore repo = get_git_repo() if not repo or self._base_commit is None: return GitStatus([], [], [], []) try: repo.rev_parse(self._base_commit) except gitdb.exc.BadName: raise ActionFailure(f"Unknown git ref '{self._base_commit}'") # Output of git command will be relative to git project root status_output = zsplit( git.diff( "--cached", "--name-status", "--no-ext-diff", "-z", "--diff-filter=ACDMRTUXB", "--ignore-submodules", self._base_commit, ).stdout.decode()) added = [] modified = [] removed = [] unmerged = [] while status_output: code = status_output[0] fname = status_output[1] trim_size = 2 if not code.strip(): continue if code == StatusCode.Untracked or code == StatusCode.Ignored: continue resolved_name = self._fname_to_path(repo, fname) # If file is symlink to directory, skip absolute_name = Path(repo.working_tree_dir) / fname if absolute_name.is_symlink() and resolved_name.is_dir(): click.echo( f"| Skipping {absolute_name} since it is a symlink to a directory: {resolved_name}", err=True, ) else: # The following detection for unmerged codes comes from `man git-status` if code == StatusCode.Unmerged: unmerged.append(resolved_name) if (code[0] == StatusCode.Renamed ): # code is RXXX, where XXX is percent similarity removed.append(resolved_name) fname = status_output[2] trim_size += 1 added.append(resolved_name) if code == StatusCode.Added: added.append(resolved_name) if code == StatusCode.Modified: modified.append(resolved_name) if code == StatusCode.Deleted: removed.append(resolved_name) status_output = status_output[trim_size:] debug_echo( f"Git status:\nadded: {added}\nmodified: {modified}\nremoved: {removed}\nunmerged: {unmerged}" ) return GitStatus(added, modified, removed, unmerged)
def report_results(self, results: Results) -> None: debug_echo(f"=== reporting results to semgrep app at {self.url}") fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy() if "pr-comment-autofix" in os.getenv("SEMGREP_AGENT_OPT_IN_FEATURES", ""): fields_to_omit.remove("fixed_lines") response: Optional["requests.Response"] = None response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json={ # send a backup token in case the app is not available "token": os.getenv("GITHUB_TOKEN"), "findings": [ finding.to_dict(omit=fields_to_omit) for finding in results.findings.new ], }, timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() errors = response.json()["errors"] for error in errors: message = error["message"] click.echo(f"Server returned following warning: {message}", err=True) except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/ignores", json={ "findings": [finding.to_dict() for finding in results.findings.ignored], }, timeout=30, ) debug_echo(f"=== POST .../ignores responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") # mark as complete response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={ "exit_code": -1, "stats": results.stats }, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )
def report_results(self, results: Results, rule_ids: Sequence[str], cai_ids: Sequence[str]) -> None: debug_echo(f"=== reporting results to semgrep app at {self.url}") fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy() if self.scan.autofix: fields_to_omit.remove("fixed_lines") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json={ # send a backup token in case the app is not available "token": os.getenv("GITHUB_TOKEN"), "gitlab_token": os.getenv("GITLAB_TOKEN"), "findings": [ finding.to_dict(omit=fields_to_omit) for finding in results.findings.new ], "searched_paths": [str(p) for p in results.findings.searched_paths], "rule_ids": rule_ids, "cai_ids": cai_ids, }, timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() errors = response.json()["errors"] for error in errors: message = error["message"] click.echo(f"Server returned following warning: {message}", err=True) except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/ignores", json={ "findings": [ finding.to_dict() for finding in results.findings.new_ignored ], }, timeout=30, ) debug_echo(f"=== POST .../ignores responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") # mark as complete # In order to not overload our app database, we truncate target stats to the 20 heaviest hitters. This adds # approximately 80 kB of database load per scan when using p/ci. response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={ "exit_code": results.findings.max_exit_code, "stats": results.stats(n_heavy_targets=20), }, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )