def _do_exec(self, command: List[str], preload_content=True) -> Union[WSClient, str]: for i in range(MAGIC_KONSTANT): try: s = stream( self.api.connect_get_namespaced_pod_exec, self.pod_name, self.k8s_namespace_name, command=command, stdin=False, stderr=True, stdout=True, tty=False, _preload_content= preload_content, # <<< we need a client object _request_timeout=WEBSOCKET_CALL_TIMEOUT, ) logger.debug( "we have successfully initiated the kube api client") return s except ApiException as ex: # in packit-service prod, occasionally 'No route to host' happens here # let's try to repeat the request logger.warning("exception while initiating WS Client: %s", ex) time.sleep(2 * i + 1) continue raise SandcastleException( "Unable to connect to the kubernetes API server.")
def _copy_path_from_pod(self, local_dir: Path, pod_dir: Path): """ copy content of a dir from pod to local dir :param local_dir: path to the local dir :param pod_dir: path within the pod """ try: run_command([ "oc", "rsync", "--delete", # delete files in local_dir which are not in pod_dir "--quiet=true", # avoid huge logs f"--namespace={self.k8s_namespace_name}", f"{self.pod_name}:{pod_dir}/", # trailing / to copy only content of dir f"{local_dir}", ]) except Exception as ex: # There is a race condition in k8s that it tells the pod is running even # though it already killed an exec session and hence we couldn't copy # anything from the pod if not self.is_pod_running( ) or "not available in container" in str(ex): logger.warning( "The pod is not running while we tried to copy data out of it." ) raise SandcastleException( "Cannot copy data from the sandbox - the pod is not running." ) raise
def create_pod(self, pod_manifest: Dict) -> Dict: """ Create pod in a namespace :return: response from the API server """ # if we hit timebound quota, let's try RETRY_CREATE_POD_MAX times with expo backoff # 2 ** 7 = 128 = 2 minutes # 2 ** 8 = 256 = 4 minutes # in total we try for ~8 minutes for idx in range(1, RETRY_CREATE_POD_MAX): try: logger.debug( f"Creating sandbox pod via kubernetes API, try {idx}") return self.api.create_namespaced_pod( body=pod_manifest, namespace=self.k8s_namespace_name) except ApiException as ex: logger.info(f"Unable to create the pod: {ex}") # reproducer for this is to set memory quota for your cluster: # https://docs.openshift.com/online/pro/dev_guide/compute_resources.html#dev-memory-requests exc_str = str(ex) # there is no documentation to say what's inside the exception # [2021-02-08 10:22:56,070: INFO/ForkPoolWorker-1] Unable to create the pod: (403) # HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-store', ... # HTTP response body: {"kind":"Status","status":"Failure", # "message":"pods \"docker-io-usercont-sandcastle-prod-...\" is forbidden... # "code":403} if "403" in exc_str: # forbidden sleep_time = 2**idx logger.debug(f"Trying again in {sleep_time}s") time.sleep(sleep_time) else: raise raise SandcastleException("Unable to schedule the sandbox pod.")
def get_api_client() -> CoreV1Api: """ Obtain API client for kubenernetes; if running in a pod, load service account identity, otherwise load kubeconfig """ logger.debug("Initialize kubernetes client") configuration = Configuration() if "KUBERNETES_SERVICE_HOST" in os.environ: logger.info("loading incluster config") load_incluster_config(client_configuration=configuration) else: logger.info("loading kubeconfig") load_kube_config(client_configuration=configuration) if not configuration.api_key: raise SandcastleException( "No api_key, can't access any cluster.\n") return CoreV1Api(ApiClient(configuration=configuration))
def _copy_path_to_pod(self, local_path: Path, pod_dir: Path, no_perms: bool = False): """ copy local_path (dir or file) inside pod :param local_path: path to a local file or a dir :param pod_dir: Directory within the pod where the content of local_path is extracted :param no_perms: If true, do not transfer permissions https://www.openshift.com/blog/transferring-files-in-and-out-of-containers-in-openshift-part-1-manually-copying-files """ if local_path.is_dir(): exclude = "--exclude=lost+found" # can't touch that include = "--include=[]" # default elif local_path.is_file(): exclude = "--exclude=*" # everything include = f"--include={local_path.name}" # only the file local_path = local_path.parent else: raise SandcastleException( f"{local_path} is neither a dir nor a file") cmd = [ "oc", "rsync", exclude, include, "--quiet=true", # avoid huge logs f"--namespace={self.k8s_namespace_name}", f"{local_path}/", # ??? rsync doesn't work without the trailing / f"{self.pod_name}:{pod_dir}", ] if no_perms: cmd += ["--no-perms"] run_command(cmd)
def deploy_pod(self, command: Optional[List] = None): """ Deploy a pod and babysit it. If it exists already, remove it. """ if self.mapped_dir and command: raise SandcastleException( "Since you set your own command, we cannot sync the local dir" " inside because there is a race condition between the pod start" " and the copy process. Please use exec instead.") logger.info("Deploying pod %s", self.pod_name) if self.is_pod_already_deployed(): self.delete_pod() pod_manifest = self.create_pod_manifest(command=command) self.create_pod(pod_manifest) # wait for the pod to start count = 0 logger.debug("pod = %r" % self.pod_name) while True: resp = self.get_pod() # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase if resp.status.phase != "Pending": logger.info("pod is no longed pending - status: %s", resp.status.phase) break time.sleep(1) count += 1 if count > 600: logger.error("The pod did not start on time, " "status = %r" % resp.status) raise RuntimeError( "The pod did not start in 600 seconds: something's wrong.") if resp.status.phase == "Failed": # > resp.status.container_statuses[0].state # {'running': None, # 'terminated': {'container_id': 'docker://f3828... # 'exit_code': 2, # 'finished_at': datetime.datetime(2019, 6, 7,... # 'message': None, # 'reason': 'Error', # 'signal': None, # 'started_at': datetime.datetime(2019, 6, 7,... # 'waiting': None} raise SandcastleCommandFailed( output=self.get_logs(), reason=str(resp.status), rc=self.get_rc_from_v1pod(resp), ) if command: # wait for the pod to finish since the command is set while True: resp = self.get_pod() # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase if resp.status.phase == "Failed": logger.info("The pod has failed execution: you should " "inspect logs or check `oc describe`") raise SandcastleCommandFailed( output=self.get_logs(), reason=str(resp.status), rc=self.get_rc_from_v1pod(resp), ) if resp.status.phase == "Succeeded": logger.info( "All Containers in the pod have finished successfully." ) break # TODO: can we use watch instead? time.sleep(1)
def exec( self, command: List[str], env: Optional[Dict] = None, cwd: Union[str, Path] = None, ) -> str: """ exec a command in a running pod :param command: command to run :param env: a Dict with env vars to set for the exec'd command :param cwd: run the command in this subdirectory of a mapped dir, defaults to a mapped dir or a temporary directory if mapped_dir is not set :returns logs """ if not self.mapped_dir and cwd: raise SandcastleException( "The cwd argument only works with a mapped dir - " "please set a mapped dir or change directory in the command you provide." ) # we need to check first if the pod is running; otherwise we'd get a nasty 500 if not self.is_pod_running(): raise SandcastleTimeoutReached( "You have reached a timeout: the pod is no longer running.") logger.info("command = %s", command) target_dir = None if not self.mapped_dir else Path( self.mapped_dir.path) unique_dir, target_script_path = self._prepare_exec( command, target_dir=target_dir, env=env, cwd=cwd) command = ["bash", str(target_script_path)] if self.mapped_dir: self._copy_path_to_pod(self.mapped_dir.local_dir, unique_dir) # https://github.com/kubernetes-client/python/blob/master/examples/exec.py # https://github.com/kubernetes-client/python/issues/812#issuecomment-499423823 # FIXME: refactor this junk into a dedicated function, ideally to _do_exec ws_client: WSClient = self._do_exec(command, preload_content=False) try: # https://github.com/packit-service/sandcastle/issues/23 # even with a >0 number or ==0, select tends to block response = "" errors = "" while ws_client.is_open(): ws_client.run_forever(timeout=WEBSOCKET_CALL_TIMEOUT) errors += ws_client.read_channel(ERROR_CHANNEL) logger.debug("%s", errors) # read_all would consume ERR_CHANNEL, so read_all needs to be last response += ws_client.read_all() if errors: # errors = '{"metadata":{},"status":"Success"}' j = json.loads(errors) status = j.get("status", None) if status == "Success": logger.info("exec command succeeded, yay!") self._copy_mdir_from_pod(unique_dir) elif status == "Failure": logger.info("exec command failed") logger.debug(j) logger.info(f"output:\n{response}") # the timeout could have been reached here which means # the pod is not running, so we are not able `oc rsync` things from inside: # we won't be needing the data any more since p-s halts execution # after a failure in action, we only do this b/c it's the right thing to do # for use cases outside p-s try: self._copy_mdir_from_pod(unique_dir) except SandcastleException: # yes, we eat the exception because the one raised below # is much more important since it contains metadata about what happened; # logs will contain info about what happened while trying to copy things pass # ('{"metadata":{},"status":"Failure","message":"command terminated with ' # 'non-zero exit code: Error executing in Docker Container: ' # '1","reason":"NonZeroExitCode","details":{"causes":[{"reason":"ExitCode","message":"1"}]}}') causes = j.get("details", {}).get("causes", []) rc = 999 for c in causes: if c.get("reason", None) == "ExitCode": try: rc = int(c.get("message", None)) except ValueError: rc = 999 raise SandcastleCommandFailed(output=response, reason=errors, rc=rc) else: logger.warning( "exec didn't yield the metadata we expect, mighty suspicious, %s", errors, ) finally: ws_client.close() logger.debug("exec response = %r" % response) return response