Esempio n. 1
0
 def _do_exec(self,
              command: List[str],
              preload_content=True) -> Union[WSClient, str]:
     for i in range(MAGIC_KONSTANT):
         try:
             s = stream(
                 self.api.connect_get_namespaced_pod_exec,
                 self.pod_name,
                 self.k8s_namespace_name,
                 command=command,
                 stdin=False,
                 stderr=True,
                 stdout=True,
                 tty=False,
                 _preload_content=
                 preload_content,  # <<< we need a client object
                 _request_timeout=WEBSOCKET_CALL_TIMEOUT,
             )
             logger.debug(
                 "we have successfully initiated the kube api client")
             return s
         except ApiException as ex:
             # in packit-service prod, occasionally 'No route to host' happens here
             # let's try to repeat the request
             logger.warning("exception while initiating WS Client: %s", ex)
             time.sleep(2 * i + 1)
             continue
     raise SandcastleException(
         "Unable to connect to the kubernetes API server.")
Esempio n. 2
0
    def _copy_path_from_pod(self, local_dir: Path, pod_dir: Path):
        """
        copy content of a dir from pod to local dir

        :param local_dir: path to the local dir
        :param pod_dir: path within the pod
        """
        try:
            run_command([
                "oc",
                "rsync",
                "--delete",  # delete files in local_dir which are not in pod_dir
                "--quiet=true",  # avoid huge logs
                f"--namespace={self.k8s_namespace_name}",
                f"{self.pod_name}:{pod_dir}/",  # trailing / to copy only content of dir
                f"{local_dir}",
            ])
        except Exception as ex:
            # There is a race condition in k8s that it tells the pod is running even
            # though it already killed an exec session and hence we couldn't copy
            # anything from the pod
            if not self.is_pod_running(
            ) or "not available in container" in str(ex):
                logger.warning(
                    "The pod is not running while we tried to copy data out of it."
                )
                raise SandcastleException(
                    "Cannot copy data from the sandbox - the pod is not running."
                )
            raise
Esempio n. 3
0
    def create_pod(self, pod_manifest: Dict) -> Dict:
        """
        Create pod in a namespace

        :return: response from the API server
        """
        # if we hit timebound quota, let's try RETRY_CREATE_POD_MAX times with expo backoff
        # 2 ** 7 = 128 = 2 minutes
        # 2 ** 8 = 256 = 4 minutes
        # in total we try for ~8 minutes
        for idx in range(1, RETRY_CREATE_POD_MAX):
            try:
                logger.debug(
                    f"Creating sandbox pod via kubernetes API, try {idx}")
                return self.api.create_namespaced_pod(
                    body=pod_manifest, namespace=self.k8s_namespace_name)
            except ApiException as ex:
                logger.info(f"Unable to create the pod: {ex}")
                # reproducer for this is to set memory quota for your cluster:
                # https://docs.openshift.com/online/pro/dev_guide/compute_resources.html#dev-memory-requests
                exc_str = str(ex)
                # there is no documentation to say what's inside the exception
                #   [2021-02-08 10:22:56,070: INFO/ForkPoolWorker-1] Unable to create the pod: (403)
                #   HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-store', ...
                #   HTTP response body: {"kind":"Status","status":"Failure",
                #   "message":"pods \"docker-io-usercont-sandcastle-prod-...\" is forbidden...
                #     "code":403}
                if "403" in exc_str:  # forbidden
                    sleep_time = 2**idx
                    logger.debug(f"Trying again in {sleep_time}s")
                    time.sleep(sleep_time)
                else:
                    raise
        raise SandcastleException("Unable to schedule the sandbox pod.")
Esempio n. 4
0
 def get_api_client() -> CoreV1Api:
     """
     Obtain API client for kubenernetes; if running in a pod,
     load service account identity, otherwise load kubeconfig
     """
     logger.debug("Initialize kubernetes client")
     configuration = Configuration()
     if "KUBERNETES_SERVICE_HOST" in os.environ:
         logger.info("loading incluster config")
         load_incluster_config(client_configuration=configuration)
     else:
         logger.info("loading kubeconfig")
         load_kube_config(client_configuration=configuration)
     if not configuration.api_key:
         raise SandcastleException(
             "No api_key, can't access any cluster.\n")
     return CoreV1Api(ApiClient(configuration=configuration))
Esempio n. 5
0
    def _copy_path_to_pod(self,
                          local_path: Path,
                          pod_dir: Path,
                          no_perms: bool = False):
        """
        copy local_path (dir or file) inside pod

        :param local_path: path to a local file or a dir
        :param pod_dir: Directory within the pod where the content of local_path is extracted
        :param no_perms: If true, do not transfer permissions

        https://www.openshift.com/blog/transferring-files-in-and-out-of-containers-in-openshift-part-1-manually-copying-files
        """
        if local_path.is_dir():
            exclude = "--exclude=lost+found"  # can't touch that
            include = "--include=[]"  # default
        elif local_path.is_file():
            exclude = "--exclude=*"  # everything
            include = f"--include={local_path.name}"  # only the file
            local_path = local_path.parent
        else:
            raise SandcastleException(
                f"{local_path} is neither a dir nor a file")

        cmd = [
            "oc",
            "rsync",
            exclude,
            include,
            "--quiet=true",  # avoid huge logs
            f"--namespace={self.k8s_namespace_name}",
            f"{local_path}/",  # ??? rsync doesn't work without the trailing /
            f"{self.pod_name}:{pod_dir}",
        ]
        if no_perms:
            cmd += ["--no-perms"]
        run_command(cmd)
Esempio n. 6
0
    def deploy_pod(self, command: Optional[List] = None):
        """
        Deploy a pod and babysit it. If it exists already, remove it.
        """
        if self.mapped_dir and command:
            raise SandcastleException(
                "Since you set your own command, we cannot sync the local dir"
                " inside because there is a race condition between the pod start"
                " and the copy process. Please use exec instead.")

        logger.info("Deploying pod %s", self.pod_name)
        if self.is_pod_already_deployed():
            self.delete_pod()

        pod_manifest = self.create_pod_manifest(command=command)
        self.create_pod(pod_manifest)

        # wait for the pod to start
        count = 0
        logger.debug("pod = %r" % self.pod_name)
        while True:
            resp = self.get_pod()
            # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
            if resp.status.phase != "Pending":
                logger.info("pod is no longed pending - status: %s",
                            resp.status.phase)
                break
            time.sleep(1)
            count += 1
            if count > 600:
                logger.error("The pod did not start on time, "
                             "status = %r" % resp.status)
                raise RuntimeError(
                    "The pod did not start in 600 seconds: something's wrong.")

        if resp.status.phase == "Failed":
            # > resp.status.container_statuses[0].state
            # {'running': None,
            #  'terminated': {'container_id': 'docker://f3828...
            #                 'exit_code': 2,
            #                 'finished_at': datetime.datetime(2019, 6, 7,...
            #                 'message': None,
            #                 'reason': 'Error',
            #                 'signal': None,
            #                 'started_at': datetime.datetime(2019, 6, 7,...
            #  'waiting': None}

            raise SandcastleCommandFailed(
                output=self.get_logs(),
                reason=str(resp.status),
                rc=self.get_rc_from_v1pod(resp),
            )

        if command:
            # wait for the pod to finish since the command is set
            while True:
                resp = self.get_pod()
                # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
                if resp.status.phase == "Failed":
                    logger.info("The pod has failed execution: you should "
                                "inspect logs or check `oc describe`")
                    raise SandcastleCommandFailed(
                        output=self.get_logs(),
                        reason=str(resp.status),
                        rc=self.get_rc_from_v1pod(resp),
                    )
                if resp.status.phase == "Succeeded":
                    logger.info(
                        "All Containers in the pod have finished successfully."
                    )
                    break
                # TODO: can we use watch instead?
                time.sleep(1)
Esempio n. 7
0
    def exec(
        self,
        command: List[str],
        env: Optional[Dict] = None,
        cwd: Union[str, Path] = None,
    ) -> str:
        """
        exec a command in a running pod

        :param command: command to run
        :param env: a Dict with env vars to set for the exec'd command
        :param cwd: run the command in this subdirectory of a mapped dir,
               defaults to a mapped dir or a temporary directory if mapped_dir is not set
        :returns logs
        """
        if not self.mapped_dir and cwd:
            raise SandcastleException(
                "The cwd argument only works with a mapped dir - "
                "please set a mapped dir or change directory in the command you provide."
            )
        # we need to check first if the pod is running; otherwise we'd get a nasty 500
        if not self.is_pod_running():
            raise SandcastleTimeoutReached(
                "You have reached a timeout: the pod is no longer running.")
        logger.info("command = %s", command)

        target_dir = None if not self.mapped_dir else Path(
            self.mapped_dir.path)
        unique_dir, target_script_path = self._prepare_exec(
            command, target_dir=target_dir, env=env, cwd=cwd)
        command = ["bash", str(target_script_path)]
        if self.mapped_dir:
            self._copy_path_to_pod(self.mapped_dir.local_dir, unique_dir)
        # https://github.com/kubernetes-client/python/blob/master/examples/exec.py
        # https://github.com/kubernetes-client/python/issues/812#issuecomment-499423823
        # FIXME: refactor this junk into a dedicated function, ideally to _do_exec
        ws_client: WSClient = self._do_exec(command, preload_content=False)
        try:
            # https://github.com/packit-service/sandcastle/issues/23
            # even with a >0 number or ==0, select tends to block
            response = ""
            errors = ""
            while ws_client.is_open():
                ws_client.run_forever(timeout=WEBSOCKET_CALL_TIMEOUT)
                errors += ws_client.read_channel(ERROR_CHANNEL)
                logger.debug("%s", errors)
                # read_all would consume ERR_CHANNEL, so read_all needs to be last
                response += ws_client.read_all()
            if errors:
                # errors = '{"metadata":{},"status":"Success"}'
                j = json.loads(errors)
                status = j.get("status", None)
                if status == "Success":
                    logger.info("exec command succeeded, yay!")
                    self._copy_mdir_from_pod(unique_dir)
                elif status == "Failure":
                    logger.info("exec command failed")
                    logger.debug(j)
                    logger.info(f"output:\n{response}")
                    # the timeout could have been reached here which means
                    # the pod is not running, so we are not able `oc rsync` things from inside:
                    # we won't be needing the data any more since p-s halts execution
                    # after a failure in action, we only do this b/c it's the right thing to do
                    # for use cases outside p-s
                    try:
                        self._copy_mdir_from_pod(unique_dir)
                    except SandcastleException:
                        # yes, we eat the exception because the one raised below
                        # is much more important since it contains metadata about what happened;
                        # logs will contain info about what happened while trying to copy things
                        pass

                    # ('{"metadata":{},"status":"Failure","message":"command terminated with '
                    #  'non-zero exit code: Error executing in Docker Container: '
                    #  '1","reason":"NonZeroExitCode","details":{"causes":[{"reason":"ExitCode","message":"1"}]}}')
                    causes = j.get("details", {}).get("causes", [])
                    rc = 999
                    for c in causes:
                        if c.get("reason", None) == "ExitCode":
                            try:
                                rc = int(c.get("message", None))
                            except ValueError:
                                rc = 999
                    raise SandcastleCommandFailed(output=response,
                                                  reason=errors,
                                                  rc=rc)
                else:
                    logger.warning(
                        "exec didn't yield the metadata we expect, mighty suspicious, %s",
                        errors,
                    )
        finally:
            ws_client.close()

        logger.debug("exec response = %r" % response)
        return response