Ejemplo n.º 1
0
 def wait_pod_event(self, name, cond):
     w = Watch()
     for event in w.stream(self.core_api.list_namespaced_pod,
                           self.namespace,
                           timeout_seconds=120):
         object = event['object']
         etype = event['type']
         if object.metadata.name != name: continue
         if cond(etype, object):
             w.stop()
Ejemplo n.º 2
0
 def watch_job(self, job_name, namespace):
     """
     监控job是否完成 bool
     """
     watcher = Watch()
     try:
         for event in watcher.stream(self.batch.list_namespaced_job,
                                     namespace=namespace,
                                     label_selector=f'job-name={job_name}'):
             succeed = event['object'].status.succeeded
             active = event['object'].status.active
             if succeed == 1 and active == None:
                 watcher.stop()
                 return True
     except Exception as e:
         print(e)
         return False
Ejemplo n.º 3
0
 def handle_sigterm(self, k8s_watch: watch.Watch, k8s_watch_stream):
     """
     handling pending sigterm signal which must have came,
     while watching on kubernetes.watch.stream synchronusly.
     """
     # set the stop flag so it will not pick the new events
     k8s_watch.stop()
     # flush out already fetched events and release the connection and other acquired resources
     Log.info(f"Flusing remaining Kubernetes {self._object} events.")
     for an_event in k8s_watch_stream:
         # Debug logging event type and metadata
         Log.debug(
             f'{self.name} flushing out the event type: {an_event[K8SEventsConst.TYPE]} \
             metadata: {an_event[K8SEventsConst.RAW_OBJECT][K8SEventsConst.METADATA]}'
         )
     # Setting flag to stop event processing loop
     Log.info(f"Stopped watching for {self._object} events.")
     self._stop_event_processing = True
Ejemplo n.º 4
0
 def watch_workflow_pods(self, experiment_id: str):
     # Bug conhecido:
     # Um pod que foi encontrado pelo worker de pods pode não ser encontrado pelo worker de logs no caso de experimentos
     load_kube_config()
     v1 = client.CoreV1Api()
     w = Watch()
     try:
         for pod in w.stream(v1.list_namespaced_pod,
                             namespace=KF_PIPELINES_NAMESPACE,
                             label_selector=f"experiment-id={experiment_id}"):
             if pod["type"] == "ADDED":
                 pod = pod["object"]
                 for container in pod.spec.containers:
                     if container.name not in EXCLUDE_CONTAINERS and "name" in pod.metadata.annotations:
                         self.loop.run_in_executor(self.pool, self.log_stream, pod, container)
     except CancelledError:
         """
         Expected behavior when trying to cancel task
         """
         w.stop()
         return
Ejemplo n.º 5
0
 def watch_deployment_pods(self, deployment_id):
     load_kube_config()
     v1 = client.CoreV1Api()
     w = Watch()
     try:
         for pod in w.stream(
             v1.list_namespaced_pod,
             namespace=KF_PIPELINES_NAMESPACE,
             label_selector=f"seldon-deployment-id={deployment_id}",
         ):
             if pod["type"] == "ADDED":
                 pod = pod["object"]
                 for container in pod.spec.containers:
                     if container.name not in EXCLUDE_CONTAINERS:
                         self.loop.run_in_executor(self.pool, self.log_stream, pod, container)
     except CancelledError:
         """
         Expected behavior when trying to cancel task
         """
         w.stop()
         return
    def streamEvents(self) -> None:
        """
        Watches for changes to the mongo objects in Kubernetes and processes any changes immediately.
        """
        event_watcher = Watch()

        # start watching from the latest version that we have
        if self.cluster_versions:
            event_watcher.resource_version = max(
                self.cluster_versions.values())

        for event in event_watcher.stream(
                self.kubernetes_service.listMongoObjects,
                _request_timeout=self.STREAM_REQUEST_TIMEOUT):
            logging.info("Received event %s", event)

            if event["type"] in ("ADDED", "MODIFIED"):
                cluster_object = self._parseConfiguration(event["object"])
                if cluster_object:
                    self.checkCluster(cluster_object)
                else:
                    logging.warning(
                        "Could not validate cluster object, stopping event watcher."
                    )
                    event_watcher.stop = True
            elif event["type"] in ("DELETED", ):
                self.collectGarbage()

            else:
                logging.warning(
                    "Could not parse event, stopping event watcher.")
                event_watcher.stop = True

            # Change the resource version manually because of a bug fixed in a later version of the K8s client:
            # https://github.com/kubernetes-client/python-base/pull/64
            if isinstance(event.get('object'),
                          dict) and 'resourceVersion' in event['object'].get(
                              'metadata', {}):
                event_watcher.resource_version = event['object']['metadata'][
                    'resourceVersion']
Ejemplo n.º 7
0
    def monitor_pods(self):
        # Wrap watch in outer loop, it might get interrupted before we
        # are finished looking
        printed_all_up = False
        start_time = datetime.now()
        while self.resources["pods"]:
            try:
                w = Watch()
                for event in w.stream(self.core_api.list_namespaced_pod,
                                      self.namespace):
                    object = event['object']
                    etype = event['type']
                    uid = (object.metadata.namespace, object.metadata.name)
                    if uid in self.resources["pods"]:
                        if etype == "MODIFIED":

                            #print("************************************\n%s %s\n%s" \
                            #      % (etype, object.metadata.name, object))

                            ready = 0
                            total = len(object.spec.containers)
                            pod_name_ip = "n/a"
                            status = object.status.phase
                            if object.status.reason is not None:
                                status = object.status.reason
                            if object.spec.node_name and object.spec.node_name != "":
                                pod_name_ip = object.spec.node_name
                            if object.status.pod_ip and object.status.pod_ip != "":
                                pod_name_ip += "/" + object.status.pod_ip

                            initializing = False

                            # On Kubernetes 1.5, get init container status out of the annotation manually
                            if not object.status.init_container_statuses \
                               and object.metadata.annotations \
                               and "pod.alpha.kubernetes.io/init-container-statuses" in object.metadata.annotations:
                                jp = json.loads(object.metadata.annotations[
                                    "pod.alpha.kubernetes.io/init-containers"])
                                js = json.loads(object.metadata.annotations[
                                    "pod.alpha.kubernetes.io/init-container-statuses"]
                                                )
                                a = ApiClient()
                                object.spec.init_containers = \
                                 a._ApiClient__deserialize(jp, "list[V1Container]")
                                object.status.init_container_statuses = \
                                 a._ApiClient__deserialize(js, "list[V1ContainerStatus]")

                            if object.status.init_container_statuses is not None:
                                for i, cs in enumerate(
                                        object.status.init_container_statuses):
                                    if cs.state.terminated and cs.state.terminated.exit_code == 0:
                                        continue
                                    elif cs.state.terminated:
                                        if len(cs.state.terminated.reason
                                               ) == 0:
                                            if cs.state.terminated.signal != 0:
                                                status = "Init:Signal:%d" % cs.state.terminated.signal
                                            else:
                                                status = "Init:ExitCode:%d" % cs.state.terminated.exit_code
                                        else:
                                            status = "Init:" + cs.state.terminated.reason
                                        initializing = True
                                    elif cs.state.waiting and len(cs.state.waiting.reason) > 0 \
                                      and cs.state.waiting.reason != "PodInitializing":
                                        status = "Init:" + cs.state.waiting.reason
                                        initializing = True
                                    else:
                                        status = "Init:%d/%d" % (
                                            i, len(
                                                object.spec.init_containers))
                                        initializing = True
                                    break

                            if not initializing and object.status.container_statuses is not None:
                                for cs in object.status.container_statuses:
                                    if cs.ready: ready += 1
                                    if cs.state.waiting and cs.state.waiting.reason != "":
                                        status = cs.state.waiting.reason
                                    elif cs.state.terminated and cs.state.terminated.reason != "":
                                        status = cs.state.terminated.reason
                                    elif cs.state.terminated and cs.state.terminated.reason == "":
                                        if cs.state.terminated.signal != 0:
                                            status = "Signal:%d" % cs.state.terminated.signal
                                        else:
                                            statis = "ExitCode:%d" % cs.state.terminated.exit_code

                            print(" - %-24s %-18s %d/%d  %s" \
                               % (object.metadata.name, status, ready, total, pod_name_ip))

                            self.resources["pods"][uid][
                                "phase"] = object.status.phase
                            self.resources["pods"][uid]["status"] = status
                            self.resources["pods"][uid]["ready"] = ready
                            self.resources["pods"][uid]["total"] = total
                            if ((object.status.phase == "Succeeded"
                                 or object.status.phase == "Failed")
                                    and object.metadata.deletion_timestamp
                                    == None):

                                if object.status.phase == "Failed":
                                    return False

                                #print("Pod %s/%s is finished" % (object.metadata.namespace, object.metadata.name))
                                #self.delete_all()

                            if object.status.container_statuses is not None:
                                for c in filter(
                                        lambda c: c.state.terminated,
                                        object.status.container_statuses):

                                    # If any container failed, assume overall failure
                                    if c.state.terminated.exit_code != 0:
                                        print(
                                            "Container '%s' of pod '%s:%s' failed"
                                            % (c.name, uid[0], uid[1]))
                                        return False

                                    # If a sufficient container completed, assume overall completion
                                    elif c.name in self.resources["pods"][uid][
                                            "sufficient_containers"]:
                                        print(
                                            "Container '%s' of pod '%s:%s' succeeded, finishing"
                                            % (c.name, uid[0], uid[1]))
                                        return True

                        if etype == "DELETED":
                            print("Pod %s/%s has been deleted" %
                                  (object.metadata.namespace,
                                   object.metadata.name))
                            del self.resources["pods"][uid]
                            if not self.resources["pods"]:
                                w.stop()
                                print("Done watching events")

                    if not printed_all_up:
                        all_up = True
                        for k, p in self.resources["pods"].items():
                            if p["status"] != "Running":
                                all_up = False
                            if p["ready"] != p["total"]:
                                all_up = False
                        if all_up:
                            printed_all_up = True
                            all_up_time = datetime.now()
                            print("All pods up and running (setup took %s)" %
                                  str(all_up_time - start_time))

            except Exception as e:
                if str(e) != "TERM":
                    print("Exception while monitoring pods")
                    print(traceback.format_exc())
                return False

        return True
Ejemplo n.º 8
0
    def delete_all(self):
        # We must pass a new default API client to avoid urllib conn pool warnings
        start_time = datetime.now()
        print("Deleting items")
        for uid in self.resources["pods"]:
            print("  - Pod %s:%s" % uid)
            try:
                res = self.core_api.delete_namespaced_pod(
                    namespace=uid[0], name=uid[1], body=V1DeleteOptions())
            except:
                print("    (issue cleaning up, ignored)")

        for uid in self.resources["services"]:
            print("  - Service %s:%s" % uid)
            try:
                res = self.core_api.delete_namespaced_service(namespace=uid[0],
                                                              name=uid[1])
            except:
                print("    (issue cleaning up, ignored)")

        for uid in self.resources["ingress"]:
            print("  - Ingress %s:%s" % uid)
            try:
                res = self.beta1_api.delete_namespaced_ingress(
                    namespace=uid[0], name=uid[1], body=V1DeleteOptions())
            except:
                print("    (issue cleaning up, ignored)")
        self.resources["ingress"] = {}

        for uid in self.resources["config_maps"]:
            print("  - ConfigMap %s:%s" % uid)
            try:
                res = self.core_api.delete_namespaced_config_map(
                    namespace=uid[0], name=uid[1], body=V1DeleteOptions())
            except:
                print("    (issue cleaning up, ignored)")
        self.resources["config_maps"] = {}

        for uid in self.resources["role_bindings"]:
            print("  - RoleBinding %s:%s" % uid)
            try:
                res = self.rbac_api.delete_namespaced_role_binding(
                    namespace=uid[0], name=uid[1], body=V1DeleteOptions())
            except:
                print("    (issue cleaning up, ignored)")
        self.resources["role_bindings"] = {}

        for uid in self.resources["roles"]:
            print("  - Role %s:%s" % uid)
            try:
                res = self.rbac_api.delete_namespaced_role(
                    namespace=uid[0], name=uid[1], body=V1DeleteOptions())
            except:
                print("    (issue cleaning up, ignored)")
        self.resources["roles"] = {}

        for uid in self.resources["service_accounts"]:
            print("  - ServiceAccount %s:%s" % uid)
            try:
                res = self.rbac_api.delete_namespaced_service_account(
                    namespace=uid[0], name=uid[1], body=V1DeleteOptions())
            except:
                print("    (issue cleaning up, ignored)")
        self.resources["service_accounts"] = {}

        # Not checking for possibly deleted pods, pods take a while to
        # delete and they will not be listed anymore

        print("Waiting for pod and service deletion")
        #print("Waiting for pods to be deleted: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["pods"]]))
        while self.resources["pods"]:
            current_pods = [(i.metadata.namespace, i.metadata.name)
                            for i in self.core_api.list_namespaced_pod(
                                self.namespace).items]
            #print("Current pods: %s" % ', '.join(["%s:%s" % uid for uid in current_pods]))
            deleted_pods = [
                uid for uid in self.resources["pods"]
                if uid not in current_pods
            ]
            #print("Deleted pods: %s" % ', '.join(["%s:%s" % uid for uid in deleted_pods]))
            for uid in deleted_pods:
                print("  - Pod %s:%s*" % uid)
                del self.resources["pods"][uid]
            if not self.resources["pods"]: break

            #print("Remaining: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["pods"]]))
            w = Watch()
            for event in w.stream(self.core_api.list_namespaced_pod,
                                  self.namespace,
                                  timeout_seconds=30):
                object = event['object']
                etype = event['type']
                uid = (object.metadata.namespace, object.metadata.name)
                if etype == "DELETED" and uid in self.resources["pods"]:
                    print("  - Pod %s:%s" % uid)
                    del self.resources["pods"][uid]
                    if not self.resources["pods"]: w.stop()
        #print("Done deleting pods")

        #print("Waiting for services to be deleted: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["services"]]))
        while self.resources["services"]:
            current_services = [(i.metadata.namespace, i.metadata.name)
                                for i in self.core_api.list_namespaced_service(
                                    self.namespace).items]
            #print("Current services: %s" % ', '.join(["%s:%s" % uid for uid in current_services]))
            deleted_services = [
                uid for uid in self.resources["services"]
                if uid not in current_services
            ]
            #print("Deleted services: %s" % ', '.join(["%s:%s" % uid for uid in deleted_services]))
            for uid in deleted_services:
                print("  - Service %s:%s*" % uid)
                del self.resources["services"][uid]
            if not self.resources["services"]: break

            # There is a short gap here that could trigger a race condition
            # but there seems to be no "query and keep watching" API that could
            # prevent that.

            #print("Remaining: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["services"]]))
            w = Watch()
            for event in w.stream(self.core_api.list_namespaced_service,
                                  self.namespace,
                                  timeout_seconds=30):
                object = event['object']
                etype = event['type']
                uid = (object.metadata.namespace, object.metadata.name)
                if etype == "DELETED" and uid in self.resources["services"]:
                    print("  - Service %s:%s" % uid)
                    del self.resources["services"][uid]
                    if not self.resources["services"]: w.stop()
        #print("Done deleting services")

        all_deleted_time = datetime.now()
        print("All items deleted (deletion took %s)" %
              str(all_deleted_time - start_time))