def wait_pod_event(self, name, cond): w = Watch() for event in w.stream(self.core_api.list_namespaced_pod, self.namespace, timeout_seconds=120): object = event['object'] etype = event['type'] if object.metadata.name != name: continue if cond(etype, object): w.stop()
def watch_job(self, job_name, namespace): """ 监控job是否完成 bool """ watcher = Watch() try: for event in watcher.stream(self.batch.list_namespaced_job, namespace=namespace, label_selector=f'job-name={job_name}'): succeed = event['object'].status.succeeded active = event['object'].status.active if succeed == 1 and active == None: watcher.stop() return True except Exception as e: print(e) return False
def handle_sigterm(self, k8s_watch: watch.Watch, k8s_watch_stream): """ handling pending sigterm signal which must have came, while watching on kubernetes.watch.stream synchronusly. """ # set the stop flag so it will not pick the new events k8s_watch.stop() # flush out already fetched events and release the connection and other acquired resources Log.info(f"Flusing remaining Kubernetes {self._object} events.") for an_event in k8s_watch_stream: # Debug logging event type and metadata Log.debug( f'{self.name} flushing out the event type: {an_event[K8SEventsConst.TYPE]} \ metadata: {an_event[K8SEventsConst.RAW_OBJECT][K8SEventsConst.METADATA]}' ) # Setting flag to stop event processing loop Log.info(f"Stopped watching for {self._object} events.") self._stop_event_processing = True
def watch_workflow_pods(self, experiment_id: str): # Bug conhecido: # Um pod que foi encontrado pelo worker de pods pode não ser encontrado pelo worker de logs no caso de experimentos load_kube_config() v1 = client.CoreV1Api() w = Watch() try: for pod in w.stream(v1.list_namespaced_pod, namespace=KF_PIPELINES_NAMESPACE, label_selector=f"experiment-id={experiment_id}"): if pod["type"] == "ADDED": pod = pod["object"] for container in pod.spec.containers: if container.name not in EXCLUDE_CONTAINERS and "name" in pod.metadata.annotations: self.loop.run_in_executor(self.pool, self.log_stream, pod, container) except CancelledError: """ Expected behavior when trying to cancel task """ w.stop() return
def watch_deployment_pods(self, deployment_id): load_kube_config() v1 = client.CoreV1Api() w = Watch() try: for pod in w.stream( v1.list_namespaced_pod, namespace=KF_PIPELINES_NAMESPACE, label_selector=f"seldon-deployment-id={deployment_id}", ): if pod["type"] == "ADDED": pod = pod["object"] for container in pod.spec.containers: if container.name not in EXCLUDE_CONTAINERS: self.loop.run_in_executor(self.pool, self.log_stream, pod, container) except CancelledError: """ Expected behavior when trying to cancel task """ w.stop() return
def streamEvents(self) -> None: """ Watches for changes to the mongo objects in Kubernetes and processes any changes immediately. """ event_watcher = Watch() # start watching from the latest version that we have if self.cluster_versions: event_watcher.resource_version = max( self.cluster_versions.values()) for event in event_watcher.stream( self.kubernetes_service.listMongoObjects, _request_timeout=self.STREAM_REQUEST_TIMEOUT): logging.info("Received event %s", event) if event["type"] in ("ADDED", "MODIFIED"): cluster_object = self._parseConfiguration(event["object"]) if cluster_object: self.checkCluster(cluster_object) else: logging.warning( "Could not validate cluster object, stopping event watcher." ) event_watcher.stop = True elif event["type"] in ("DELETED", ): self.collectGarbage() else: logging.warning( "Could not parse event, stopping event watcher.") event_watcher.stop = True # Change the resource version manually because of a bug fixed in a later version of the K8s client: # https://github.com/kubernetes-client/python-base/pull/64 if isinstance(event.get('object'), dict) and 'resourceVersion' in event['object'].get( 'metadata', {}): event_watcher.resource_version = event['object']['metadata'][ 'resourceVersion']
def monitor_pods(self): # Wrap watch in outer loop, it might get interrupted before we # are finished looking printed_all_up = False start_time = datetime.now() while self.resources["pods"]: try: w = Watch() for event in w.stream(self.core_api.list_namespaced_pod, self.namespace): object = event['object'] etype = event['type'] uid = (object.metadata.namespace, object.metadata.name) if uid in self.resources["pods"]: if etype == "MODIFIED": #print("************************************\n%s %s\n%s" \ # % (etype, object.metadata.name, object)) ready = 0 total = len(object.spec.containers) pod_name_ip = "n/a" status = object.status.phase if object.status.reason is not None: status = object.status.reason if object.spec.node_name and object.spec.node_name != "": pod_name_ip = object.spec.node_name if object.status.pod_ip and object.status.pod_ip != "": pod_name_ip += "/" + object.status.pod_ip initializing = False # On Kubernetes 1.5, get init container status out of the annotation manually if not object.status.init_container_statuses \ and object.metadata.annotations \ and "pod.alpha.kubernetes.io/init-container-statuses" in object.metadata.annotations: jp = json.loads(object.metadata.annotations[ "pod.alpha.kubernetes.io/init-containers"]) js = json.loads(object.metadata.annotations[ "pod.alpha.kubernetes.io/init-container-statuses"] ) a = ApiClient() object.spec.init_containers = \ a._ApiClient__deserialize(jp, "list[V1Container]") object.status.init_container_statuses = \ a._ApiClient__deserialize(js, "list[V1ContainerStatus]") if object.status.init_container_statuses is not None: for i, cs in enumerate( object.status.init_container_statuses): if cs.state.terminated and cs.state.terminated.exit_code == 0: continue elif cs.state.terminated: if len(cs.state.terminated.reason ) == 0: if cs.state.terminated.signal != 0: status = "Init:Signal:%d" % cs.state.terminated.signal else: status = "Init:ExitCode:%d" % cs.state.terminated.exit_code else: status = "Init:" + cs.state.terminated.reason initializing = True elif cs.state.waiting and len(cs.state.waiting.reason) > 0 \ and cs.state.waiting.reason != "PodInitializing": status = "Init:" + cs.state.waiting.reason initializing = True else: status = "Init:%d/%d" % ( i, len( object.spec.init_containers)) initializing = True break if not initializing and object.status.container_statuses is not None: for cs in object.status.container_statuses: if cs.ready: ready += 1 if cs.state.waiting and cs.state.waiting.reason != "": status = cs.state.waiting.reason elif cs.state.terminated and cs.state.terminated.reason != "": status = cs.state.terminated.reason elif cs.state.terminated and cs.state.terminated.reason == "": if cs.state.terminated.signal != 0: status = "Signal:%d" % cs.state.terminated.signal else: statis = "ExitCode:%d" % cs.state.terminated.exit_code print(" - %-24s %-18s %d/%d %s" \ % (object.metadata.name, status, ready, total, pod_name_ip)) self.resources["pods"][uid][ "phase"] = object.status.phase self.resources["pods"][uid]["status"] = status self.resources["pods"][uid]["ready"] = ready self.resources["pods"][uid]["total"] = total if ((object.status.phase == "Succeeded" or object.status.phase == "Failed") and object.metadata.deletion_timestamp == None): if object.status.phase == "Failed": return False #print("Pod %s/%s is finished" % (object.metadata.namespace, object.metadata.name)) #self.delete_all() if object.status.container_statuses is not None: for c in filter( lambda c: c.state.terminated, object.status.container_statuses): # If any container failed, assume overall failure if c.state.terminated.exit_code != 0: print( "Container '%s' of pod '%s:%s' failed" % (c.name, uid[0], uid[1])) return False # If a sufficient container completed, assume overall completion elif c.name in self.resources["pods"][uid][ "sufficient_containers"]: print( "Container '%s' of pod '%s:%s' succeeded, finishing" % (c.name, uid[0], uid[1])) return True if etype == "DELETED": print("Pod %s/%s has been deleted" % (object.metadata.namespace, object.metadata.name)) del self.resources["pods"][uid] if not self.resources["pods"]: w.stop() print("Done watching events") if not printed_all_up: all_up = True for k, p in self.resources["pods"].items(): if p["status"] != "Running": all_up = False if p["ready"] != p["total"]: all_up = False if all_up: printed_all_up = True all_up_time = datetime.now() print("All pods up and running (setup took %s)" % str(all_up_time - start_time)) except Exception as e: if str(e) != "TERM": print("Exception while monitoring pods") print(traceback.format_exc()) return False return True
def delete_all(self): # We must pass a new default API client to avoid urllib conn pool warnings start_time = datetime.now() print("Deleting items") for uid in self.resources["pods"]: print(" - Pod %s:%s" % uid) try: res = self.core_api.delete_namespaced_pod( namespace=uid[0], name=uid[1], body=V1DeleteOptions()) except: print(" (issue cleaning up, ignored)") for uid in self.resources["services"]: print(" - Service %s:%s" % uid) try: res = self.core_api.delete_namespaced_service(namespace=uid[0], name=uid[1]) except: print(" (issue cleaning up, ignored)") for uid in self.resources["ingress"]: print(" - Ingress %s:%s" % uid) try: res = self.beta1_api.delete_namespaced_ingress( namespace=uid[0], name=uid[1], body=V1DeleteOptions()) except: print(" (issue cleaning up, ignored)") self.resources["ingress"] = {} for uid in self.resources["config_maps"]: print(" - ConfigMap %s:%s" % uid) try: res = self.core_api.delete_namespaced_config_map( namespace=uid[0], name=uid[1], body=V1DeleteOptions()) except: print(" (issue cleaning up, ignored)") self.resources["config_maps"] = {} for uid in self.resources["role_bindings"]: print(" - RoleBinding %s:%s" % uid) try: res = self.rbac_api.delete_namespaced_role_binding( namespace=uid[0], name=uid[1], body=V1DeleteOptions()) except: print(" (issue cleaning up, ignored)") self.resources["role_bindings"] = {} for uid in self.resources["roles"]: print(" - Role %s:%s" % uid) try: res = self.rbac_api.delete_namespaced_role( namespace=uid[0], name=uid[1], body=V1DeleteOptions()) except: print(" (issue cleaning up, ignored)") self.resources["roles"] = {} for uid in self.resources["service_accounts"]: print(" - ServiceAccount %s:%s" % uid) try: res = self.rbac_api.delete_namespaced_service_account( namespace=uid[0], name=uid[1], body=V1DeleteOptions()) except: print(" (issue cleaning up, ignored)") self.resources["service_accounts"] = {} # Not checking for possibly deleted pods, pods take a while to # delete and they will not be listed anymore print("Waiting for pod and service deletion") #print("Waiting for pods to be deleted: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["pods"]])) while self.resources["pods"]: current_pods = [(i.metadata.namespace, i.metadata.name) for i in self.core_api.list_namespaced_pod( self.namespace).items] #print("Current pods: %s" % ', '.join(["%s:%s" % uid for uid in current_pods])) deleted_pods = [ uid for uid in self.resources["pods"] if uid not in current_pods ] #print("Deleted pods: %s" % ', '.join(["%s:%s" % uid for uid in deleted_pods])) for uid in deleted_pods: print(" - Pod %s:%s*" % uid) del self.resources["pods"][uid] if not self.resources["pods"]: break #print("Remaining: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["pods"]])) w = Watch() for event in w.stream(self.core_api.list_namespaced_pod, self.namespace, timeout_seconds=30): object = event['object'] etype = event['type'] uid = (object.metadata.namespace, object.metadata.name) if etype == "DELETED" and uid in self.resources["pods"]: print(" - Pod %s:%s" % uid) del self.resources["pods"][uid] if not self.resources["pods"]: w.stop() #print("Done deleting pods") #print("Waiting for services to be deleted: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["services"]])) while self.resources["services"]: current_services = [(i.metadata.namespace, i.metadata.name) for i in self.core_api.list_namespaced_service( self.namespace).items] #print("Current services: %s" % ', '.join(["%s:%s" % uid for uid in current_services])) deleted_services = [ uid for uid in self.resources["services"] if uid not in current_services ] #print("Deleted services: %s" % ', '.join(["%s:%s" % uid for uid in deleted_services])) for uid in deleted_services: print(" - Service %s:%s*" % uid) del self.resources["services"][uid] if not self.resources["services"]: break # There is a short gap here that could trigger a race condition # but there seems to be no "query and keep watching" API that could # prevent that. #print("Remaining: %s" % ', '.join(["%s:%s" % uid for uid in self.resources["services"]])) w = Watch() for event in w.stream(self.core_api.list_namespaced_service, self.namespace, timeout_seconds=30): object = event['object'] etype = event['type'] uid = (object.metadata.namespace, object.metadata.name) if etype == "DELETED" and uid in self.resources["services"]: print(" - Service %s:%s" % uid) del self.resources["services"][uid] if not self.resources["services"]: w.stop() #print("Done deleting services") all_deleted_time = datetime.now() print("All items deleted (deletion took %s)" % str(all_deleted_time - start_time))