async def pod_info( pod: V1Pod, client: kubernetes_tools.KubeClient, num_tail_lines: int, ): container_statuses = pod.status.container_statuses or [] pod_event_messages = await get_pod_event_messages(client, pod) containers = [ dict( name=container.name, tail_lines=await get_tail_lines_for_kubernetes_container( client, pod, container, num_tail_lines, ), ) for container in container_statuses ] return { "name": pod.metadata.name, "host": kubernetes_tools.get_pod_hostname(client, pod), "deployed_timestamp": pod.metadata.creation_timestamp.timestamp(), "phase": pod.status.phase, "ready": kubernetes_tools.is_pod_ready(pod), "containers": containers, "reason": pod.status.reason, "message": pod.status.message, "events": pod_event_messages, "git_sha": pod.metadata.labels.get("paasta.yelp.com/git_sha"), "config_sha": pod.metadata.labels.get("paasta.yelp.com/config_sha"), }
def get_pod_status( pod: V1Pod, backends: Optional[Set[str]], ) -> Dict[str, Any]: reason = pod.status.reason message = pod.status.message scheduled = kubernetes_tools.is_pod_scheduled(pod) ready = kubernetes_tools.is_pod_ready(pod) delete_timestamp = (pod.metadata.deletion_timestamp.timestamp() if pod.metadata.deletion_timestamp else None) if not scheduled: sched_condition = kubernetes_tools.get_pod_condition( pod, "PodScheduled") reason = sched_condition.reason message = sched_condition.message if ready and backends is not None: # Replace readiness with whether or not it is actually registered in the mesh # TODO: Replace this once k8s readiness reflects mesh readiness, PAASTA-17266 ready = pod.status.pod_ip in backends return { "name": pod.metadata.name, "ip": pod.status.pod_ip, "host": pod.status.host_ip, "phase": pod.status.phase, "reason": reason, "message": message, "scheduled": scheduled, "ready": ready, "containers": get_pod_containers(pod), "create_timestamp": pod.metadata.creation_timestamp.timestamp(), "delete_timestamp": delete_timestamp, }
async def get_pod_status( pod: V1Pod, backends_task: "asyncio.Future[Dict[str, Any]]", client: Any, num_tail_lines: int, ) -> Dict[str, Any]: events_task = asyncio.create_task( get_pod_event_messages(client, pod, max_age_in_seconds=900) ) containers_task = asyncio.create_task( get_pod_containers(pod, client, num_tail_lines) ) await asyncio.gather(events_task, containers_task, return_exceptions=True) reason = pod.status.reason message = pod.status.message scheduled = kubernetes_tools.is_pod_scheduled(pod) ready = kubernetes_tools.is_pod_ready(pod) delete_timestamp = ( pod.metadata.deletion_timestamp.timestamp() if pod.metadata.deletion_timestamp else None ) try: # Filter events to only last 15m pod_event_messages = events_task.result() except asyncio.TimeoutError: pod_event_messages = [{"error": "Could not retrieve events. Please try again."}] if not scheduled and reason != "Evicted": sched_condition = kubernetes_tools.get_pod_condition(pod, "PodScheduled") # If the condition is not yet available (e.g. pod not fully created yet), defer to Status messages if sched_condition: reason = sched_condition.reason message = sched_condition.message mesh_ready = None if backends_task is not None: # TODO: Remove this once k8s readiness reflects mesh readiness, PAASTA-17266 mesh_ready = pod.status.pod_ip in (await backends_task) return { "name": pod.metadata.name, "ip": pod.status.pod_ip, "host": pod.status.host_ip, "phase": pod.status.phase, "reason": reason, "message": message, "scheduled": scheduled, "ready": ready, "mesh_ready": mesh_ready, "containers": containers_task.result(), "create_timestamp": pod.metadata.creation_timestamp.timestamp(), "delete_timestamp": delete_timestamp, "events": pod_event_messages, }
def healthy_flink_containers_cnt(si_pods: Sequence[V1Pod], container_type: str) -> int: """Return count of healthy Flink containers with given type """ return len([ pod for pod in si_pods if pod.metadata.labels["flink-container-type"] == container_type and is_pod_ready(pod) and container_lifetime(pod).total_seconds() > 60 ])
def get_cr_name(si_pods: Sequence[V1Pod]) -> str: """Returns the flink custom resource name based on the pod name. We are randomly choosing jobmanager pod here. This change is related to FLINK-3129 """ jobmanager_pod = [ pod for pod in si_pods if pod.metadata.labels["flink.yelp.com/container-type"] == "jobmanager" and is_pod_ready(pod) and container_lifetime(pod).total_seconds() > 60 ] if len(jobmanager_pod) == 1: return jobmanager_pod[0].metadata.name.split("-jobmanager-")[0] else: return ""
def get_version_for_controller_revision( cr: V1ControllerRevision, pods: Sequence[V1Pod], backends: Optional[Set[str]], ) -> KubernetesVersionDict: ready_pods = [pod for pod in pods if kubernetes_tools.is_pod_ready(pod)] return { "name": cr.metadata.name, "type": "ControllerRevision", "replicas": len(pods), "ready_replicas": len(ready_pods), "create_timestamp": cr.metadata.creation_timestamp.timestamp(), "git_sha": cr.metadata.labels.get("paasta.yelp.com/git_sha"), "config_sha": cr.metadata.labels.get("paasta.yelp.com/config_sha"), "pods": [get_pod_status(pod, backends) for pod in pods], }
def check_healthy_kubernetes_tasks_for_service_instance( instance_config: KubernetesDeploymentConfig, expected_count: int, all_pods: Sequence[V1Pod], ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) num_healthy_tasks = len([pod for pod in si_pods if is_pod_ready(pod)]) log.info( f"Checking {instance_config.service}.{instance_config.instance} in kubernetes as it is not in smartstack" ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=expected_count, num_available=num_healthy_tasks, )
async def get_pod_status_tasks_by_sha_and_readiness( pods_task: "asyncio.Future[V1Pod]", backends_task: "asyncio.Future[Dict[str, Any]]", client: kubernetes_tools.KubeClient, verbose: int, ) -> DefaultDict[ Tuple[str, str], DefaultDict[bool, List["asyncio.Future[Dict[str, Any]]"]] ]: num_tail_lines = calculate_tail_lines(verbose) tasks_by_sha_and_readiness: DefaultDict[ Tuple[str, str], DefaultDict[bool, List["asyncio.Future[Dict[str, Any]]"]] ] = defaultdict(lambda: defaultdict(list)) for pod in await pods_task: git_sha = pod.metadata.labels["paasta.yelp.com/git_sha"] config_sha = pod.metadata.labels["paasta.yelp.com/config_sha"] is_ready = kubernetes_tools.is_pod_ready(pod) pod_status_task = asyncio.create_task( get_pod_status(pod, backends_task, client, num_tail_lines) ) tasks_by_sha_and_readiness[(git_sha, config_sha)][is_ready].append( pod_status_task ) return tasks_by_sha_and_readiness
async def job_status( kstatus: MutableMapping[str, Any], client: kubernetes_tools.KubeClient, job_config: LongRunningServiceConfig, pod_list: Sequence[V1Pod], replicaset_list: Sequence[V1ReplicaSet], verbose: int, namespace: str, ) -> None: app_id = job_config.get_sanitised_deployment_name() kstatus["app_id"] = app_id kstatus["pods"] = [] kstatus["replicasets"] = [] if verbose > 0: num_tail_lines = calculate_tail_lines(verbose) for pod in pod_list: container_statuses = pod.status.container_statuses or [] containers = [ dict( name=container.name, tail_lines=await get_tail_lines_for_kubernetes_container( client, pod, container, num_tail_lines, ), ) for container in container_statuses ] kstatus["pods"].append( { "name": pod.metadata.name, "host": kubernetes_tools.get_pod_hostname(client, pod), "deployed_timestamp": pod.metadata.creation_timestamp.timestamp(), "phase": pod.status.phase, "ready": kubernetes_tools.is_pod_ready(pod), "containers": containers, "reason": pod.status.reason, "message": pod.status.message, } ) for replicaset in replicaset_list: try: ready_replicas = replicaset.status.ready_replicas if ready_replicas is None: ready_replicas = 0 except AttributeError: ready_replicas = 0 kstatus["replicasets"].append( { "name": replicaset.metadata.name, "replicas": replicaset.spec.replicas, "ready_replicas": ready_replicas, "create_timestamp": replicaset.metadata.creation_timestamp.timestamp(), } ) kstatus["expected_instance_count"] = job_config.get_instances() app = kubernetes_tools.get_kubernetes_app_by_name( name=app_id, kube_client=client, namespace=namespace ) deploy_status = kubernetes_tools.get_kubernetes_app_deploy_status( app=app, desired_instances=job_config.get_instances() ) kstatus["deploy_status"] = kubernetes_tools.KubernetesDeployStatus.tostring( deploy_status ) kstatus["running_instance_count"] = ( app.status.ready_replicas if app.status.ready_replicas else 0 ) kstatus["create_timestamp"] = app.metadata.creation_timestamp.timestamp() kstatus["namespace"] = app.metadata.namespace