def health(self) -> Health:
        """Return health status of the cluster."""
        ansible_health = Health(source=self._instance_id)

        if self._ansible:
            for test_health_function in [
                self._health_all_ping,
            ]:
                test_health = test_health_function()
                ansible_health.merge(test_health)

        return ansible_health
Exemple #2
0
    def health(self) -> Health:
        """Create a Health check for the helm workload."""
        health: Health = Health(source=self._instance_id)

        status = self.status()

        if status.status in [Status.UNKNOWN]:
            health.unknown(
                f"Helm: {self._instance_id} release status is unknown: {status.status} "
                f": {status.description}")

        if status.status in [Status.SUPERSEDED, Status.UNINSTALLING]:
            health.warning(
                f"Helm: {self._instance_id} release status is at issue: {status.status} "
                f": {status.description}")

        if status.status in [Status.DEPLOYED, Status.UNINSTALLED]:
            health.healthy(
                f"Helm: {self._instance_id} release status is good: {status.status} "
                f": {status.description}")

        if status.status == Status.FAILED:
            health.error(
                f"Helm: {self._instance_id} release status is not good: {status.status} "
                f": {status.description}")

        if status.status in [
                Status.PENDING_INSTALL,
                Status.PENDING_UPGRADE,
                Status.PENDING_ROLLBACK,
        ]:
            health.warning(f"Helm status pending: {status.status}")

        return health
    def _health_k8s_allpod_health(self) -> Health:
        """Check if kubernetes thinks all the pods are healthy."""
        health = Health(source=self._instance_id)

        core_v1_api = self.get_api("CoreV1Api")

        unhealthy_pod_count = 0
        for pod in core_v1_api.list_pod_for_all_namespaces().items:
            if pod.status.phase == "Failed":
                health.error(f"KubeAPI: pod failed: {pod.metadata.name}")
                unhealthy_pod_count += 1
        if unhealthy_pod_count == 0:
            health.healthy("KubeAPI: all pods report as healthy")
        elif unhealthy_pod_count < 2:
            health.warning("KubeAPI: some pods report as failed")
        else:
            health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (pod health)")

        return health
    def _health_k8s_livez(self) -> Health:
        """Check if kubernetes thinks the pod is healthy."""
        health = Health(source=self._instance_id)

        try:
            if self.livez():
                health.healthy("KubeAPI: livez reports live")
            else:
                health.warning("KubeAPI: livez reports NOT live.")
        # pylint: disable=broad-except
        except Exception as err:
            health.error(f"Could not retrieve livez: {err}")

        return health
Exemple #5
0
    def health(self) -> Health:
        """Determine the health of the K8s instance."""
        k8s_health = Health(source=self._instance_id,
                            status=HealthStatus.UNKNOWN)

        for test_health_function in [self._health_swarm_nodes]:
            try:
                test_health = test_health_function()
            # pylint: disable=broad-except
            except Exception as err:
                test_health = Health(source=self._instance_id)
                test_health.critical(
                    f"{test_health_function} exception: {err}")
            finally:
                k8s_health.merge(test_health)
        return k8s_health
    def health(self) -> Health:
        """Determine the health of the K8s instance."""
        k8s_health = Health(source=self._instance_id, status=HealthStatus.UNKNOWN)

        for test_health_function in [
            self._health_k8s_readyz,
            self._health_k8s_livez,
            self._health_k8s_node_health,
            self._health_k8s_alldeployment_health,
            self._health_k8s_alldaemonset_health,
            self._health_k8s_allstatefulset_health,
            self._health_k8s_allpod_health,
        ]:
            try:
                test_health = test_health_function()
            # pylint: disable=broad-except
            except Exception as err:
                test_health = Health(source=self._instance_id)
                test_health.critical(f"{test_health_function} exception: {err}")
            finally:
                k8s_health.merge(test_health)
        return k8s_health
Exemple #7
0
    def health(self) -> Health:
        """Determine the health of the K8s deployment."""
        dep_health = Health(source=self._instance_id,
                            status=HealthStatus.UNKNOWN)

        if self._deployment is None:
            dep_health.info(
                f"Deployment: {self._instance_id} not yet started.")
            return dep_health

        for test_health_function in [self._health_deployment_status]:
            test_health = test_health_function()
            dep_health.merge(test_health)

        return dep_health
Exemple #8
0
    def _health_deployment_status(self):
        """Check if kubernetes thinks the deployment is healthy."""
        health = Health(source=self._instance_id)

        apps_v1 = self._kubeapi_client.get_api("AppsV1Api")

        try:
            deployment = apps_v1.read_namespaced_deployment(
                self.name, self.namespace)
            status = deployment.status

            if status is None:
                health.error(
                    f"Deployment: [{self.namespace}/{self.name}] retrieved no status."
                )
            if status.conditions is None:
                health.warning(
                    f"Deployment: [{self.namespace}/{self.name}] retrieved no status conditions."
                )
            else:

                available_condition = next(condition
                                           for condition in status.conditions
                                           if condition.type == "Available")
                progressing_condition = next(
                    condition for condition in status.conditions
                    if condition.type == "Progressing")
                if available_condition and available_condition.status == "True":
                    health.healthy(
                        f"Deployment: [{self.namespace}/{self.name}] "
                        "Deployment is available "
                        f"-> {available_condition.message}")
                elif progressing_condition and progressing_condition.status == "True":
                    health.warning(
                        f"Deployment: [{self.namespace}/{self.name}] "
                        "Deployment is progressing "
                        f"-> {progressing_condition.message}")
                else:
                    health.error(
                        f"Deployment: [{self.namespace}/{self.name}] "
                        "Deployment is neither progressing nor available "
                        f"-> {available_condition.message} && {progressing_condition.message}"
                    )

                for condition in status.conditions:
                    if condition.type in ["Available", "Progressing"]:
                        pass

                    elif condition.status == "True":
                        health.healthy(
                            f"Deployment: [{self.namespace}/{self.name}] {condition.type} "
                            f"-> {condition.message}")
                    else:
                        health.error(
                            f"Deployment: [{self.namespace}/{self.name}] {condition.type} "
                            f"-> {condition.message}")

        except kubernetes.client.rest.ApiException as err:
            health.error(f"Deployment: K8S REST API exception occured: {err}")

        return health
Exemple #9
0
    def _health_swarm_nodes(self) -> Health:
        """Check if kubernetes thinks the pod is healthy."""
        health = Health(source=self._instance_id)

        try:
            for node in self.nodes.list():
                attrs = node.attrs
                description = node.attrs["Description"]["Hostname"]
                role = node.attrs["Spec"]["Role"]

                errors: int = 0

                if "Status" in attrs:
                    node_status = node.attrs["Status"]
                    message = node_status["Message"]
                    if node_status["State"] != "ready":
                        health.error(
                            f"Docker:Node: {role} {description} : {message}")
                        errors += 1

                if "ManagerStatus" in attrs:
                    manager_status = node.attrs["ManagerStatus"]
                    if manager_status["Reachability"] != "reachable":
                        health.error(
                            f"Docker:Node: {role} {description} : manager is not reachable"
                        )
                        errors += 1

                if node.attrs["Spec"]["Availability"] != "active":
                    health.warning(
                        f"Docker:Node: {role} {description} : is not available"
                    )
                    errors += 1

                if errors == 0:
                    health.healthy(
                        f"Docker:Node: {role} {description} : reports healthy")
                else:
                    health.warning(
                        f"Docker:Node: {role} {description} : is not health ({errors} issues.)"
                    )
        # pylint: disable=broad-except
        except Exception as err:
            health.error(f"Docker: could not retrieve node health: {err}")

        return health
Exemple #10
0
    def _health_k8s_allstatefulset_health(self) -> Health:
        """Check if kubernetes thinks all the statefulsets are healthy."""
        health = Health(source=self._instance_id)

        apps_v1_api: api.apps_v1_api.AppsV1Api = self.get_api("AppsV1Api")

        unhealthy_count = 0
        # pylint: disable=no-member
        for statefulset in apps_v1_api.list_stateful_set_for_all_namespaces().items:
            namespace = statefulset.metadata.namespace
            name = statefulset.metadata.name
            status = statefulset.status

            # {'collision_count': 0,
            #  'conditions': None,
            #  'current_replicas': 1,
            #  'current_revision': 'loki-workload-67877b465c',
            #  'observed_generation': 1,
            #  'ready_replicas': 1,
            #  'replicas': 1,
            #  'update_revision': 'loki-workload-67877b465c',
            #  'updated_replicas': 1}

            if status.collision_count is not None and status.collision_count > 0:
                health.warning(
                    f"KubeAPI:Statefulset: [{namespace}/{name}] "
                    "-> Reports some collisions: "
                    f"{status.collision_count}"
                )
                unhealthy_count += 1

            if status.conditions:
                for condition in status.conditions:
                    if condition.status == "True":
                        health.healthy(
                            f"KubeAPI:Statefulset: [{namespace}/{name}] {condition.type} "
                            f"-> {condition.message}"
                        )
                    else:
                        health.warning(
                            f"KubeAPI:Statefulset: [{namespace}/{name}] {condition.type} "
                            f"-> {condition.message}"
                        )
                        unhealthy_count += 1

        if unhealthy_count == 0:
            health.healthy("KubeAPI: all statefulsets report as healthy")
        elif unhealthy_count < 3:
            health.warning("KubeAPI: some statefulsets report condition failures")
        else:
            health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (statefulset health)")

        return health
Exemple #11
0
    def _health_k8s_alldaemonset_health(self) -> Health:
        """Check if kubernetes thinks all the daemonsets are healthy."""
        health = Health(source=self._instance_id)

        apps_v1_api: api.apps_v1_api.AppsV1Api = self.get_api("AppsV1Api")

        unhealthy_dae_count = 0
        # pylint: disable=no-member
        for daemonset in apps_v1_api.list_daemon_set_for_all_namespaces().items:
            namespace = daemonset.metadata.namespace
            name = daemonset.metadata.name
            status = daemonset.status

            if status.collision_count is not None and status.collision_count > 0:
                health.warning(
                    f"Daemonset: [{namespace}/{name}] collision_count "
                    "-> Reports some collisions: "
                    f"{status.collision_count}"
                )
                unhealthy_dae_count += 1
            if status.number_unavailable is not None and status.number_unavailable > 0:
                health.warning(
                    f"Daemonset: [{namespace}/{name}] number_unavailable "
                    "-> Reports some unavailable pods: "
                    f"{status.number_unavailable}"
                )
                unhealthy_dae_count += 1
            if status.desired_number_scheduled < status.current_number_scheduled:
                health.warning(
                    f"Daemonset: [{namespace}/{name}] desired_number_scheduled "
                    "-> Does not have the desired number scheduled: "
                    f"{status.desired_number_scheduled} < "
                    f"{status.current_number_scheduled}"
                )
                unhealthy_dae_count += 1

            if status.conditions:
                for condition in status.conditions:
                    if condition.status == "True":
                        health.healthy(
                            f"Daemonset: [{namespace}/{name}] {condition.type} "
                            f"-> {condition.message}"
                        )
                    else:
                        health.warning(
                            f"Daemonset: [{namespace}/{name}] {condition.type} "
                            f"-> {condition.message}"
                        )
                        unhealthy_dae_count += 1

        if unhealthy_dae_count == 0:
            health.healthy("KubeAPI: all daemonsets report as healthy")
        elif unhealthy_dae_count < 3:
            health.warning("KubeAPI: some daemonsets report condition failures")
        else:
            health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (daemonset health)")

        return health
Exemple #12
0
    def _health_k8s_alldeployment_health(self) -> Health:
        """Check if kubernetes thinks all the deployments are healthy."""
        health = Health(source=self._instance_id)

        apps_v1_api: api.apps_v1_api.AppsV1Api = self.get_api("AppsV1Api")

        unhealthy_dep_count = 0
        # pylint: disable=no-member
        for deployment in apps_v1_api.list_deployment_for_all_namespaces().items:
            namespace = deployment.metadata.namespace
            name = deployment.metadata.name

            no_issues = True

            if not deployment.status.conditions:
                health.unknown(
                    f"KubeAPI:Deployment: [{namespace}/{name}] "
                    "Deployment does not have any conditions (yet?)"
                )
                continue

            available_condition = next(
                (
                    condition
                    for condition in deployment.status.conditions
                    if condition.type == "Available"
                ),
                None,
            )
            progressing_condition = next(
                (
                    condition
                    for condition in deployment.status.conditions
                    if condition.type == "Progressing"
                ),
                None,
            )
            if available_condition and available_condition.status == "True":
                pass
            elif progressing_condition and progressing_condition.status == "True":
                health.warning(
                    f"KubeAPI:Deployment: [{namespace}/{name}] "
                    "Deployment is progressing "
                    f"-> {progressing_condition.message}"
                )
                no_issues = False
            else:
                messages = "\n".join(
                    list(
                        condition.message
                        for condition in [progressing_condition, available_condition]
                        if condition is not None
                    )
                )
                health.warning(
                    f"KubeAPI:Deployment: [{namespace}/{name}] "
                    "Deployment is neither progressing nor available "
                    f"-> {messages}"
                )
                no_issues = False

            for condition in deployment.status.conditions:
                if condition.type in ["Available", "Progressing"]:
                    pass

                elif condition.status != "True":
                    health.warning(
                        f"KubeAPI:Deployment: [{namespace}/{name}] {condition.type} "
                        f"-> {condition.message}"
                    )
                    no_issues = False

            if no_issues:
                health.healthy(f"KubeAPI:Deployment: [{namespace}/{name}] is healthy")
            else:
                health.warning(f"KubeAPI:Deployment: [{namespace}/{name}] is not healthy")
                unhealthy_dep_count += 1

        if unhealthy_dep_count == 0:
            health.healthy("KubeAPI: all deployments report healthy")
        elif unhealthy_dep_count < 3:
            health.warning("KubeAPI: some deployments report condition failures")
        else:
            health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (deployment health)")

        return health
Exemple #13
0
    def _health_k8s_node_health(self) -> Health:
        """Check if kubernetes thinks the nodes are healthy."""
        health = Health(source=self._instance_id)

        try:
            for node in self.nodes():
                name = node.metadata.name
                no_issues = True

                condition = next(
                    (
                        condition
                        for condition in node.status.conditions
                        if condition.type == "Ready"
                    ),
                    None,
                )
                if condition is not None and condition.status != "True":
                    health.warning(f"KubeAPI: {name}: {condition.message}")
                    no_issues = False

                condition = next(
                    (
                        condition
                        for condition in node.status.conditions
                        if condition.type == "NetworkUnavailable"
                    ),
                    None,
                )
                if condition is not None and condition.status == "True":
                    health.warning(f"KubeAPI: {name}: {condition.message}")
                    no_issues = False

                condition = next(
                    (
                        condition
                        for condition in node.status.conditions
                        if condition.type == "MemoryPressure"
                    ),
                    None,
                )
                if condition is not None and condition.status == "True":
                    health.warning(f"KubeAPI: {name}: {condition.message}")
                    no_issues = False

                condition = next(
                    (
                        condition
                        for condition in node.status.conditions
                        if condition.type == "DiskPressure"
                    ),
                    None,
                )
                if condition is not None and condition.status == "True":
                    health.warning(f"KubeAPI: {name}: {condition.message}")
                    no_issues = False

                condition = next(
                    (
                        condition
                        for condition in node.status.conditions
                        if condition.type == "PIDPressure"
                    ),
                    None,
                )
                if condition is not None and condition.status == "True":
                    health.warning(f"KubeAPI: {name}: {condition.message}")
                    no_issues = False

                if no_issues:
                    health.healthy(f"KubeAPI: Node {name} reports healthy.")
                else:
                    health.error(f"KubeAPI: Node {name} reporting issues.")

        # pylint: disable=broad-except
        except Exception as err:
            health.error(f"KubeAPI:Exception occured when check kubelet health: {err}")

        return health
Exemple #14
0
    def health(self) -> Health:
        """Perform a health check on the workload."""
        health = Health(source=self._instance_id)

        try:
            status = self.status()

            if status.status in [Status.POSTPROCESS]:
                health.info(
                    "Sonobuoy: run has finished, but result is not yet avaialble."
                )
            elif status.status in [Status.COMPLETE, Status.PASSED]:
                health.info("Sonobuoy: completed.")
            elif status.status in [Status.FAILED]:
                health.error("Sonobuoy: run has produced a failure.")
            else:  # if status.status() in [Status.PENDING, Status.RUNNING]:
                health.info("Sonobuoy: Running")

        except (subprocess.CalledProcessError, AttributeError) as err:
            health.unknown(
                f"No status found. Sonobuoy is likely not running: {err}")

        return health
Exemple #15
0
 def health(self) -> Health:
     """Evaluate health of the node."""
     return Health(source=f"{self._client_id}-{self._id}")
    def _health_all_ping(self) -> Health:
        """Health check that tries to ping all of the hosts."""
        ping_health = Health(source=self._instance_id)

        ping = self.ping()

        try:
            ping_task_result_hosts = ping["plays"][0]["tasks"][0]["hosts"]
            stats_hosts = ping["stats"]
        except KeyError:
            ping_health.error("ansible ping gave unexpected results.")
        else:
            for host, host_stats in stats_hosts.items():
                if host_stats["ok"]:
                    ping_health.healthy(f"Ansible: {host} ping response ok.")
                elif host_stats["unreachable"]:
                    ping_health.warning(f"Ansible: {host} unreachable during ping.")
                elif host_stats["ignored"]:
                    ping_health.warning(f"Ansible: {host} ping ignored.")
                elif host_stats["failures"]:
                    ping_health.error(f"Ansible: {host} ping failed.")
                elif host_stats["unknowwn"]:
                    ping_health.error(f"Ansible: {host} ping skipped.")
                else:
                    ping_health.warning(
                        f"Ansible: {host} status not understood: {ping_task_result_hosts[host]}."
                    )

        return ping_health