コード例 #1
0
 def __init__(self, kube_config_file=None):
     super(OcpMachineSet, self).__init__(kube_config_file=kube_config_file)
     self.api_version = "machine.openshift.io/v1beta1"
     self.kind = "MachineSet"
     self.machineset = self.dyn_client.resources.get(
         api_version=self.api_version, kind=self.kind)
     self.machine = OcpMachines(kube_config_file=kube_config_file)
     self.node = OcpNodes(kube_config_file=kube_config_file)
コード例 #2
0
 def __init__(self, kube_config_file):
     self.kube_config_file = kube_config_file
     super(OcpHealthChecker, self).__init__(kube_config_file=self.kube_config_file)
     self.ocp_node = OcpNodes(kube_config_file=self.kube_config_file)
     self.ocp_cluster_operator = OcpClusterOperator(kube_config_file=self.kube_config_file)
     self.ocp_control_plane = OcpControlPlane(kube_config_file=self.kube_config_file)
     self.ocp_cluster_version = OcpClusterVersion(kube_config_file=self.kube_config_file)
     self.ocp_route = OcpRoutes(kube_config_file=self.kube_config_file)
     self.ocp_pod = OcpPods(kube_config_file=self.kube_config_file)
     self.ocp_deployment = OcpDeploymentconfigs(kind="Deployment", kube_config_file=self.kube_config_file)
     self.ocp_config = OcpConfig(kind="Config", api_version="imageregistry.operator.openshift.io/v1",
                                 kube_config_file=self.kube_config_file)
     self.ocp_secret = OcpSecret(kube_config_file=self.kube_config_file)
コード例 #3
0
def ocp_node(get_kubeconfig):
    return OcpNodes(kube_config_file=get_kubeconfig)
コード例 #4
0
class OcpHealthChecker(OcpBase):
    """
    OcpHealthChecker will check the health of certain critical openshift components.
    - Node/Controller
    - Router
    - ImageRegistry
    - Persistence Storage for ImageRegistry
    - API Server
    - Web Console
    - Cluster Version
    - Control Planes
    - Cluster Operators

    Every components health check returns overall health of that component (bool) and optional unhealthy components
    (list or dict). Optional unhealthy components will be display in tabular format when we check the health of
    openshift cluster.

    """
    def __init__(self, kube_config_file):
        self.kube_config_file = kube_config_file
        super(OcpHealthChecker,
              self).__init__(kube_config_file=self.kube_config_file)
        self.ocp_node = OcpNodes(kube_config_file=self.kube_config_file)
        self.ocp_cluster_operator = OcpClusterOperator(
            kube_config_file=self.kube_config_file)
        self.ocp_control_plane = OcpControlPlane(
            kube_config_file=self.kube_config_file)
        self.ocp_cluster_version = OcpClusterVersion(
            kube_config_file=self.kube_config_file)
        self.ocp_route = OcpRoutes(kube_config_file=self.kube_config_file)
        self.ocp_pod = OcpPods(kube_config_file=self.kube_config_file)
        self.ocp_deployment = OcpDeploymentconfigs(
            kind="Deployment", kube_config_file=self.kube_config_file)
        self.ocp_config = OcpConfig(
            kind="Config",
            api_version="imageregistry.operator.openshift.io/v1",
            kube_config_file=self.kube_config_file)
        self.ocp_secret = OcpSecret(kube_config_file=self.kube_config_file)

    def check_node_health(self):
        """
        Check health of each cluster node
        Methods checks for:
            - DiskPressure: All have sufficient disk space.
            - MemoryPressure: All have sufficient memory
            - PIDPressure: All have sufficient number processes are running
            - If ALL above is False, Node is in ready(healthy) state
        :return: Return tuple of all_nodes_healthy(boolean) and node_health_info(dict of node name and failure reason)
        """
        logger.info("Checking all cluster nodes health")
        unhealthy_node_info = dict()
        all_nodes_healthy = False
        individual_node_health_status_list = list()
        node_list_info = self.ocp_node.get_all_nodes()
        if node_list_info:
            for node_info in node_list_info.items:
                temp_list = list()
                for condition in node_info.status.conditions:
                    if condition["type"] == "Ready":
                        individual_node_health_status_list.append(
                            condition["status"])
                    elif condition["type"] == "MemoryPressure" and condition[
                            "status"] == "True":
                        temp_list.append(
                            {"MemoryPressure": "The node memory is low"})
                    elif condition["type"] == "DiskPressure" and condition[
                            "status"] == "True":
                        temp_list.append(
                            {"DiskPressure": "The disk capacity is low"})
                    elif condition["type"] == "PIDPressure" and condition[
                            "status"] == "True":
                        temp_list.append({
                            "PIDPressure":
                            "There are too many processes on the node"
                        })

                unhealthy_node_info[node_info["metadata"]["name"]] = temp_list

        logger.info(
            "Check overall health of cluster nodes by checking each node health"
        )
        if len(set(individual_node_health_status_list)) == 1 and \
                list(set(individual_node_health_status_list))[0] == "True":
            all_nodes_healthy = True

        return all_nodes_healthy, unhealthy_node_info

    def check_router_health(self):
        """
        Check openshift router health
        - Check if router pod is running fine in openshift-ingress namespace
        - Check if deployments of router has matching number of replicas
        :return:Return tuple of bool (is_router_healthy) and dict (unhealthy_router_info)
        """
        logger.info("Check the health of openshift router operator pod")
        is_router_healthy = False
        unhealthy_router_info = dict()
        is_router_pod_healthy = False
        is_replicas_count_matching = False
        unhealthy_pods = dict()
        pods_response = self.ocp_pod.list_pods_in_a_namespace(
            namespace="openshift-ingress")
        for pod in pods_response.items:
            for condition in pod["status"]["conditions"]:
                if condition["type"] == "Ready" and condition[
                        "status"] == "False":
                    unhealthy_pods[pod["metadata"]
                                   ["name"]] = condition["status"]

        if len(unhealthy_pods.values()) == 0:
            is_router_pod_healthy = True
        # Get unhealthy pod names
        unhealthy_pod_names = list()
        for key, value in unhealthy_pods.items():
            unhealthy_pod_names.append(key)
        unhealthy_router_info.update({"router_pod": unhealthy_pod_names})
        logger.info("Is router pod/s healthy : %s", is_router_pod_healthy)

        logger.info(
            "Check replicas count of openshift router deployment are matching")
        deployment_response = self.ocp_deployment.list_all_deployments_in_a_namespace(
            namespace="openshift-ingress")
        for deployment in deployment_response.items:
            if deployment.metadata.name == "router-default":
                replicas = deployment.status.replicas
                available_replicas = deployment.status.availableReplicas
                ready_replicas = deployment.status.readyReplicas
        if replicas == available_replicas == ready_replicas:
            is_replicas_count_matching = True
        # If replicas count doesn't match, add them into unhealthy component list
        if not is_replicas_count_matching:
            unhealthy_router_info.update(
                {"router_replicas": is_replicas_count_matching})
        logger.info("Is router deployment replicas count matching : %s",
                    is_replicas_count_matching)

        logger.info("Check overall health of router operator")
        if is_router_pod_healthy and is_replicas_count_matching:
            is_router_healthy = True

        return is_router_healthy, unhealthy_router_info

        logger.info("Check overall health of router operator")
        if is_router_pod_healthy and is_replicas_count_matching:
            is_router_healthy = True

        return is_router_healthy, unhealthy_router_info

    def check_image_registry_health(self):
        """
        Check openshift cluster image registry
        - Check if image registry pod is running fine in openshift-image-registry namespace
        - Check if deployments of image registry has matching number of replicas
        :return:
        """
        logger.info("Check health of openshift image registry pods")
        is_image_registry_healthy = False
        unhealthy_image_registry_info = dict()
        is_image_registry_pods_healthy = False
        unhealthy_pods = dict()
        pods_response = self.ocp_pod.list_pods_in_a_namespace(
            namespace="openshift-image-registry")
        for pod in pods_response.items:
            if "cluster-image-registry-operator" in pod.metadata.name or \
                    "image-registry" in pod.metadata.name:
                for condition in pod["status"]["conditions"]:
                    if condition["type"] == "Ready" and condition[
                            "status"] == "False":
                        unhealthy_pods[pod["metadata"]
                                       ["name"]] = condition["status"]

        if len(unhealthy_pods.values()) == 0:
            is_image_registry_pods_healthy = True
        # Get unhealthy pod names
        unhealthy_pod_names = list()
        for key, value in unhealthy_pods.items():
            unhealthy_pod_names.append(key)
        unhealthy_image_registry_info.update(
            {"image_registry_pod": unhealthy_pod_names})
        logger.info("Is image registry pod/s healthy : %s",
                    is_image_registry_pods_healthy)

        logger.info(
            "Check replicas count of openshift image registry deployment are matching"
        )
        is_replicas_count_matching = False
        replica_count_dict = dict()
        deployment_response = \
            self.ocp_deployment.list_all_deployments_in_a_namespace(namespace="openshift-image-registry")
        for deployment in deployment_response.items:
            if deployment.metadata.name == "cluster-image-registry-operator" or \
                    deployment.metadata.name == "image-registry":
                replicas = deployment.status.replicas
                available_replicas = deployment.status.availableReplicas
                ready_replicas = deployment.status.readyReplicas

            if replicas == available_replicas == ready_replicas:
                is_replicas_count_matching_for_each_image_registry = True
            replica_count_dict[
                deployment.metadata.
                name] = is_replicas_count_matching_for_each_image_registry
            logger.info("Is replicas count for %s matching? : %s",
                        deployment.metadata.name,
                        is_replicas_count_matching_for_each_image_registry)

        if all(val for val in replica_count_dict.values()):
            is_replicas_count_matching = True
        # If replicas count doesn't match, add them into unhealthy component list
        if not is_replicas_count_matching:
            unhealthy_image_registry_info.update(
                {"image_registry_replicas": is_replicas_count_matching})
        logger.info(
            "Is replicas count matching for all image registry deployment : %s",
            is_replicas_count_matching)

        logger.info(
            "Check overall health of image registry operator by checking pod/s status and replicas count match"
        )
        if is_image_registry_pods_healthy and is_replicas_count_matching:
            is_image_registry_healthy = True

        return is_image_registry_healthy, unhealthy_image_registry_info

    def check_persistence_storage_for_image_registry(self):
        """
        Check if persistence storage configure for cluster image registry
        - Check managementState of image-registry. IPI installation has default "Managed".
          UPI installation has default "Removed". managementState field should "Managed".
            Managed: The Operator updates the registry as configuration resources are updated.
            Unmanaged: The Operator ignores changes to the configuration resources.
            Removed: The Operator removes the registry instance and tear down any storage that the Operator provisioned.
        - Check if persistence storage configured for image registry
        :return: (boolean) Return True if persistence storage configured for image registry otherwise False.

        NOTE : If persistence storage is not configured for image registry doesn't mean unhealthy. It's a WARNING. If
        persistence storage is not configure for image registry, Images will be inaccessible after reboot.
        """
        is_image_registry_storage_configured = False
        is_management_state_correct = False
        is_persistence_storage_configured = True
        logger.info("Check managementState for image registry")
        image_config_response = self.ocp_config.get_ocp_config(name="cluster")
        if image_config_response["spec"]["managementState"] == "Managed":
            is_management_state_correct = True
        logger.info("Is managementState correct : %s",
                    {is_management_state_correct})

        logger.info(
            "Check if persistence storage configured for image registry")
        if "emptyDir" in dict(image_config_response["spec"]["storage"]):
            is_persistence_storage_configured = False
        logger.info("Is persistence Storage Configured: %s",
                    is_persistence_storage_configured)

        if is_management_state_correct and is_persistence_storage_configured:
            is_image_registry_storage_configured = True

        return is_image_registry_storage_configured

    def check_api_server_health(self):
        """
        Check openshift apiserver is reachable and  healthy
        :return: (boolean) Return True if api server is healthy otherwise False
        """
        status_codes = dict()
        is_api_server_healthy = False
        kubeconfig_data = self.get_data_from_kubeconfig_v4()

        logger.info("Check health of API Server")
        api_server_url = kubeconfig_data["api_server_url"]
        final_api_server_url = api_server_url + "/healthz"
        logger.info("API Server URL : %s", final_api_server_url)
        bearer_token = self.ocp_secret.get_long_live_bearer_token()
        headers = {'Authorization': 'Bearer ' + bearer_token}

        # Suppress only the single warning from urllib3 needed.
        requests.packages.urllib3.disable_warnings(
            category=InsecureRequestWarning)

        api_server_response = requests.get(final_api_server_url,
                                           headers=headers,
                                           verify=False)
        logger.info("API Server Status Code : %s",
                    api_server_response.status_code)
        status_codes["api_server_status"] = api_server_response.status_code

        for key, value in status_codes.items():
            if value in range(200, 300):
                is_api_server_healthy = True

        return is_api_server_healthy

    def check_web_console_health(self):
        """
        Check if web console is reachable and healthy
        :return:(boolean) Return True if web console is healthy otherwise False
        """
        status_codes = dict()
        is_web_console_healthy = False

        logger.info("Check health of web-console")
        web_console_route = self.ocp_route.get_route_in_namespace(
            namespace="openshift-console", route_name="console")
        web_console_url = "https://" + web_console_route + ":443" + "/healthz"
        logger.info("Web Console URL : %s", web_console_url)

        # Suppress only the single warning from urllib3 needed.
        requests.packages.urllib3.disable_warnings(
            category=InsecureRequestWarning)

        web_console_response = requests.get(web_console_url, verify=False)
        logger.info("Web Console Status Code : %s",
                    web_console_response.status_code)
        status_codes["web_console_status"] = web_console_response.status_code

        for key, value in status_codes.items():
            if value in range(200, 300):
                is_web_console_healthy = True

        return is_web_console_healthy

    def check_cluster_version_operator_health(self):
        """
        Check ClusterVersion operator health
        :return: (boolean) Return
        """
        logger.info("Check health of ClusterVersion operator")
        is_cluster_version_operator_healthy = False

        cluster_version_response = self.ocp_cluster_version.get_cluster_version(
        )
        for cluster_version in cluster_version_response.items:
            if cluster_version["metadata"]["name"] == "version":
                for condition in cluster_version["status"]["conditions"]:
                    if condition["type"] == "Available" and condition[
                            "status"] == "True":
                        is_cluster_version_operator_healthy = True

        return is_cluster_version_operator_healthy

    def check_control_plane_status(self):
        """
        Check health of cluster control plane components
        Command : "oc get cs OR oc get componentstatus"
        :return: (tuple) Return tuple of overall health of control plane component (boolean) and list of unhealthy
        components if any
        """
        logger.info("Checking control plan status")
        all_control_plane_components_healthy = False
        unhealthy_components_list = list()
        control_plane_components = self.ocp_control_plane.get_all_control_plane_components(
        )
        if control_plane_components:
            for control_plane_component in control_plane_components.items:
                for condition in control_plane_component.conditions:
                    if condition["type"] != "Healthy" and not condition[
                            "status"] == "False":
                        unhealthy_components_list.append(
                            control_plane_component["metadata"]["name"])

        # Set control plane health by checking all control plane components health
        if len(unhealthy_components_list) == 0:
            all_control_plane_components_healthy = True

        return all_control_plane_components_healthy, unhealthy_components_list

    def check_cluster_operators_health(self):
        """
        Check health of cluster operator
        Command : "oc get co OR oc get clusteroperator"
        :return: (tuple) Return overall health of cluster operator (boolean) and list of unhealthy operator if any
        """
        logger.info("Checking all cluster operators health")
        all_cluster_operators_healthy = False
        unhealthy_operators_list = list()
        cluster_operators = self.ocp_cluster_operator.get_all_cluster_operators(
        )
        if cluster_operators:
            for cluster_operator in cluster_operators.items:
                for condition in cluster_operator.status.conditions:
                    if condition["type"] == "Available" and condition[
                            "status"] == "False":
                        unhealthy_operators_list.append(
                            cluster_operator["metadata"]["name"])

        # Set cluster operator health status by checking all operator are health
        if len(unhealthy_operators_list) == 0:
            all_cluster_operators_healthy = True

        return all_cluster_operators_healthy, unhealthy_operators_list
コード例 #5
0
class OcpMachineSet(OcpBase):
    """
    OcpMachineSet class extends OcpBase and encapsulates all methods
    related to managing Openshift Machine Sets.
    :param kube_config_file: A kubernetes config file.
    :return: None
    """
    def __init__(self, kube_config_file=None):
        super(OcpMachineSet, self).__init__(kube_config_file=kube_config_file)
        self.api_version = "machine.openshift.io/v1beta1"
        self.kind = "MachineSet"
        self.machineset = self.dyn_client.resources.get(
            api_version=self.api_version, kind=self.kind)
        self.machine = OcpMachines(kube_config_file=kube_config_file)
        self.node = OcpNodes(kube_config_file=kube_config_file)

    def get_machine_sets(self) -> ResourceList:
        """
        Get all Machine sets in a cluster
        :return: MachineSetList on success OR an empty list on failure
        """
        api_response = list()
        try:
            api_response = self.machineset.get(namespace=MACHINE_NAMESPACE)
        except ApiException as e:
            logger.error("Exception while getting all Machine Sets: %s\n", e)

        return api_response

    def get_machine_set(self, machine_set_name: str) -> ResourceInstance:
        """
        Get a Machine set by name
        :param machine_set_name: (str) name of the machine set
        :return: MachineSet object on success OR None on failure
        """
        api_response = None
        try:
            api_response = self.machineset.get(name=machine_set_name,
                                               namespace=MACHINE_NAMESPACE)
        except ApiException as e:
            logger.error("Exception while getting Machine set: %s\n", e)

        return api_response

    def get_machine_set_role(self, machine_set_name: str) -> str:
        """
        Get a Machine set role
        :param machine_set_name: (str) name of the machine set
        :return: Machine set role on success OR empty string on failure
        """
        role = str()
        machine_set = self.get_machine_set(machine_set_name)
        role = machine_set.metadata.labels[
            "machine.openshift.io/cluster-api-machine-role"]
        return role

    def is_machine_set_ready(self, machine_set_name: str) -> bool:
        """
        Verify that a Machine reflects the desired number of user specified replicas
        :param machine_set_name: (str) name of the machine set
        :return: (bool) True when readyReplicas == replicas OR False otherwise
        """
        field_selector = f"metadata.name={machine_set_name}"
        for event in self.machineset.watch(namespace=MACHINE_NAMESPACE,
                                           field_selector=field_selector,
                                           timeout=TIMEOUT):
            requested_replicas = event["object"]["status"]["replicas"]
            ready_replicas = event["object"]["status"]["readyReplicas"]
            if requested_replicas == ready_replicas:
                return True
            else:
                logger.info("Waiting for replicas to match ready replicas")
        return False

    def scale_machine_set(self, machine_set_name: str,
                          replicas: int) -> bool:  # noqa: C901
        """
        Verify that a Machine reflects the desired number of user specified replicas
        :param machine_set_name: (str) name of the machine set
        :param replicas: (int) the number of desired machine replicas
        :return: (bool) True when successfully scaling a Machine set object OR False otherwise
        """
        def _verify_successful_scale_up(machine_set_name: str) -> bool:
            """
            Once a patch operation is successfully completed, a scale up is deemed successful
            if the following conditions are met:
                1. The newly generated machines reach a ready state
                2. New nodes corresponding to the newcly created machines are created
                   and reach a ready state.
            :param machinet_set_name: (str) name of the machine set
            :return: (bool) True if the given machine set is successfully scaled up OR False otherwise.
            """
            scaled_up_machines_list = self.machine.get_machines_in_machineset(
                machine_set_name)
            creation_phases = {"Provisioning", "Provisioned"}
            new_machine_names = [
                machine.metadata.name
                for machine in scaled_up_machines_list.items
                if machine.status.phase in creation_phases
            ]
            new_machines_ready = True
            for machine_name in new_machine_names:
                new_machines_ready = new_machines_ready and self.machine.is_machine_created(
                    machine_name)
            if new_machines_ready:
                new_nodes_ready = True
                for machine in new_machine_names:
                    logger.debug("Checking that new nodes are available")
                    node_name = self.machine.get_machine_node_ref(machine)
                    new_nodes_ready = new_nodes_ready and self.node.is_node_ready(
                        node_name)
                return new_nodes_ready
            else:
                raise AssertionError(
                    "New machine(s) resulting from scaling did not reach a ready state"
                )

        def _verify_successful_scale_down(machine_set_name: str) -> bool:
            """
            Once a patch operation is successfully completed, a scale down is deemed successful
            if the following conditions are met:
                1. Enough machines are deleted to meet the desired number of replicas
                2. Nodes corresponding to the deleted machines are in turn deleted as well.
            :param machinet_set_name: (str) name of the machine set
            :return: (bool) True if the given machine set is successfully scaled down OR False otherwise.
            """
            scaled_down_machines_list = self.machine.get_machines_in_machineset(
                machine_set_name)
            machine_names_to_be_deleted = [
                machine.metadata.name
                for machine in scaled_down_machines_list.items
                if machine.status.phase == "Deleting"
            ]
            node_names_to_be_deleted = list()
            for machine in machine_names_to_be_deleted:
                node_names_to_be_deleted.append(
                    self.machine.get_machine_node_ref(machine))
            logger.debug("Machines to be deleted are: {}".format(
                machine_names_to_be_deleted))
            excess_machines_deleted = True
            for machine_name in machine_names_to_be_deleted:
                excess_machines_deleted = excess_machines_deleted and self.machine.is_machine_deleted(
                    machine_name)
            if excess_machines_deleted:
                excess_nodes_deleted = True
                for node in node_names_to_be_deleted:
                    logger.debug("Checking that scaled down nodes are removed")
                    excess_nodes_deleted = excess_nodes_deleted and self.node.is_node_deleted(
                        node)
                return excess_nodes_deleted
            else:
                raise AssertionError(
                    "Scale down operation did not complete successfully")

        def _is_watched_desired(machine_set_name: str,
                                desired_replicas: str) -> bool:
            """
            After patching a Machine set object with a different replica value, this method
            is meant to verify that the 'replicas' value reflects the value we used with
            the patch operation.
            :param machine_set_name: (str) The name of the machine set
            :param desired_replicas: (int) The number of replicas to be watched
            :return: (bool) True when values match OR False otherwise
            """
            field_selector = f"metadata.name={machine_set_name}"
            for event in self.machineset.watch(namespace=MACHINE_NAMESPACE,
                                               field_selector=field_selector,
                                               timeout=TIMEOUT):
                if event["object"]["status"]["replicas"] == replicas:
                    return True
                else:
                    logger.debug(
                        "Waiting for MachineSet to reflect new number of desired replicas"
                    )
            return False

        initial_machines = self.machine.get_machines_in_machineset(
            machine_set_name)
        initial_machine_names = set(
            [machine.metadata.name for machine in initial_machines.items])
        initial_machines_count = len(initial_machine_names)
        # If number of existing machine is the same as replicas, nothing to do.
        if initial_machines_count == replicas:
            logger.info(
                "Desired replicas is already equal to number of machines. No sacling required"
            )
            return True
        body = {"spec": {"replicas": replicas}}
        api_response = None
        try:
            api_response = self.machineset.patch(
                name=machine_set_name,
                body=body,
                namespace=MACHINE_NAMESPACE,
                content_type="application/merge-patch+json",
            )
        except ApiException as e:
            logger.error("Exception while updating MachineSet: %s\n", e)
        if not _is_watched_desired(machine_set_name, replicas):
            raise AssertionError(
                "The MachineSet does not reflect the deisred number of replicas"
            )

        if api_response:
            if initial_machines_count < replicas:
                return _verify_successful_scale_up(machine_set_name)
            else:
                return _verify_successful_scale_down(machine_set_name)