def __init__(self, kube_config_file=None): super(OcpMachineSet, self).__init__(kube_config_file=kube_config_file) self.api_version = "machine.openshift.io/v1beta1" self.kind = "MachineSet" self.machineset = self.dyn_client.resources.get( api_version=self.api_version, kind=self.kind) self.machine = OcpMachines(kube_config_file=kube_config_file) self.node = OcpNodes(kube_config_file=kube_config_file)
def __init__(self, kube_config_file): self.kube_config_file = kube_config_file super(OcpHealthChecker, self).__init__(kube_config_file=self.kube_config_file) self.ocp_node = OcpNodes(kube_config_file=self.kube_config_file) self.ocp_cluster_operator = OcpClusterOperator(kube_config_file=self.kube_config_file) self.ocp_control_plane = OcpControlPlane(kube_config_file=self.kube_config_file) self.ocp_cluster_version = OcpClusterVersion(kube_config_file=self.kube_config_file) self.ocp_route = OcpRoutes(kube_config_file=self.kube_config_file) self.ocp_pod = OcpPods(kube_config_file=self.kube_config_file) self.ocp_deployment = OcpDeploymentconfigs(kind="Deployment", kube_config_file=self.kube_config_file) self.ocp_config = OcpConfig(kind="Config", api_version="imageregistry.operator.openshift.io/v1", kube_config_file=self.kube_config_file) self.ocp_secret = OcpSecret(kube_config_file=self.kube_config_file)
def ocp_node(get_kubeconfig): return OcpNodes(kube_config_file=get_kubeconfig)
class OcpHealthChecker(OcpBase): """ OcpHealthChecker will check the health of certain critical openshift components. - Node/Controller - Router - ImageRegistry - Persistence Storage for ImageRegistry - API Server - Web Console - Cluster Version - Control Planes - Cluster Operators Every components health check returns overall health of that component (bool) and optional unhealthy components (list or dict). Optional unhealthy components will be display in tabular format when we check the health of openshift cluster. """ def __init__(self, kube_config_file): self.kube_config_file = kube_config_file super(OcpHealthChecker, self).__init__(kube_config_file=self.kube_config_file) self.ocp_node = OcpNodes(kube_config_file=self.kube_config_file) self.ocp_cluster_operator = OcpClusterOperator( kube_config_file=self.kube_config_file) self.ocp_control_plane = OcpControlPlane( kube_config_file=self.kube_config_file) self.ocp_cluster_version = OcpClusterVersion( kube_config_file=self.kube_config_file) self.ocp_route = OcpRoutes(kube_config_file=self.kube_config_file) self.ocp_pod = OcpPods(kube_config_file=self.kube_config_file) self.ocp_deployment = OcpDeploymentconfigs( kind="Deployment", kube_config_file=self.kube_config_file) self.ocp_config = OcpConfig( kind="Config", api_version="imageregistry.operator.openshift.io/v1", kube_config_file=self.kube_config_file) self.ocp_secret = OcpSecret(kube_config_file=self.kube_config_file) def check_node_health(self): """ Check health of each cluster node Methods checks for: - DiskPressure: All have sufficient disk space. - MemoryPressure: All have sufficient memory - PIDPressure: All have sufficient number processes are running - If ALL above is False, Node is in ready(healthy) state :return: Return tuple of all_nodes_healthy(boolean) and node_health_info(dict of node name and failure reason) """ logger.info("Checking all cluster nodes health") unhealthy_node_info = dict() all_nodes_healthy = False individual_node_health_status_list = list() node_list_info = self.ocp_node.get_all_nodes() if node_list_info: for node_info in node_list_info.items: temp_list = list() for condition in node_info.status.conditions: if condition["type"] == "Ready": individual_node_health_status_list.append( condition["status"]) elif condition["type"] == "MemoryPressure" and condition[ "status"] == "True": temp_list.append( {"MemoryPressure": "The node memory is low"}) elif condition["type"] == "DiskPressure" and condition[ "status"] == "True": temp_list.append( {"DiskPressure": "The disk capacity is low"}) elif condition["type"] == "PIDPressure" and condition[ "status"] == "True": temp_list.append({ "PIDPressure": "There are too many processes on the node" }) unhealthy_node_info[node_info["metadata"]["name"]] = temp_list logger.info( "Check overall health of cluster nodes by checking each node health" ) if len(set(individual_node_health_status_list)) == 1 and \ list(set(individual_node_health_status_list))[0] == "True": all_nodes_healthy = True return all_nodes_healthy, unhealthy_node_info def check_router_health(self): """ Check openshift router health - Check if router pod is running fine in openshift-ingress namespace - Check if deployments of router has matching number of replicas :return:Return tuple of bool (is_router_healthy) and dict (unhealthy_router_info) """ logger.info("Check the health of openshift router operator pod") is_router_healthy = False unhealthy_router_info = dict() is_router_pod_healthy = False is_replicas_count_matching = False unhealthy_pods = dict() pods_response = self.ocp_pod.list_pods_in_a_namespace( namespace="openshift-ingress") for pod in pods_response.items: for condition in pod["status"]["conditions"]: if condition["type"] == "Ready" and condition[ "status"] == "False": unhealthy_pods[pod["metadata"] ["name"]] = condition["status"] if len(unhealthy_pods.values()) == 0: is_router_pod_healthy = True # Get unhealthy pod names unhealthy_pod_names = list() for key, value in unhealthy_pods.items(): unhealthy_pod_names.append(key) unhealthy_router_info.update({"router_pod": unhealthy_pod_names}) logger.info("Is router pod/s healthy : %s", is_router_pod_healthy) logger.info( "Check replicas count of openshift router deployment are matching") deployment_response = self.ocp_deployment.list_all_deployments_in_a_namespace( namespace="openshift-ingress") for deployment in deployment_response.items: if deployment.metadata.name == "router-default": replicas = deployment.status.replicas available_replicas = deployment.status.availableReplicas ready_replicas = deployment.status.readyReplicas if replicas == available_replicas == ready_replicas: is_replicas_count_matching = True # If replicas count doesn't match, add them into unhealthy component list if not is_replicas_count_matching: unhealthy_router_info.update( {"router_replicas": is_replicas_count_matching}) logger.info("Is router deployment replicas count matching : %s", is_replicas_count_matching) logger.info("Check overall health of router operator") if is_router_pod_healthy and is_replicas_count_matching: is_router_healthy = True return is_router_healthy, unhealthy_router_info logger.info("Check overall health of router operator") if is_router_pod_healthy and is_replicas_count_matching: is_router_healthy = True return is_router_healthy, unhealthy_router_info def check_image_registry_health(self): """ Check openshift cluster image registry - Check if image registry pod is running fine in openshift-image-registry namespace - Check if deployments of image registry has matching number of replicas :return: """ logger.info("Check health of openshift image registry pods") is_image_registry_healthy = False unhealthy_image_registry_info = dict() is_image_registry_pods_healthy = False unhealthy_pods = dict() pods_response = self.ocp_pod.list_pods_in_a_namespace( namespace="openshift-image-registry") for pod in pods_response.items: if "cluster-image-registry-operator" in pod.metadata.name or \ "image-registry" in pod.metadata.name: for condition in pod["status"]["conditions"]: if condition["type"] == "Ready" and condition[ "status"] == "False": unhealthy_pods[pod["metadata"] ["name"]] = condition["status"] if len(unhealthy_pods.values()) == 0: is_image_registry_pods_healthy = True # Get unhealthy pod names unhealthy_pod_names = list() for key, value in unhealthy_pods.items(): unhealthy_pod_names.append(key) unhealthy_image_registry_info.update( {"image_registry_pod": unhealthy_pod_names}) logger.info("Is image registry pod/s healthy : %s", is_image_registry_pods_healthy) logger.info( "Check replicas count of openshift image registry deployment are matching" ) is_replicas_count_matching = False replica_count_dict = dict() deployment_response = \ self.ocp_deployment.list_all_deployments_in_a_namespace(namespace="openshift-image-registry") for deployment in deployment_response.items: if deployment.metadata.name == "cluster-image-registry-operator" or \ deployment.metadata.name == "image-registry": replicas = deployment.status.replicas available_replicas = deployment.status.availableReplicas ready_replicas = deployment.status.readyReplicas if replicas == available_replicas == ready_replicas: is_replicas_count_matching_for_each_image_registry = True replica_count_dict[ deployment.metadata. name] = is_replicas_count_matching_for_each_image_registry logger.info("Is replicas count for %s matching? : %s", deployment.metadata.name, is_replicas_count_matching_for_each_image_registry) if all(val for val in replica_count_dict.values()): is_replicas_count_matching = True # If replicas count doesn't match, add them into unhealthy component list if not is_replicas_count_matching: unhealthy_image_registry_info.update( {"image_registry_replicas": is_replicas_count_matching}) logger.info( "Is replicas count matching for all image registry deployment : %s", is_replicas_count_matching) logger.info( "Check overall health of image registry operator by checking pod/s status and replicas count match" ) if is_image_registry_pods_healthy and is_replicas_count_matching: is_image_registry_healthy = True return is_image_registry_healthy, unhealthy_image_registry_info def check_persistence_storage_for_image_registry(self): """ Check if persistence storage configure for cluster image registry - Check managementState of image-registry. IPI installation has default "Managed". UPI installation has default "Removed". managementState field should "Managed". Managed: The Operator updates the registry as configuration resources are updated. Unmanaged: The Operator ignores changes to the configuration resources. Removed: The Operator removes the registry instance and tear down any storage that the Operator provisioned. - Check if persistence storage configured for image registry :return: (boolean) Return True if persistence storage configured for image registry otherwise False. NOTE : If persistence storage is not configured for image registry doesn't mean unhealthy. It's a WARNING. If persistence storage is not configure for image registry, Images will be inaccessible after reboot. """ is_image_registry_storage_configured = False is_management_state_correct = False is_persistence_storage_configured = True logger.info("Check managementState for image registry") image_config_response = self.ocp_config.get_ocp_config(name="cluster") if image_config_response["spec"]["managementState"] == "Managed": is_management_state_correct = True logger.info("Is managementState correct : %s", {is_management_state_correct}) logger.info( "Check if persistence storage configured for image registry") if "emptyDir" in dict(image_config_response["spec"]["storage"]): is_persistence_storage_configured = False logger.info("Is persistence Storage Configured: %s", is_persistence_storage_configured) if is_management_state_correct and is_persistence_storage_configured: is_image_registry_storage_configured = True return is_image_registry_storage_configured def check_api_server_health(self): """ Check openshift apiserver is reachable and healthy :return: (boolean) Return True if api server is healthy otherwise False """ status_codes = dict() is_api_server_healthy = False kubeconfig_data = self.get_data_from_kubeconfig_v4() logger.info("Check health of API Server") api_server_url = kubeconfig_data["api_server_url"] final_api_server_url = api_server_url + "/healthz" logger.info("API Server URL : %s", final_api_server_url) bearer_token = self.ocp_secret.get_long_live_bearer_token() headers = {'Authorization': 'Bearer ' + bearer_token} # Suppress only the single warning from urllib3 needed. requests.packages.urllib3.disable_warnings( category=InsecureRequestWarning) api_server_response = requests.get(final_api_server_url, headers=headers, verify=False) logger.info("API Server Status Code : %s", api_server_response.status_code) status_codes["api_server_status"] = api_server_response.status_code for key, value in status_codes.items(): if value in range(200, 300): is_api_server_healthy = True return is_api_server_healthy def check_web_console_health(self): """ Check if web console is reachable and healthy :return:(boolean) Return True if web console is healthy otherwise False """ status_codes = dict() is_web_console_healthy = False logger.info("Check health of web-console") web_console_route = self.ocp_route.get_route_in_namespace( namespace="openshift-console", route_name="console") web_console_url = "https://" + web_console_route + ":443" + "/healthz" logger.info("Web Console URL : %s", web_console_url) # Suppress only the single warning from urllib3 needed. requests.packages.urllib3.disable_warnings( category=InsecureRequestWarning) web_console_response = requests.get(web_console_url, verify=False) logger.info("Web Console Status Code : %s", web_console_response.status_code) status_codes["web_console_status"] = web_console_response.status_code for key, value in status_codes.items(): if value in range(200, 300): is_web_console_healthy = True return is_web_console_healthy def check_cluster_version_operator_health(self): """ Check ClusterVersion operator health :return: (boolean) Return """ logger.info("Check health of ClusterVersion operator") is_cluster_version_operator_healthy = False cluster_version_response = self.ocp_cluster_version.get_cluster_version( ) for cluster_version in cluster_version_response.items: if cluster_version["metadata"]["name"] == "version": for condition in cluster_version["status"]["conditions"]: if condition["type"] == "Available" and condition[ "status"] == "True": is_cluster_version_operator_healthy = True return is_cluster_version_operator_healthy def check_control_plane_status(self): """ Check health of cluster control plane components Command : "oc get cs OR oc get componentstatus" :return: (tuple) Return tuple of overall health of control plane component (boolean) and list of unhealthy components if any """ logger.info("Checking control plan status") all_control_plane_components_healthy = False unhealthy_components_list = list() control_plane_components = self.ocp_control_plane.get_all_control_plane_components( ) if control_plane_components: for control_plane_component in control_plane_components.items: for condition in control_plane_component.conditions: if condition["type"] != "Healthy" and not condition[ "status"] == "False": unhealthy_components_list.append( control_plane_component["metadata"]["name"]) # Set control plane health by checking all control plane components health if len(unhealthy_components_list) == 0: all_control_plane_components_healthy = True return all_control_plane_components_healthy, unhealthy_components_list def check_cluster_operators_health(self): """ Check health of cluster operator Command : "oc get co OR oc get clusteroperator" :return: (tuple) Return overall health of cluster operator (boolean) and list of unhealthy operator if any """ logger.info("Checking all cluster operators health") all_cluster_operators_healthy = False unhealthy_operators_list = list() cluster_operators = self.ocp_cluster_operator.get_all_cluster_operators( ) if cluster_operators: for cluster_operator in cluster_operators.items: for condition in cluster_operator.status.conditions: if condition["type"] == "Available" and condition[ "status"] == "False": unhealthy_operators_list.append( cluster_operator["metadata"]["name"]) # Set cluster operator health status by checking all operator are health if len(unhealthy_operators_list) == 0: all_cluster_operators_healthy = True return all_cluster_operators_healthy, unhealthy_operators_list
class OcpMachineSet(OcpBase): """ OcpMachineSet class extends OcpBase and encapsulates all methods related to managing Openshift Machine Sets. :param kube_config_file: A kubernetes config file. :return: None """ def __init__(self, kube_config_file=None): super(OcpMachineSet, self).__init__(kube_config_file=kube_config_file) self.api_version = "machine.openshift.io/v1beta1" self.kind = "MachineSet" self.machineset = self.dyn_client.resources.get( api_version=self.api_version, kind=self.kind) self.machine = OcpMachines(kube_config_file=kube_config_file) self.node = OcpNodes(kube_config_file=kube_config_file) def get_machine_sets(self) -> ResourceList: """ Get all Machine sets in a cluster :return: MachineSetList on success OR an empty list on failure """ api_response = list() try: api_response = self.machineset.get(namespace=MACHINE_NAMESPACE) except ApiException as e: logger.error("Exception while getting all Machine Sets: %s\n", e) return api_response def get_machine_set(self, machine_set_name: str) -> ResourceInstance: """ Get a Machine set by name :param machine_set_name: (str) name of the machine set :return: MachineSet object on success OR None on failure """ api_response = None try: api_response = self.machineset.get(name=machine_set_name, namespace=MACHINE_NAMESPACE) except ApiException as e: logger.error("Exception while getting Machine set: %s\n", e) return api_response def get_machine_set_role(self, machine_set_name: str) -> str: """ Get a Machine set role :param machine_set_name: (str) name of the machine set :return: Machine set role on success OR empty string on failure """ role = str() machine_set = self.get_machine_set(machine_set_name) role = machine_set.metadata.labels[ "machine.openshift.io/cluster-api-machine-role"] return role def is_machine_set_ready(self, machine_set_name: str) -> bool: """ Verify that a Machine reflects the desired number of user specified replicas :param machine_set_name: (str) name of the machine set :return: (bool) True when readyReplicas == replicas OR False otherwise """ field_selector = f"metadata.name={machine_set_name}" for event in self.machineset.watch(namespace=MACHINE_NAMESPACE, field_selector=field_selector, timeout=TIMEOUT): requested_replicas = event["object"]["status"]["replicas"] ready_replicas = event["object"]["status"]["readyReplicas"] if requested_replicas == ready_replicas: return True else: logger.info("Waiting for replicas to match ready replicas") return False def scale_machine_set(self, machine_set_name: str, replicas: int) -> bool: # noqa: C901 """ Verify that a Machine reflects the desired number of user specified replicas :param machine_set_name: (str) name of the machine set :param replicas: (int) the number of desired machine replicas :return: (bool) True when successfully scaling a Machine set object OR False otherwise """ def _verify_successful_scale_up(machine_set_name: str) -> bool: """ Once a patch operation is successfully completed, a scale up is deemed successful if the following conditions are met: 1. The newly generated machines reach a ready state 2. New nodes corresponding to the newcly created machines are created and reach a ready state. :param machinet_set_name: (str) name of the machine set :return: (bool) True if the given machine set is successfully scaled up OR False otherwise. """ scaled_up_machines_list = self.machine.get_machines_in_machineset( machine_set_name) creation_phases = {"Provisioning", "Provisioned"} new_machine_names = [ machine.metadata.name for machine in scaled_up_machines_list.items if machine.status.phase in creation_phases ] new_machines_ready = True for machine_name in new_machine_names: new_machines_ready = new_machines_ready and self.machine.is_machine_created( machine_name) if new_machines_ready: new_nodes_ready = True for machine in new_machine_names: logger.debug("Checking that new nodes are available") node_name = self.machine.get_machine_node_ref(machine) new_nodes_ready = new_nodes_ready and self.node.is_node_ready( node_name) return new_nodes_ready else: raise AssertionError( "New machine(s) resulting from scaling did not reach a ready state" ) def _verify_successful_scale_down(machine_set_name: str) -> bool: """ Once a patch operation is successfully completed, a scale down is deemed successful if the following conditions are met: 1. Enough machines are deleted to meet the desired number of replicas 2. Nodes corresponding to the deleted machines are in turn deleted as well. :param machinet_set_name: (str) name of the machine set :return: (bool) True if the given machine set is successfully scaled down OR False otherwise. """ scaled_down_machines_list = self.machine.get_machines_in_machineset( machine_set_name) machine_names_to_be_deleted = [ machine.metadata.name for machine in scaled_down_machines_list.items if machine.status.phase == "Deleting" ] node_names_to_be_deleted = list() for machine in machine_names_to_be_deleted: node_names_to_be_deleted.append( self.machine.get_machine_node_ref(machine)) logger.debug("Machines to be deleted are: {}".format( machine_names_to_be_deleted)) excess_machines_deleted = True for machine_name in machine_names_to_be_deleted: excess_machines_deleted = excess_machines_deleted and self.machine.is_machine_deleted( machine_name) if excess_machines_deleted: excess_nodes_deleted = True for node in node_names_to_be_deleted: logger.debug("Checking that scaled down nodes are removed") excess_nodes_deleted = excess_nodes_deleted and self.node.is_node_deleted( node) return excess_nodes_deleted else: raise AssertionError( "Scale down operation did not complete successfully") def _is_watched_desired(machine_set_name: str, desired_replicas: str) -> bool: """ After patching a Machine set object with a different replica value, this method is meant to verify that the 'replicas' value reflects the value we used with the patch operation. :param machine_set_name: (str) The name of the machine set :param desired_replicas: (int) The number of replicas to be watched :return: (bool) True when values match OR False otherwise """ field_selector = f"metadata.name={machine_set_name}" for event in self.machineset.watch(namespace=MACHINE_NAMESPACE, field_selector=field_selector, timeout=TIMEOUT): if event["object"]["status"]["replicas"] == replicas: return True else: logger.debug( "Waiting for MachineSet to reflect new number of desired replicas" ) return False initial_machines = self.machine.get_machines_in_machineset( machine_set_name) initial_machine_names = set( [machine.metadata.name for machine in initial_machines.items]) initial_machines_count = len(initial_machine_names) # If number of existing machine is the same as replicas, nothing to do. if initial_machines_count == replicas: logger.info( "Desired replicas is already equal to number of machines. No sacling required" ) return True body = {"spec": {"replicas": replicas}} api_response = None try: api_response = self.machineset.patch( name=machine_set_name, body=body, namespace=MACHINE_NAMESPACE, content_type="application/merge-patch+json", ) except ApiException as e: logger.error("Exception while updating MachineSet: %s\n", e) if not _is_watched_desired(machine_set_name, replicas): raise AssertionError( "The MachineSet does not reflect the deisred number of replicas" ) if api_response: if initial_machines_count < replicas: return _verify_successful_scale_up(machine_set_name) else: return _verify_successful_scale_down(machine_set_name)