def test_rolling_reboot_node(self, node_type): """ Test to rolling reboot of nodes """ # Get info from SCALE_DATA_FILE for validation if os.path.exists(SCALE_DATA_FILE): file_data = templating.load_yaml(SCALE_DATA_FILE) namespace = file_data.get("NAMESPACE") pod_scale_list = file_data.get("POD_SCALE_LIST") pvc_scale_list = file_data.get("PVC_SCALE_LIST") else: raise FileNotFoundError node_list = list() # Rolling reboot nodes if node_type == constants.WORKER_MACHINE: tmp_list = get_nodes(node_type=node_type) ocs_node_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) for tmp in tmp_list: if tmp.name in ocs_node_list: node_list.append(tmp) else: node_list = get_nodes(node_type=node_type) factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in node_list: nodes.restart_nodes(nodes=[node]) scale_lib.validate_node_and_oc_services_are_up_after_reboot() # Validate storage pods are running wait_for_storage_pods() # Validate cluster health ok and all pods are running assert utils.ceph_health_check( delay=180 ), "Ceph health in bad state after node reboots" # Validate all PVCs from namespace are in Bound state assert scale_lib.validate_all_pvcs_and_check_state( namespace=namespace, pvc_scale_list=pvc_scale_list ) # Validate all PODs from namespace are up and running assert scale_lib.validate_all_pods_and_check_state( namespace=namespace, pod_scale_list=pod_scale_list )
def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): Following kwargs are valid repo: Ripsaw repo to used - a github link branch: branch to use from the repo namespace: namespace for the operator Example Usage: r1 = RipSaw() r1.apply_crd(crd='ripsaw_v1alpha1_ripsaw_crd.yaml') # use oc apply to apply custom modified bench my_custom_bench = my_custom_bench.yaml run_cmd('oc apply -f my_custom_bench') """ self.args = kwargs self.repo = self.args.get( "repo", "https://github.com/cloud-bulldozer/benchmark-operator") self.branch = self.args.get("branch", "master") self.namespace = self.args.get("namespace", RIPSAW_NAMESPACE) self.pgsql_is_setup = False self.ocp = OCP() self.ns_obj = OCP(kind="namespace") self.pod_obj = OCP(namespace=RIPSAW_NAMESPACE, kind="pod") self._create_namespace() self._clone_ripsaw() self.worker_nodes = [node.name for node in get_nodes()] helpers.label_worker_node(self.worker_nodes, label_key="kernel-cache-dropper", label_value="yes")
def test_osd_balance(self, es): """ Current pattern is: add 6 osds (9 total, 3 nodes) add 3 nodes add 9 osds (18 total, 6 nodes) add 3 nodes add 9 osds (27 total, 9 nodes) """ crd_data = templating.load_yaml(constants.OSD_SCALE_BENCHMARK_YAML) our_uuid = uuid4().hex self.elastic_info = ElasticData(our_uuid, crd_data) self.elastic_info.es_connect() collect_stats(INITIAL_SETUP, self.elastic_info) for cntr in range(0, MAX_TIMES_ADDED): num_nodes = len(get_nodes(constants.WORKER_MACHINE)) osd_incr = 3 if cntr == 0 and num_nodes == START_NODE_NUM: osd_incr = 2 if osd_incr == 3: scale_ocs_node() collect_stats("Three nodes have been added", self.elastic_info) cntval = 3 * osd_incr logging.info(f"Adding {cntval} osds to nodes") scale_capacity_with_deviceset(add_deviceset_count=osd_incr, timeout=900) collect_stats("OSD capacity increase", self.elastic_info) collect_stats(FINAL_REPORT, self.elastic_info)
def test_nodes_restart(self, nodes, node_type): """ Test nodes restart (from the platform layer) """ node_count = len(get_nodes(node_type=node_type)) if node_type == constants.WORKER_MACHINE: ocp_nodes = get_nodes(node_type=node_type) else: ocp_nodes = get_nodes(node_type=node_type, num_of_nodes=2) nodes.restart_nodes(nodes=ocp_nodes, wait=False) wait_for_node_count_to_reach_status(node_count=node_count, node_type=node_type) self.sanity_helpers.health_check() self.create_resources()
def initialize_data(): """ Initialize the data dictionary with cluster data Returns: dict: A dictionary contains the data to push to the dashboard """ # worker type is relevant only for cloud instances. log.info("Initializing the dashboard data") worker_lbl = get_nodes(num_of_nodes=1)[0].data["metadata"]["labels"] if "beta.kubernetes.io/instance-type" in worker_lbl: worker_type = worker_lbl["beta.kubernetes.io/instance-type"] else: # TODO: Maybe for None cloud we can add the Arch ? # worker_type = worker_lbl['kubernetes.io/arch'] worker_type = "" log.info(f"The worker type is {worker_type}") (ocs_ver_info, _) = get_ocs_version() ocs_ver_full = ocs_ver_info["status"]["desired"]["version"] m = re.match(r"(\d.\d).(\d)", ocs_ver_full) if m and m.group(1) is not None: ocs_ver = m.group(1) log.info(f"ocs_ver is {ocs_ver_full}") platform = config.ENV_DATA["platform"].upper() if platform.lower() not in ["vsphere", "baremetal"]: platform = f"{platform.upper()} {worker_type}" data_template["commitid"] = ocs_ver_full data_template["project"] = f"OCS{ocs_ver}" data_template["branch"] = ocs_ver_info["spec"]["channel"] data_template["executable"] = ocs_ver data_template["environment"] = platform return data_template
def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ get_nodes(node_type=node_type, num_of_nodes=1)[0] for node_type in ["worker", "master"] ] assert nodes, "Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def get_node_name_where_jenkins_pod_not_hosted(self, node_type=constants. WORKER_MACHINE, num_of_nodes=1): """ get nodes Args: node_type (str): The node type (e.g. worker, master) num_of_nodes (int): The number of nodes to be returned Returns: list: List of compute node names """ if node_type == constants.MASTER_MACHINE: nodes_drain = [ node.name for node in get_nodes(node_type=node_type, num_of_nodes=num_of_nodes) ] elif node_type == constants.WORKER_MACHINE: pod_objs = [] for project in self.projects: pod_names = get_pod_name_by_pattern(pattern="jenkins", namespace=project) pod_obj = [ get_pod_obj(name=pod_name, namespace=project) for pod_name in pod_names ] pod_objs += pod_obj nodes_app_name = set(get_app_pod_running_nodes(pod_objs)) nodes_worker_name = set(get_worker_nodes()) nodes_drain = nodes_worker_name - nodes_app_name else: raise ValueError("The node type is worker or master") return list(nodes_drain)[:num_of_nodes]
def test_pv_after_reboot_node(self, nodes): """ Verify unexpected PV is not created after node reboot on LSO cluster """ pv_before_reset = get_pv_names() worker_nodes = get_nodes(node_type=constants.WORKER_MACHINE, num_of_nodes=3) ocp_obj = OCP(kind=constants.PV) for worker_node in worker_nodes: # Restart one worker node nodes.restart_nodes(nodes=[worker_node], wait=True) self.sanity_helpers.health_check(cluster_check=False, tries=60) logger.info(f"Verify PV after reboot {worker_node}") pv_after_reset = get_pv_names() pv_diff = set(pv_after_reset) - set(pv_before_reset) pv_new = [] for pv in pv_diff: pv_obj = ocp_obj.get(resource_name=pv) if pv_obj["spec"]["storageClassName"] == "localblock": pv_new.append(pv) assert ( not pv_new ), f"Unexpected PV {pv_new} created after reboot {worker_node}" logger.info("SUCCESS - No new PV was created.")
def get_node_by_attached_volume(self, volume): """ Get the node by attached volume on IBM Cloud. Args: volume (str): volume id. Raises: NodeHasNoAttachedVolume: In case the volume is not attached to node Returns: str: worker id """ cmd = f"ibmcloud is volume {volume} --output json" out = run_ibmcloud_cmd(cmd) out = json.loads(out) if not out["volume_attachments"]: logger.info("volume is not attached to node") raise NodeHasNoAttachedVolume("volume not attached to node") else: worker_id = out["volume_attachments"][0]["instance"]["name"] logger.info(f"volume is attached to node: {worker_id}") worker_nodes = get_nodes(node_type="worker") for worker_node in worker_nodes: logger.info( f"worker node id is:{worker_node.get()['metadata']['labels']['ibm-cloud.kubernetes.io/worker-id']}" ) if (worker_node.get()["metadata"]["labels"] ["ibm-cloud.kubernetes.io/worker-id"] == worker_id): logger.info(f"return worker node is:{worker_id}") return worker_node
def node_operations_entry_criteria( self, node_type, number_of_nodes, operation_name="Node Operation", network_fail_time=None, ): """ Entry criteria function for node related operations Args: node_type (str): Type of node number_of_nodes (int): Number of nodes operation_name (str): Name of the node operation network_fail_time (int): Total time to fail the network in a node Returns: tuple: containing the params used in Node operations """ self.validate_cluster(node_status=True, operation_name=operation_name) logger.info(f"Getting parameters related to: {operation_name}") typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=number_of_nodes) if network_fail_time: return typed_nodes, network_fail_time else: return typed_nodes
def test_rebootnodes(): """ Check basic consistency in platform handling. """ ibmcloud = IBMCloud() worker_nodes = node.get_nodes(node_type="worker") ibmcloud.restart_nodes(worker_nodes)
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name reboot_events_cmd = ( f"get events -A --field-selector involvedObject.name=" f"{typed_node_name},reason=Rebooted -o yaml") # Find the number of reboot events in 'typed_node_name' num_events = len( typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"]) # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=False) try: wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_NOT_READY_SCHEDULING_DISABLED, ) except ResourceWrongStatusException: # Sometimes, the node will be back to running state quickly so # that the status change won't be detected. Verify the node was # actually restarted by checking the reboot events count new_num_events = len( typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"]) assert new_num_events > num_events, ( f"Reboot event not found." f"Node {typed_node_name} did not restart.") wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED, ) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_node_maintenance(self, reduce_and_resume_cluster_load, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=90)
def get_new_device_paths(device_sets_required, osd_size_capacity_requested): """ Get new device paths to add capacity over Baremetal cluster Args: device_sets_required (int) : Count of device sets to be added osd_size_capacity_requested (int) : Requested OSD size capacity Returns: list : List containing added device paths """ ocp_obj = OCP(kind="localvolume", namespace=config.ENV_DATA["local_storage_namespace"]) workers = get_nodes(node_type="worker") worker_names = [worker.name for worker in workers] config.ENV_DATA["worker_replicas"] = len(worker_names) output = ocp_obj.get(resource_name="local-block") # Fetch device paths present in the current LVCR cur_device_list = output["spec"]["storageClassDevices"][0]["devicePaths"] # Clone repo and run playbook to fetch all device paths from each node path = os.path.join(constants.EXTERNAL_DIR, "device-by-id-ocp") clone_repo(constants.OCP_QE_DEVICEPATH_REPO, path) os.chdir(path) run_cmd("ansible-playbook devices_by_id.yml") # Filter unused/unallocated device paths with open("local-storage-block.yaml", "r") as cloned_file: with open("local-block.yaml", "w") as our_file: device_from_worker = [1] * config.ENV_DATA["worker_replicas"] cur_line = cloned_file.readline() while "devicePaths:" not in cur_line: our_file.write(cur_line) cur_line = cloned_file.readline() our_file.write(cur_line) cur_line = cloned_file.readline() # Add required number of device path from each worker node while cur_line: if str(osd_size_capacity_requested) in cur_line: for i in range(len(worker_names)): if device_from_worker[i] and (str(worker_names[i]) in cur_line): if not any(s in cur_line for s in cur_device_list): our_file.write(cur_line) device_from_worker[ i] = device_from_worker[i] - 1 cur_line = cloned_file.readline() local_block_yaml = open("local-block.yaml") lvcr = yaml.load(local_block_yaml, Loader=yaml.FullLoader) new_dev_paths = lvcr["spec"]["storageClassDevices"][0]["devicePaths"] logger.info(f"Newly added devices are: {new_dev_paths}") if new_dev_paths: assert len(new_dev_paths) == ( len(worker_names) * device_sets_required ), f"Current devices available = {len(new_dev_paths)}" os.chdir(constants.TOP_DIR) shutil.rmtree(path) # Return list of old device paths and newly added device paths cur_device_list.extend(new_dev_paths) return cur_device_list
def test_registry_rolling_reboot_node(self, node_type, nodes): """ Test registry workload when backed by OCS and reboot node one by one """ # Get the node list node_list = get_nodes(node_type) # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push(project_name=self.project_name) # Validate image exists in registries path validate_image_exists() for node in node_list: # Reboot node log.info(node.name) nodes.restart_nodes([node], wait=False) # Wait some time after rebooting node waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_cluster_connectivity)(tries=400) retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_nodes_status)(timeout=900) # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40) # Validate storage pods are running wait_for_storage_pods() # Validate image registry pods validate_registry_pod_status() # Validate image exists in registries path validate_image_exists()
def test_attachvolume(get_volume): """ Check basic consistency in platform handling. """ ibmcloud = IBMCloud() worker_nodes = node.get_nodes(node_type="worker", num_of_nodes=1) ibmcloud.attach_volume(get_volume, worker_nodes)
def verify_image_versions(old_images, upgrade_version, version_before_upgrade): """ Verify if all the images of OCS objects got upgraded Args: old_images (set): set with old images upgrade_version (packaging.version.Version): version of OCS version_before_upgrade (float): version of OCS before upgrade """ number_of_worker_nodes = len(get_nodes()) osd_count = get_osd_count() verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL) verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL) # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID, # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID nooba_pods = 2 if upgrade_version < parse_version("4.3") else 3 verify_pods_upgraded(old_images, selector=constants.NOOBAA_APP_LABEL, count=nooba_pods) verify_pods_upgraded( old_images, selector=constants.CSI_CEPHFSPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.CSI_RBDPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.MON_APP_LABEL, count=3, ) verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL) osd_timeout = 600 if upgrade_version >= parse_version("4.5") else 750 verify_pods_upgraded( old_images, selector=constants.OSD_APP_LABEL, count=osd_count, timeout=osd_timeout * osd_count, ) verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2) if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: rgw_count = get_rgw_count(upgrade_version.base_version, True, version_before_upgrade) verify_pods_upgraded( old_images, selector=constants.RGW_APP_LABEL, count=rgw_count, )
def add_worker_based_on_pods_count_per_node( node_count, expected_count, role_type=None, machineset_name=None ): """ Function to evaluate number of pods up in node and add new node accordingly. Args: machineset_name (list): Machineset_names to add more nodes if required. node_count (int): Additional nodes to be added expected_count (int): Expected pod count in one node role_type (str): To add type to the nodes getting added Returns: bool: True if Nodes gets added, else false. """ # Check for POD running count on each nodes if ( config.ENV_DATA["deployment_type"] == "ipi" and config.ENV_DATA["platform"].lower() == "aws" ): app_nodes = node.get_nodes(node_type=role_type) pod_count_dict = node.get_running_pod_count_from_node(node_type=role_type) high_count_nodes, less_count_nodes = ([] for i in range(2)) for node_obj in app_nodes: count = pod_count_dict[f"{node_obj.name}"] if count >= expected_count: high_count_nodes.append(node_obj.name) else: less_count_nodes.append(node_obj.name) if len(less_count_nodes) <= 1: for name in machineset_name: count = machine.get_replica_count(machine_set=name) machine.add_node(machine_set=name, count=(count + node_count)) machine.wait_for_new_node_to_be_ready(name) return True else: logging.info( f"Enough pods can be created with available nodes {pod_count_dict}" ) return False elif ( config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "vsphere" ): raise UnsupportedPlatformError("Unsupported Platform to add worker") elif ( config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "baremetal" ): raise UnsupportedPlatformError("Unsupported Platform to add worker") elif ( config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "azure" ): raise UnsupportedPlatformError("Unsupported Platform to add worker")
def add_worker_based_on_cpu_utilization( node_count, expected_percent, role_type=None, machineset_name=None ): """ Function to evaluate CPU utilization of nodes and add node if required. Args: machineset_name (list): Machineset_names to add more nodes if required. node_count (int): Additional nodes to be added expected_percent (int): Expected utilization precent role_type (str): To add type to the nodes getting added Returns: bool: True if Nodes gets added, else false. """ # Check for CPU utilization on each nodes if ( config.ENV_DATA["deployment_type"] == "ipi" and config.ENV_DATA["platform"].lower() == "aws" ): app_nodes = node.get_nodes(node_type=role_type) uti_dict = node.get_node_resource_utilization_from_oc_describe( node_type=role_type ) uti_high_nodes, uti_less_nodes = ([] for i in range(2)) for node_obj in app_nodes: utilization_percent = uti_dict[f"{node_obj.name}"]["cpu"] if utilization_percent > expected_percent: uti_high_nodes.append(node_obj.name) else: uti_less_nodes.append(node_obj.name) if len(uti_less_nodes) <= 1: for name in machineset_name: count = machine.get_replica_count(machine_set=name) machine.add_node(machine_set=name, count=(count + node_count)) machine.wait_for_new_node_to_be_ready(name) return True else: logging.info(f"Enough resource available for more pod creation {uti_dict}") return False elif ( config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "vsphere" ): raise UnsupportedPlatformError("Unsupported Platform to add worker") elif ( config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "baremetal" ): raise UnsupportedPlatformError("Unsupported Platform to add worker") elif ( config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "azure" ): raise UnsupportedPlatformError("Unsupported Platform to add worker")
def get_environment_info(): """ Getting the environment information, Information that will be collected Versions: OCP - version / build / channel OCS - version / build Ceph - version Rook - version Platform: BM / VmWare / Cloud provider etc. Instance type / architecture Cluster name User name that run the test Return: dict: dictionary that contain the environment information """ results = {} # getting the name and email of the user that running the test. try: user = utils.run_cmd("git config --get user.name").strip() email = utils.run_cmd("git config --get user.email").strip() results["user"] = f"{user} <{email}>" except CommandFailed: # if no git user define, the default user is none results["user"] = "" results["clustername"] = ocp.get_clustername() results["platform"] = node.get_provider() if results["platform"].lower() not in constants.ON_PREM_PLATFORMS: results["platform"] = results["platform"].upper() results["ocp_build"] = ocp.get_build() results["ocp_channel"] = ocp.get_ocp_channel() results["ocp_version"] = utils.get_ocp_version() results["ceph_version"] = utils.get_ceph_version() results["rook_version"] = utils.get_rook_version() results["ocs_build"] = ocp.get_ocs_version() # Extracting the version number x.y.z from full build name m = re.match(r"(\d.\d).(\d)", results["ocs_build"]) if m and m.group(1) is not None: results["ocs_version"] = m.group(1) # Getting the instance type for cloud or Arch type for None cloud worker_lbl = node.get_nodes(num_of_nodes=1)[0].data["metadata"]["labels"] if "beta.kubernetes.io/instance-type" in worker_lbl: results["worker_type"] = worker_lbl["beta.kubernetes.io/instance-type"] else: results["worker_type"] = worker_lbl["kubernetes.io/arch"] return results
def test_registry_shutdown_and_recovery_node(self, nodes): """ Test registry workload when backed by OCS and its impact when node is shutdown and recovered """ # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push( project_name=self.project_name, template="eap-cd-basic-s2i", image= "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest", pattern="eap-app", ) # Get the node list node_list = get_nodes(node_type="worker") for node in node_list: # Stop node nodes.stop_nodes(nodes=[node]) # Validate node reached NotReady state wait_for_nodes_status(node_names=[node.name], status=constants.NODE_NOT_READY) # Start node nodes.start_nodes(nodes=[node]) # Validate all nodes are in READY state and up retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_nodes_status)(timeout=900) # Validate all storage pods are running wait_for_storage_pods() # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40) # Validate image registry pods validate_registry_pod_status() # Validate image exists in registries path validate_image_exists(namespace=self.project_name)
def test_run_pgsql_node_drain(self, pgsql, transactions=5600, node_type="worker"): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Select a node where pgbench is not running for drain typed_nodes = [ node1.name for node1 in node.get_nodes(node_type=node_type) ] filter_list = pgsql.filter_pgbench_nodes_from_nodeslist(typed_nodes) typed_node_name = filter_list[random.randint(0, len(filter_list) - 1)] log.info(f"Selected node {typed_node_name} for node drain operation") # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40) # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)
def test_rolling_nodes_restart(self, nodes, node_type): """ Test restart nodes one after the other and check health status in between """ ocp_nodes = get_nodes(node_type=node_type) for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) self.sanity_helpers.health_check(cluster_check=False, tries=60) self.create_resources()
def collect_stats(action_text, elastic_info): """ Write the current configuration information into the REPORT file. This information includes the osd, nodes and which osds are on which nodes. The minimum and maximum numbers of osds per node are also computed and saved. Args: action_text (str): Title of last action taken (usually adding nodes or adding osds) elastic_info (es): ElasticData object for stat collection Raises: AssertionError: OSD layout is unbalanced """ output_info = {"title": action_text} pod_obj = ocp.OCP(kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) osd_list = pod_obj.get(selector=constants.OSD_APP_LABEL)["items"] node_stats = {} for osd_ent in osd_list: osd_node = osd_ent["spec"]["nodeName"] if osd_node in node_stats: node_stats[osd_node].append(osd_ent) else: node_stats[osd_node] = [osd_ent] osds_per_node = [] for entry in node_stats: osds_per_node.append(len(node_stats[entry])) wnodes = get_nodes(constants.WORKER_MACHINE) for wnode in wnodes: if wnode.name not in node_stats: osds_per_node.append(0) maxov = max(osds_per_node) minov = min(osds_per_node) this_skew = maxov - minov logging.info(f"Skew found is {this_skew}") output_info["osds"] = osd_list output_info["worker_nodes"] = wnodes output_info["pairings"] = {} for entry in osd_list: output_info["pairings"][entry["metadata"] ["name"]] = entry["spec"]["nodeName"] output_info["maxov"] = maxov output_info["minov"] = minov output_info["skew_value"] = this_skew elastic_info.add_key(elastic_info.record_counter, output_info) elastic_info.log_recent_activity() elastic_info.record_counter += 1 ceph_health_check(tries=30, delay=60) assert is_balanced(this_skew, maxov), NOT_BALANCED
def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) get_node_resource_utilization_from_adm_top(node_type="master", print_table=True) if pod_name_of_node == "couchbase": node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == "osd": node_list = get_osd_running_nodes() elif pod_name_of_node == "master": master_node = get_nodes(pod_name_of_node, num_of_nodes=1) # Restart relevant node if pod_name_of_node == "master": nodes.restart_nodes(master_node, wait=False) waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) else: restart_node = get_node_objs(node_list[random.randint( 0, len(node_list) - 1)]) nodes.restart_nodes(restart_node) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] retry((CommandFailed), tries=60, delay=15)(bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)) self.sanity_helpers.health_check(tries=40)
def test_run_pgsql_node_drain(self, pgsql, transactions=900, node_type="master"): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Node drain with specific node type typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)
def test_registry_reboot_node(self, node_type, nodes): """ Test registry workload when backed by OCS and reboot node """ # Get the node list node = get_nodes(node_type, num_of_nodes=1) # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push( project_name=self.project_name, template="eap-cd-basic-s2i", image= "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest", pattern="eap-app", ) # Validate image exists in registries path validate_image_exists(namespace=self.project_name) # Reboot one node nodes.restart_nodes(node, wait=False) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_cluster_connectivity)(tries=400) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status)(timeout=900) # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40) # Validate storage pods are running wait_for_storage_pods() # Validate image registry pods validate_registry_pod_status() # Validate image exists in registries path validate_image_exists(namespace=self.project_name)
def finalizer(): config.switch_to_provider() log.info( "Verify that all the worker nodes are in a Ready state on the provider" ) wnodes = get_nodes(node_type=constants.WORKER_MACHINE) for wnode in wnodes: is_recovered = recover_node_to_ready_state(wnode) if not is_recovered: log.warning(f"The node {wnode.name} has failed to recover") log.info("Verify again that the ceph health is OK") ceph_health_check() config.switch_ctx(self.orig_index)
def get_attached_volume(request, get_volume): """ Attached volume """ def finalizer(): worker_nodes = node.get_nodes(node_type="worker", num_of_nodes=1) ibmcloud.detach_volume(get_volume, worker_nodes) request.addfinalizer(finalizer) ibmcloud = IBMCloud() worker_nodes = node.get_nodes(node_type="worker", num_of_nodes=1) ibmcloud.attach_volume(get_volume, worker_nodes) worker_id = ibmcloud.get_node_by_attached_volume(get_volume) return worker_id
def get_max_pvc_count(): """ Return the maximum number of pvcs to test for. This value is 500 times the number of worker nodes. """ worker_nodes = get_nodes(node_type="worker") count = 0 for wnode in worker_nodes: wdata = wnode.data labellist = wdata["metadata"]["labels"].keys() if "node-role.kubernetes.io/worker" not in labellist: continue if "cluster.ocs.openshift.io/openshift-storage" not in labellist: continue count += 1 pvc_count = count * constants.SCALE_MAX_PVCS_PER_NODE return pvc_count