def stop(self, node, timeout): """ Stop the kubelet service using parent service class. After that, ensures the corresponding OCP node moves to NotReady state. Args: node (object): Node objects timeout (int): time in seconds to wait for service to stop. """ super().stop(node, timeout) wait_for_nodes_status(node_names=[node.name], status=constants.NODE_NOT_READY, timeout=timeout)
def test_registry_shutdown_and_recovery_node(self, nodes): """ Test registry workload when backed by OCS and its impact when node is shutdown and recovered """ # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push(project_name=self.project_name) # Get the node list node_list = get_nodes(node_type="worker") for node in node_list: # Stop node nodes.stop_nodes(nodes=[node]) # Validate node reached NotReady state wait_for_nodes_status( node_names=[node.name], status=constants.NODE_NOT_READY ) # Start node nodes.start_nodes(nodes=[node]) # Validate all nodes are in READY state and up retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_nodes_status)(timeout=900) # Validate all storage pods are running wait_for_storage_pods() # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40) # Validate image registry pods validate_registry_pod_status() # Validate image exists in registries path validate_image_exists()
def finalizer(): # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status")
def restart(self, node, timeout): """ Restart the kubelet service using parent service class. After that, ensures the corresponding OCP node is connectable and moves to Ready state. Args: node (object): Node objects timeout (int): time in seconds to wait for service to stop. """ super().restart(node, timeout) wait_for_cluster_connectivity(tries=900) wait_for_nodes_status(node_names=[node.name], status=constants.NODE_READY, timeout=timeout)
def restart_ocs_operator_node(self): """ Restart node that runs OCS operator pod """ pod_obj = pod.get_ocs_operator_pod() node_obj = pod.get_pod_node(pod_obj) self.nodes.restart_nodes([node_obj]) wait_for_nodes_status() pod.wait_for_pods_to_be_running( namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name] )
def teardown(): # Delete created app pods and pvcs assert pod.delete_pods(pod_objs) assert pvc.delete_pvcs(pvc_objs) # Switch to default project ret = ocp.switch_to_default_rook_cluster_project() assert ret, 'Failed to switch to default rook cluster project' # Delete created projects for prj in namespace_list: prj.delete(resource_name=prj.namespace) # Validate all nodes are in READY state wait_for_nodes_status()
def test_monitoring_after_rebooting_node_where_mgr_is_running( self, nodes, pods): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted nodes.restart_nodes([mgr_node_obj]) # Validate all nodes are in READY state wait_for_nodes_status() # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def finalizer(): """ Removes huge pages on worker nodes and verifies all pods are up """ disable_huge_pages() wait_for_nodes_status(status=constants.NODE_READY, timeout=600) nodes = get_nodes() for node in nodes: assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] == "0"), f"Huge pages is not applied on {node.name}" log.info("Wait for all pods to be in running state") wait_for_pods_to_be_running(timeout=600) sanity_helpers.ceph_health_check(tries=120)
def wait_for_nodes_status_and_prometheus_health_check(pods): """ Waits for the all the nodes to be in running state and also check prometheus health """ # Validate all nodes are in READY state wait_for_nodes_status(timeout=900) # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) assert prometheus_health_check(), "Prometheus health is degraded"
def start_baremetal_machines(self, baremetal_machine, wait=True): """ Start Baremetal Machines Args: baremetal_machine (list): BM objects wait (bool): Wait for BMs to start """ for node in baremetal_machine: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) logger.info(f"Powering On {node.name}") ipmi_ctx.chassis_control_power_up() if wait: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) for status in TimeoutSampler( 600, 5, self.get_power_status, ipmi_ctx ): logger.info( f"Waiting for Baremetal Machine {node.name} to power on. " f"Current Baremetal status: {status}" ) if status == VM_POWERED_ON: logger.info( f"Baremetal Machine {node.name} reached poweredOn status" ) ipmi_ctx.session.close() break wait_for_cluster_connectivity(tries=400) wait_for_nodes_status( node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800 ) wait_for_nodes_status( node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800 )
def stop_node(): """ Turn off one worker node for 6 minutes. Returns: str: Node that was turned down """ # run_time of operation run_time = 60 * 6 nonlocal node logger.info(f"Turning off node {node.name}") nodes.stop_nodes(nodes=[node]) # Validate node reached NotReady state wait_for_nodes_status(node_names=[node.name], status=constants.NODE_NOT_READY) logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return node.name
def test_osd_node_restart_and_check_osd_pods_status(self, nodes): """ 1) Restart one of the osd nodes. 2) Check that the osd pods associated with the node should change to a Terminating state. 3) Wait for the node to reach Ready state. 4) Check that the new osd pods with the same ids start on the same node. 5) Check the worker nodes security groups. """ # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162 if is_ms_consumer_cluster(): logger.info( "The test is applicable only for an MS provider cluster. " "Switching to the provider cluster...") config.switch_to_provider() self.create_resources() osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] old_osd_pod_ids = get_node_osd_ids(osd_node_name) logger.info(f"osd pod ids: {old_osd_pod_ids}") node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids) node_osd_pod_names = [p.name for p in node_osd_pods] logger.info(f"Going to restart the node {osd_node_name}") nodes.restart_nodes(nodes=[osd_node], wait=False) logger.info("Verify the node osd pods go into a Terminating state") res = pod.wait_for_pods_to_be_in_statuses( [constants.STATUS_TERMINATING], node_osd_pod_names) assert res, "Not all the node osd pods are in a Terminating state" wait_for_nodes_status(node_names=[osd_node_name]) assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids, timeout=300) logger.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" ) logger.info( "Verify the worker nodes security groups on the provider...") assert verify_worker_nodes_security_groups()
def stop_nodes(): """ Turn off test nodes for 5 minutes. Returns: list: Names of nodes that were turned down """ # run_time of operation run_time = 60 * 5 nonlocal test_nodes node_names = [node.name for node in test_nodes] logger.info(f"Turning off nodes {node_names}") nodes.stop_nodes(nodes=test_nodes) # Validate node reached NotReady state wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY) logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return node_names
def test_monitoring_shutdown_and_recovery_prometheus_node( self, nodes, pods): """ Test case to validate whether shutdown and recovery of a node where monitoring pods running has no functional impact """ # Get all prometheus pods prometheus_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for prometheus_pod_obj in prometheus_pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj) # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted nodes.stop_nodes([prometheus_node_obj]) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes([prometheus_node_obj]) # Validate all nodes are in READY state wait_for_nodes_status() # Check all the prometheus pods are up for pod_obj in prometheus_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after shutdown and recovery of prometheus nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get a list of 2 nodes. Pick one of them after checking # which one does't have the rook operator running on typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222 rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get('name') == typed_node_name: typed_node_name = typed_nodes[1].name # End of workaround for BZ 1778488 # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() # Restart node if the osd stays at CLBO state osd_pods_obj_list = get_osd_pods() for pod in osd_pods_obj_list: if (pod.get().get("status").get("containerStatuses")[0].get( "state") == constants.STATUS_CLBO): node_obj = get_pod_node(pod) nodes.restart_nodes([node_obj]) node.wait_for_nodes_status([node_obj.name])
def test_hugepages_post_odf_deployment( self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, node_restart_teardown, ): """ Test to verify that after enabling huge pages the nodes come up with higher page size and all odf cluster pods come back up. """ # Applies huge pages on the cluster nodes enable_huge_pages() log.info("Wait for all worker node to be READY state") wait_for_nodes_status(status=constants.NODE_READY, timeout=600) nodes = get_nodes() for node in nodes: assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] == "64Mi"), f"Huge pages is not applied on {node.name}" log.info("Wait for all storage cluster pods to be in running state") wait_for_pods_to_be_running(timeout=600) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, False) # Deleting Resources log.info("Deleting the resources created") self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120)
def finalizer(): # Start the powered off nodes nodes.restart_nodes_teardown() try: node.wait_for_nodes_status(status=constants.NODE_READY) except ResourceWrongStatusException: # Restart the nodes if in NotReady state not_ready_nodes = [ n for n in node.get_node_objs() if n .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] if not_ready_nodes: logger.info( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status(status=constants.NODE_READY) # Check ceph health assert ceph_health_check(), f"Ceph cluster health is not OK" logger.info("Ceph cluster health is OK")
def test_monitoring_when_one_of_the_prometheus_node_down( self, nodes, pods): """ Test case to validate when the prometheus pod is down and its interaction with prometheus """ # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted pod_node_obj = pod.get_pod_node(pod_obj) # Make one of the node down where the prometheus pod is hosted nodes.restart_nodes([pod_node_obj]) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all the prometheus pods are up for pod_obj in pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check for the created pvc metrics after restarting node where prometheus pod is hosted for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) log.info( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected" )
def test_node_maintenance_restart_activate( self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type ): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node's ec2 instance - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_node, f"Failed to find a {node_type} node for the test" typed_node_name = typed_node[0].name # Maintenance the node (unschedule and drain). The function contains logging node.drain_nodes([typed_node_name]) instance = aws.get_instances_ids_and_names(typed_node) assert instance, f"Failed to get ec2 instances for node {typed_node_name}" # Restarting ec2 instance aws_obj.restart_ec2_instances(instances=instance, wait=True) node.wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED ) # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_add_node(self): """ Test for adding worker nodes to the cluster while IOs """ dt = config.ENV_DATA['deployment_type'] if dt == 'ipi': before_replica_counts = dict() count = 2 machines = machine_utils.get_machinesets() for machine in machines: before_replica_counts.update( {machine: machine_utils.get_replica_count(machine)}) worker_nodes_before = helpers.get_worker_nodes() logger.info( f'The worker nodes number before adding a new node is {len(worker_nodes_before)}' ) after_replica_counts = dict() for machine in machines: machine_utils.add_node(machine, count=count) after_replica_counts.update(({ machine: machine_utils.get_replica_count(machine) })) logger.info(after_replica_counts) for sample in TimeoutSampler(timeout=300, sleep=3, func=helpers.get_worker_nodes): if len(sample) == count * len(machines): break worker_nodes_after = helpers.get_worker_nodes() logger.info( f'The worker nodes number after adding a new node is {len(worker_nodes_after)}' ) wait_for_nodes_status(node_names=worker_nodes_after, status=constants.NODE_READY) else: pytest.skip("UPI not yet supported")
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=False) wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_NOT_READY_SCHEDULING_DISABLED) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) get_node_resource_utilization_from_adm_top(node_type="master", print_table=True) if pod_name_of_node == "couchbase": node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == "osd": node_list = get_osd_running_nodes() elif pod_name_of_node == "master": master_node = get_nodes(pod_name_of_node, num_of_nodes=1) # Restart relevant node if pod_name_of_node == "master": nodes.restart_nodes(master_node, wait=False) waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) else: restart_node = get_node_objs(node_list[random.randint( 0, len(node_list) - 1)]) nodes.restart_nodes(restart_node) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] retry((CommandFailed), tries=60, delay=15)(bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)) self.sanity_helpers.health_check(tries=40)
def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup): """ Test case to validate rebooting master node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_nodes(node_type, num_of_nodes=1) # Reboot one master nodes nodes.restart_nodes(node, wait=False) # Wait some time after rebooting master waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition="Running", resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800)
def test_amq_after_shutdown_and_recovery_worker_node( self, nodes, amq_setup): """ Test case to validate shutdown and recovery node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_typed_nodes(node_type='worker', num_of_nodes=1) # Reboot one master nodes nodes.stop_nodes(nodes=node) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=node) # Validate all nodes are in READY state and up retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=30, delay=15)(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition='Running', resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800)
def add_nodes(self): """ Add new nodes to the cluster """ # create separate directory for scale-up terraform data scaleup_terraform_data_dir = os.path.join( self.cluster_path, constants.TERRAFORM_DATA_DIR, constants.SCALEUP_TERRAFORM_DATA_DIR ) create_directory_path(scaleup_terraform_data_dir) logger.info( f"scale-up terraform data directory: {scaleup_terraform_data_dir}" ) # git clone repo from openshift-misc clone_repo( constants.VSPHERE_SCALEUP_REPO, self.upi_scale_up_repo_path ) # modify scale-up repo self.modify_scaleup_repo() config.ENV_DATA['vsphere_resource_pool'] = config.ENV_DATA.get( "cluster_name" ) # sync guest time with host if config.ENV_DATA.get('sync_time_with_host'): sync_time_with_host(constants.SCALEUP_VSPHERE_MACHINE_CONF, True) # get the RHCOS worker list self.rhcos_ips = get_node_ips() logger.info(f"RHCOS IP's: {json.dumps(self.rhcos_ips)}") # generate terraform variable for scaling nodes self.generate_terraform_vars_for_scaleup() # Add nodes using terraform scaleup_terraform = Terraform(constants.SCALEUP_VSPHERE_DIR) previous_dir = os.getcwd() os.chdir(scaleup_terraform_data_dir) scaleup_terraform.initialize() scaleup_terraform.apply(self.scale_up_terraform_var) scaleup_terraform_tfstate = os.path.join( scaleup_terraform_data_dir, "terraform.tfstate" ) out = scaleup_terraform.output( scaleup_terraform_tfstate, "rhel_worker" ) rhel_worker_nodes = json.loads(out)['value'] logger.info(f"RHEL worker nodes: {rhel_worker_nodes}") os.chdir(previous_dir) # Install OCP on rhel nodes rhel_install = OCPINSTALLRHEL(rhel_worker_nodes) rhel_install.prepare_rhel_nodes() rhel_install.execute_ansible_playbook() # Giving some time to settle down the new nodes time.sleep(self.wait_time) # wait for nodes to be in READY state wait_for_nodes_status(timeout=300)
def test_worker_node_restart_during_pvc_expansion(self, nodes): """ Verify PVC expansion will succeed if a worker node is restarted during expansion """ pvc_size_expanded = 30 executor = ThreadPoolExecutor(max_workers=len(self.pods)) selected_node = node.get_nodes( node_type=constants.WORKER_MACHINE, num_of_nodes=1 ) # Restart node log.info(f"Restart node {selected_node[0].name}") restart_thread = executor.submit(nodes.restart_nodes, nodes=selected_node) log.info("Expanding all PVCs.") for pvc_obj in self.pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G") pvc_obj.expand_proc = executor.submit( pvc_obj.resize_pvc, pvc_size_expanded, True ) # Check result of node 'restart_nodes' restart_thread.result() log.info("Verify status of node.") node.wait_for_nodes_status( node_names=[node.get_node_name(selected_node[0])], status=constants.NODE_READY, timeout=300, ) # Verify pvc expansion status for pvc_obj in self.pvcs: assert ( pvc_obj.expand_proc.result() ), f"Expansion failed for PVC {pvc_obj.name}" log.info("PVC expansion was successful on all PVCs") # Run IO log.info("Run IO after PVC expansion.") for pod_obj in self.pods: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.io_proc = executor.submit( pod_obj.run_io, storage_type=storage_type, size="6G", runtime=30, fio_filename=f"{pod_obj.name}_file", ) log.info("Wait for IO to complete on all pods") for pod_obj in self.pods: pod_obj.io_proc.result() fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, ( f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}" ) log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods after PVC expansion.")
def test_monitoring_after_draining_node_where_prometheus_hosted( self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED) # Validate all prometheus pod is running POD = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert POD.wait_for_resource( condition='Running', selector='app=prometheus', timeout=180), ( "One or more prometheus pods are not in running state") # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info['spec']['nodeName'] assert new_node not in prometheus_node, ( 'Promethues pod not re-spinned on new node') log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" ) # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_monitoring_when_one_of_the_prometheus_node_down( self, test_fixture): """ Test case to validate when the prometheus pod is down and interaction with prometheus """ namespace_list, pvc_objs, pod_objs, sc = test_fixture aws_obj = aws.AWS() # Get all the openshift-monitoring pods monitoring_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE) # Get the worker node list workers = get_typed_nodes(node_type='worker') # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] prometheus_node = [ node for node in workers if node.get().get('metadata').get('name') == prometheus_node ] # Make one of the node down where the prometheus pod is hosted instances = aws.get_instances_ids_and_names(prometheus_node) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all the monitoring pods are up for pod_obj in monitoring_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) # Check for the created pvc metrics after nodes restarting for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) # Create projects after restarting nodes namespaces = helpers.create_multilpe_projects(number_of_project=1) namespace_list.extend(namespaces) # Create pvcs after restarting nodes pvcs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespaces ] for pvc_obj in pvcs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() pvc_objs.extend(pvcs) # Create app pods after restarting nodes pods = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvcs ] for pod_obj in pods: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() pod_objs.extend(pods) # Check for the created pvc metrics on prometheus pod after restarting nodes for pvc_obj in pvcs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" )
def add_nodes(self): """ Add new nodes to the cluster """ # create separate directory for scale-up terraform data scaleup_terraform_data_dir = os.path.join( self.cluster_path, constants.TERRAFORM_DATA_DIR, constants.SCALEUP_TERRAFORM_DATA_DIR, ) create_directory_path(scaleup_terraform_data_dir) logger.info( f"scale-up terraform data directory: {scaleup_terraform_data_dir}") # git clone repo from openshift-misc clone_repo(constants.VSPHERE_SCALEUP_REPO, self.upi_scale_up_repo_path) # git clone repo from cluster-launcher clone_repo(constants.VSPHERE_CLUSTER_LAUNCHER, self.cluster_launcer_repo_path) helpers = VSPHEREHELPERS() helpers.modify_scaleup_repo() config.ENV_DATA["vsphere_resource_pool"] = config.ENV_DATA.get( "cluster_name") # sync guest time with host sync_time_with_host_file = constants.SCALEUP_VSPHERE_MACHINE_CONF if config.ENV_DATA["folder_structure"]: sync_time_with_host_file = os.path.join( constants.CLUSTER_LAUNCHER_VSPHERE_DIR, f"aos-{get_ocp_version(seperator='_')}", constants.CLUSTER_LAUNCHER_MACHINE_CONF, ) if config.ENV_DATA.get("sync_time_with_host"): sync_time_with_host(sync_time_with_host_file, True) # get the RHCOS worker list rhcos_ips = get_node_ips() logger.info(f"RHCOS IP's: {json.dumps(rhcos_ips)}") # generate terraform variable for scaling nodes self.scale_up_terraform_var = helpers.generate_terraform_vars_for_scaleup( rhcos_ips) # choose the vsphere_dir based on OCP version # generate cluster_info and config yaml files # for OCP version greater than 4.4 vsphere_dir = constants.SCALEUP_VSPHERE_DIR rhel_module = "rhel-worker" if Version.coerce(self.ocp_version) >= Version.coerce("4.5"): vsphere_dir = os.path.join( constants.CLUSTER_LAUNCHER_VSPHERE_DIR, f"aos-{get_ocp_version('_')}", "vsphere", ) helpers.generate_cluster_info() helpers.generate_config_yaml() rhel_module = "RHEL_WORKER_LIST" # Add nodes using terraform scaleup_terraform = Terraform(vsphere_dir) previous_dir = os.getcwd() os.chdir(scaleup_terraform_data_dir) scaleup_terraform.initialize() scaleup_terraform.apply(self.scale_up_terraform_var) scaleup_terraform_tfstate = os.path.join(scaleup_terraform_data_dir, "terraform.tfstate") out = scaleup_terraform.output(scaleup_terraform_tfstate, rhel_module) if config.ENV_DATA["folder_structure"]: rhel_worker_nodes = out.strip().replace('"', "").split(",") else: rhel_worker_nodes = json.loads(out)["value"] logger.info(f"RHEL worker nodes: {rhel_worker_nodes}") os.chdir(previous_dir) # Install OCP on rhel nodes rhel_install = OCPINSTALLRHEL(rhel_worker_nodes) rhel_install.prepare_rhel_nodes() rhel_install.execute_ansible_playbook() # Giving some time to settle down the new nodes time.sleep(self.wait_time) # wait for nodes to be in READY state wait_for_nodes_status(timeout=300)