def test_add_node(self): """ Test for adding worker nodes to the cluster while IOs """ new_nodes = 3 if config.ENV_DATA['platform'].lower() in constants.CLOUD_PLATFORMS: dt = config.ENV_DATA['deployment_type'] if dt == 'ipi': machines = machine_utils.get_machinesets() logger.info( f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}' ) for machine in machines: add_new_node_and_label_it(machine) logger.info( f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}' ) else: logger.info( f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}' ) if config.ENV_DATA.get('rhel_workers'): node_type = constants.RHEL_OS else: node_type = constants.RHCOS assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed" logger.info( f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}' ) elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521")
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # 'rook-ceph-crashcollector' on the failed node stucks at pending # state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as WA and # deleting its deployment so that the pod disappears # Will revert this WA once the BZ is fixed except ResourceWrongStatusException: if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP() name = pod_obj.name[:-17] command = f"delete deployment {name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def add_new_storage_node(self, node_name): machine_name = machine.get_machine_from_node_name(node_name) log.info(f"{node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name(machine_name) log.info(f"{node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name)
def test_add_node(self): """ Test for adding worker nodes to the cluster while IOs """ new_nodes = 3 if config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS: dt = config.ENV_DATA["deployment_type"] if dt == "ipi": machines = machine_utils.get_machinesets() logger.info( f"The worker nodes number before expansion {len(node.get_worker_nodes())}" ) for machine in machines: add_new_node_and_label_it(machine) logger.info( f"The worker nodes number after expansion {len(node.get_worker_nodes())}" ) else: logger.info( f"The worker nodes number before expansion {len(node.get_worker_nodes())}" ) if config.ENV_DATA.get("rhel_workers"): node_type = constants.RHEL_OS else: node_type = constants.RHCOS assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed" logger.info( f"The worker nodes number after expansion {len(node.get_worker_nodes())}" ) elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521") # Issue to remove skip code https://github.com/red-hat-storage/ocs-ci/issues/2403 # logger.info(f'The worker nodes number before expansion {len(node.get_worker_nodes())}') # if config.ENV_DATA.get('rhel_user'): # pytest.skip("Skipping add RHEL node, code unavailable") # node_type = constants.RHCOS # assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed" # logger.info(f'The worker nodes number after expansion {len(node.get_worker_nodes())}') ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=3600), "Data re-balance failed to complete"
def factory(additional_nodes=3): """ Args: additional_nodes (int): Number of additional nodes to be added (default=3). """ log.info("Creating machineset") machineset_name.append( machine.create_custom_machineset(instance_type="m5.4xlarge", zone="a")) machine.wait_for_new_node_to_be_ready(machineset_name[0]) log.info( f"Adding {additional_nodes} more nodes to machineset {machineset_name[0]}" ) node.add_new_node_and_label_it( machineset_name=machineset_name[0], num_nodes=additional_nodes, mark_for_ocs_label=False, ) machine.wait_for_new_node_to_be_ready(machineset_name[0])
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_add_node_aws(self): """ Test for adding worker nodes to the cluster while IOs """ dt = config.ENV_DATA['deployment_type'] if dt == 'ipi': machines = machine_utils.get_machinesets() logger.info(f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}') for machine in machines: add_new_node_and_label_it(machine) logger.info(f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}') else: new_nodes = 3 logger.info(f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}') if config.ENV_DATA.get('rhel_workers'): node_type = constants.RHEL_OS else: node_type = constants.RHCOS assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed" logger.info(f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}')
def identify_and_add_nodes(self, scenario, num_of_nodes): """ Fetches info about the worker nodes and add nodes (if required) Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test Returns: tuple: tuple containing: list: list of OCS nodes name list: list of non-OCS nodes name """ nodes_to_add = 0 initial_worker_nodes = node.get_worker_nodes() ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) non_ocs_nodes = list(set(initial_worker_nodes) - set(ocs_nodes)) if "colocated" in scenario and len(ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(initial_worker_nodes) if "dedicated" in scenario and len(non_ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(non_ocs_nodes) if nodes_to_add > 0: logger.info(f"{nodes_to_add} extra workers nodes needed") if config.ENV_DATA["deployment_type"] == "ipi": machine_name = random.choice( machine.get_machines( machine_type=constants.WORKER_MACHINE)).name machineset_name = machine.get_machineset_from_machine_name( machine_name) node.add_new_node_and_label_it( machineset_name=machineset_name, num_nodes=nodes_to_add, mark_for_ocs_label=False, ) else: is_rhel = config.ENV_DATA.get( "rhel_workers") or config.ENV_DATA.get("rhel_user") node_type = constants.RHEL_OS if is_rhel else constants.RHCOS node.add_new_node_and_label_upi( node_type=node_type, num_nodes=nodes_to_add, mark_for_ocs_label=False, ) new_worker_nodes = node.get_worker_nodes() new_nodes_added = list( set(new_worker_nodes) - set(initial_worker_nodes)) assert (len(new_nodes_added) == nodes_to_add ), "Extra nodes not added in the cluster" non_ocs_nodes += new_nodes_added if "colocated" in scenario and len(ocs_nodes) < num_of_nodes: logger.info("Adding OCS storage label to Non-OCS workers") node_obj = ocp.OCP(kind=constants.NODE) nodes_to_label = non_ocs_nodes[0:(num_of_nodes - len(ocs_nodes))] for node_name in nodes_to_label: node_obj.add_label(resource_name=node_name, label=constants.OPERATOR_NODE_LABEL) ocs_nodes.append(node_name) non_ocs_nodes = list(set(non_ocs_nodes) - set(ocs_nodes)) logger.info(f"The OCS nodes are : {ocs_nodes}") logger.info(f"The Non-OCS nodes are: {non_ocs_nodes}") return ocs_nodes, non_ocs_nodes
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") else: raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_simultaneous_drain_of_two_ocs_nodes( self, pvc_factory, pod_factory, dc_pod_factory, interface, bucket_factory, rgw_bucket_factory, ): """ OCS-2128/OCS-2129: - Create PVCs and start IO on DC based app pods - Add one extra node in two of the AZs and label the nodes with OCS storage label - Maintenance (mark as unscheduable and drain) 2 worker nodes simultaneously - Confirm that OCS and DC pods are in running state - Remove unscheduled nodes - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Check cluster and Ceph health """ # Get OSD running nodes osd_running_worker_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_worker_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_worker_nodes, label_key="dc", label_value="fedora") log.info("Successfully labeled worker nodes with {dc:fedora}") # Create DC app pods log.info("Creating DC based app pods and starting IO in background") interface = (constants.CEPHBLOCKPOOL if interface == "rbd" else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get the machine name using the node name machine_names = [ machine.get_machine_from_node_name(osd_running_worker_node) for osd_running_worker_node in osd_running_worker_nodes[:2] ] log.info(f"{osd_running_worker_nodes} associated " f"machine are {machine_names}") # Get the machineset name using machine name machineset_names = [ machine.get_machineset_from_machine_name(machine_name) for machine_name in machine_names ] log.info(f"{osd_running_worker_nodes} associated machineset " f"is {machineset_names}") # Add a new node and label it add_new_node_and_label_it(machineset_names[0]) add_new_node_and_label_it(machineset_names[1]) # Drain 2 nodes drain_nodes(osd_running_worker_nodes[:2]) # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if "rook-ceph-crashcollector" in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = "-".join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # DC app pods on the drained node will get automatically created on other # running node in same AZ. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=1200) log.info("All the dc pods reached running state") # Remove unscheduled nodes # In scenarios where the drain is attempted on >3 worker setup, # post completion of drain we are removing the unscheduled nodes so # that we maintain 3 worker nodes. log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}") remove_node_objs = get_node_objs(osd_running_worker_nodes[:2]) remove_nodes(remove_node_objs) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def factory(ocs_nodes=False, node_count=3, taint_label=None): """ Args: ocs_nodes (bool): True if new nodes are OCS, False otherwise node_count (int): Number of nodes to be added taint_label (str): Taint label to be added """ new_nodes = [] if config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS: dt = config.ENV_DATA["deployment_type"] if dt == "ipi": machines = machine_utils.get_machinesets() log.info( f"The worker nodes number before expansion {len(get_worker_nodes())}" ) for machine in machines: new_nodes.append( add_new_node_and_label_it( machine, mark_for_ocs_label=ocs_nodes)) log.info( f"The worker nodes number after expansion {len(get_worker_nodes())}" ) else: log.info( f"The worker nodes number before expansion {len(get_worker_nodes())}" ) if config.ENV_DATA.get("rhel_workers"): node_type = constants.RHEL_OS else: node_type = constants.RHCOS new_nodes.append( add_new_node_and_label_upi(node_type, node_count, mark_for_ocs_label=ocs_nodes)) log.info( f"The worker nodes number after expansion {len(get_worker_nodes())}" ) elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: log.info( f"The worker nodes number before expansion {len(get_worker_nodes())}" ) if config.ENV_DATA.get("rhel_user"): node_type = constants.RHEL_OS else: node_type = constants.RHCOS new_nodes.append( add_new_node_and_label_upi(node_type, node_count, mark_for_ocs_label=ocs_nodes)) log.info( f"The worker nodes number after expansion {len(get_worker_nodes())}" ) nodes = [node for sublist in new_nodes for node in sublist] if taint_label: taint_nodes(nodes=nodes, taint_label=taint_label), "Failed to taint nodes" log.info(f"Successfully Tainted nodes {new_nodes} with {taint_label}")
def setup( self, request, scenario, nodes, multi_pvc_factory, service_account_factory, dc_pod_factory, ): """ Identify the nodes and start multiple dc pods for the test Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') nodes: A fixture to get instance of the relevant platform nodes class multi_pvc_factory: A fixture create a set of new PVCs service_account_factory: A fixture to create a service account dc_pod_factory: A fixture to create dc pod Returns: list: dc pod objs """ worker_nodes = node.get_worker_nodes() ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) non_ocs_nodes = list(set(worker_nodes) - set(ocs_nodes)) def finalizer(): helpers.remove_label_from_worker_node(node_list=worker_nodes, label_key="nodetype") # Check ceph health ceph_health_check(tries=80) request.addfinalizer(finalizer) if (scenario == "dedicated") and len(non_ocs_nodes) == 0: if config.ENV_DATA.get("deployment_type").lower() == "ipi": machines = machine.get_machinesets() node.add_new_node_and_label_it(machines[0], num_nodes=1, mark_for_ocs_label=False) else: if (config.ENV_DATA.get("platform").lower() == constants.VSPHERE_PLATFORM): pytest.skip( "Skipping add node in VSPHERE due to https://bugzilla.redhat.com/show_bug.cgi?id=1844521" ) is_rhel = config.ENV_DATA.get( "rhel_workers") or config.ENV_DATA.get("rhel_user") node_type = constants.RHEL_OS if is_rhel else constants.RHCOS node.add_new_node_and_label_upi(node_type=node_type, num_nodes=1, mark_for_ocs_label=False) non_ocs_nodes = list(set(node.get_worker_nodes()) - set(ocs_nodes)) app_pod_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes # Label nodes to be able to run app pods helpers.label_worker_node(node_list=app_pod_nodes, label_key="nodetype", label_value="app-pod") access_modes_rbd = [ constants.ACCESS_MODE_RWO, f"{constants.ACCESS_MODE_RWX}-Block", ] access_modes_cephfs = [ constants.ACCESS_MODE_RWO, constants.ACCESS_MODE_RWX ] pvcs_rbd = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, size=self.pvc_size, access_modes=access_modes_rbd, status=constants.STATUS_BOUND, num_of_pvc=len(access_modes_rbd), ) project = pvcs_rbd[0].project pvcs_cephfs = multi_pvc_factory( interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size, access_modes=access_modes_cephfs, status=constants.STATUS_BOUND, num_of_pvc=len(access_modes_cephfs), ) pvcs = pvcs_cephfs + pvcs_rbd # Set volume mode on PVC objects for pvc_obj in pvcs: pvc_info = pvc_obj.get() setattr(pvc_obj, "volume_mode", pvc_info["spec"]["volumeMode"]) sa_obj = service_account_factory(project=project) pods = [] # Create pods for pvc_obj in pvcs: if constants.CEPHFS_INTERFACE in pvc_obj.storageclass.name: interface = constants.CEPHFILESYSTEM else: interface = constants.CEPHBLOCKPOOL num_pods = 2 if pvc_obj.access_mode == constants.ACCESS_MODE_RWX else 1 logger.info("Creating app pods") for _ in range(num_pods): pods.append( dc_pod_factory( interface=interface, pvc=pvc_obj, node_selector={"nodetype": "app-pod"}, raw_block_pv=pvc_obj.volume_mode == "Block", sa_obj=sa_obj, )) logger.info( f"Created {len(pods)} pods using {len(pvcs_cephfs)} cephfs, {len(pvcs_rbd)} rbd PVCs." ) return pods
def test_automated_recovery_from_stopped_node_and_start( self, nodes, additional_node): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI 0) A - add new node, B - don't add new node 1) Stop node 2) Validate result: A - pods should respin on the new node B - pods should remain in Pending state on the stopped node 3) Start node 4) Validate result: A - pods should start on the new node B - pods should start on the stopped node after starting it """ wnode_name = get_worker_nodes()[0] machine_name = machine.get_machine_from_node_name(wnode_name) self.machineset_name = machine.get_machineset_from_machine_name( machine_name) self.start_ready_replica_count = machine.get_ready_replica_count( self.machineset_name) if additional_node: new_ocs_node_names = add_new_node_and_label_it( self.machineset_name) failure_domain = get_failure_domain() log.info("Wait for the nodes racks or zones to appear...") wait_for_nodes_racks_or_zones(failure_domain, new_ocs_node_names) new_ocs_node = get_node_objs(new_ocs_node_names)[0] log.info( f"Successfully created a new OCS node '{new_ocs_node.name}'") self.extra_node = True log.info("Get another OSD node in the same rack or zone...") self.osd_worker_node = get_another_osd_node_in_same_rack_or_zone( failure_domain, new_ocs_node) assert (self.osd_worker_node ), "Didn't find another osd node in the same rack or zone" else: osd_node_names = get_osd_running_nodes() self.osd_worker_node = get_node_objs(osd_node_names)[0] osd_pods = get_osd_pods() temp_osd = get_node_pods(self.osd_worker_node.name, pods_to_search=osd_pods)[0] osd_real_name = "-".join(temp_osd.name.split("-")[:-1]) nodes.stop_nodes([self.osd_worker_node], wait=True) log.info(f"Successfully powered off node: {self.osd_worker_node.name}") timeout = 420 assert pod.wait_for_pods_to_be_in_statuses( [constants.STATUS_TERMINATING], [temp_osd.name], timeout=timeout ), (f"The pod {osd_real_name} didn't reach the status {constants.STATUS_TERMINATING} " f"after {timeout} seconds") # Validate that the OSD in terminate state has a new OSD in Pending all_pod_obj = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE) new_osd = None for pod_obj in all_pod_obj: if osd_real_name == "-".join(pod_obj.name.split("-")[:-1]) and ( temp_osd.name != pod_obj.name): new_osd = pod_obj break nodes.start_nodes(nodes=[self.osd_worker_node], wait=True) log.info(f"Successfully powered on node: {self.osd_worker_node.name}") wait_for_resource_state(new_osd, constants.STATUS_RUNNING, timeout=180) if additional_node: new_osd_node = get_pod_node(new_osd) assert (new_osd_node.name != self.osd_worker_node.name ), "New OSD is expected to run on the new additional node"
def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface, bucket_factory, rgw_bucket_factory, ): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora") # Create DC app pods log.info("Creating DC based app pods") if interface == "rbd": interface = constants.CEPHBLOCKPOOL elif interface == "cephfs": interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it new_ocs_node_names = add_new_node_and_label_it(machineset_name) failure_domain = get_failure_domain() log.info("Wait for the nodes racks or zones to appear...") wait_for_nodes_racks_or_zones(failure_domain, new_ocs_node_names) new_ocs_node = get_node_objs(new_ocs_node_names)[0] osd_node_in_same_rack_or_zone = get_another_osd_node_in_same_rack_or_zone( failure_domain, new_ocs_node, common_nodes) # Get the failure node obj failure_node_obj = get_node_objs([osd_node_in_same_rack_or_zone.name]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") pod.wait_for_storage_pods(timeout=300) except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks if config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: tries = 200 else: tries = 40 self.sanity_helpers.health_check(tries=tries)
def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def factory(ocs_nodes=False, node_count=3, taint_label=None): """ Args: ocs_nodes (bool): True if new nodes are OCS, False otherwise node_count (int): Number of nodes to be added taint_label (str): Taint label to be added """ new_nodes = [] if config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS: dt = config.ENV_DATA["deployment_type"] if dt == "ipi": machines = machine_utils.get_machinesets() log.info( f"The worker nodes number before expansion {len(get_worker_nodes())}" ) for machine in machines: new_nodes.append( add_new_node_and_label_it( machine, mark_for_ocs_label=ocs_nodes)) log.info( f"The worker nodes number after expansion {len(get_worker_nodes())}" ) else: log.info( f"The worker nodes number before expansion {len(get_worker_nodes())}" ) if config.ENV_DATA.get("rhel_workers"): node_type = constants.RHEL_OS else: node_type = constants.RHCOS new_nodes.append( add_new_node_and_label_upi(node_type, node_count, mark_for_ocs_label=ocs_nodes)) log.info( f"The worker nodes number after expansion {len(get_worker_nodes())}" ) elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521") # Issue to remove skip code https://github.com/red-hat-storage/ocs-ci/issues/2403 # log.info( # f"The worker nodes number before expansion {len(get_worker_nodes())}" # ) # if config.ENV_DATA.get("rhel_user"): # pytest.skip("Skipping add RHEL node, code unavailable") # node_type = constants.RHCOS # new_nodes.append( # add_new_node_and_label_upi(node_type, num_nodes=node_count) # ) # log.info( # f"The worker nodes number after expansion {len(get_worker_nodes())}" # ) nodes = [node for sublist in new_nodes for node in sublist] if taint_label: taint_nodes(nodes=nodes, taint_label=taint_label), "Failed to taint nodes" log.info(f"Successfully Tainted nodes {new_nodes} with {taint_label}")