def reboot_gluster_node_and_wait_for_services(self): gluster_node_ip = ( g.config["gluster_servers"][self.gluster_servers[0]]["storage"]) gluster_pod = list( filter(lambda pod: (pod["pod_host_ip"] == gluster_node_ip), get_ocp_gluster_pod_details(self.oc_node))) if not gluster_pod: raise ExecutionError("Gluster pod Host IP '%s' not matched." % gluster_node_ip) gluster_pod = gluster_pod[0]["pod_name"] self.addCleanup(wait_for_pod_be_ready, self.oc_node, gluster_pod) node_reboot_by_command(gluster_node_ip, timeout=600, wait_step=10) # wait for the gluster pod to be in 'Running' state wait_for_pod_be_ready(self.oc_node, gluster_pod) # glusterd and gluster-blockd service should be up and running services = (("glusterd", "running"), ("gluster-blockd", "running"), ("tcmu-runner", "running"), ("gluster-block-target", "exited")) for service, state in services: check_service_status_on_pod(self.oc_node, gluster_pod, service, "active", state)
def test_dynamic_provisioning_glusterfile_gluster_pod_or_node_failure( self): """Create glusterblock PVC when gluster pod or node is down.""" mount_path = "/mnt" datafile_path = '%s/fake_file_for_%s' % (mount_path, self.id()) # Create secret and storage class self.create_storage_class() # Create PVC pvc_name = self.create_and_wait_for_pvc() # Create app POD with attached volume pod_name = oc_create_tiny_pod_with_volume( self.node, pvc_name, "test-pvc-mount-on-app-pod", mount_path=mount_path, image=self.io_container_image_cirros) self.addCleanup(wait_for_resource_absence, self.node, 'pod', pod_name) self.addCleanup(oc_delete, self.node, 'pod', pod_name) # Wait for app POD be up and running wait_for_pod_be_ready(self.node, pod_name, timeout=60, wait_step=2) # Run IO in background io_cmd = "oc rsh %s dd if=/dev/urandom of=%s bs=1000K count=900" % ( pod_name, datafile_path) async_io = g.run_async(self.node, io_cmd, "root") # Check for containerized Gluster if self.is_containerized_gluster(): # Pick up one of the hosts which stores PV brick (4+ nodes case) gluster_pod_data = get_gluster_pod_names_by_pvc_name( self.node, pvc_name)[0] # Delete glusterfs POD from chosen host and wait for # spawn of new one oc_delete(self.node, 'pod', gluster_pod_data["pod_name"]) cmd = ("oc get pods -o wide | grep glusterfs | grep %s | " "grep -v Terminating | awk '{print $1}'") % ( gluster_pod_data["pod_hostname"]) for w in Waiter(600, 15): new_gluster_pod_name = self.cmd_run(cmd) if new_gluster_pod_name: break if w.expired: error_msg = "exceeded timeout, new gluster pod not created" g.log.error(error_msg) raise AssertionError(error_msg) g.log.info("new gluster pod name is %s" % new_gluster_pod_name) wait_for_pod_be_ready(self.node, new_gluster_pod_name) else: pvc_hosting_node_ip = get_gluster_host_ips_by_pvc_name( self.node, pvc_name)[0] heketi_nodes = heketi_node_list(self.heketi_client_node, self.heketi_server_url) node_ip_for_reboot = None for heketi_node in heketi_nodes: heketi_node_ip = heketi_node_info( self.heketi_client_node, self.heketi_server_url, heketi_node, json=True)["hostnames"]["storage"][0] if heketi_node_ip == pvc_hosting_node_ip: node_ip_for_reboot = heketi_node_ip break if not node_ip_for_reboot: raise AssertionError( "Gluster node IP %s not matched with heketi node %s" % (pvc_hosting_node_ip, heketi_node_ip)) node_reboot_by_command(node_ip_for_reboot) # Check that async IO was not interrupted ret, out, err = async_io.async_communicate() self.assertEqual(ret, 0, "IO %s failed on %s" % (io_cmd, self.node))
def test_heketi_metrics_validation_with_node_reboot(self): """Validate heketi metrics after node reboot using prometheus""" initial_metrics, final_metrics = {}, {} # Use storage project openshift_ops.switch_oc_project( self._master, self.storage_project_name) # Get initial metrics result h_node, h_server = self.heketi_client_node, self.heketi_server_url initial_metrics = tuple( heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0] for metric in self.metrics) # Use prometheus project openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) # Get initial prometheus result initial_prometheus = self._get_and_manipulate_metric_data( self.metrics) # Get hosted node IP of heketi pod openshift_ops.switch_oc_project( self._master, self.storage_project_name) heketi_pod = openshift_ops.get_pod_name_from_dc( self._master, self.heketi_dc_name) heketi_node = openshift_ops.oc_get_custom_resource( self._master, 'pod', '.:spec.nodeName', heketi_pod)[0] # Reboot the node on which heketi pod is scheduled self.addCleanup( self._check_heketi_and_gluster_pod_after_node_reboot, heketi_node) node_ops.node_reboot_by_command(heketi_node) # Wait node to become NotReady custom = r'":.status.conditions[?(@.type==\"Ready\")]".status' for w in waiter.Waiter(300, 10): status = openshift_ops.oc_get_custom_resource( self._master, 'node', custom, heketi_node) if status[0] == 'False': break if w.expired: raise exceptions.ExecutionError( "Failed to bring down node {}".format(heketi_node)) # Wait for node to become ready openshift_ops.wait_for_ocp_node_be_ready(self._master, heketi_node) # Wait for heketi and glusterfs pod to become ready self._check_heketi_and_gluster_pod_after_node_reboot(heketi_node) # Use prometheus project openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) # Get final metrics result final_metrics = tuple( heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0] for metric in self.metrics) # Get final prometheus result final_prometheus = self._get_and_manipulate_metric_data( self.metrics) err_msg = "Initial value {} is not same as final value {}" self.assertEqual( initial_metrics, final_metrics, err_msg.format( initial_metrics, final_metrics)) self.assertEqual( initial_prometheus, final_prometheus, err_msg.format( initial_prometheus, final_prometheus))
def test_udev_usage_in_container(self): """Validate LVM inside container does not use udev""" # Skip the TC if independent mode deployment if not self.is_containerized_gluster(): self.skipTest("Skipping this test case as it needs to run on " "converged mode deployment") h_client, h_url = self.heketi_client_node, self.heketi_server_url server_info = list(g.config.get('gluster_servers').values())[0] server_node = server_info.get('manage') additional_device = server_info.get('additional_devices')[0] # command to run pvscan cmd_pvscan = "timeout 300 pvscan" # Get pod name from on host for pod_info in self.pod_name: if pod_info.get('pod_hostname') == server_node: pod_name = pod_info.get('pod_name') break # Create file volume vol_info = heketi_ops.heketi_volume_create(h_client, h_url, self.volume_size, json=True) self.addCleanup(heketi_ops.heketi_volume_delete, h_client, h_url, vol_info.get("id")) # Create block volume block_vol_info = heketi_ops.heketi_blockvolume_create(h_client, h_url, self.volume_size, json=True) self.addCleanup(heketi_ops.heketi_blockvolume_delete, h_client, h_url, block_vol_info.get("id")) # Check dmeventd service in container err_msg = "dmeventd.service is running on setup" with self.assertRaises(AssertionError, msg=err_msg): openshift_ops.oc_rsh(self.oc_node, pod_name, "systemctl is-active dmeventd.service") # Service dmeventd should not be running in background with self.assertRaises(AssertionError, msg=err_msg): openshift_ops.oc_rsh(self.oc_node, pod_name, "ps aux | grep dmeventd.service") # Perform a pvscan in contaier openshift_ops.oc_rsh(self.oc_node, pod_name, cmd_pvscan) # Get heketi node to add new device heketi_node_list = heketi_ops.heketi_node_list(h_client, h_url) for h_node_id in heketi_node_list: h_node_info = heketi_ops.heketi_node_info(h_client, h_url, h_node_id, json=True) h_node_host = h_node_info.get('hostnames', {}).get('manage')[0] if h_node_host == server_node: break # Add new device to the node heketi_ops.heketi_device_add(h_client, h_url, additional_device, h_node_id) h_node_info = heketi_ops.heketi_node_info(h_client, h_url, h_node_id, json=True) h_device_id = [ device.get('id') for device in h_node_info.get('devices') if device.get('name') == additional_device ] self.addCleanup(heketi_ops.heketi_device_delete, h_client, h_url, h_device_id[0]) self.addCleanup(heketi_ops.heketi_device_remove, h_client, h_url, h_device_id[0]) self.addCleanup(heketi_ops.heketi_device_disable, h_client, h_url, h_device_id[0]) # Reboot the node on which device is added self.addCleanup(self._check_heketi_and_gluster_pod_after_node_reboot, server_node) node_ops.node_reboot_by_command(server_node) # Wait node to become NotReady custom = r'":.status.conditions[?(@.type==\"Ready\")]".status' for w in waiter.Waiter(300, 10): status = openshift_ops.oc_get_custom_resource( self.oc_node, 'node', custom, server_node) if status[0] == 'False': break if w.expired: raise exceptions.ExecutionError( "Failed to bring node down {}".format(server_node)) # Wait for node to become ready openshift_ops.wait_for_ocp_node_be_ready(self.oc_node, server_node) # Wait for heketi and glusterfs pod to become ready self._check_heketi_and_gluster_pod_after_node_reboot(server_node) # Perform a pvscan in contaier openshift_ops.oc_rsh(self.oc_node, pod_name, cmd_pvscan)