def test_brick_multiplex_pids_with_diff_vol_option_values(self): """Test Brick Pid's should be same when values of vol options are diff """ h_client, h_url = self.heketi_client_node, self.heketi_server_url # Disable heketi nodes except first three nodes h_nodes_list = heketi_node_list(h_client, h_url) for node_id in h_nodes_list[3:]: heketi_node_disable(h_client, h_url, node_id) self.addCleanup(heketi_node_enable, h_client, h_url, node_id) # Create storage class with diff volumeoptions sc1 = self.create_storage_class(volumeoptions='user.heketi.abc 1') sc2 = self.create_storage_class(volumeoptions='user.heketi.abc 2') # Create PVC's with above SC pvc1 = self.create_and_wait_for_pvcs(sc_name=sc1) pvc2 = self.create_and_wait_for_pvcs(sc_name=sc2) # Get vol info and status vol_info1 = get_gluster_vol_info_by_pvc_name(self.node, pvc1[0]) vol_info2 = get_gluster_vol_info_by_pvc_name(self.node, pvc2[0]) vol_status1 = get_gluster_vol_status(vol_info1['gluster_vol_id']) vol_status2 = get_gluster_vol_status(vol_info2['gluster_vol_id']) # Verify vol options err_msg = ('Volume option "user.heketi.abc %s" did not got match for ' 'volume %s in gluster vol info') self.assertEqual( vol_info1['options']['user.heketi.abc'], '1', err_msg % (1, vol_info1['gluster_vol_id'])) self.assertEqual( vol_info2['options']['user.heketi.abc'], '2', err_msg % (2, vol_info2['gluster_vol_id'])) # Get the PID's and match them pids1 = set() for brick in vol_info1['bricks']['brick']: host, bname = brick['name'].split(":") pids1.add(vol_status1[host][bname]['pid']) pids2 = set() for brick in vol_info2['bricks']['brick']: host, bname = brick['name'].split(":") pids2.add(vol_status2[host][bname]['pid']) err_msg = ('Pids of both the volumes %s and %s are expected to be' 'same. But got the different Pids "%s" and "%s".' % (vol_info1['gluster_vol_id'], vol_info2['gluster_vol_id'], pids1, pids2)) self.assertEqual(pids1, pids2, err_msg)
def check_vol_status(self): # Check status of all vols status = get_gluster_vol_status('all') pids = defaultdict(int) down_bricks = 0 for vol in status.keys(): for host in status[vol].keys(): for brick_or_shd in status[vol][host].keys(): if status[vol][host][brick_or_shd]['status'] != "1": down_bricks += 1 pid = status[vol][host][brick_or_shd]['pid'] pids[pid] += 1 # Get Pids which are running more than 250 bricks and raise exception exhausted_pids = [pd for pd in pids.keys() if pids[pd] > 250] self.assertFalse( (exhausted_pids and down_bricks), 'Pids {} have more than 250 bricks attached to it. {} bricks or ' 'shd are down.'.format(exhausted_pids, down_bricks)) self.assertFalse( exhausted_pids, 'Pids {} have more than 250 bricks attached to' ' it.'.format(exhausted_pids)) self.assertFalse( down_bricks, '{} bricks or shd are down.'.format(down_bricks))
def test_kill_bhv_fsd_while_es_pod_running(self): """Validate killing of bhv fsd won't effect es pod io's""" # Fetch pod and PVC names and validate iscsi and multipath es_pod, pvc_name = self._get_es_pod_and_verify_iscsi_sessions() # Get the bhv name gluster_node = list(self._registry_servers_info.keys())[0] openshift_ops.switch_oc_project(self._master, self._registry_project_name) bhv_name = self.get_block_hosting_volume_by_pvc_name( pvc_name, heketi_server_url=self._registry_heketi_server_url, gluster_node=gluster_node) # Get one of the bricks pid of the bhv gluster_volume_status = gluster_ops.get_gluster_vol_status(bhv_name) pid = None for g_node, g_node_data in gluster_volume_status.items(): if g_node != gluster_node: continue for process_name, process_data in g_node_data.items(): if not process_name.startswith("/var"): continue pid = process_data["pid"] # When birck is down, pid of the brick is returned as -1. # Which is unexepeted situation. So, add appropriate assertion. self.assertNotEqual( pid, "-1", "Got unexpected PID (-1) for '{}' gluster vol " "on '{}' node.".format(bhv_name, gluster_node)) break self.assertTrue( pid, "Could not find 'pid' in Gluster vol data for '{}' " "Gluster node. Data: {}".format(gluster_node, gluster_volume_status)) break # Kill gluster vol brick process using found pid cmd_kill = "kill -9 {}".format(pid) cmd_start_vol = "gluster v start {} force".format(bhv_name) openshift_ops.cmd_run_on_gluster_pod_or_node(self._master, cmd_kill, gluster_node) self.addCleanup(openshift_ops.cmd_run_on_gluster_pod_or_node, self._master, cmd_start_vol, gluster_node) self.addCleanup(openshift_ops.switch_oc_project, self._master, self._registry_project_name) # Run I/O on ES pod openshift_ops.switch_oc_project(self._master, self._logging_project_name) file_name = '/elasticsearch/persistent/file1' cmd_run_io = 'dd if=/dev/urandom of={} bs=4k count=10000'.format( file_name) cmd_remove_file = 'rm {}'.format(file_name) openshift_ops.oc_rsh(self._master, es_pod, cmd_run_io) self.addCleanup(openshift_ops.oc_rsh, self._master, es_pod, cmd_remove_file)
def _get_bricks_pids(self, vol_name): """Return list having bricks pids with gluster pod ip""" pids = [] g_volume_status = get_gluster_vol_status(vol_name) self.assertTrue( g_volume_status, "Failed to get the gluster volume status for the " "volume {}".format(vol_name)) for g_node, g_node_data in g_volume_status.items(): for process_name, process_data in g_node_data.items(): if process_name.startswith("/var"): pid = process_data["pid"] pids.append([g_node, pid]) return pids
def test_prometheous_kill_bhv_brick_process(self): """Validate kill brick process of block hosting volume with prometheus workload running""" # Add check for CRS version openshift_ops.switch_oc_project( self._master, self._registry_project_name) if not self.is_containerized_gluster(): self.skipTest("Skipping this test case as CRS" " version check can not be implemented") # Get one of the prometheus pod name and respective pvc name openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) prometheus_pods = openshift_ops.oc_get_pods( self._master, selector=self._prometheus_resources_selector) if not prometheus_pods: self.skipTest( prometheus_pods, "Skipping test as prometheus" " pod is not present") # Validate iscsi and multipath prometheus_pod = list(prometheus_pods.keys())[0] pvc_name = openshift_ops.oc_get_custom_resource( self._master, "pod", ":.spec.volumes[*].persistentVolumeClaim.claimName", prometheus_pod) self.assertTrue(pvc_name, "Failed to get PVC name") pvc_name = pvc_name[0] self.verify_iscsi_sessions_and_multipath( pvc_name, prometheus_pod, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True) # Try to fetch metric from prometheus pod self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') # Kill the brick process of a BHV gluster_node = list(self._registry_servers_info.keys())[0] openshift_ops.switch_oc_project( self._master, self._registry_project_name) bhv_name = self.get_block_hosting_volume_by_pvc_name( pvc_name, heketi_server_url=self._registry_heketi_server_url, gluster_node=gluster_node, ocp_client_node=self._master) vol_status = gluster_ops.get_gluster_vol_status(bhv_name) gluster_node_ip, brick_pid = None, None for g_node, g_node_data in vol_status.items(): for process_name, process_data in g_node_data.items(): if process_name.startswith("/var"): gluster_node_ip = g_node brick_pid = process_data["pid"] break if gluster_node_ip and brick_pid: break self.assertIsNotNone(brick_pid, "Could not find pid for brick") cmd = "kill -9 {}".format(brick_pid) openshift_ops.cmd_run_on_gluster_pod_or_node( self._master, cmd, gluster_node_ip) self.addCleanup(self._guster_volume_cleanup, bhv_name) # Check if the brick-process has been killed killed_pid_cmd = ( "ps -p {} -o pid --no-headers".format(brick_pid)) try: openshift_ops.cmd_run_on_gluster_pod_or_node( self._master, killed_pid_cmd, gluster_node_ip) except exceptions.ExecutionError: g.log.info("Brick process {} was killed" "successfully".format(brick_pid)) # Try to fetch metric from prometheus pod openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count') # Start the bhv using force openshift_ops.switch_oc_project( self._master, self._registry_project_name) start_vol, _, _ = volume_ops.volume_start( gluster_node_ip, bhv_name, force=True) self.assertFalse( start_vol, "Failed to start volume {}" " using force".format(bhv_name)) # Validate iscsi and multipath openshift_ops.switch_oc_project( self._master, self._prometheus_project_name) self.verify_iscsi_sessions_and_multipath( pvc_name, prometheus_pod, rtype='pod', heketi_server_url=self._registry_heketi_server_url, is_registry_gluster=True) # Try to fetch metric from prometheus pod self._fetch_metric_from_promtheus_pod( metric='heketi_device_brick_count')