def test_volume_status_xml(self): # create a two node cluster ret = peer_probe_servers(self.servers[0], self.servers[1]) self.assertTrue( ret, "Peer probe failed to %s from %s" % (self.mnode, self.servers[1])) # create a distributed volume with single node number_of_bricks = 1 servers_info_from_single_node = {} servers_info_from_single_node[self.servers[0]] = self.all_servers_info[ self.servers[0]] bricks_list = form_bricks_list(self.mnode, self.volname, number_of_bricks, self.servers[0], servers_info_from_single_node) ret, _, _ = volume_create(self.servers[0], self.volname, bricks_list) self.assertEqual(ret, 0, "Volume creation failed") g.log.info("Volume %s created successfully", self.volname) # Get volume status ret, _, err = volume_status(self.servers[1], self.volname) self.assertNotEqual(ret, 0, ("Unexpected: volume status is success for" " %s, even though volume is not started " "yet" % self.volname)) self.assertIn("is not started", err, ("volume status exited with" " incorrect error message")) # Get volume status with --xml vol_status = get_volume_status(self.servers[1], self.volname) self.assertIsNone(vol_status, ("Unexpected: volume status --xml for %s" " is success even though the volume is" " not stared yet" % self.volname)) # start the volume ret, _, _ = volume_start(self.servers[1], self.volname) self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname) # Get volume status ret, _, _ = volume_status(self.servers[1], self.volname) self.assertEqual(ret, 0, ("Failed to get volume status for %s" % self.volname)) # Get volume status with --xml vol_status = get_volume_status(self.servers[1], self.volname) self.assertIsNotNone(vol_status, ("Failed to get volume " "status --xml for %s" % self.volname)) # Verify there are no crashes while executing gluster volume status status = True glusterd_log = (self._get_test_specific_glusterd_log( self.mnode).split("\n")) for line in glusterd_log: if ' E ' in glusterd_log: status = False g.log.info("Unexpected! Error found %s", line) self.assertTrue(status, "Error found in glusterd logs")
def test_volume_status_xml(self): # create a two node cluster ret = peer_probe_servers(self.servers[0], self.servers[1]) self.assertTrue( ret, "Peer probe failed to %s from %s" % (self.mnode, self.servers[1])) # create a distributed volume with single node number_of_bricks = 1 servers_info_from_single_node = {} servers_info_from_single_node[self.servers[0]] = self.all_servers_info[ self.servers[0]] bricks_list = form_bricks_list(self.mnode, self.volname, number_of_bricks, self.servers[0], servers_info_from_single_node) ret, _, _ = volume_create(self.servers[0], self.volname, bricks_list) self.assertEqual(ret, 0, "Volume creation failed") g.log.info("Volume %s created successfully", self.volname) # Get volume status ret, _, err = volume_status(self.servers[1], self.volname) self.assertNotEqual(ret, 0, ("Unexpected: volume status is success for" " %s, even though volume is not started " "yet" % self.volname)) self.assertIn("is not started", err, ("volume status exited with" " incorrect error message")) # Get volume status with --xml vol_status = get_volume_status(self.servers[1], self.volname) self.assertIsNone(vol_status, ("Unexpected: volume status --xml for %s" " is success even though the volume is" " not stared yet" % self.volname)) # start the volume ret, _, _ = volume_start(self.servers[1], self.volname) self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname) # Get volume status ret, _, _ = volume_status(self.servers[1], self.volname) self.assertEqual(ret, 0, ("Failed to get volume status for %s" % self.volname)) # Get volume status with --xml vol_status = get_volume_status(self.servers[1], self.volname) self.assertIsNotNone(vol_status, ("Failed to get volume " "status --xml for %s" % self.volname))
def get_brick_and_volume_status(self, volume_name): """Status of each brick in a volume for background validation.""" volume_info = volume_ops.get_volume_info( 'auto_get_gluster_endpoint', volume_name) self.assertIsNotNone( volume_info, "'%s' volume info is empty" % volume_name) volume_status = volume_ops.get_volume_status( 'auto_get_gluster_endpoint', volume_name) self.assertIsNotNone( volume_status, "'%s' volume status is empty" % volume_name) self.assertEqual(int(volume_info[volume_name]["status"]), 1, "Volume not up") brick_info = [] for brick_details in volume_info[volume_name]["bricks"]["brick"]: brick_info.append(brick_details["name"]) self.assertTrue( brick_info, "Brick details are empty for %s" % volume_name) for brick in brick_info: brick_data = brick.strip().split(":") brick_ip = brick_data[0] brick_name = brick_data[1] self.assertEqual(int(volume_status[volume_name][brick_ip] [brick_name]["status"]), 1, "Brick %s not up" % brick_name)
def are_bricks_online(mnode, volname, bricks_list): """Verify all the specified list of bricks are online. Args: mnode (str): Node on which commands will be executed. volname (str): Name of the volume. bricks_list (list): List of bricks to verify online status. Returns: bool : True if all bricks online. False otherwise. NoneType: None on failure in getting volume status """ _rc = True offline_bricks_list = [] volume_status = get_volume_status(mnode, volname) if not volume_status: g.log.error("Unable to check if bricks are online for the volume %s", volname) return None for brick in bricks_list: brick_node, brick_path = brick.split(":") status = int(volume_status[volname][brick_node][brick_path]['status']) if status != 1: g.log.error("BRICK : %s is not online", brick) offline_bricks_list.append(brick) _rc = False if not _rc: g.log.error("Some of the bricks %s are not online", offline_bricks_list) return False g.log.info("All the bricks %s are online", bricks_list) return True
def get_online_bricks_list(mnode, volname): """Get list of bricks which are online. Args: mnode (str): Node on which commands will be executed. volname (str): Name of the volume. Returns: list : List of bricks in the volume which are online. NoneType: None on failure in getting volume status """ online_bricks_list = [] volume_status = get_volume_status(mnode, volname) if not volume_status: g.log.error("Unable to get online bricks_list for the volume %s", volname) return None bricks_list = get_all_bricks(mnode, volname) for brick in bricks_list: brick_node, brick_path = brick.split(":") try: status = int( volume_status[volname][brick_node][brick_path]['status']) except KeyError: continue if status == 1: online_bricks_list.append(brick) return online_bricks_list
def is_snapd_running(mnode, volname): """Checks if snapd is running on the given node Args: mnode (str): Node on which cmd has to be executed. volname (str): volume name Returns: True on success, False otherwise Example: is_snapd_running("abc.com", "testvol") """ vol_status = get_volume_status(mnode, volname=volname) if vol_status is None: g.log.error("Failed to get volume status in is_snapd_running()") return False if 'Snapshot Daemon' not in vol_status[volname][mnode]: g.log.error("uss is not enabled in volume %s" % volname) return False snapd_status = vol_status[volname][mnode]['Snapshot Daemon']['status'] if snapd_status != '1': g.log.error("Snapshot Daemon is not running in node %s" % mnode) return False return True
def get_brick_and_volume_status(self, volume_name): """Status of each brick in a volume for background validation.""" volume_info = volume_ops.get_volume_info( 'auto_get_gluster_endpoint', volume_name) self.assertIsNotNone( volume_info, "'%s' volume info is empty" % volume_name) volume_status = volume_ops.get_volume_status( 'auto_get_gluster_endpoint', volume_name) self.assertIsNotNone( volume_status, "'%s' volume status is empty" % volume_name) self.assertEqual(int(volume_info[volume_name]["status"]), 1, "Volume not up") brick_info = [] for brick_details in volume_info[volume_name]["bricks"]["brick"]: brick_info.append(brick_details["name"]) self.assertTrue( brick_info, "Brick details are empty for %s" % volume_name) for brick in brick_info: brick_data = brick.strip().split(":") brick_ip = brick_data[0] brick_name = brick_data[1] self.assertEqual(int(volume_status[volume_name][brick_ip] [brick_name]["status"]), 1, "Brick %s not up" % brick_name)
def is_shd_daemon_running(mnode, node, volname): """ Verifies whether the shd daemon is up and running on a particular node by checking the existence of shd pid and parsing the get volume status output. Args: mnode (str): The first node in servers list node (str): The node to be checked for whether the glustershd process is up or not volname (str): Name of the volume created Returns: boolean: True if shd is running on the node, False, otherwise """ # Get glustershd pid from node. ret, glustershd_pids = get_self_heal_daemon_pid(node) if not ret and glustershd_pids[node] != -1: return False # Verifying glustershd process is no longer running from get status. vol_status = get_volume_status(mnode, volname) if vol_status is None: return False try: _ = vol_status[volname][node]['Self-heal Daemon'] return True except KeyError: return False
def is_daemon_process_running(self): """ function for checking daemon process. """ vol_status_shd_pid_list = [] vol_status_quotad_pid_list = [] g.log.info( "Total self-heal and quota daemon process should be %d for " "%d nodes", len(self.servers) * 2, len(self.servers)) # Getting vol status in dictonary format vol_status = get_volume_status(self.mnode, self.volname) # Getting self-heal daemon and quota daemon pids of every host from # gluster volume status for server in self.servers: vol_status_quotad_pid_list.append( vol_status[self.volname][server]['Quota Daemon']['pid']) vol_status_shd_pid_list.append( vol_status[self.volname][server]['Self-heal Daemon']['pid']) g.log.info("shd list from get volume status: %s", vol_status_shd_pid_list) g.log.info("quotad list from get volume status: %s", vol_status_quotad_pid_list) sh_daemon_list = [] quotad_list = [] # Finding and Storing all hosts self heal daemon # in to sh_daemon_list, all # host quota daemon into quotad_list list using ps command for daemon_name, daemon_list in (('glustershd', sh_daemon_list), ('quotad', quotad_list)): for host in self.servers: cmd = "ps -eaf |grep %s |grep -v grep | awk '{ print $2 }'" % ( daemon_name) ret, out, err = g.run(host, cmd) err_msg = ("Failed to find '%s' daemon on the '%s' host using " "'ps -eaf' command.\nret: %s\nout: %s\nerr: %s" % (daemon_name, host, ret, out, err)) self.assertEqual(ret, 0, err_msg) daemon_list.append(out.strip()) g.log.info("shd list :%s", sh_daemon_list) g.log.info("quotad list :%s", quotad_list) # Checking in all hosts quota daemon and self heal daemon is # running or not # Here comparing the list of daemons got from ps command and # list of daemons got from vol status command, # all daemons should match from both the list if sorted(sh_daemon_list + quotad_list) == sorted(vol_status_shd_pid_list + vol_status_quotad_pid_list): return len(sh_daemon_list + quotad_list) == len(self.servers) * 2 return False
def test_gluster_volume_status_xml_dump(self): """ Setps: 1. stop one of the volume (i.e) gluster volume stop <vol-name> 2. Get the status of the volumes with --xml dump XML dump should be consistent """ ret, _, _ = volume_stop(self.mnode, volname=self.volname_2, force=True) self.assertFalse(ret, "Failed to stop volume '{}'".format(self.volname_2)) out = get_volume_status(self.mnode) self.assertIsNotNone( out, "Failed to get volume status on {}".format(self.mnode)) for _ in range(4): sleep(2) out1 = get_volume_status(self.mnode) self.assertIsNotNone( out1, "Failed to get volume status on {}".format(self.mnode)) self.assertEqual(out1, out)
def are_all_self_heal_daemons_are_online(mnode, volname): """Verifies whether all the self-heal-daemons are online for the specified volume. Args: mnode (str): Node on which cmd has to be executed. volname (str): volume name Returns: bool : True if all the self-heal-daemons are online for the volume. False otherwise. NoneType: None if unable to get the volume status """ from glustolibs.gluster.volume_libs import is_distribute_volume if is_distribute_volume(mnode, volname): g.log.info( "Volume %s is a distribute volume. " "Hence not checking for self-heal daemons " "to be online", volname) return True service = 'shd' failure_msg = ("Verifying all self-heal-daemons are online failed for " "volume %s" % volname) # Get volume status vol_status = get_volume_status(mnode=mnode, volname=volname, service=service) if vol_status is None: g.log.error(failure_msg) return None # Get all nodes from pool list from glustolibs.gluster.peer_ops import nodes_from_pool_list all_nodes = nodes_from_pool_list(mnode) if not all_nodes: g.log.error(failure_msg) return False online_status = True for node in all_nodes: node_shd_status_value = ( vol_status[volname][node]['Self-heal Daemon']['status']) if node_shd_status_value != '1': online_status = False g.run(mnode, ("gluster volume status %s shd" % volname)) if online_status is True: g.log.info("All self-heal Daemons are online") return True else: g.log.error("Some of the self-heal Daemons are offline") return False
def get_gluster_vol_status(file_vol): """Get Gluster vol hosting nodes. Args: file_vol (str): file volume name. """ # Get Gluster vol info gluster_volume_status = get_volume_status( "auto_get_gluster_endpoint", file_vol) if not gluster_volume_status: raise AssertionError("Failed to get volume status for gluster " "volume '%s'" % file_vol) if file_vol in gluster_volume_status: gluster_volume_status = gluster_volume_status.get(file_vol) return gluster_volume_status
def get_gluster_vol_status(file_vol): """Get Gluster vol status. Args: file_vol (str): file volume name. """ # Get Gluster vol info gluster_volume_status = get_volume_status( "auto_get_gluster_endpoint", file_vol) if not gluster_volume_status: raise AssertionError("Failed to get volume status for gluster " "volume '%s'" % file_vol) if file_vol in gluster_volume_status: gluster_volume_status = gluster_volume_status.get(file_vol) return gluster_volume_status
def get_gluster_vol_status(file_vol, is_detail=False): """Get Gluster vol status. Args: file_vol (str): file volume name. is_detail (bool): True for detailed output else False """ # Get Gluster vol info options = 'detail' if is_detail else '' gluster_volume_status = get_volume_status("auto_get_gluster_endpoint", file_vol, options=options) if not gluster_volume_status: raise AssertionError("Failed to get volume status for gluster " "volume '%s'" % file_vol) if file_vol in gluster_volume_status: gluster_volume_status = gluster_volume_status.get(file_vol) return gluster_volume_status
def restart_block_hosting_volume(gluster_pod, block_hosting_vol, sleep_time=120, hostname=None): """restars block hosting volume service Args: hostname (str): hostname on which gluster pod exists gluster_pod (podcmd | str): gluster pod class object has gluster pod and ocp master node or gluster pod name block_hosting_vol (str): name of block hosting volume """ gluster_pod = _get_gluster_pod(gluster_pod, hostname) gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol) if not gluster_volume_status: raise AssertionError("failed to get gluster volume status") g.log.info("Gluster volume %s status\n%s : " % (block_hosting_vol, gluster_volume_status)) ret, out, err = volume_stop(gluster_pod, block_hosting_vol) if ret != 0: err_msg = "failed to stop gluster volume %s on pod %s error: %s" % ( block_hosting_vol, gluster_pod, err) g.log.error(err_msg) raise AssertionError(err_msg) # Explicit wait to stop ios and pvc creation for 2 mins time.sleep(sleep_time) ret, out, err = volume_start(gluster_pod, block_hosting_vol, force=True) if ret != 0: err_msg = "failed to start gluster volume %s on pod %s error: %s" % ( block_hosting_vol, gluster_pod, err) g.log.error(err_msg) raise AssertionError(err_msg) ret, out, err = volume_status(gluster_pod, block_hosting_vol) if ret != 0: err_msg = ("failed to get status for gluster volume %s on pod %s " "error: %s" % (block_hosting_vol, gluster_pod, err)) g.log.error(err_msg) raise AssertionError(err_msg)
def check_for_cpu_usage_spikes_on_glusterfsd(nodes, test_name, threshold=3): """Check for CPU usage spikes in glusterfsd Args: nodes(list): Servers on which memory leaks have to be checked test_name(str): Name of testcase for which memory leaks has to be checked Kwargs: threshold(int): Accepted amount of instances of 100% CPU usage (Default:3) Returns: bool: True if CPU spikes are more than threshold else False NOTE: This function should be exuected with the volumes present on the cluster. """ is_there_a_spike = [] for node in nodes: # Get the volume status on the node volume_status = get_volume_status(node) dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name) if dataframe.empty: return False for volume in volume_status.keys(): for process in volume_status[volume][node].keys(): # Skiping if process isn't brick process if process in ('Self-heal Daemon', 'Quota Daemon'): continue # Call function to check for cpu spikes cpu_spikes = _check_for_cpu_usage_spikes( dataframe, node, 'glusterfsd', threshold, volume_status, volume, process) if cpu_spikes: g.log.error( "CPU usage spikes observed more than " "threshold %d on node %s on volume %s for " "brick process %s", threshold, node, volume, process) is_there_a_spike.append(cpu_spikes) return any(is_there_a_spike)
def check_for_memory_leaks_in_glusterfsd(nodes, test_name, gain=30.0): """Check for memory leaks in glusterfsd Args: nodes(list): Servers on which memory leaks have to be checked test_name(str): Name of testcase for which memory leaks has to be checked Kwargs: gain(float): Accepted amount of leak for a given testcase in MB (Default:30) Returns: bool: True if memory leak was obsevred else False NOTE: This function should be executed with the volumes present on the cluster. """ is_there_a_leak = [] for node in nodes: # Get the volume status on the node volume_status = get_volume_status(node) dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name) if dataframe.empty: return False for volume in volume_status.keys(): for process in volume_status[volume][node].keys(): # Skiping if process isn't brick process if not process.count('/'): continue # Call 3 point check function three_point_check = _perform_three_point_check_for_memory_leak( dataframe, node, 'glusterfsd', gain, volume_status, volume, process) if three_point_check: g.log.error( "Memory leak observed on node %s in brick " " process for brick %s on volume %s", node, process, volume) is_there_a_leak.append(three_point_check) return any(is_there_a_leak)
def check_brick_pid_matches_glusterfsd_pid(mnode, volname): # pylint: disable=invalid-name """Checks for brick process(es) both volume status and 'ps -eaf | grep glusterfsd' matches for the given volume Args: mnode (str): Node on which cmd has to be executed. volname (str): Name of the volume. Returns: bool : True if pid's matches. False otherwise. """ from glustolibs.gluster.brick_libs import get_all_bricks _rc = True bricks_list = get_all_bricks(mnode, volname) for brick in bricks_list: brick_node, brick_path = brick.split(":") ret = get_volume_status(mnode, volname) brick_pid = ret[volname][brick_node][brick_path]["pid"] if brick_pid == "None": g.log.error( "Failed to get brick pid on node %s " "of brick path %s", brick_node, brick_path) _rc = False cmd = "pgrep -x glusterfsd" ret, pid, _ = g.run(brick_node, cmd) if ret != 0: g.log.error("Failed to run the command %s on " "node %s", cmd, brick_node) _rc = False else: glusterfsd_pid = pid.split('\n')[:-1] if brick_pid not in glusterfsd_pid: g.log.error( "Brick pid %s doesn't match glusterfsd " "pid %s of the node %s", brick_pid, glusterfsd_pid, brick_node) _rc = False return _rc
def is_heal_enabled(mnode, volname): """Check if heal is enabled for a volume. Args: mnode : Node on which commands are executed volname : Name of the volume Returns: bool : True if heal is enabled on volume. False otherwise. NoneType: None if unable to get the volume status. """ enabled = True vol_status_dict = get_volume_status(mnode, volname, service='shd') if vol_status_dict is None: g.log.error("Failed to check if heal is enabled on volume %s or not" % volname) return None for node in vol_status_dict[volname].keys(): if not ('Self-heal Daemon' in vol_status_dict[volname][node]): enabled = False return enabled
def restart_file_volume(file_vol, sleep_time=120): """Restars file volume service. Args: file_vol (str): name of a file volume """ gluster_volume_status = get_volume_status( "auto_get_gluster_endpoint", file_vol) if not gluster_volume_status: raise AssertionError("failed to get gluster volume status") g.log.info("Gluster volume %s status\n%s : " % ( file_vol, gluster_volume_status) ) ret, out, err = volume_stop("auto_get_gluster_endpoint", file_vol) if ret != 0: err_msg = "Failed to stop gluster volume %s. error: %s" % ( file_vol, err) g.log.error(err_msg) raise AssertionError(err_msg) # Explicit wait to stop ios and pvc creation for 2 mins time.sleep(sleep_time) ret, out, err = volume_start( "auto_get_gluster_endpoint", file_vol, force=True) if ret != 0: err_msg = "failed to start gluster volume %s error: %s" % ( file_vol, err) g.log.error(err_msg) raise AssertionError(err_msg) ret, out, err = volume_status("auto_get_gluster_endpoint", file_vol) if ret != 0: err_msg = ("Failed to get status for gluster volume %s error: %s" % ( file_vol, err)) g.log.error(err_msg) raise AssertionError(err_msg)
def restart_file_volume(file_vol, sleep_time=120): """Restart file volume (stop and start volume). Args: file_vol (str): name of a file volume """ gluster_volume_status = get_volume_status( "auto_get_gluster_endpoint", file_vol) if not gluster_volume_status: raise AssertionError("failed to get gluster volume status") g.log.info("Gluster volume %s status\n%s : " % ( file_vol, gluster_volume_status) ) ret, out, err = volume_stop("auto_get_gluster_endpoint", file_vol) if ret != 0: err_msg = "Failed to stop gluster volume %s. error: %s" % ( file_vol, err) g.log.error(err_msg) raise AssertionError(err_msg) # Explicit wait to stop ios and pvc creation for 2 mins time.sleep(sleep_time) ret, out, err = volume_start( "auto_get_gluster_endpoint", file_vol, force=True) if ret != 0: err_msg = "failed to start gluster volume %s error: %s" % ( file_vol, err) g.log.error(err_msg) raise AssertionError(err_msg) ret, out, err = volume_status("auto_get_gluster_endpoint", file_vol) if ret != 0: err_msg = ("Failed to get status for gluster volume %s error: %s" % ( file_vol, err)) g.log.error(err_msg) raise AssertionError(err_msg)
def get_brick_pids(gluster_pod, block_hosting_vol, hostname=None): """gets brick pids from gluster pods Args: hostname (str): hostname on which gluster pod exists gluster_pod (podcmd | str): gluster pod class object has gluster pod and ocp master node or gluster pod name block_hosting_vol (str): Block hosting volume id """ gluster_pod = _get_gluster_pod(gluster_pod, hostname) gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol) if not gluster_volume_status: raise AssertionError("failed to get volume status for gluster " "volume '%s' on pod '%s'" % (gluster_pod, block_hosting_vol)) gluster_volume_status = gluster_volume_status.get(block_hosting_vol) assert gluster_volume_status, ("gluster volume %s not present" % (block_hosting_vol)) pids = {} for parent_key, parent_val in gluster_volume_status.items(): for child_key, child_val in parent_val.items(): if not child_key.startswith("/var"): continue pid = child_val["pid"] # When birck is down, pid of the brick is returned as -1. # Which is unexepeted situation, hence raising error. if pid == "-1": raise AssertionError("Something went wrong brick pid is -1") pids[parent_key] = pid return pids
def test_replace_brick_quorum(self): ''' -> Create volume -> Set quorum type -> Set quorum ratio to 95% -> Start the volume -> Stop the glusterd on one node -> Now quorum is in not met condition -> Check all bricks went to offline or not -> Perform replace brick operation -> Start glusterd on same node which is already stopped -> Check all bricks are in online or not -> Verify in vol info that old brick not replaced with new brick ''' # Forming brick list, 6 bricks for creating volume, 7th brick for # performing replace brick operation brick_list = form_bricks_list(self.mnode, self.volname, 7, self.servers, self.all_servers_info) # Create Volume ret, _, _ = volume_create(self.mnode, self.volname, brick_list[0:6], replica_count=3) self.assertEqual(ret, 0, "Failed to create volume %s" % self.volname) g.log.info("Volume created successfully %s", self.volname) # Enabling server quorum ret = set_volume_options(self.mnode, self.volname, {'cluster.server-quorum-type': 'server'}) self.assertTrue( ret, "Failed to set server quorum on volume %s" % self.volname) g.log.info("Able to set server quorum successfully on volume %s", self.volname) # Setting Quorum ratio in percentage ret = set_volume_options(self.mnode, 'all', {'cluster.server-quorum-ratio': '95%'}) self.assertTrue( ret, "Failed to set server quorum ratio on %s" % self.servers) g.log.info("Able to set server quorum ratio successfully on %s", self.servers) # Start the volume ret, _, _ = volume_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname) g.log.info("Volume started successfully %s", self.volname) # Stop glusterd on one of the node random_server = random.choice(self.servers[1:]) ret = stop_glusterd(random_server) self.assertTrue(ret, "Failed to stop glusterd for %s" % random_server) g.log.info("Glusterd stopped successfully on server %s", random_server) # Checking whether glusterd is running or not ret = is_glusterd_running(random_server) self.assertEqual( ret, 1, "Glusterd is still running on the node %s " "where glusterd stopped" % random_server) g.log.info("Glusterd is not running on the server %s", random_server) # Verifying node count in volume status after glusterd stopped # on one of the server, Its not possible to check the brick status # immediately in volume status after glusterd stop count = 0 while count < 100: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname].keys()) if servers_count == 5: break sleep(2) count += 1 # creating brick list from volume status offline_bricks = [] vol_status = get_volume_status(self.mnode, self.volname) for node in vol_status[self.volname]: for brick_path in vol_status[self.volname][node]: if brick_path != 'Self-heal Daemon': offline_bricks.append(':'.join([node, brick_path])) # Checking bricks are offline or not with quorum ratio(95%) ret = are_bricks_offline(self.mnode, self.volname, offline_bricks) self.assertTrue( ret, "Bricks are online when quorum is in not met " "condition for %s" % self.volname) g.log.info( "Bricks are offline when quorum is in not met " "condition for %s", self.volname) # Getting random brick from offline brick list self.random_brick = random.choice(offline_bricks) # Performing replace brick commit force when quorum not met self.replace_brick_failed = False ret, _, _ = replace_brick(self.mnode, self.volname, self.random_brick, brick_list[6]) self.assertNotEqual( ret, 0, "Replace brick should fail when quorum is " "in not met condition but replace brick " "success on %s" % self.volname) g.log.info( "Failed to replace brick when quorum is in not met " "condition %s", self.volname) self.replace_brick_failed = True # Start glusterd on one of the node ret = start_glusterd(random_server) self.assertTrue( ret, "Failed to start glusterd on server %s" % random_server) g.log.info("Glusterd started successfully on server %s", random_server) # Verifying node count in volume status after glusterd started # on one of the servers, Its not possible to check the brick status # immediately in volume status after glusterd start count = 0 while count < 100: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname].keys()) if servers_count == 6: break sleep(2) count += 1 # Checking bricks are online or not count = 0 while count < 100: ret = are_bricks_online(self.mnode, self.volname, brick_list[0:6]) if ret: break sleep(2) count += 1 self.assertTrue(ret, "All bricks are not online for %s" % self.volname) g.log.info("All bricks are online for volume %s", self.volname) # Comparing brick lists of before and after performing replace brick # operation after_brick_list = get_all_bricks(self.mnode, self.volname) self.assertListEqual( after_brick_list, brick_list[0:6], "Bricks are not same before and after performing " "replace brick operation for volume %s" % self.volname) g.log.info( "Bricks are same before and after performing replace " "brick operation for volume %s", self.volname)
def test_create_heketi_volume(self): """Test heketi volume creation and background gluster validation""" hosts = [] gluster_servers = [] brick_info = [] output_dict = heketi_ops.heketi_volume_create(self.heketi_client_node, self.heketi_server_url, 10, json=True) self.assertNotEqual(output_dict, False, "Volume could not be created") volume_name = output_dict["name"] volume_id = output_dict["id"] self.addCleanup(heketi_ops.heketi_volume_delete, self.heketi_client_node, self.heketi_server_url, volume_id) self.assertEqual(output_dict["durability"]["replicate"]["replica"], 3, "Volume %s is not replica 3" % volume_id) self.assertEqual(output_dict["size"], 10, "Volume %s is not of intended size" % volume_id) mount_node = ( output_dict["mount"]["glusterfs"]["device"].strip().split(":")[0]) hosts.append(mount_node) for backup_volfile_server in ( output_dict["mount"]["glusterfs"]["options"] ["backup-volfile-servers"].strip().split(",")): hosts.append(backup_volfile_server) for gluster_server in self.gluster_servers: gluster_servers.append( g.config["gluster_servers"][gluster_server]["storage"]) self.assertEqual( set(hosts), set(gluster_servers), "Hosts and gluster servers not matching for %s" % volume_id) volume_info = volume_ops.get_volume_info('auto_get_gluster_endpoint', volume_name) self.assertIsNotNone(volume_info, "get_volume_info returned None") volume_status = volume_ops.get_volume_status( 'auto_get_gluster_endpoint', volume_name) self.assertIsNotNone(volume_status, "get_volume_status returned None") self.assertEqual(int(volume_info[volume_name]["status"]), 1, "Volume %s status down" % volume_id) for brick_details in volume_info[volume_name]["bricks"]["brick"]: brick_info.append(brick_details["name"]) self.assertNotEqual(brick_info, [], "Brick details are empty for %s" % volume_name) for brick in brick_info: brick_data = brick.strip().split(":") brick_ip = brick_data[0] brick_name = brick_data[1] self.assertEqual( int(volume_status[volume_name][brick_ip][brick_name] ["status"]), 1, "Brick %s is not up" % brick_name)
def test_remove_brick_status(self): ''' -> Create volume -> Enable server quorum on volume -> Stop glusterd on all nodes except first node -> Verify brick status of nodes where glusterd is running with default quorum ratio(51%) -> Change the cluster.server-quorum-ratio from default to 95% -> Start glusterd on all servers except last node -> Verify the brick status again ''' # Enabling server quorum ret = set_volume_options(self.mnode, self.volname, {'cluster.server-quorum-type': 'server'}) self.assertTrue( ret, "Failed to set server quorum on volume %s" % self.volname) g.log.info("Able to set server quorum on volume %s successfully", self.volname) # Getting brick list brick_list = get_all_bricks(self.mnode, self.volname) # Stopping glusterd on remaining servers except first node ret = stop_glusterd(self.servers[1:]) self.assertTrue( ret, "Failed to stop gluterd on some of the servers " "%s" % self.servers[1:]) g.log.info("Glusterd stopped successfully on servers %s", self.servers[1:]) # Checking brick status for glusterd running nodes with # default quorum ratio(51%) ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1]) self.assertTrue( ret, "Bricks are online when quorum is in not " "met condition for %s" % self.volname) g.log.info( "Bricks are offline when quorum is in not met " "condition for %s", self.volname) # Setting quorum ratio to 95% ret = set_volume_options(self.mnode, 'all', {'cluster.server-quorum-ratio': '95%'}) self.assertTrue( ret, "Failed to set quorum ratio to 95 percentage on " "servers %s" % self.servers) g.log.info( "Able to set server quorum ratio to 95 percentage " "on servers %s", self.servers) # Starting glusterd on remaining servers except last node ret = start_glusterd(self.servers[1:5]) self.assertTrue( ret, "Failed to start glusterd on some of the servers" " %s" % self.servers[1:5]) g.log.info( "Glusterd started successfully on all servers except " "last node %s", self.servers[1:5]) # Verfiying node count in volume status after glusterd # started on servers, Its not possible to check the brick status # immediately after glusterd start, that's why verifying that all # glusterd started nodes available in gluster volume status or not count = 0 while count < 50: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname].keys()) if servers_count == 5: break sleep(2) count += 1 # Checking brick status with quorum ratio(95%) ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:5]) self.assertTrue( ret, "Bricks are online when quorum is in not " "met condition for %s" % self.volname) g.log.info( "Bricks are offline when quorum is in not met " "condition for %s", self.volname)
def test_add_identical_brick(self): """ In this test case: 1. Create Dist Volume on Node 1 2. Down brick on Node 1 3. Peer Probe N2 from N1 4. Add identical brick on newly added node 5. Check volume status """ # pylint: disable=too-many-statements # Create a distributed volume on Node1 number_of_brick = 1 servers_info_from_single_node = { self.servers[0]: self.all_servers_info[self.servers[0]] } self.volname = "testvol" bricks_list = form_bricks_list(self.servers[0], self.volname, number_of_brick, self.servers[0], servers_info_from_single_node) ret, _, _ = volume_create(self.servers[0], self.volname, bricks_list, force=False) self.assertEqual(ret, 0, "Volume create failed") g.log.info("Volume %s created successfully", self.volname) ret, _, _ = volume_start(self.servers[0], self.volname, True) self.assertEqual(ret, 0, ("Failed to start the " "volume %s", self.volname)) g.log.info("Get all the bricks of the volume") bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") g.log.info("Successfully got the list of bricks of volume") ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") ret, _, _ = peer_probe(self.servers[0], self.servers[1]) self.assertEqual(ret, 0, ("peer probe from %s to %s is failed", self.servers[0], self.servers[1])) g.log.info("peer probe is success from %s to " "%s", self.servers[0], self.servers[1]) # wait for some time before add-brick time.sleep(2) # Replace just host IP to create identical brick add_bricks = [] add_bricks.append( string.replace(bricks_list[0], self.servers[0], self.servers[1])) ret, _, _ = add_brick(self.mnode, self.volname, add_bricks) self.assertEqual(ret, 0, "Failed to add the bricks to the volume") g.log.info("Successfully added bricks to volume %s", add_bricks[0]) ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Volume start with force failed") vol_status = get_volume_status(self.mnode, self.volname) self.assertIsNotNone( vol_status, "Failed to get volume " "status for %s" % self.volname)
def test_status_string(self): ''' -> Create Volume -> Start rebalance -> Check task type in volume status -> Check task status string in volume status -> Check task type in volume status xml -> Check task status string in volume status xml -> Start Remove brick operation -> Check task type in volume status -> Check task status string in volume status -> Check task type in volume status xml -> Check task status string in volume status xml ''' # Start rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to start rebalance for volume %s" % self.volname) g.log.info("Rebalance started successfully on volume %s", self.volname) # Wait for rebalance to complete ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, "Rebalance failed for volume %s" % self.volname) g.log.info("Rebalance completed successfully on volume %s", self.volname) # Getting volume status after rebalance start ret, out, _ = volume_status(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to get volume status for volume %s" % self.volname) g.log.info("Volume status successful on volume %s", self.volname) status_list = out.splitlines() # Verifying task type from volume status for rebalance self.assertIn('Rebalance', status_list[len(status_list) - 4], "Incorrect task type found in volume status for %s" % self.volname) g.log.info("Correct task type found in volume status for %s", self.volname) # Verifying task status string in volume status for rebalance self.assertIn('completed', status_list[len(status_list) - 2], "Incorrect task status found in volume status for %s" % self.volname) g.log.info("Correct task status found in volume status for %s", self.volname) # Getting volume status --xml after rebalance start vol_status = get_volume_status(self.mnode, self.volname, options='tasks') # Verifying task type from volume status --xml for rebalance self.assertEqual('Rebalance', vol_status[self.volname]['task_status'][0]['type'], "Incorrect task type found in volume status xml " "for %s" % self.volname) g.log.info("Correct task type found in volume status xml for %s", self.volname) # Verifying task status string from volume status --xml for rebalance self.assertEqual( 'completed', vol_status[self.volname]['task_status'][0]['statusStr'], "Incorrect task status found in volume status " "xml for %s" % self.volname) g.log.info("Correct task status found in volume status xml %s", self.volname) # Getting sub vols subvol_dict = get_subvols(self.mnode, self.volname) subvol = subvol_dict['volume_subvols'][1] # Perform remove brick start ret, _, _ = remove_brick(self.mnode, self.volname, subvol, 'start', replica_count=3) self.assertEqual(ret, 0, "Failed to start remove brick operation " "for %s" % self.volname) g.log.info("Remove brick operation started successfully on volume %s", self.volname) # Getting volume status after remove brick start ret, out, _ = volume_status(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to get volume status for volume %s" % self.volname) g.log.info("Volume status successful on volume %s", self.volname) status_list = out.splitlines() # Verifying task type from volume status after remove brick start self.assertIn('Remove brick', status_list[len(status_list) - 8], "Incorrect task type found in volume status for " "%s" % self.volname) g.log.info("Correct task type found in volume status task for %s", self.volname) # Verifying task status string in volume status after remove # brick start ret = False remove_status = ['completed', 'in progress'] if (status_list[len(status_list) - 2].split(':')[1].strip() in remove_status): ret = True self.assertTrue(ret, "Incorrect task status found in volume status " "task for %s" % self.volname) g.log.info("Correct task status found in volume status task for %s", self.volname) # Getting volume status --xml after remove brick start vol_status = get_volume_status(self.mnode, self.volname, options='tasks') # Verifying task type from volume status --xml after # remove brick start self.assertEqual('Remove brick', vol_status[self.volname]['task_status'][0]['type'], "Incorrect task type found in volume status xml for " "%s" % self.volname) g.log.info("Correct task type found in volume status xml for %s", self.volname) # Verifying task status string from volume status --xml # after remove brick start ret = False if (vol_status[self.volname]['task_status'][0]['statusStr'] in remove_status): ret = True self.assertTrue(ret, "Incorrect task status found in volume status " "xml for %s" % self.volname) g.log.info("Correct task status found in volume status xml %s", self.volname)
def test_peer_probe_validation(self): # pylint: disable=too-many-statements ''' -> Create trusted storage pool, by probing with networkshort names -> Create volume using IP of host -> perform basic operations like -> gluster volume start <vol> -> gluster volume info <vol> -> gluster volume status <vol> -> gluster volume stop <vol> -> Create a volume using the FQDN of the host -> perform basic operations like -> gluster volume start <vol> -> gluster volume info <vol> -> gluster volume status <vol> -> gluster volume stop <vol> ''' # Peer probing using short name for server in self.servers[1:]: ret, hostname, _ = g.run(server, "hostname -s") self.assertEqual(ret, 0, ("Unable to get short name " "for server % s" % server)) ret, _, _ = peer_probe(self.mnode, hostname) if ret == 1: ret, hostname, _ = g.run(server, "hostname") self.assertEqual(ret, 0, ("Unable to get short name " "for server % s" % server)) hostname = hostname.split(".")[0]+"."+hostname.split(".")[1] ret, _, _ = peer_probe(self.mnode, hostname) self.assertEqual(ret, 0, "Unable to peer" "probe to the server % s" % hostname) g.log.info("Peer probe succeeded for server %s", hostname) # Create a volume self.volname = "test-vol" self.brick_list = form_bricks_list(self.mnode, self.volname, 3, self.servers, self.all_servers_info) g.log.info("Creating a volume") ret, _, _ = volume_create(self.mnode, self.volname, self.brick_list, force=False) self.assertEqual(ret, 0, "Unable" "to create volume % s" % self.volname) g.log.info("Volume created successfully % s", self.volname) # Start a volume g.log.info("Start a volume") ret, _, _ = volume_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Unable" "to start volume % s" % self.volname) g.log.info("Volume started successfully % s", self.volname) # Get volume info g.log.info("get volume info") volinfo = get_volume_info(self.mnode, self.volname) self.assertIsNotNone(volinfo, "Failed to get the volume " "info for %s" % self.volname) # Get volume status vol_status = get_volume_status(self.mnode, self.volname) self.assertIsNotNone(vol_status, "Failed to get volume " "status for %s" % self.volname) # stop volume ret, _, _ = volume_stop(self.mnode, self.volname) self.assertEqual(ret, 0, "Unable" "to stop volume % s" % self.volname) g.log.info("Volume stopped successfully % s", self.volname) # Create a volume self.volname = "test-vol-fqdn" self.brick_list = form_bricks_list(self.mnode, self.volname, 3, self.servers, self.all_servers_info) # Getting FQDN (Full qualified domain name) of each host and # replacing ip with FQDN name for each brick for example # 10.70.37.219:/bricks/brick0/vol1 is a brick, here ip is replaced # with FQDN name now brick looks like # dhcp35-219.lab.eng.blr.redhat.com:/bricks/brick0/vol1 my_brick_list = [] for brick in self.brick_list: fqdn_list = brick.split(":") fqdn = socket.getfqdn(fqdn_list[0]) fqdn = fqdn + ":" + fqdn_list[1] my_brick_list.append(fqdn) g.log.info("Creating a volume") ret, _, _ = volume_create(self.mnode, self.volname, my_brick_list, force=False) self.assertEqual(ret, 0, "Unable" "to create volume % s" % self.volname) g.log.info("Volume created successfully % s", self.volname) # Start a volume g.log.info("Start a volume") ret, _, _ = volume_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Unable" "to start volume % s" % self.volname) g.log.info("Volume started successfully % s", self.volname) # Get volume info g.log.info("get volume info") volinfo = get_volume_info(self.mnode, self.volname) self.assertIsNotNone(volinfo, "Failed to get the volume " "info for %s" % self.volname) # Get volume status vol_status = get_volume_status(self.mnode, self.volname) self.assertIsNotNone(vol_status, "Failed to get volume " "status for %s" % self.volname) # stop volume ret, _, _ = volume_stop(self.mnode, self.volname) self.assertEqual(ret, 0, "Unable" "to stop volume % s" % self.volname) g.log.info("Volume stopped successfully % s", self.volname)
def test_brick_port(self): # pylint: disable=too-many-statements, too-many-branches """ In this test case: 1. Trusted storage Pool of 2 nodes 2. Create a distributed volumes with 2 bricks 3. Start the volume 4. Stop glusterd on one node 2 5. Modify any of the volume option on node 1 6. Start glusterd on node 2 7. Check volume status, brick should get port """ my_server_info = { self.servers[0]: self.all_servers_info[self.servers[0]] } my_servers = self.servers[0:2] index = 1 ret, _, _ = peer_probe(self.servers[0], self.servers[index]) self.assertEqual(ret, 0, ("peer probe from %s to %s is failed", self.servers[0], self.servers[index])) g.log.info("peer probe is success from %s to " "%s", self.servers[0], self.servers[index]) key = self.servers[index] my_server_info[key] = self.all_servers_info[key] self.volname = "testvol" bricks_list = form_bricks_list(self.mnode, self.volname, 2, my_servers, my_server_info) g.log.info("Creating a volume %s ", self.volname) ret = volume_create(self.mnode, self.volname, bricks_list, force=False) self.assertEqual(ret[0], 0, ("Unable" "to create volume %s" % self.volname)) g.log.info("Volume created successfully %s", self.volname) ret, _, _ = volume_start(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to start the " "volume %s", self.volname)) g.log.info("Get all the bricks of the volume") bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") g.log.info("Successfully got the list of bricks of volume") vol_status = get_volume_status(self.mnode, self.volname) self.assertIsNotNone(vol_status, "Failed to get volume " "status for %s" % self.volname) totport = 0 for _, value in vol_status.items(): for _, val in value.items(): for _, value1 in val.items(): if int(value1["port"]) > 0: totport += 1 self.assertEqual(totport, 2, ("Volume %s is not started successfully" "because no. of brick port is not equal" " to 2", self.volname)) ret = stop_glusterd(self.servers[1]) self.assertTrue(ret, "Failed to stop glusterd on one of the node") ret = wait_for_glusterd_to_start(self.servers[1]) self.assertFalse(ret, "glusterd is still running on %s" % self.servers[1]) g.log.info("Glusterd stop on the nodes : %s " "succeeded", self.servers[1]) option = {'performance.readdir-ahead': 'on'} ret = set_volume_options(self.servers[0], self.volname, option) self.assertTrue(ret, "gluster volume set %s performance.readdir-ahead" "on is failed on server %s" % (self.volname, self.servers[0])) g.log.info("gluster volume set %s performance.readdir-ahead on" "successfully on :%s", self.volname, self.servers[0]) ret = start_glusterd(self.servers[1]) self.assertTrue(ret, "Failed to start glusterd on one of the node") g.log.info("Glusterd start on the nodes : %s " "succeeded", self.servers[1]) ret = wait_for_glusterd_to_start(self.servers[1]) self.assertTrue(ret, "glusterd is not running on %s" % self.servers[1]) g.log.info("Glusterd start on the nodes : %s " "succeeded", self.servers[1]) ret = wait_for_peers_to_connect(self.servers[0], self.servers[1]) self.assertTrue(ret, "glusterd is not connected %s with peer %s" % (self.servers[0], self.servers[1])) vol_status = get_volume_status(self.mnode, self.volname) self.assertIsNotNone(vol_status, "Failed to get volume " "status for %s" % self.volname) totport = 0 for _, value in vol_status.items(): for _, val in value.items(): for _, value1 in val.items(): if int(value1["port"]) > 0: totport += 1 self.assertEqual(totport, 2, ("Volume %s is not started successfully" "because no. of brick port is not equal" " to 2", self.volname))
def test_rebalance_hang(self): """ In this test case: 1. Trusted storage Pool of 2 nodes 2. Create a distributed volumes with 2 bricks 3. Start the volume 4. Mount the volume 5. Add some data file on mount 6. Start rebalance with force 7. kill glusterd on 2nd node 8. Issue volume related command """ # pylint: disable=too-many-statements my_server_info = { self.servers[0]: self.all_servers_info[self.servers[0]] } my_servers = self.servers[0:2] index = 1 ret, _, _ = peer_probe(self.servers[0], self.servers[index]) self.assertEqual(ret, 0, ("peer probe from %s to %s is failed", self.servers[0], self.servers[index])) g.log.info("peer probe is success from %s to " "%s", self.servers[0], self.servers[index]) key = self.servers[index] my_server_info[key] = self.all_servers_info[key] self.volname = "testvol" bricks_list = form_bricks_list(self.mnode, self.volname, 2, my_servers, my_server_info) g.log.info("Creating a volume %s ", self.volname) ret, _, _ = volume_create(self.mnode, self.volname, bricks_list, force=False) self.assertEqual(ret, 0, ("Unable" "to create volume %s" % self.volname)) g.log.info("Volume created successfully %s", self.volname) ret, _, _ = volume_start(self.mnode, self.volname, False) self.assertEqual(ret, 0, ("Failed to start the " "volume %s", self.volname)) g.log.info("Get all the bricks of the volume") bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") g.log.info("Successfully got the list of bricks of volume") # Mounting a volume ret, _, _ = mount_volume(self.volname, mtype=self.mount_type, mpoint=self.mounts[0].mountpoint, mserver=self.mnode, mclient=self.mounts[0].client_system) self.assertEqual(ret, 0, ("Volume %s is not mounted") % self.volname) g.log.info("Volume mounted successfully : %s", self.volname) self.all_mounts_procs = [] # Creating files command = ("cd %s/ ; " "for i in `seq 1 10` ; " "do mkdir l1_dir.$i ; " "for j in `seq 1 5` ; " "do mkdir l1_dir.$i/l2_dir.$j ; " "for k in `seq 1 10` ; " "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k " "bs=128k count=$k ; " "done ; " "done ; " "done ; " % (self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.io_validation_complete = True self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Starting rebalance with force on the volume") ret, _, _ = rebalance_start(self.mnode, self.volname, False, True) self.assertEqual( ret, 0, ("Failed to start rebalance for volume %s", self.volname)) g.log.info("Successfully rebalance on the volume %s", self.volname) ret = stop_glusterd(self.servers[1]) self.assertTrue(ret, "Failed to stop glusterd on one of the node") ret = is_glusterd_running(self.servers[1]) self.assertNotEqual( ret, 0, ("Glusterd is not stopped on servers %s", self.servers[1])) g.log.info("Glusterd stop on the nodes : %s succeeded", self.servers[1]) # Wait for fix-layout to complete g.log.info("Waiting for rebalance to complete") ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, ("Rebalance is not yet complete on the volume " "%s", self.volname)) g.log.info("Rebalance is successfully complete on the volume %s", self.volname) vol_status = get_volume_status(self.mnode, self.volname) self.assertIsNotNone( vol_status, "Failed to get volume " "status for %s" % self.volname) # Start glusterd on the node where it is stopped ret = start_glusterd(self.servers[1]) self.assertTrue(ret, "glusterd start on the node failed") count = 0 while count < 60: ret = is_glusterd_running(self.servers[1]) if not ret: break sleep(2) count += 1 self.assertEqual(ret, 0, "glusterd is not running on %s" % self.servers[1]) g.log.info("Glusterd start on the nodes : %s " "succeeded", self.servers[1])
def compute_data_usage_stats_on_servers(nodes, test_name): """Compute min, max, mean and median for servers Args: nodes(list): Servers from which data is to be used to compute min, max , mean, mode and median test_name(str): Name of testcase for which data has to be processed Returns: dict: dict of min, max, mean and median for a given process NOTE: This function has to be always run before cleanup. """ data_dict = {} for node in nodes: # Get the volume status on the node volume_status = get_volume_status(node) data_dict[node] = {} for process in ('glusterd', 'glusterfs', 'glusterfsd'): # Generate a dataframe from the csv file dataframe = create_dataframe_from_csv(node, process, test_name) if dataframe.empty: return {} data_dict[node][process] = {} if process == 'glusterd': # Checking if glusterd is restarted. if len(set(dataframe['Process ID'])) > 1: data_dict[node][process]['is_restarted'] = True else: data_dict[node][process]['is_restarted'] = False # Call function to compute min, max, mean and median _compute_min_max_mean_median(dataframe, data_dict, process, node) continue # Map volumes to volume process for volume in volume_status.keys(): for proc in volume_status[volume][node].keys(): if (proc == 'Self-heal Daemon' and process == 'glusterfs'): # Fetching pid from volume status output and create a # dataframe with the entries of only that pid pid = volume_status[volume][node][proc]['pid'] proc_dataframe = dataframe[dataframe['Process ID'] == pid] # Call function to compute min, max, mean # and median _compute_min_max_mean_median(proc_dataframe, data_dict, process, node, volume) if (proc.count('/') >= 2 and process == 'glusterfsd'): # Fetching pid from volume status output and create a # dataframe with the entries of only that pid pid = volume_status[volume][node][proc]['pid'] proc_dataframe = dataframe[dataframe['Process ID'] == pid] # Call function to compute min, max, mean and median _compute_min_max_mean_median(proc_dataframe, data_dict, process, node, volume, proc) return data_dict
def test_add_brick_when_quorum_not_met(self): # pylint: disable=too-many-statements # create and start a volume ret = setup_volume(self.mnode, self.all_servers_info, self.volume) self.assertTrue(ret, ("Failed to create " "and start volume %s" % self.volname)) g.log.info("Volume is created and started successfully") # set cluster.server-quorum-type as server ret = set_volume_options(self.mnode, self.volname, {'cluster.server-quorum-type': 'server'}) self.assertTrue(ret, ("Failed to set the quorum type as a server" " on volume %s", self.volname)) g.log.info("Able to set server quorum successfully on volume %s", self.volname) # Setting quorum ratio to 95% ret = set_volume_options(self.mnode, 'all', {'cluster.server-quorum-ratio': '95%'}) self.assertTrue( ret, "Failed to set server quorum ratio on %s" % self.volname) g.log.info("Able to set server quorum ratio successfully on %s", self.servers) # bring down glusterd of half nodes num_of_servers = len(self.servers) num_of_nodes_to_bring_down = num_of_servers / 2 for node in range(num_of_nodes_to_bring_down, num_of_servers): ret = stop_glusterd(self.servers[node]) self.assertTrue( ret, ("Failed to stop glusterd on %s" % self.servers[node])) g.log.info("Glusterd stopped successfully on server %s", self.servers[node]) for node in range(num_of_nodes_to_bring_down, num_of_servers): count = 0 while count < 80: ret = is_glusterd_running(self.servers[node]) if ret: break sleep(2) count += 1 self.assertNotEqual( ret, 0, "glusterd is still running on %s" % self.servers[node]) # Verifying node count in volume status after glusterd stopped # on half of the servers, Its not possible to check the brick status # immediately in volume status after glusterd stop count = 0 while count < 100: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname]) if servers_count == (num_of_servers - num_of_nodes_to_bring_down): break sleep(2) count += 1 # confirm that quorum is not met, brick process should be down bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") bricks_to_check = bricks_list[0:num_of_nodes_to_bring_down] ret = are_bricks_offline(self.mnode, self.volname, bricks_to_check) self.assertTrue( ret, "Unexpected: Server quorum is not met, " "Bricks are up") g.log.info("Server quorum is not met, bricks are down as expected") # try add brick operation, which should fail num_bricks_to_add = 1 brick = form_bricks_list(self.mnode, self.volname, num_bricks_to_add, self.servers, self.all_servers_info) ret, _, _ = add_brick(self.mnode, self.volname, brick) self.assertNotEqual(ret, 0, ("Unexpected: add brick is success, " "when quorum is not met")) g.log.info("Add brick is failed as expected, when quorum is not met") # confirm that, newly added brick is not part of volume bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") if brick in bricks_list: ret = False self.assertTrue(ret, ("Unexpected: add brick is success, " "when quorum is not met")) g.log.info("Add brick is failed as expected, when quorum is not met") # set cluster.server-quorum-type as none ret = set_volume_options(self.mnode, self.volname, {'cluster.server-quorum-type': 'none'}) self.assertTrue(ret, ("Failed to set the quorum type as a server" " on volume %s", self.volname)) g.log.info("Able to set server quorum successfully on volume %s", self.volname)
def test_xml_dump_of_gluster_volume_status_during_rebalance(self): """ 1. Create a trusted storage pool by peer probing the node 2. Create a distributed-replicated volume 3. Start the volume and fuse mount the volume and start IO 4. Create another replicated volume and start it and stop it 5. Start rebalance on the volume 6. While rebalance in progress, stop glusterd on one of the nodes in the Trusted Storage pool. 7. Get the status of the volumes with --xml dump """ self.volname_2 = "test_volume_2" # create volume # Fetching all the parameters for volume_create list_of_three_servers = [] server_info_for_three_nodes = {} for server in self.servers[:3]: list_of_three_servers.append(server) server_info_for_three_nodes[server] = self.all_servers_info[server] bricks_list = form_bricks_list(self.mnode, self.volname, 3, list_of_three_servers, server_info_for_three_nodes) # Creating volumes using 3 servers ret, _, _ = volume_create(self.mnode, self.volname_2, bricks_list, force=True) self.assertFalse(ret, "Volume creation failed") g.log.info("Volume %s created successfully", self.volname_2) ret, _, _ = volume_start(self.mnode, self.volname_2) self.assertFalse(ret, "Failed to start volume {}".format(self.volname_2)) ret, _, _ = volume_stop(self.mnode, self.volname_2) self.assertFalse(ret, "Failed to stop volume {}".format(self.volname_2)) # Start Rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to start rebalance on the volume " "%s", self.volname)) # Get rebalance status status_info = get_rebalance_status(self.mnode, self.volname) status = status_info['aggregate']['statusStr'] self.assertIn('in progress', status, "Rebalance process is not running") g.log.info("Rebalance process is running") # Stop glusterd ret = stop_glusterd(self.servers[2]) self.assertTrue(ret, "Failed to stop glusterd") ret, out, _ = g.run( self.mnode, "gluster v status | grep -A 4 'Rebalance' | awk 'NR==3{print " "$3,$4}'") ret = get_volume_status(self.mnode, self.volname, options="tasks") rebalance_status = ret[self.volname]['task_status'][0]['statusStr'] self.assertIn(rebalance_status, out.replace("\n", ""))