def setUpClass(cls): """ setup volume, mount volume and initialize necessary variables which is used in tests """ # Calling GlusterBaseClass setUpClass cls.get_super_method(cls, 'setUpClass')() # Setup Volume and Mount Volume ret = cls.setup_volume_and_mount_volume(mounts=cls.mounts) if not ret: raise ExecutionError("Failed to Setup_Volume and Mount_Volume") g.log.info("Successful in Setup Volume and Mount Volume") # Verify glustershd process releases its parent process ret = is_shd_daemonized(cls.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online") # Upload script cls.script_upload_path = ("/usr/share/glustolibs/io/scripts/" "file_dir_ops.py") ret = upload_scripts(cls.clients, [cls.script_upload_path]) if not ret: raise ExecutionError("Failed to upload IO scripts to clients")
def setUp(self): """ setup volume, mount volume and initialize necessary variables which is used in tests """ # calling GlusterBaseClass setUpClass GlusterBaseClass.setUp.im_func(self) self.all_mounts_procs = [] self.io_validation_complete = False # Setup Volume and Mount Volume g.log.info("Starting to Setup Volume and Mount Volume") ret = self.setup_volume_and_mount_volume(mounts=self.mounts) if not ret: raise ExecutionError("Failed to Setup_Volume and Mount_Volume") g.log.info("Successful in Setup Volume and Mount Volume") # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(self.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online") self.glustershd = "/var/lib/glusterd/glustershd/glustershd-server.vol"
def setUp(self): # Calling GlusterBaseClass setUp GlusterBaseClass.setUp.im_func(self) self.extra_servers = self.servers[-2:] self.servers = self.servers[:-2] # Performing peer detach for server in self.extra_servers: # Peer detach ret, _, _ = peer_detach(self.mnode, server) if ret: raise ExecutionError("Peer detach failed") g.log.info("Peer detach successful.") # Create volume using first four nodes servers_info_from_four_nodes = {} for server in self.servers: servers_info_from_four_nodes[server] = self.all_servers_info[ server] self.volume['servers'] = self.servers ret = setup_volume(self.mnode, servers_info_from_four_nodes, self.volume, force=False) if not ret: raise ExecutionError("Volume create failed on four nodes") g.log.info("Distributed replicated volume created successfully") # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(self.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online")
def setUpClass(cls): """ setup volume and initialize necessary variables which is used in tests """ # calling GlusterBaseClass setUpClass cls.get_super_method(cls, 'setUpClass')() list_of_vol = [ 'distributed-dispersed', 'replicated', 'dispersed', 'distributed', 'distributed-replicated' ] cls.volume_configs = [] if cls.default_volume_type_config['distributed']['dist_count'] > 3: cls.default_volume_type_config['distributed']['dist_count'] = 3 for volume_type in list_of_vol: cls.volume_configs.append({ 'name': 'testvol_%s' % (volume_type), 'servers': cls.servers, 'voltype': cls.default_volume_type_config[volume_type] }) for volume_config in cls.volume_configs: ret = setup_volume(mnode=cls.mnode, all_servers_info=cls.all_servers_info, volume_config=volume_config, multi_vol=True) volname = volume_config['name'] if not ret: raise ExecutionError("Failed to setup Volume" " %s" % volname) g.log.info("Successful in setting volume %s", volname) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(cls.mnode, volname, 60) if not ret: raise ExecutionError("Volume %s : All process are not online" % volname) g.log.info("Successfully Verified volume %s processes are online", volname) # Verfiy glustershd process releases its parent process g.log.info("Verifying Self Heal Daemon process is daemonized") ret = is_shd_daemonized(cls.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online")
def setUp(self): """ setup volume and initialize necessary variables which is used in tests """ # Calling GlusterBaseClass setUp self.get_super_method(self, 'setUp')() # Setup Volume for all the volume types self.volume_configs = [] for volume_type in self.default_volume_type_config: self.volume_configs.append({ 'name': 'testvol_%s' % volume_type, 'servers': self.servers, 'voltype': self.default_volume_type_config[volume_type] }) for volume_config in self.volume_configs[1:]: ret = setup_volume(mnode=self.mnode, all_servers_info=self.all_servers_info, volume_config=volume_config, multi_vol=True) volname = volume_config['name'] if not ret: raise ExecutionError("Failed to setup Volume" " %s" % volname) g.log.info("Successful in setting volume %s", volname) # Verify volume's all process are online for 60 sec ret = wait_for_volume_process_to_be_online(self.mnode, volname, 60) if not ret: raise ExecutionError("Volume %s : All process are not online" % volname) g.log.info("Successfully Verified volume %s processes are online", volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(self.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online") self.glustershd = "/var/lib/glusterd/glustershd/glustershd-server.vol"
def setUp(self): """ setup volume, mount volume and initialize necessary variables which is used in tests """ # calling GlusterBaseClass setUpClass GlusterBaseClass.setUp.im_func(self) # Setup Volume and Mount Volume g.log.info("Starting to Setup Volume and Mount Volume") ret = self.setup_volume_and_mount_volume(mounts=self.mounts) if not ret: raise ExecutionError("Failed to Setup_Volume and Mount_Volume") g.log.info("Successful in Setup Volume and Mount Volume") # Verfiy glustershd process releases its parent process g.log.info("Verifying Self Heal Daemon process is daemonized") ret = is_shd_daemonized(self.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online")
def setUpClass(cls): """ setup volume, mount volume and initialize necessary variables which is used in tests """ # calling GlusterBaseClass setUpClass GlusterBaseClass.setUpClass.im_func(cls) # Setup Volume and Mount Volume g.log.info("Starting to Setup Volume and Mount Volume") ret = cls.setup_volume_and_mount_volume(mounts=cls.mounts) if not ret: raise ExecutionError("Failed to Setup_Volume and Mount_Volume") g.log.info("Successful in Setup Volume and Mount Volume") # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(cls.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online") cls.GLUSTERSHD = "/var/lib/glusterd/glustershd/glustershd-server.vol"
def setUp(self): # Calling GlusterBaseClass setUpClass self.get_super_method(self, 'setUp')() # Upload script self.script_upload_path = ("/usr/share/glustolibs/io/scripts/" "file_dir_ops.py") ret = upload_scripts(self.clients, [self.script_upload_path]) if not ret: raise ExecutionError("Failed to upload IO scripts to clients") # Setup Volume and Mount Volume ret = self.setup_volume_and_mount_volume(mounts=self.mounts) if not ret: raise ExecutionError("Failed to Setup_Volume and Mount_Volume") g.log.info("Successful in Setup Volume and Mount Volume") # Verify glustershd process releases its parent process ret = is_shd_daemonized(self.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online")
def test_glustershd_with_add_remove_brick(self): """ Test script to verify glustershd process with adding and removing bricks * check glustershd process - only 1 glustershd process should be running * bricks must be present in glustershd-server.vol file for the replicated involved volumes * Add bricks * check glustershd process - only 1 glustershd process should be running and its should be different from previous one * bricks which are added must present in glustershd-server.vol file * remove bricks * check glustershd process - only 1 glustershd process should be running and its different from previous one * bricks which are removed should not present in glustershd-server.vol file """ # pylint: disable=too-many-statements nodes = self.volume['servers'] bricks_list = [] glustershd_pids = {} # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s", pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file", self.glustershd) # expanding volume g.log.info("Start adding bricks to volume %s", self.volname) ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to add bricks to " "volume %s " % self.volname)) g.log.info("Add brick successful") # Log Volume Info and Status after expanding the volume g.log.info("Logging volume info and Status after expanding volume") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info("Successful in logging volume info and status " "of volume %s", self.volname) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Start Rebalance g.log.info("Starting Rebalance on the volume") ret, _, err = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to start rebalance on " "the volume %s with error %s" % (self.volname, err))) g.log.info("Successfully started rebalance on the " "volume %s", self.volname) # Log Rebalance status g.log.info("Log Rebalance status") _, _, _ = rebalance_status(self.mnode, self.volname) # Wait for rebalance to complete g.log.info("Waiting for rebalance to complete") ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, ("Rebalance is not yet complete " "on the volume %s", self.volname)) g.log.info("Rebalance is successfully complete on " "the volume %s", self.volname) # Check Rebalance status after rebalance is complete g.log.info("Checking Rebalance status") ret, _, _ = rebalance_status(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to get rebalance status for " "the volume %s", self.volname)) g.log.info("Successfully got rebalance status of the " "volume %s", self.volname) # Check the self-heal daemon process after adding bricks g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) glustershd_pids_after_expanding = {} ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self-heal daemon process " "on nodes %s", nodes) glustershd_pids_after_expanding = pids g.log.info("Self Heal Daemon Process ID's after expanding " "volume: %s", glustershd_pids_after_expanding) self.assertNotEqual(glustershd_pids, glustershd_pids_after_expanding, "Self Daemon process is same before and" " after adding bricks") g.log.info("Self Heal Daemon Process is different before and " "after adding bricks") # get the bricks for the volume after expanding bricks_list_after_expanding = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_expanding) # validate the bricks present in volume info # with glustershd server volume file after adding bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_expanding) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "expanding bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd) # shrink the volume g.log.info("Starting volume shrink") ret = shrink_volume(self.mnode, self.volname) self.assertTrue(ret, ("Failed to shrink the volume on " "volume %s", self.volname)) g.log.info("Shrinking volume is successful on " "volume %s", self.volname) # Log Volume Info and Status after shrinking the volume g.log.info("Logging volume info and Status after shrinking volume") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status " "of volume %s", self.volname) # get the bricks after shrinking the volume bricks_list_after_shrinking = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after shrinking " "volume: %s", bricks_list_after_shrinking) self.assertEqual(len(bricks_list_after_shrinking), len(bricks_list), "Brick Count is mismatched after " "shrinking the volume %s" % self.volname) g.log.info("Brick Count matched before before expanding " "and after shrinking volume") # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process after removing bricks g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) glustershd_pids_after_shrinking = {} ret, pids = get_self_heal_daemon_pid(nodes) glustershd_pids_after_shrinking = pids self.assertNotEqual(glustershd_pids_after_expanding, glustershd_pids_after_shrinking, "Self Heal Daemon process is same " "after adding bricks and shrinking volume") g.log.info("Self Heal Daemon Process is different after adding bricks " "and shrinking volume") # validate bricks present in volume info # with glustershd server volume file after removing bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_shrinking) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "removing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd)
def test_data_self_heal_daemon_off(self): """ Test Data-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get areequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get areequal after getting bricks offline and compare with areequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - add bricks - do rebalance - create 5k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s" % (options, self.volname)) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 100 --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before getting bricks offline g.log.info('Getting areequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Get areequal after getting bricks offline g.log.info('Getting areequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks offline ' 'is successful') # Checking areequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 100 --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Bring brick online g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s" % self.volname) # Do rebalance ret, out, err = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Create 1k files self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 1000 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline' % brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Bring brick online g.log.info('Bringing bricks %s online...' % brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts")
def test_self_heal_50k_files_heal_command_by_add_brick(self): """ Test self-heal of 50k files (heal command) Description: - Set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "self-heal-daemon": "off" - Bring down all bricks processes from selected set - Create IO (50k files) - Get arequal before getting bricks online - Bring bricks online - Set the volume option "self-heal-daemon": "on" - Check for daemons - Start healing - Check if heal is completed - Check for split-brain - Get arequal after getting bricks online and compare with arequal before getting bricks online - Add bricks - Do rebalance - Get arequal after adding bricks and compare with arequal after getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Successfully set %s for volume %s", options, self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files on client side all_mounts_procs = [] # Create 50k files g.log.info('Creating files...') command = ("cd %s ; " "for i in `seq 1 50000` ; " "do dd if=/dev/urandom of=test.$i " "bs=100k count=1 ; " "done ;" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts[0]), "IO failed on some of the clients") # Get arequal before getting bricks online ret, result_before_online = collect_mounts_arequal(self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options ret = set_volume_options(self.mnode, self.volname, {"self-heal-daemon": "on"}) self.assertTrue(ret, 'Failed to set option self-heal-daemon to ON.') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual( result_before_online, result_after_online, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Add bricks ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance and wait for it to complete ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname, timeout=3600) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Get arequal after adding bricks ret, result_after_adding_bricks = collect_mounts_arequal( self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks ' 'is successful') # Checking arequals after bringing bricks online # and after adding bricks self.assertItemsEqual( result_after_online, result_after_adding_bricks, 'Checksums after bringing bricks online and ' 'after adding bricks are not equal') g.log.info('Checksums after bringing bricks online and ' 'after adding bricks are equal')
def test_replace_brick_self_heal_io_in_progress(self): """ - Create directory on mount point and write files/dirs - Create another set of files (1K files) - While creation of files/dirs are in progress Kill one brick - Remove the contents of the killed brick(simulating disk replacement) - When the IO's are still in progress, restart glusterd on the nodes where we simulated disk replacement to bring back bricks online - Start volume heal - Wait for IO's to complete - Verify whether the files are self-healed - Calculate arequals of the mount point and all the bricks """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Create dirs with files g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 10 %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, command, user=self.mounts[0].user) self.assertFalse(ret, err) g.log.info("IO is successful") # Creating another set of files (1K files) self.all_mounts_procs = [] # Create dirs with files g.log.info('Creating 1K files...') command = ("/usr/bin/env python %s create_files " "-f 1500 --fixed-file-size 10k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts[0]) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Remove the content of the killed bricks for brick in bricks_to_bring_offline: brick_node, brick_path = brick.split(':') # Removing files command = ('cd %s ; rm -rf *' % brick_path) ret, _, err = g.run(brick_node, command) self.assertFalse(ret, err) g.log.info('Files are deleted on brick %s', brick) # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal daemons are online") # Start healing ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Check arequals for "replicated" all_bricks = get_all_bricks(self.mnode, self.volname) if self.volume_type == "replicated": # Get arequal after bricks are online ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after successfully bringing' 'bricks online.') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total ret, arequals = collect_bricks_arequal(all_bricks) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and brick ' 'are not equal') g.log.info('Arequals for mountpoint and brick are equal') # Check arequals for "distributed-replicated" if self.volume_type == "distributed-replicated": # Get the subvolumes subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols_dict['volume_subvols'][i] ret, arequal = collect_bricks_arequal(subvol_brick_list[0]) self.assertTrue(ret, 'Failed to get arequal on first brick') first_brick_total = arequal[0].splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick ret, arequals = collect_bricks_arequal(subvol_brick_list) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and brick are ' 'not equal') g.log.info('Arequals for subvol and brick are equal')
def test_impact_of_replace_brick_for_glustershd(self): # pylint: disable=too-many-statements,too-many-branches,too-many-locals nodes = self.volume['servers'] replaced_bricks = [] # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file", self.glustershd) # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # replace brick from each sub-vol for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_replace = subvol_brick_list[-1] new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume : %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) replaced_bricks.append(brick_to_replace) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verify glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on nodes " "%s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or" " more than One self heal daemon process" " found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids_after_replacement = pids # Compare pids before and after replacing self.assertNotEqual( glustershd_pids, glustershd_pids_after_replacement, "Self Daemon process is same before and" " after replacing bricks") g.log.info("Self Heal Daemon Process is different before and " "after replacing bricks") # get the bricks for the volume after replacing bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_replacing) # validate the bricks present in volume info # with glustershd server volume file after replacing bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_replacing) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd) g.log.info("Starting to delete replaced brick dir's") # Remove brick directories of the replaced bricks as this is not # handled by tearDown class for bricks in replaced_bricks: node, brick_path = bricks.split(r':') cmd = "rm -rf " + brick_path ret, _, _ = g.run(node, cmd) if ret: raise ExecutionError("Failed to delete the brick dir's for" " %s and brick %s" % (node, brick_path)) g.log.info("Successfully deleted brick dir's for replaced bricks")
def test_conservative_merge_of_files_heal_command(self): """ - set options: "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" - Bring brick 0 offline - Creating files on client side - Bring brick 0 online - Bring brick 1 offline - Creating files on client side - Bring brick 1 online - Get arequal on bricks - Setting option "self-heal-daemon": "on" - Start healing - Get arequal on bricks and compare with arequals before healing and mountpoint """ # pylint: disable=too-many-statements,too-many-locals # set options bricks_list = get_all_bricks(self.mnode, self.volname) options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" } g.log.info("setting options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Bring brick 0 offline g.log.info('Bringing bricks %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0]) g.log.info('Bringing bricks %s offline is successful', bricks_list[0]) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("python %s create_deep_dirs_with_files " "-d 0 -l 5 -f 10 --dirname-start-num 1 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick 0 online g.log.info('Bringing bricks %s online...', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[0]) g.log.info('Bringing bricks %s online is successful', bricks_list[0]) # Bring brick 1 offline g.log.info('Bringing bricks %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Creating files on client side self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("python %s create_deep_dirs_with_files " "-d 0 -l 5 -f 10 --dirname-start-num 6 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick 1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Get arequal on bricks arequals_before_heal = {} g.log.info('Getting arequal on bricks...') for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_before_heal[brick] = brick_total # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequals for mount g.log.info('Getting arequal before getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal for replicated') self.assertNotEqual( cmp(arequals_before_heal, arequals_after_heal), 0, 'Arequals are equal for bricks ' 'before and after healing')
def test_data_self_heal_command(self): """ Test Data-Self-Heal (heal command) Description: - get the client side healing volume options and check if they have already been disabled by default NOTE: Client side healing has been disabled by default since GlusterFS 6.0 "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get arequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get arequal after getting bricks offline and compare with arequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - create 5k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # pylint: disable=too-many-statements # Checking if Client side healing options are disabled by default g.log.info('Checking Client side healing is disabled by default') options = ('cluster.metadata-self-heal', 'cluster.data-self-heal', 'cluster.entry-self-heal') for option in options: ret = get_volume_options(self.mnode, self.volname, option)[option] self.assertTrue(bool(ret == 'off' or ret == 'off (DEFAULT)'), "{} option is not disabled by default" .format(option)) g.log.info("Client side healing options are disabled by default") # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1k %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 10k %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all processes are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Create 1k files self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 1000 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline', brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Bring brick online g.log.info('Bringing bricks %s online...', brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True
def test_data_self_heal_algorithm_diff_heal_command(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff' Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - modify the data - get arequal before getting bricks online - bring bricks online - expand volume by adding bricks to the volume - do rebalance - set the volume option "self-heal-daemon": "on" and check for daemons - start healing - check if heal is completed - check for split-brain - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-branches,too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "data-self-heal-algorithm": "diff" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off'," "'data-self-heal-algorithm' " "is set to 'diff' successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Expand volume by adding bricks to the volume g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums are equal')
def test_self_heal_50k_files_heal_default(self): """ Test self-heal of 50k files by heal default Description: - bring down all bricks processes from selected set - create IO (50k files) - Get arequal before getting bricks online - check for daemons to come online - heal daemon should pick up entries to heal automatically - check if heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create 50k files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 50000 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Default Heal testing, wait for shd to pick up healing # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(result_before_online, result_after_online, 'Checksums before and after bringing bricks online ' 'are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_glustershd_with_restarting_glusterd(self): """ Test Script to verify the self heal daemon process with restarting glusterd and rebooting the server * stop all volumes * restart glusterd - should not run self heal daemon process * start replicated involved volumes * single self heal daemon process running * restart glusterd * self heal daemon pid will change * bring down brick and restart glusterd * self heal daemon pid will change and its different from previous * brought up the brick """ # pylint: disable=too-many-statements nodes = self.volume['servers'] # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # check the self heal daemon process after stopping the volume g.log.info("Verifying the self heal daemon process for " "volume %s", self.volname) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is still running " "even after stopping volume %s" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume %s", self.volname) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) self.assertTrue( wait_for_glusterd_to_start(self.servers), "Failed to start glusterd on %s" % self.servers) # check the self heal daemon process after restarting glusterd process g.log.info("Starting to get self-heal daemon process on" " nodes %s", nodes) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is running after " "glusterd restart with volume %s in " "stop state" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume and restarting glusterd %s", self.volname) # start the volume g.log.info("Starting the volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verfiy glustershd process releases its parent process g.log.info("Checking whether glustershd process is daemonized or not") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Single self heal daemon process on all nodes %s", nodes) # get the self heal daemon pids after starting volume g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self heal daemon pids") glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info # with glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different from " "glustershd server volume file. " "Please check log file for details.")) g.log.info("Successfully parsed %s file", self.glustershd) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after starting volume and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_glusterd_restart = pids self.assertNotEqual(glustershd_pids, glustershd_pids_after_glusterd_restart, ("Self Heal Daemon pids are same after " "restarting glusterd process")) g.log.info("Self Heal Daemon process are different before and " "after restarting glusterd process") # select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # bring bricks offline g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # restart glusterd after brought down the brick g.log.info("Restart glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after killing brick and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_killing_brick = pids self.assertNotEqual(glustershd_pids_after_glusterd_restart, glustershd_pids_after_killing_brick, ("Self Heal Daemon process are same from before " "killing the brick,restarting glusterd process")) g.log.info("Self Heal Daemon process are different after killing the " "brick, restarting the glusterd process") # brought the brick online g.log.info("bringing up the bricks : %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to brought the bricks online")) g.log.info("Successfully brought the bricks online") # check all bricks are online g.log.info("Verifying all bricka are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.")
def test_self_heal_symbolic_links(self): """ Test Self-Heal of Symbolic Links (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - calculate arequals and compare with arequal before bringing bricks offline - modify the data and verify whether the links are properly created - calculate arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check is heal is complited - check for split-brain - calculate arequal after getting bricks online and compare with arequal before getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off' successfully") # Creating files on client side all_mounts_procs = [] test_sym_link_self_heal_folder = 'test_sym_link_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("cd %s/ ; " "mkdir %s ; " "cd %s/ ;" "for i in `seq 1 5` ; " "do mkdir dir.$i ; " "for j in `seq 1 10` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1k count=$j ; " "done ; " "done ;" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder, test_sym_link_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(sorted(result_before_offline), sorted(result_after_offline), 'Checksums before and after bringing bricks ' 'online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create symlinks g.log.info('Creating symlinks...') command = ("cd %s/%s/ ; " "for i in `seq 1 5` ; " "do ln -s dir.$i sym_link_dir.$i ; " "done ;" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder)) ret, _, _ = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, 'Failed to modify the data for %s...' % self.mounts[0].mountpoint) g.log.info('Modifying the data for %s is successful', self.mounts[0].mountpoint) # Verify whether the links are properly created # Get symlink list command = ("cd %s/%s/ ; " "ls |grep 'sym'" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder)) _, out, _ = g.run(self.mounts[0].client_system, command) symlink_list = out.strip().split('\n') # Get folder list command = ("cd %s/%s/ ; " "ls |grep -v 'sym'" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder)) _, out, _ = g.run(self.mounts[0].client_system, command) folder_list = out.strip().split('\n') # Compare symlinks and folders for symlink in symlink_list: symlink_index = symlink_list.index(symlink) command = ("cd %s/%s/ ; " "readlink %s" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder, symlink)) _, out, _ = g.run(self.mounts[0].client_system, command) symlink_to_folder = out.strip() self.assertEqual(symlink_to_folder, folder_list[symlink_index], 'Links are not properly created') g.log.info('Links for %s are properly created', self.mounts[0].mountpoint) # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums before and after bringing bricks ' 'online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_entry_self_heal_heal_command(self): """ Test Entry-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - get arequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - get arequal after getting bricks offline and compare with arequal after bringing bricks offline - modify the data - get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before bringing bricks online """ # pylint: disable=too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off'") # Start IO on mounts g.log.info("Starting IO on all mounts...") g.log.info("Starting IO on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("python %s create_deep_dirs_with_files " "--dir-length 2 " "--dir-depth 2 " "--max-num-of-dirs 2 " "--num-of-files 20 %s/files" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to create the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Created IO for %s is successfully', self.mounts[0].mountpoint) # Command list to do different operations with data - # create, rename, copy and delete cmd_list = [ "python %s create_files -f 20 %s/files", "python %s mv %s/files", # 'copy' command works incorrect. disable until fixed # "python %s copy --dest-dir %s/new_dir %s/files", "python %s delete %s" ] for cmd in cmd_list: # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_before_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' " "is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_after_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums are not equal') g.log.info('Checksums before bringing bricks offline ' 'and after bringing bricks offline are equal') # Modify the data g.log.info("Start modifying IO on all mounts...") g.log.info("Modifying IO on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) if 'copy --dest-dir' in cmd: parsed_cmd = cmd % (self.script_upload_path, self.mounts[0].mountpoint, self.mounts[0].mountpoint) else: parsed_cmd = cmd % (self.script_upload_path, self.mounts[0].mountpoint) ret, _, err = g.run(self.mounts[0].client_system, parsed_cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to modify the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Modified IO for %s is successfully', self.mounts[0].mountpoint) # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_before_online = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal before getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_after_online = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal after getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def test_replacing_all_arbiters(self): """ - Create an arbiter volume 4(2+1) distributed replicate - Start writing IO - While the I/O's are going on replace all the arbiter bricks - check for the new bricks attached successfully - Check for heals - Validate IO """ # pylint: disable=too-many-locals,too-many-statements # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Clear all brick folders. Its need to prevent healing with old files for brick in bricks_list: g.log.info('Clearing brick %s', brick) node, brick_path = brick.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) self.assertFalse(ret, err) g.log.info('Clearing brick %s is successful', brick) g.log.info('Clearing for all brick is successful') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 3 -l 3 -n 3 -f 20 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # replace bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] for subvol in subvols: g.log.info('Replacing arbiter brick for %s', subvol) brick_to_replace = subvol[-1] self.bricks_to_clean.append(brick_to_replace) new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume: %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) # check replaced bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] index = 0 for subvol in subvols: expected_brick_path = self.bricks_to_clean[index] + 'new' brick_to_check = subvol[-1] self.assertEqual(expected_brick_path, brick_to_check, 'Brick %s is not replaced brick' % brick_to_check) index += 1 # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s: All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True
def test_glustershd_on_newly_probed_server(self): """ Test script to verify glustershd process on newly probed server * check glustershd process - only 1 glustershd process should be running * Add new node to cluster * check glustershd process - only 1 glustershd process should be running on all servers inclusing newly probed server * stop the volume * add another node to cluster * check glustershd process - glustershd process shouldn't be running on servers including newly probed server * start the volume * check glustershd process - only 1 glustershd process should be running on all servers inclusing newly probed server """ # pylint: disable=too-many-statements nodes = self.volume['servers'][:-2] # check the self-heal daemon process g.log.info("Starting to get self heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # Add new node to the cluster g.log.info("Peer probe for %s", self.extra_servers[0]) ret = peer_probe_servers(self.mnode, self.extra_servers[0]) self.assertTrue( ret, "Failed to peer probe server : %s" % self.extra_servers[0]) g.log.info( "Peer probe success for %s and all peers are in " "connected state", self.extra_servers[0]) nodes.append(self.extra_servers[0]) # check the self-heal daemon process and it should be running on # newly probed servers g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # Add another new node to the cluster g.log.info("peer probe for %s", self.extra_servers[1]) ret = peer_probe_servers(self.mnode, self.extra_servers[1]) self.assertTrue( ret, "Failed to peer probe server : %s" % self.extra_servers[1]) g.log.info( "Peer probe success for %s and all peers are in " "connected state", self.extra_servers[1]) nodes.append(self.extra_servers[1]) # check the self-heal daemon process after stopping volume and # no self heal daemon should be running including newly probed node g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertFalse(ret, ("Self Heal Daemon process is running even " "after stopping volume %s" % self.volname)) for node in pids: self.assertEquals(pids[node][0], -1, ("Self Heal Daemon is still " "running on node %s even " "after stopping all " "volumes" % node)) g.log.info("Expected : No self heal daemon process is running " "after stopping all volumes") # start the volume g.log.info("Starting volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process g.log.info("verifying self heal daemon process is daemonized") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # detach extra servers from the cluster g.log.info("peer detaching extra servers %s from cluster", self.extra_servers) ret = peer_detach_servers(self.mnode, self.extra_servers) self.assertTrue( ret, "Failed to peer detach extra servers : %s" % self.extra_servers) g.log.info("Peer detach success for %s ", self.extra_servers)
def setUpClass(cls): """ setup volume and initialize necessary variables which is used in tests """ # calling GlusterBaseClass setUpClass cls.get_super_method(cls, 'setUpClass')() cls.default_volume_type_config = { 'replicated': { 'type': 'replicated', 'replica_count': 2, 'transport': 'tcp' }, 'dispersed': { 'type': 'dispersed', 'disperse_count': 6, 'redundancy_count': 2, 'transport': 'tcp' }, 'distributed': { 'type': 'distributed', 'dist_count': 2, 'transport': 'tcp' }, 'distributed-replicated': { 'type': 'distributed-replicated', 'dist_count': 2, 'replica_count': 3, 'transport': 'tcp' } } # Setup Volume for all the volume types cls.volume_configs = [] for volume_type in cls.default_volume_type_config: cls.volume_configs.append({ 'name': 'testvol_%s' % volume_type, 'servers': cls.servers, 'voltype': cls.default_volume_type_config[volume_type] }) for volume_config in cls.volume_configs: ret = setup_volume(mnode=cls.mnode, all_servers_info=cls.all_servers_info, volume_config=volume_config) volname = volume_config['name'] if not ret: raise ExecutionError("Failed to setup Volume" " %s" % volname) g.log.info("Successful in setting volume %s", volname) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(cls.mnode, volname, 60) if not ret: raise ExecutionError("Volume %s : All process are not online" % volname) g.log.info("Successfully Verified volume %s processes are online", volname) # Verfiy glustershd process releases its parent process g.log.info("Verifying Self Heal Daemon process is daemonized") ret = is_shd_daemonized(cls.servers) if not ret: raise ExecutionError("Self Heal Daemon process was still" " holding parent process.") g.log.info("Self Heal Daemon processes are online") cls.GLUSTERSHD = "/var/lib/glusterd/glustershd/glustershd-server.vol"
def test_existing_glustershd_should_take_care_of_self_healing(self): """ Test Script which verifies that the existing glustershd should take care of self healing * Create and start the Replicate volume * Check the glustershd processes - Note the pids * Bring down the One brick ( lets say brick1) without affecting the cluster * Create 1000 files on volume * bring the brick1 up which was killed in previous steps * check the heal info - proactive self healing should start * Bring down brick1 again * wait for 60 sec and brought up the brick1 * Check the glustershd processes - pids should be different * Monitor the heal till its complete """ # pylint: disable=too-many-locals,too-many-lines,too-many-statements nodes = self.servers # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # select the bricks to bring offline g.log.info("Selecting bricks to brought offline for volume %s", self.volname) bricks_to_bring_offline = \ select_volume_bricks_to_bring_offline(self.mnode, self.volname) g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline) # Bring down the selected bricks g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # get the bricks which are running g.log.info("getting the brick list which are online") online_bricks = get_online_bricks_list(self.mnode, self.volname) g.log.info("Online Bricks for volume %s : %s", self.volname, online_bricks) # write 1MB files to the mounts g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mounts) all_mounts_procs = [] cmd = ("for i in `seq 1 1000`; " "do dd if=/dev/urandom of=%s/file_$i " "bs=1M count=1; " "done" % self.mounts[0].mountpoint) g.log.info(cmd) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # check the heal info g.log.info("Get the pending heal info for the volume %s", self.volname) heal_info = get_heal_info_summary(self.mnode, self.volname) g.log.info("Successfully got heal info for the volume %s", self.volname) g.log.info("Heal Info for volume %s : %s", self.volname, heal_info) # Bring bricks online g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline, 'glusterd_restart') self.assertTrue( ret, ("Failed to bring bricks: %s online" % bricks_to_bring_offline)) g.log.info("Successfully brought all bricks: %s online", bricks_to_bring_offline) # Wait for 90 sec to start self healing g.log.info('Waiting for 90 sec to start self healing') time.sleep(90) # check the heal info g.log.info("Get the pending heal info for the volume %s", self.volname) heal_info_after_brick_online = get_heal_info_summary( self.mnode, self.volname) g.log.info("Successfully got heal info for the volume %s", self.volname) g.log.info("Heal Info for volume %s : %s", self.volname, heal_info_after_brick_online) # check heal pending is decreased flag = False for brick in online_bricks: if int(heal_info_after_brick_online[brick]['numberOfEntries'])\ < int(heal_info[brick]['numberOfEntries']): flag = True break self.assertTrue(flag, "Pro-active self heal is not started") g.log.info("Pro-active self heal is started") # bring down bricks again g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # wait for 60 sec and brought up the brick again g.log.info('waiting for 60 sec and brought up the brick again') time.sleep(60) g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline, 'glusterd_restart') self.assertTrue( ret, ("Failed to bring bricks: %s online" % bricks_to_bring_offline)) g.log.info("Successfully brought all bricks: %s online", bricks_to_bring_offline) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) shd_pids_after_bricks_online = pids # compare the glustershd pids self.assertNotEqual(glustershd_pids, shd_pids_after_bricks_online, ("self heal daemon process are same before and " "after bringing up bricks online")) g.log.info("EXPECTED : self heal daemon process are different before " "and after bringing up bricks online") # wait for heal to complete g.log.info("Monitoring the heal.....") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, ("Heal is not completed on volume %s" % self.volname)) g.log.info("Heal Completed on volume %s", self.volname) # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully')
def test_metadata_self_heal(self): """ Test MetaData Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Change the permissions, ownership and the group of the files under "test_meta_data_self_heal" folder - get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check is heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online - check group and user are 'qa' """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off' successfully") # Creating files on client side all_mounts_procs = [] test_meta_data_self_heal_folder = 'test_meta_data_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 50` ; " "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; " "done ;" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder, test_meta_data_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Changing the permissions, ownership and the group # of the files under "test_meta_data_self_heal" folder g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Change permissions to 444 g.log.info('Changing permissions...') command = ("cd %s/%s/ ; " "chmod -R 444 *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Permissions are changed successfully') # Change the ownership to qa g.log.info('Changing the ownership...') command = ("cd %s/%s/ ; " "chown -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Ownership is changed successfully') # Change the group to qa g.log.info('Changing the group...') command = ("cd %s/%s/ ; " "chgrp -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Group is changed successfully') # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume process %s not online " "despite waiting for 5 minutes", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal') # Adding servers and client in single dict to check permissions nodes_to_check = {} all_bricks = get_all_bricks(self.mnode, self.volname) for brick in all_bricks: node, brick_path = brick.split(':') nodes_to_check[node] = brick_path nodes_to_check[self.mounts[0].client_system] = \ self.mounts[0].mountpoint # Checking for user and group for node in nodes_to_check: # Get file list command = ("cd %s/%s/ ; " "ls" % (nodes_to_check[node], test_meta_data_self_heal_folder)) ret, out, err = g.run(node, command) file_list = out.split() for file_name in file_list: file_to_check = '%s/%s/%s' % (nodes_to_check[node], test_meta_data_self_heal_folder, file_name) g.log.info('Checking for permissions, user and group for %s', file_name) # Check for permissions cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'" .format(file_to_check)) ret, permissions, _ = g.run(node, cmd) self.assertEqual(permissions.split('\n')[0], '444', 'Permissions %s is not equal to 444' % permissions) g.log.info("Permissions are '444' for %s", file_name) # Check for user cmd = ("ls -ld {} | awk '{{print $3}}'" .format(file_to_check)) ret, username, _ = g.run(node, cmd) self.assertEqual(username.split('\n')[0], 'qa', 'User %s is not equal qa' % username) g.log.info("User is 'qa' for %s", file_name) # Check for group cmd = ("ls -ld {} | awk '{{print $4}}'" .format(file_to_check)) ret, groupname, _ = g.run(node, cmd) self.assertEqual(groupname.split('\n')[0], 'qa', 'Group %s is not equal qa' % groupname) g.log.info("Group is 'qa' for %s", file_name)
def test_self_heal_50k_files_heal_command_by_add_brick(self): """ Test self-heal of 50k files (heal command Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "self-heal-daemon": "off" - bring down all bricks processes from selected set - create IO (50k files) - Get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check for daemons - start healing - check if heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online - add bricks - do rebalance - get arequal after adding bricks and compare with arequal after getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Successfully set %s for volume %s", options, self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create 50k files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 50000 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Get arequal after adding bricks g.log.info('Getting arequal after adding bricks...') ret, result_after_adding_bricks = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks ' 'is successful') # Checking arequals after bringing bricks online # and after adding bricks self.assertItemsEqual(result_after_online, result_after_adding_bricks, 'Checksums after bringing bricks online and ' 'after adding bricks are not equal') g.log.info('Checksums after bringing bricks online and ' 'after adding bricks are equal')
def test_self_heal_differing_in_file_type(self): """ testing self heal of files with different file types with default configuration Description: - create IO - calculate arequal - bring down all bricks processes from selected set - calculate arequal and compare with arequal before getting bricks offline - modify the data - arequal before getting bricks online - bring bricks online - check daemons and healing completion - start healing - calculate arequal and compare with arequal before bringing bricks online and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side all_mounts_procs = [] test_file_type_differs_self_heal_folder = \ 'test_file_type_differs_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 10` ; " "do mkdir l1_dir.$i ; " "for j in `seq 1 5` ; " "do mkdir l1_dir.$i/l2_dir.$j ; " "for k in `seq 1 10` ; " "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k " "bs=1k count=$k ; " "done ; " "done ; " "done ; " % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(sorted(result_before_offline), sorted(result_after_offline), 'Checksums before and after bringing bricks' ' offline are not equal') g.log.info('Checksums before and after ' 'bringing bricks offline are equal') # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("cd %s/%s/ ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do for k in `seq 1 10` ; " "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; " "mkdir l1_dir.$i/l2_dir.$j/test.$k ; " "done ; " "done ; " "done ;" % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums before and after bringing bricks' ' online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_data_self_heal_daemon_off(self): """ Test Data-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get arequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get areeual after getting bricks offline and compare with arequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - add bricks - do rebalance - create 1k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # pylint: disable=too-many-statements,too-many-locals # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files on client side g.log.info("Starting IO on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("/usr/bin/env python %s create_files -f 100" " --fixed-file-size 1k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to create the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Created IO for %s is successfully', self.mounts[0].mountpoint) # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_before_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_after_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual( result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("/usr/bin/env python %s create_files -f 100" " --fixed-file-size 10k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to midify the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Modified IO for %s is successfully', self.mounts[0].mountpoint) # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Create 1k files all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 1000" " --base-file-name newfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline', brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful', brick) # Introducing 30 second sleep when brick is down g.log.info( "Waiting for 30 seconds, with ongoing IO while " "brick %s is offline", brick) sleep(30) # Bring brick online g.log.info('Bringing bricks %s online...', brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', brick) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue( ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients")
def test_data_self_heal_algorithm_full_default(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full' Description: - set the volume option "data-self-heal-algorithm" to value "full" - create IO - bring down all bricks processes from selected set - modify the data - calculate arequal - bring bricks online - start healing - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options "data-self-heal-algorithm": "full"...') options = {"data-self-heal-algorithm": "full"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'data-self-heal-algorithm' is set to 'full' " "successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def test_impact_of_replace_brick_for_glustershd(self): nodes = self.volume['servers'] # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s" % nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s" % (self.GLUSTERSHD, self.mnode)) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file" % self.GLUSTERSHD) # replace brick brick_to_replace = bricks_list[-1] new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume : %s" % (brick_to_replace, self.volname)) ret, out, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully' % (brick_to_replace, new_brick)) # check bricks bricks_list = get_all_bricks(self.mnode, self.volname) self.assertEqual(bricks_list[-1], new_brick, 'Replaced brick and ' 'new brick are not equal') # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verify glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s" % nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids_after_replacement = pids # Compare pids before and after replacing self.assertNotEqual(glustershd_pids, glustershd_pids_after_replacement, "Self Daemon process is same before and" " after replacing bricks") g.log.info("Self Heal Daemon Process is different before and " "after replacing bricks") # get the bricks for the volume after replacing bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s" % bricks_list_after_replacing) # validate the bricks present in volume info # with glustershd server volume file after replacing bricks g.log.info("Starting parsing file %s" % self.GLUSTERSHD) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_replacing) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file" % self.GLUSTERSHD)