def _restart_volume_and_bring_all_offline_bricks_online(self): """Restart volume and bring all offline bricks online""" ret = is_heal_complete(self.mnode, self.volname) self.assertFalse(ret, 'Heal is completed') g.log.info('Heal is pending') ret = bring_bricks_online( self.mnode, self.volname, self.bricks_to_bring_offline, bring_bricks_online_methods=['volume_start_force']) self.assertTrue( ret, 'Failed to bring bricks %s online' % self.bricks_to_bring_offline) # Check if bricks are back online or not ret = are_bricks_online(self.mnode, self.volname, self.bricks_to_bring_offline) self.assertTrue( ret, 'Bricks not online %s even after restart' % self.bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', self.bricks_to_bring_offline)
def _bring_bricks_online_heal(self, mnode, volname, bricks_list): """ Bring bricks online and monitor heal completion """ # Bring bricks online ret = bring_bricks_online( mnode, volname, bricks_list, bring_bricks_online_methods=['volume_start_force']) self.assertTrue(ret, 'Failed to bring bricks online') # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(mnode, volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(volname))) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(mnode, volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(volname))) g.log.info("Volume %s : All process are online", volname) # Monitor heal completion ret = monitor_heal_completion(mnode, volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(mnode, volname) self.assertFalse(ret, 'Volume is in split-brain state')
def toggle_bricks_and_perform_io(self, file_list, brick_list): """ Kills bricks, does I/O and brings the brick back up. """ # Bring down bricks. g.log.info("Going to bring down the brick process for %s", brick_list) ret = bring_bricks_offline(self.volname, brick_list) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", brick_list) ret = are_bricks_offline(self.mnode, self.volname, brick_list) self.assertTrue(ret, 'Bricks %s are not offline' % brick_list) # Perform I/O for filename in file_list: fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \ filename cmd = ("dd if=/dev/urandom of=%s bs=1024 count=1" % fpath) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "Creating %s failed" % fpath) # Bring up bricks ret = bring_bricks_online(self.mnode, self.volname, brick_list) self.assertTrue(ret, 'Failed to bring brick %s online' % brick_list) g.log.info('Bringing brick %s online is successful', brick_list) # Waiting for bricks to come online g.log.info("Waiting for brick process to come online") timeout = 30 ret = wait_for_bricks_to_be_online(self.mnode, self.volname, timeout) self.assertTrue(ret, "bricks didn't come online after adding bricks") g.log.info("Bricks are online")
def _test_brick_down_with_file_rename(self, pfile, rfile, brick): # Bring brick offline g.log.info('Bringing brick %s offline', brick) ret = bring_bricks_offline(self.volname, brick) self.assertTrue(ret, 'Failed to bring brick %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Brick %s is not offline' % brick) g.log.info('Bringing brick %s offline is successful', brick) # Rename file cmd = ("mv %s/%s %s/%s" % (self.mounts[0].mountpoint, pfile, self.mounts[0].mountpoint, rfile)) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "rename of file failed") # Bring brick back online g.log.info('Bringing brick %s online', brick) ret = bring_bricks_online(self.mnode, self.volname, brick) self.assertTrue(ret, 'Failed to bring brick %s online' % brick) g.log.info('Bringing brick %s online is successful', brick)
def mkdir_post_hashdown(self, subvols, parent_dir): ''' case -1: - bring down a subvol - create a directory so that it does not hash to down subvol - make sure stat is successful on the dir ''' # pylint: disable=protected-access # pylint: disable=pointless-string-statement # Find a non hashed subvolume(or brick) nonhashed_subvol, count = find_nonhashed_subvol(subvols, "/", "parent") if nonhashed_subvol is None: g.log.error('Error in finding nonhashed subvol for parent') return False # bring nonhashed_subbvol offline ret = bring_bricks_offline(self.volname, subvols[count]) if ret == 0: g.log.error('Error in bringing down subvolume %s', subvols[count]) return False g.log.info('target subvol %s is offline', subvols[count]) # create parent dir ret, _, err = g.run(self.clients[0], ("mkdir %s" % parent_dir)) if ret != 0: g.log.error('mkdir failed for %s err: %s', parent_dir, err) return False g.log.info("mkdir of parent directory %s successful", parent_dir) # this confirms both layout and stat of the directory ret = validate_files_in_dir(self.clients[0], self.mounts[0].mountpoint + '/parent_dir', test_type=LAYOUT_IS_COMPLETE, file_type=FILETYPE_DIRS) self.assertTrue(ret, "Layout is not complete") g.log.info('Layout is complete') # bring up the subvol ret = bring_bricks_online(self.mnode, self.volname, subvols[count], bring_bricks_online_methods=None) if ret == 0: g.log.error("Error in bringing back subvol online") return False g.log.info('Subvol is back online') # delete parent_dir ret, _, err = g.run(self.clients[0], ("rmdir %s" % parent_dir)) if ret != 0: g.log.error('rmdir failed for %s err: %s', parent_dir, err) g.log.info("rmdir of directory %s successful", parent_dir) return True
def _bring_bricks_online(self): """ Bring bricks online and monitor heal completion """ # Bring bricks online ret = bring_bricks_online( self.mnode, self.volname, self.bricks_to_bring_offline, bring_bricks_online_methods=['volume_start_force']) self.assertTrue(ret, 'Failed to bring bricks online') # Wait for volume processes to be online ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname)))
def test_disperse_vol(self): bricks_list = get_all_bricks(self.mnode, self.volname) ret = bring_bricks_offline(self.volname, bricks_list[0:2]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:2]) self.assertTrue(ret, "Failed to bring up the bricks") g.log.info("Successfully brought the bricks up") # Verifying all bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) if not ret: self.assertTrue(ret, "All bricks are not online") g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info( "Successful in logging volume info and status " "of volume %s", self.volname)
def tearDown(self): # Kill the IO on client if self.is_io_started: ret = kill_process(self.client, process_names=[self.file_name]) if not ret: raise ExecutionError("Not able to kill/stop IO in client") g.log.info('Successfully stopped IO in client') if self.offline_bricks: ret = bring_bricks_online(self.mnode, self.volname, self.offline_bricks) if not ret: raise ExecutionError( ret, 'Not able to bring bricks {} ' 'online'.format(self.offline_bricks)) # Cleanup and unmount volume ret = self.unmount_volume_and_cleanup_volume(mounts=[self.mount_obj]) if not ret: raise ExecutionError("Failed to unmount and cleanup volume") g.log.info("Unmount and Cleanup of volume is successful") self.get_super_method(self, 'tearDown')()
def test_heal_command_unsuccessful_as_bricks_down(self): """ - write 2 Gb file on mount - while write is in progress, kill brick b0 - start heal on the volume (should fail and have error message) - bring up the brick which was down (b0) - bring down another brick (b1) - start heal on the volume (should fail and have error message) - bring bricks up - wait for heal to complete """ # pylint: disable=too-many-statements bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create 2 Gb file g.log.info('Creating files...') command = ("cd %s ; dd if=/dev/zero of=file1 bs=10M count=200" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Bring brick0 offline g.log.info('Bringing bricks %s offline...', bricks_list[0]) ret = bring_bricks_offline(self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0]) g.log.info('Bringing bricks %s offline is successful', bricks_list[0]) # Start healing # Need to use 'gluster volume heal' command to check error message # after g.run cmd = "gluster volume heal %s" % self.volname ret, _, err = g.run(self.mnode, cmd) self.assertTrue(ret, 'Heal is started') # Check for error message self.assertIn("Launching heal operation to perform index self heal on " "volume %s has been unsuccessful" % self.volname, err, "Error message is not present or not valid") g.log.info('Expected: Healing is not started') # Bring brick0 online g.log.info("Bring bricks: %s online", bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, "Failed to bring bricks: %s online" % bricks_list[0]) g.log.info("Successfully brought all bricks:%s online", bricks_list[0]) # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Start healing # Need to use 'gluster volume heal' command to check error message # after g.run cmd = "gluster volume heal %s" % self.volname ret, _, err = g.run(self.mnode, cmd) self.assertTrue(ret, 'Heal is started') # Check for error message self.assertIn("Launching heal operation to perform index self heal on " "volume %s has been unsuccessful" % self.volname, err, "Error message is not present or not valid") g.log.info('Expected: Healing is not started') # Bring brick 1 online g.log.info("Bring bricks: %s online", bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, "Failed to bring bricks: %s online" % bricks_list[1]) g.log.info("Successfully brought all bricks:%s online", bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True
def test_data_split_brain_resolution(self): # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files and directories on client side g.log.info('Creating files and directories...') cmd = ("for i in `seq 1 10`; do mkdir %s/dir.$i; for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1K count=1;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Creating files and directories failed") g.log.info("Files & directories created successfully") # Check arequals for all the bricks g.log.info('Getting arequal before getting bricks offline...') self.verify_brick_arequals() g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Set option self-heal-daemon to OFF g.log.info('Setting option self-heal-daemon to off...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick1 offline g.log.info('Bringing brick %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[0]) g.log.info('Bringing brick %s offline is successful', bricks_list[0]) # Modify the contents of the files cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=1;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file contents failed") g.log.info("File contents updated successfully") # Bricng brick1 online and check the status g.log.info('Bringing brick %s online', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[0]) g.log.info('Bringing brick %s online is successful', bricks_list[0]) g.log.info("Verifying if brick %s is online", bricks_list[0]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0])) g.log.info("Brick %s has come online.", bricks_list[0]) # Bring brick2 offline g.log.info('Bringing brick %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[1]) g.log.info('Bringing brick %s offline is successful', bricks_list[1]) # Modify the contents of the files cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=2;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=2; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file contents failed") g.log.info("File contents updated successfully") # Bricng brick2 online and check the status g.log.info('Bringing brick %s online', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[1]) g.log.info('Bringing brick %s online is successful', bricks_list[1]) g.log.info("Verifying if brick %s is online", bricks_list[1]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1])) g.log.info("Brick %s has come online.", bricks_list[1]) # Set option self-heal-daemon to ON g.log.info('Setting option self-heal-daemon to on...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") g.log.info("Checking if files are in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("Resolving split-brain by using the source-brick option " "by choosing second brick as source for all the files") node, _ = bricks_list[1].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + bricks_list[1]) ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "Command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "Heal not triggered") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=120) self.assertTrue(ret, "Heal not completed") # Try accessing the file content from the mount cmd = ("for i in `seq 1 10`; do cat %s/file.$i > /dev/null;" "for j in `seq 1 5` ; do cat %s/dir.$i/file.$j > /dev/null;" "done ; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to access the file contents") g.log.info("File contents are accessible") # checking if file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "File still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks after heal...') self.verify_brick_arequals() g.log.info('Getting arequal after heal is successful')
def _bring_back_brick_online(self, brick): """ Brings back down brick from the volume""" ret = bring_bricks_online(self.mnode, self.volname, brick) self.assertTrue(ret, "Failed to bring brick online")
def test_self_heal_differing_in_file_type(self): """ testing self heal of files with different file types with default configuration Description: - create IO - calculate arequal - bring down all bricks processes from selected set - calculate arequal and compare with arequal before getting bricks offline - modify the data - arequal before getting bricks online - bring bricks online - check daemons and healing completion - start healing - calculate arequal and compare with arequal before bringing bricks online and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side all_mounts_procs = [] test_file_type_differs_self_heal_folder = \ 'test_file_type_differs_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 10` ; " "do mkdir l1_dir.$i ; " "for j in `seq 1 5` ; " "do mkdir l1_dir.$i/l2_dir.$j ; " "for k in `seq 1 10` ; " "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k " "bs=1k count=$k ; " "done ; " "done ; " "done ; " % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(sorted(result_before_offline), sorted(result_after_offline), 'Checksums before and after bringing bricks' ' offline are not equal') g.log.info('Checksums before and after ' 'bringing bricks offline are equal') # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("cd %s/%s/ ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do for k in `seq 1 10` ; " "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; " "mkdir l1_dir.$i/l2_dir.$j/test.$k ; " "done ; " "done ; " "done ;" % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums before and after bringing bricks' ' online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_handling_data_split_brain(self): """ - create IO - calculate arequal from mountpoint - set volume option 'self-heal-daemon' to value "off" - kill data brick1 - calculate arequal checksum and compare it - modify files and directories - bring back all bricks processes online - kill data brick3 - modify files and directories - calculate arequal from mountpoint - bring back all bricks processes online - run the find command to trigger heal from mountpoint - set volume option 'self-heal-daemon' to value "on" - check if heal is completed - check for split-brain - read files - calculate arequal checksum and compare it """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do mkdir dir.$i ; " "for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1K count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1k count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Setting options options = {"self-heal-daemon": "off"} g.log.info('Setting options %s for volume %s', options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Bring brick 1 offline bricks_to_bring_offline = [bricks_list[0]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Comparing arequals before getting bricks offline # and after getting bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Arequals before getting bricks offline ' 'and after getting bricks offline are not equal') g.log.info('Arequals before getting bricks offline ' 'and after getting bricks offline are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify files g.log.info('Modifying files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1M count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1M count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Bring 1-st brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Bring brick 3rd offline bricks_to_bring_offline = [bricks_list[-1]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Modifying files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1M count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1M count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring 3rd brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Mount and unmount mounts ret = self.unmount_volume(self.mounts) self.assertTrue(ret, 'Failed to unmount %s' % self.volname) ret = self.mount_volume(self.mounts) self.assertTrue(ret, 'Unable to mount %s' % self.volname) # Start heal from mount point g.log.info('Starting heal from mount point...') for mount_obj in self.mounts: g.log.info("Start heal for %s:%s", mount_obj.client_system, mount_obj.mountpoint) command = "/usr/bin/env python %s read %s" % ( self.script_upload_path, self.mounts[0].mountpoint) ret, _, err = g.run(mount_obj.client_system, command) self.assertFalse(ret, err) g.log.info("Heal triggered for %s:%s", mount_obj.client_system, mount_obj.mountpoint) g.log.info('Heal triggered for all mountpoints') # Enable self-heal daemon ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, 'Successfully started self heal daemon') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Reading files g.log.info('Reading files...') for mount_obj in self.mounts: g.log.info("Start reading files for %s:%s", mount_obj.client_system, mount_obj.mountpoint) command = ('cd %s/ ; ' 'for i in `seq 1 10` ; ' 'do cat file.$i > /dev/null ; ' 'for j in `seq 1 5` ; ' 'do cat dir.$i/file.$j > /dev/null ; ' 'done ; done' % mount_obj.mountpoint) ret, _, err = g.run(mount_obj.client_system, command) self.assertFalse(ret, err) g.log.info("Reading files successfully for %s:%s", mount_obj.client_system, mount_obj.mountpoint) g.log.info('Reading files successfully for all mountpoints') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Comparing arequals before getting bricks online # and after getting bricks online self.assertEqual(result_before_online, result_after_online, 'Arequals before getting bricks online ' 'and after getting bricks online are not equal') g.log.info('Arequals before getting bricks online ' 'and after getting bricks online are equal')
def test_self_heal_daemon(self): """ Test Data-Self-Heal(heal command) Description: - Create directory test_hardlink_self_heal - Create directory test_data_self_heal - Creating files for hardlinks and data files - Get arequal before getting bricks offline - Select bricks to bring offline - Bring brick offline - Create hardlinks and append data to data files - Bring brick online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal after getting bricks online - Select bricks to bring offline - Bring brick offline - Truncate data to data files and verify hardlinks - Bring brick online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal again """ # pylint: disable=too-many-branches,too-many-statements,too-many-locals # Creating directory test_hardlink_self_heal ret = mkdir( self.mounts[0].client_system, "{}/test_hardlink_self_heal".format(self.mounts[0].mountpoint)) self.assertTrue(ret, "Failed to create directory") g.log.info( "Directory 'test_hardlink_self_heal' on %s created " "successfully", self.mounts[0]) # Creating directory test_data_self_heal ret = mkdir(self.mounts[0].client_system, "{}/test_data_self_heal".format(self.mounts[0].mountpoint)) self.assertTrue(ret, "Failed to create directory") g.log.info( "Directory test_hardlink_self_heal on %s created " "successfully", self.mounts[0]) # Creating files for hardlinks and data files cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5`;' 'do mkdir dir.$i ; for j in `seq 1 10` ; do dd if=' '/dev/urandom of=dir.$i/file.$j bs=1k count=$j;done; done;' 'cd ..' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create file on mountpoint") g.log.info("Successfully created files on mountpoint") cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;' 'do dd if=/dev/urandom of=file.$i bs=128K count=$i;done;' 'cd ..' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create file on mountpoint") g.log.info("Successfully created files on mountpoint") # Get arequal before getting bricks offline ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Arequal before getting bricks online-%s', result_before_online) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Append data to data files and create hardlinks cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;' 'do dd if=/dev/urandom of=file.$i bs=512K count=$i ; done ;' 'cd .. ' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to modify data files.") g.log.info("Successfully modified data files") cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5` ;do ' 'for j in `seq 1 10`;do ln dir.$i/file.$j dir.$i/link_file.$j;' 'done ; done ; cd .. ' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Hardlinks creation failed") g.log.info("Successfully created hardlinks of files") # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) g.log.info("Volume %s : All process are online", self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Arequal after getting bricks online ' 'is %s', result_after_online) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Truncate data to data files and verify hardlinks cmd = ('cd %s/test_data_self_heal ; for i in `seq 1 100` ;' 'do truncate -s $(( $i * 128)) file.$i ; done ; cd ..' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to truncate files") g.log.info("Successfully truncated files on mountpoint") file_path = ('%s/test_hardlink_self_heal/dir{1..5}/file{1..10}' % (self.mounts[0].mountpoint)) link_path = ('%s/test_hardlink_self_heal/dir{1..5}/link_file{1..10}' % (self.mounts[0].mountpoint)) file_stat = get_file_stat(self.mounts[0], file_path) link_stat = get_file_stat(self.mounts[0], link_path) self.assertEqual(file_stat, link_stat, "Verification of hardlinks " "failed") g.log.info("Successfully verified hardlinks") # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) g.log.info("Volume %s : All process are online", self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed')
def test_multiple_clients_dd_on_same_file_default(self): """ - Create 2GB file - While creating file, start reading file - Bring down brick1 - Bring back the brick brick1 - Start healing - Bring down brick1 - Wait for IO to complete - Wait for reading to complete - Bring back the brick brick1 - Start healing - Wait for heal to complete - Check for split-brain - Calculate arequals on all the bricks and compare with mountpoint """ # pylint: disable=too-many-statements,too-many-locals bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s ; " "dd if=/dev/urandom of=test_file bs=1M count=2020" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Reading files on client side all_mounts_procs_read = [] for mount_obj in self.mounts: g.log.info("Reading data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Reading files...') command = ("python %s read %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) all_mounts_procs_read.append(proc) # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Bring brick1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Validate reading self.assertTrue( validate_io_procs(all_mounts_procs_read, self.mounts), "Reading failed on some of the clients" ) self.io_validation_complete = True # Bring brick1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal for mount g.log.info('Getting arequal...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual(mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal')
def test_entry_heal_with_quota(self): """ - Create a 1x3 volume - Set quota object limit - Create files less than the limit - Bring down a brick and create more files until limit is hit - Delete one file so that we are below the limit, and create one more file - Bring the brick back up and launch heal - Verify that after heal is complete, the deleted file does not re-appear in any of the bricks. """ # pylint: disable=too-many-statements # Enable Quota g.log.info("Enabling quota on the volume %s", self.volname) ret, _, _ = quota_enable(self.mnode, self.volname) self.assertEqual( ret, 0, ("Failed to enable quota on the volume %s", self.volname)) g.log.info("Successfully enabled quota on the volume %s", self.volname) # Check if quota is enabled g.log.info("Validate Quota is enabled on the volume %s", self.volname) ret = is_quota_enabled(self.mnode, self.volname) self.assertTrue( ret, ("Quota is not enabled on the volume %s", self.volname)) g.log.info("Successfully Validated quota is enabled on volume %s", self.volname) # Set quota related options options = { "quota-deem-statfs": "on", "soft-timeout": "0", "hard-timeout": "0" } g.log.info("setting quota volume options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Create directory on mount ret = mkdir(self.mounts[0].client_system, "%s/dir" % self.mounts[0].mountpoint) self.assertTrue(ret, "mkdir failed") # Set Quota limit on the directory path = "/dir" g.log.info( "Setting Quota Limit object on the path %s of the " "volume %s", path, self.volname) ret, _, _ = quota_limit_objects(self.mnode, self.volname, path=path, limit="10") self.assertEqual(ret, 0, ("Failed to set quota limit object " "on path %s of the volume %s", path, self.volname)) g.log.info( "Successfully set the Quota limit object on %s of the " "volume %s", path, self.volname) cmd = ("touch %s/dir/file{1..5}" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "file creation failed") # Bring brick3 offline bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info('Bringing brick %s offline', bricks_list[2]) ret = bring_bricks_offline(self.volname, bricks_list[2]) self.assertTrue(ret, 'Failed to bring brick %s offline' % bricks_list[2]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2]) g.log.info('Bringing brick %s offline was successful', bricks_list[2]) # Create files until quota object limit cmd = ("touch %s/dir/file{6..9}" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "file creation failed") # The next create must fail cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual( ret, 1, ("Creation of %s/dir/file10 succeeded while " "it was not supposed to." % self.mounts[0].mountpoint)) g.log.info( "Creation of %s/dir/file10 failed as expected due to " "quota object limit.", self.mounts[0].mountpoint) # Delete one file and re-try the create to succeed. cmd = ("rm %s/dir/file1" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "File deletion failed") cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "File creation failed") # Bring brick3 online and check status g.log.info('Bringing brick %s online...', bricks_list[2]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[2]) g.log.info('Bringing brick %s online is successful', bricks_list[2]) g.log.info("Verifying if brick3 is online....") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("brick3 did not come up")) g.log.info("brick3 has come online.") # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Verify that file10 did not get recreated on the down brick by an # accidental conservative merge. for brick in bricks_list: node, brick_path = brick.split(':') ret, _, _ = g.run(node, 'stat %s/dir/file10' % brick_path) self.assertFalse(ret, 'File present!')
def test_heal_gfid_1x3(self): """ Description: This test case verifies the gfid self-heal on a 1x3 replicate volume. 1. file created at mount point 2. 2 bricks brought down 3. file deleted 4. created a new file from the mount point 5. all bricks brought online 6. check if gfid worked correctly """ g.log.info("setting the quorum type to fixed") options = {"cluster.quorum-type": "fixed"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, "unable to set the quorum type to fixed") g.log.info("Successfully set the quorum type to fixed") g.log.info("creating a file from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 1 --base-file-name test_file --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") g.log.info("Successfully created a file from mount point") # getting list of all bricks all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "unable to get list of bricks") g.log.info("bringing down brick1 and brick2") ret = bring_bricks_offline(self.volname, all_bricks[:2]) self.assertTrue(ret, "unable to bring bricks offline") g.log.info("Successfully brought the following bricks offline " ": %s", str(all_bricks[:2])) g.log.info("deleting the file from mount point") command = "rm -f " + self.mounts[0].mountpoint + "/test_file1" ret, _, _ = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, "unable to remove file from mount point") g.log.info("Successfully deleted file from mountpoint") g.log.info("creating a new file of same name and different size " "from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 1 --base-file-name test_file --fixed-file-size 1M %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") g.log.info("Successfully created a new file of same name " "from mount point") g.log.info("bringing bricks 1 and 2 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[:2]) self.assertIsNotNone(ret, "unable to bring bricks online") g.log.info("Successfully brought the following bricks online " ": %s", str(all_bricks[:2])) g.log.info("checking if stat structure of the file is returned") ret = get_file_stat(self.mounts[0].client_system, self.mounts[0].mountpoint + '/test_file0.txt') self.assertTrue(ret, "unable to get file stats") g.log.info("file stat structure returned successfully") g.log.info("checking if the heal has completed") ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, "heal not completed") g.log.info("Self heal was completed successfully") g.log.info("checking if the areequal checksum of all the bricks in " "the subvol match") checksum_list = [] for brick in all_bricks: node, brick_path = brick.split(':') command = "arequal-checksum -p " + brick_path + \ " -i .glusterfs -i .landfill" ret, out, _ = g.run(node, command) self.assertEqual( ret, 0, "unable to get the arequal checksum " "of the brick") checksum_list.append(out) # checking file size of healed file on each brick to verify # correctness of choice for sink and source stat_dict = get_file_stat(node, brick_path + '/test_file0.txt') self.assertEqual( stat_dict['size'], '1048576', "file size of healed file is different " "than expected") flag = all(val == checksum_list[0] for val in checksum_list) self.assertTrue(flag, "the arequal checksum of all bricks is" "not same") g.log.info("the arequal checksum of all the bricks in the subvol " "is same")
def test_ec_version(self): """ Create a directory on the mountpoint Create files on the mountpoint Bring down a brick say b1 Create more files on the mountpoint Bring down another brick b2 Bring up brick b1 Wait for healing to complete Check if EC version is updated Check is EC size is updated """ # pylint: disable=too-many-statements,too-many-branches,too-many-locals # Creating dir1 on the mountpoint ret = mkdir(self.mounts[0].client_system, "%s/dir1" % self.mounts[0].mountpoint) self.assertTrue(ret, "Failed to create dir1") g.log.info("Directory dir1 on %s created successfully", self.mounts[0]) # Creating files on client side for dir1 g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create dirs with file command = ("cd %s/dir1; for i in {1..10};do" " dd if=/dev/urandom of=file.$i " "bs=1024 count=10000; done" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validating IO's and waiting to complete self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts[0]), "IO failed on some of the clients" ) self.io_validation_complete = True # Bringing brick b1 offline sub_vols = get_subvols(self.mnode, self.volname) self.bricks_list1 = list(choice(sub_vols['volume_subvols'])) brick_b1_down = choice(self.bricks_list1) ret = bring_bricks_offline(self.volname, brick_b1_down) self.assertTrue(ret, 'Brick %s is not offline' % brick_b1_down) g.log.info('Brick %s is offline successfully', brick_b1_down) del self.all_mounts_procs[:] # Creating files on client side for dir1 g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create dirs with file command = ("cd %s/dir1; for i in {11..20};do" " dd if=/dev/urandom of=file.$i " "bs=1024 count=10000; done" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validating IO's and waiting to complete self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts[0]), "IO failed on some of the clients" ) self.io_validation_complete = True # Changing mode owner and group of files dir_file_range = '2..5' cmd = ('chmod 777 %s/dir1/file.{%s}' % (self.mounts[0].mountpoint, dir_file_range)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Changing mode of files has failed") g.log.info("Mode of files have been changed successfully") cmd = ('chown root %s/dir1/file.{%s}' % (self.mounts[0].mountpoint, dir_file_range)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Changing owner of files has failed") g.log.info("Owner of files have been changed successfully") cmd = ('chgrp root %s/dir1/file.{%s}' % (self.mounts[0].mountpoint, dir_file_range)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Changing group of files has failed") g.log.info("Group of files have been changed successfully") # Create softlink and hardlink of files in mountpoint. cmd = ('cd %s/dir1/; ' 'for FILENAME in *; ' 'do ln -s $FILENAME softlink_$FILENAME; ' 'done;' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Creating Softlinks have failed") g.log.info("Softlink of files have been changed successfully") cmd = ('cd %s/dir1/; ' 'for FILENAME in *; ' 'do ln $FILENAME hardlink_$FILENAME; ' 'done;' % (self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Creating Hardlinks have failed") g.log.info("Hardlink of files have been changed successfully") # Bringing brick b2 offline bricks_list2 = deepcopy(self.bricks_list1) bricks_list2.remove(brick_b1_down) brick_b2_down = choice(bricks_list2) ret = bring_bricks_offline(self.volname, brick_b2_down) self.assertTrue(ret, 'Brick %s is not offline' % brick_b2_down) g.log.info('Brick %s is offline successfully', brick_b2_down) # Bring brick b1 online ret = bring_bricks_online(self.mnode, self.volname, [brick_b1_down], 'glusterd_restart') self.assertTrue(ret, 'Brick %s is not brought' 'online' % brick_b1_down) g.log.info('Brick %s is online successfully', brick_b1_down) # Delete brick2 from brick list as we are not checking for heal # completion in brick 2 as it is offline self.bricks_list1.remove(brick_b2_down) # Check if EC version is same on all bricks which are up ret = self.get_xattr("ec.version") self.assertTrue(ret, "Healing not completed and EC version is" "not updated") g.log.info("Healing is completed and EC version is updated") # Check if EC size is same on all bricks which are up ret = self.get_xattr("ec.size") self.assertTrue(ret, "Healing not completed and EC size is" "not updated") g.log.info("Healing is completed and EC size is updated")
def test_heal_on_file_appends(self): """ Test steps: - create and mount EC volume 4+2 - start append to a file from client - bring down one of the bricks (say b1) - wait for ~minute and bring down another brick (say b2) - after ~minute bring up first brick (b1) - check the xattrs 'ec.size', 'ec.version' - xattrs of online bricks should be same as an indication to heal """ # Get bricks list bricks_list = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Not able to get bricks list') # Creating a file, generate and append data to the file self.file_name = 'test_file' cmd = ("cd %s ;" "while true; do " "cat /dev/urandom | tr -dc [:space:][:print:] " "| head -c 4K >> %s; sleep 2; " "done;" % (self.mount_obj.mountpoint, self.file_name)) ret = g.run_async(self.client, cmd, user=self.mount_obj.user) self.assertIsNotNone(ret, "Not able to start IO on client") g.log.info('Started generating and appending data to the file') self.is_io_started = True # Select 3 bricks, 2 need to be offline and 1 will be healthy brick_1, brick_2, brick_3 = sample(bricks_list, 3) # Wait for IO to fill the bricks sleep(30) # Bring first brick offline and validate ret = bring_bricks_offline(self.volname, [brick_1]) self.assertTrue(ret, 'Failed to bring brick {} offline'.format(brick_1)) ret = are_bricks_offline(self.mnode, self.volname, [brick_1]) self.assertTrue( ret, 'Not able to validate brick {} being ' 'offline'.format(brick_1)) g.log.info("Brick %s is brought offline successfully", brick_1) self.offline_bricks.append(brick_1) # Wait for IO to fill the bricks sleep(30) # Bring second brick offline and validate ret = bring_bricks_offline(self.volname, [brick_2]) self.assertTrue(ret, 'Failed to bring brick {} offline'.format(brick_2)) ret = are_bricks_offline(self.mnode, self.volname, [brick_2]) self.assertTrue( ret, 'Not able to validate brick {} being ' 'offline'.format(brick_2)) g.log.info("Brick %s is brought offline successfully", brick_2) self.offline_bricks.append(brick_2) # Wait for IO to fill the bricks sleep(30) # Bring first brick online and validate peer status ret = bring_bricks_online( self.mnode, self.volname, [brick_1], bring_bricks_online_methods=['glusterd_restart']) self.assertTrue(ret, 'Not able to bring brick {} ' 'online'.format(brick_1)) g.log.info("Offlined brick %s is brought online successfully", brick_1) ret = self.validate_peers_are_connected() self.assertTrue( ret, "Peers are not in connected state after bringing " "an offline brick to online via `glusterd restart`") g.log.info("Successfully validated peers are in connected state") # To catchup onlined brick with healthy bricks sleep(30) # Validate the xattr to be same on onlined and healthy bric online_bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(online_bricks, 'Unable to fetch online bricks') g.log.info('All online bricks are fetched successfully') for xattr in ('trusted.ec.size', 'trusted.ec.version'): ret = validate_xattr_on_all_bricks([brick_1, brick_3], self.file_name, xattr) self.assertTrue( ret, "{} is not same on all online " "bricks".format(xattr)) # Get epoch time on the client ret, prev_ctime, _ = g.run(self.client, 'date +%s') self.assertEqual(ret, 0, 'Not able to get epoch time from client') # Headroom for file ctime to get updated sleep(5) # Validate file was being apended while checking for xattrs ret = get_file_stat( self.client, '{}/{}'.format(self.mount_obj.mountpoint, self.file_name)) self.assertIsNotNone(ret, "Not able to get stats of the file") curr_ctime = ret['epoch_ctime'] self.assertGreater( int(curr_ctime), int(prev_ctime), "Not able " "to validate data is appended to the file " "while checking for xaatrs") g.log.info("Data on all online bricks is healed and consistent")
def test_server_side_healing_happens_only_when_glustershd_running(self): """ Test Script which verifies that the server side healing must happen only if the heal daemon is running on the node where source brick resides. * Create and start the Replicate volume * Check the glustershd processes - Only 1 glustershd should be listed * Bring down the bricks without affecting the cluster * Create files on volume * kill the glustershd on node where bricks is running * bring the bricks up which was killed in previous steps * check the heal info - heal info must show pending heal info, heal shouldn't happen since glustershd is down on source node * issue heal * trigger client side heal * heal should complete successfully """ # pylint: disable=too-many-locals,too-many-statements,too-many-lines # Setting Volume options options = { "metadata-self-heal": "on", "entry-self-heal": "on", "data-self-heal": "on" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Check the self-heal daemon process ret, pids = get_self_heal_daemon_pid(self.servers) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in verifying self heal daemon process" " on all nodes %s", self.servers) # Select the bricks to bring offline bricks_to_bring_offline = (select_volume_bricks_to_bring_offline( self.mnode, self.volname)) g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline) # Bring down the selected bricks ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Brought down the brick process " "for %s", bricks_to_bring_offline) # Write files on all mounts all_mounts_procs, num_files_to_write = [], 100 for mount_obj in self.mounts: cmd = ("/usr/bin/env python %s create_files " "-f %s --base-file-name file %s" % (self.script_upload_path, num_files_to_write, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # Get online bricks list online_bricks = get_online_bricks_list(self.mnode, self.volname) g.log.info("Online Bricks for volume %s : %s", self.volname, online_bricks) # Get the nodes where bricks are running bring_offline_glustershd_nodes = [] for brick in online_bricks: bring_offline_glustershd_nodes.append(brick.split(":")[0]) g.log.info("self heal deamon on nodes %s to be killed", bring_offline_glustershd_nodes) # Kill the self heal daemon process on nodes ret = bring_self_heal_daemon_process_offline( bring_offline_glustershd_nodes) self.assertTrue( ret, ("Unable to bring self heal daemon process" " offline for nodes %s" % bring_offline_glustershd_nodes)) g.log.info( "Sucessfully brought down self heal process for " "nodes %s", bring_offline_glustershd_nodes) # Check the heal info heal_info = get_heal_info_summary(self.mnode, self.volname) g.log.info("Successfully got heal info %s for the volume %s", heal_info, self.volname) # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline, 'glusterd_restart') self.assertTrue( ret, ("Failed to bring bricks: %s online" % bricks_to_bring_offline)) # Issue heal ret = trigger_heal_full(self.mnode, self.volname) self.assertFalse(ret, ("Able to trigger heal on volume %s where " "self heal daemon is not running" % self.volname)) g.log.info( "Expected : Unable to trigger heal on volume %s where " "self heal daemon is not running", self.volname) # Wait for 130 sec to heal ret = monitor_heal_completion(self.mnode, self.volname, 130) self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname)) g.log.info("Expected : Heal pending on volume %s", self.volname) # Check the heal info heal_info_after_triggering_heal = get_heal_info_summary( self.mnode, self.volname) g.log.info("Successfully got heal info for the volume %s", self.volname) # Compare with heal pending with the files wrote for node in online_bricks: self.assertGreaterEqual( int(heal_info_after_triggering_heal[node]['numberOfEntries']), num_files_to_write, ("Some of the files are healed from source bricks %s where " "self heal daemon is not running" % node)) g.log.info("EXPECTED: No files are healed from source bricks where " "self heal daemon is not running") # Unmount and Mount volume again as volume options were set # after mounting the volume for mount_obj in self.mounts: ret, _, _ = umount_volume(mount_obj.client_system, mount_obj.mountpoint) self.assertEqual(ret, 0, "Failed to unmount %s" % mount_obj.client_system) ret, _, _ = mount_volume(self.volname, mtype='glusterfs', mpoint=mount_obj.mountpoint, mserver=self.mnode, mclient=mount_obj.client_system) self.assertEqual(ret, 0, "Failed to mount %s" % mount_obj.client_system) all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("/usr/bin/env python %s read %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "Reads failed on some of the clients") g.log.info("Reads successful on all mounts") # Wait for heal to complete ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Unable to heal the pending entries") g.log.info("Successfully healed the pending entries for volume %s", self.volname)
def test_self_heal(self): """ Description:- - Create files on mount point - Kill one brick from volume - rm -rfv on mount point - bring bricks online - wait for heals - list """ # pylint: disable=too-many-statements # IO on the mount point g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 35 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % ( self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Killing one brick from the volume set g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Not all the bricks in list: %s are offline" % bricks_to_bring_offline) g.log.info("Successfully validated that bricks: %s are all offline", bricks_to_bring_offline) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Checking volume status g.log.info("Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Removing files from the mount point when one brick is down g.log.info("Removing files from the mount point") mountpoint = self.mounts[0].mountpoint client = self.mounts[0].client_system cmd = "rm -rfv %s/*" % mountpoint ret, _, _ = g.run(client, cmd) if ret != 0: raise ExecutionError("failed to delete the files") # Bringing bricks online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Check if bricks are online g.log.info("Checking bricks are online or not") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Monitoring heals on the volume g.log.info("Wait for heal completion...") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Self heal didn't complete even after waiting " "for 20 minutes.") g.log.info("self-heal is successful after changing the volume type " "from replicated to arbitered volume") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_client_side_quorum_with_fixed_for_cross3(self): """ Test Script to verify the Client Side Quorum with fixed for cross 3 volume * Disable self heal daemom * set cluster.quorum-type to fixed. * start I/O( write and read )from the mount point - must succeed * Bring down brick1 * start I/0 ( write and read ) - must succeed * Bring down brick2 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 2 * start I/0 ( write and read ) - read and write will fail * bring back the brick1 online * start I/0 ( write and read ) - must succeed * Bring back brick2 online * start I/0 ( write and read ) - must succeed * set cluster.quorum-type to auto * start I/0 ( write and read ) - must succeed * Bring down brick1 and brick2 * start I/0 ( write and read ) - read and write will fail * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - read and write will fail * set the cluster.quorum-count to 3 * start I/0 ( write and read ) - read and write will fail * set the quorum-type to none * start I/0 ( write and read ) - must succeed """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Disable self heal daemon options = {"cluster.self-heal-daemon": "off"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # set cluster.quorum-type to fixed options = {"cluster.quorum-type": "fixed"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/O( write ) - must succeed all_mounts_procs = [] g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # bring down brick1 for all the subvolumes offline_brick1_from_replicasets = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_bring_offline1 = subvol_brick_list[0] g.log.info("Going to bring down the brick process " "for %s", brick_to_bring_offline1) ret = bring_bricks_offline(self.volname, brick_to_bring_offline1) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", brick_to_bring_offline1) offline_brick1_from_replicasets.append(brick_to_bring_offline1) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name testfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # bring down brick2 for all the subvolumes offline_brick2_from_replicasets = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_bring_offline2 = subvol_brick_list[1] g.log.info("Going to bring down the brick process " "for %s", brick_to_bring_offline2) ret = bring_bricks_offline(self.volname, brick_to_bring_offline2) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", brick_to_bring_offline2) offline_brick2_from_replicasets.append(brick_to_bring_offline2) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name filename %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set the cluster.quorum-count to 2 options = {"cluster.quorum-count": "2"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected Error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating file") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # bring back the brick1 online for all subvolumes g.log.info("bringing up the brick : %s online", offline_brick1_from_replicasets) ret = bring_bricks_online( self.mnode, self.volname, offline_brick1_from_replicasets, bring_bricks_online_methods='glusterd_restart') self.assertTrue(ret, ("Failed to brought the brick %s online" % offline_brick1_from_replicasets)) g.log.info("Successfully brought the brick %s online", offline_brick1_from_replicasets) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newfilename %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # Bring back brick2 online g.log.info("bringing up the brick : %s online", offline_brick2_from_replicasets) ret = bring_bricks_online( self.mnode, self.volname, offline_brick2_from_replicasets, bring_bricks_online_methods='glusterd_restart') self.assertTrue(ret, ("Failed to brought the brick %s online" % offline_brick2_from_replicasets)) g.log.info("Successfully brought the brick %s online", offline_brick2_from_replicasets) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name textfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newtextfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # bring down brick1 and brick2 for all the subvolumes for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) bricks_to_bring_offline = subvol_brick_list[0:2] g.log.info("Going to bring down the brick process for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Failed to bring down the bricks. Please " "check the log file for more details.") g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # start I/0 ( write and read ) - read and write will fail all_mounts_procs = [] g.log.info("Start creating file on mountpoint %s", self.mounts[0].mountpoint) cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] g.log.info("Starting reading file") cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Start creating files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the cluster.quorum-count to 3 options = {"cluster.quorum-count": "3"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Start creating files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the quorum-type to none options = {"cluster.quorum-type": "none"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name lastfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)
def test_self_heal_when_io_in_progress(self): """Test self-heal is successful when IO is in progress. Description: - simulate brick down. - bring bricks online - wait for heal to complete - validate IO """ # Log Volume Info and Status before simulating brick failure g.log.info( "Logging volume info and Status before bringing bricks " "offlien from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring bricks offline g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Wait for gluster processes to be offline time.sleep(10) # Log Volume Info and Status g.log.info( "Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Not all the bricks in list:%s are offline") g.log.info("Successfully validated that bricks: %s are all offline") # Add delay before bringing bricks online time.sleep(40) # Bring bricks online g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, ("Failed to bring bricks: %s online", bricks_to_bring_offline)) g.log.info("Successfully brought all bricks:%s online", bricks_to_bring_offline) # Wait for gluster processes to be online time.sleep(10) # Log Volume Info and Status g.log.info( "Logging volume info and Status after bringing bricks " "online from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online", self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal to complete g.log.info("Wait for self-heal to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue( ret, "Self heal didn't complete even after waiting " "for 20 minutes. 20 minutes is too much a time for " "current test workload") g.log.info("self-heal is successful after replace-brick operation") # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.io_validation_complete = True self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_data_self_heal_algorithm_diff_heal_command(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff' Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - modify the data - get arequal before getting bricks online - bring bricks online - expand volume by adding bricks to the volume - do rebalance - set the volume option "self-heal-daemon": "on" and check for daemons - start healing - check if heal is completed - check for split-brain - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-branches,too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "data-self-heal-algorithm": "diff" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off'," "'data-self-heal-algorithm' " "is set to 'diff' successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Expand volume by adding bricks to the volume g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums are equal')
def _perform_brick_ops_and_enable_self_heal(self, op_type): '''Refactor of steps common to all tests: Brick down and perform metadata/data operations''' # First brick in the subvol will always be online and used for self # heal, so make keys match brick index self.op_cmd = { # The operation with key `4` in every op_type will be used for # final data consistency check # Metadata Operations (owner and permission changes) 'metadata': { 2: '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \ dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''', 3: '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', 4: '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', }, # Data Operations (append data to the files) 'data': { 2: '''cd {0}; for i in `seq 1 3`; do {1} 2K >> file.$i; for j in `seq 1 3`; do {1} 2K >> dir.$i/file.$j; done; done;''', 3: '''cd {0}; for i in `seq 1 3`; do {1} 3K >> file.$i; for j in `seq 1 3`; do {1} 3K >> dir.$i/file.$j; done; done;''', 4: '''cd {0}; for i in `seq 1 6`; do {1} 4K >> file.$i; for j in `seq 1 6`; do {1} 4K >> dir.$i/file.$j; done; done;''', }, # Create files and directories when brick is down with no # initial IO 'gfid': { 2: '''cd {0}; for i in `seq 1 3`; do {1} 2K > file.2.$i; mkdir dir.2.$i; for j in `seq 1 3`; do {1} 2K > dir.2.$i/file.2.$j; done; done;''', 3: '''cd {0}; for i in `seq 1 3`; do {1} 2K > file.3.$i; mkdir dir.3.$i; for j in `seq 1 3`; do {1} 2K > dir.3.$i/file.3.$j; done; done;''', 4: '''cd {0}; for i in `seq 4 6`; do {1} 2K > file.$i; mkdir dir.$i; for j in `seq 4 6`; do {1} 2K > dir.$i/file.$j; done; done;''', }, # Create different file type with same name while a brick was down # with no initial IO and validate failure 'file_type': { 2: 'cd {0}; for i in `seq 1 6`; do {1} 2K > notype.$i; done;', 3: 'cd {0}; for i in `seq 1 6`; do mkdir -p notype.$i; done;', 4: '''cd {0}; for i in `seq 1 6`; do {1} 2K > file.$i; for j in `seq 1 6`; do mkdir -p dir.$i; {1} 2K > dir.$i/file.$j; done; done;''', }, # Create symlinks for files and directories while a brick was down # Out of 6 files, 6 dirs and 6 files in each dir, symlink # outer 2 files, inner 2 files in each dir, 2 dirs and # verify it's a symlink(-L) and linking file exists(-e) 'symlink': { 2: '''cd {0}; for i in `seq 1 2`; do ln -sr file.$i sl_file.2.$i; [ -L sl_file.2.$i ] && [ -e sl_file.2.$i ] || exit -1; for j in `seq 1 2`; do ln -sr dir.$i/file.$j dir.$i/sl_file.2.$j; done; [ -L dir.$i/sl_file.2.$j ] && [ -e dir.$i/sl_file.2.$j ] \ || exit -1; done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.2.$k; [ -L sl_dir.2.$k ] && [ -e sl_dir.2.$k ] || exit -1; done;''', 3: '''cd {0}; for i in `seq 1 2`; do ln -sr file.$i sl_file.3.$i; [ -L sl_file.3.$i ] && [ -e sl_file.3.$i ] || exit -1; for j in `seq 1 2`; do ln -sr dir.$i/file.$j dir.$i/sl_file.3.$j; done; [ -L dir.$i/sl_file.3.$j ] && [ -e dir.$i/sl_file.3.$j ] \ || exit -1; done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.3.$k; [ -L sl_dir.3.$k ] && [ -e sl_dir.3.$k ] || exit -1; done;''', 4: '''cd {0}; ln -sr dir.4 sl_dir_new.4; mkdir sl_dir_new.4/dir.1; {1} 4K >> sl_dir_new.4/dir.1/test_file; {1} 4K >> sl_dir_new.4/test_file; ''', }, } bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(bricks, 'Not able to get list of bricks in the volume') # Make first brick always online and start operations from second brick for index, brick in enumerate(bricks[1:], start=2): # Bring brick offline ret = bring_bricks_offline(self.volname, brick) self.assertTrue(ret, 'Unable to bring {} offline'.format(brick)) self.assertTrue( are_bricks_offline(self.mnode, self.volname, [brick]), 'Brick {} is not offline'.format(brick)) # Perform file/dir operation cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd) ret, _, err = g.run(self.client, cmd) if op_type == 'file_type' and index == 3: # Should fail with ENOTCONN as one brick is down, lookupt can't # happen and quorum is not met self.assertNotEqual( ret, 0, '{0} should fail as lookup fails, quorum is not ' 'met'.format(cmd)) self.assertIn( 'Transport', err, '{0} should fail with ENOTCONN ' 'error'.format(cmd)) else: self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) # Bring brick online ret = bring_bricks_online( self.mnode, self.volname, brick, bring_bricks_online_methods='volume_start_force') self.assertTrue( are_bricks_online(self.mnode, self.volname, [brick]), 'Brick {} is not online'.format(brick)) # Assert metadata/data operations resulted in pending heals self.assertFalse(is_heal_complete(self.mnode, self.volname)) # Enable and wait self heal daemon to be online self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname), 'Enabling self heal daemon failed') self.assertTrue( wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname), 'Not all self heal daemons are online')
def test_gfid_split_brain_resolution(self): """ Description: Simulates gfid split brain on multiple files in a dir and resolve them via `bigger-file`, `mtime` and `source-brick` methods Steps: - Create and mount a replicated volume, create a dir and ~10 data files - Simulate gfid splits in 9 of the files - Resolve each 3 set of files using `bigger-file`, `mtime` and `source-bricks` split-brain resoultion methods - Trigger and monitor for heal completion - Validate all the files are healed and arequal matches for bricks in subvols """ io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c ' client, m_point = (self.mounts[0].client_system, self.mounts[0].mountpoint) arbiter = self.volume_type.find('arbiter') >= 0 # Disable self-heal daemon and set `quorum-type` option to `none` ret = set_volume_options(self.mnode, self.volname, { 'self-heal-daemon': 'off', 'cluster.quorum-type': 'none' }) self.assertTrue( ret, 'Not able to disable `quorum-type` and ' '`self-heal` daemon volume options') # Create required dir and files from the mount split_dir = 'gfid_split_dir' file_io = ('cd %s; for i in {1..10}; do ' + io_cmd + ' 1M > %s/file$i; done;') ret = mkdir(client, '{}/{}'.format(m_point, split_dir)) self.assertTrue(ret, 'Unable to create a directory from mount point') ret, _, _ = g.run(client, file_io % (m_point, split_dir)) # `file{4,5,6}` are re-created every time to be used in `bigger-file` # resolution method cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}' split_cmds = { 1: ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)), 2: ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)), 3: ';'.join( cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)), 4: ';'.join( cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)), } # Get subvols and simulate entry split brain subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] self.assertTrue(subvols, 'Not able to get list of subvols') msg = ('Unable to bring files under {} dir to entry split brain while ' '{} are down') for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter), 1): # Bring down two bricks from each subvol ret = bring_bricks_offline(self.volname, list(bricks)) self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks)) ret, _, _ = g.run(client, 'cd {}; {}'.format(m_point, split_cmds[index])) self.assertEqual(ret, 0, msg.format(split_dir, bricks)) # Bricks will be brought down only two times in case of arbiter and # bringing remaining files into split brain for `latest-mtime` heal if arbiter and index == 2: ret, _, _ = g.run(client, 'cd {}; {}'.format(m_point, split_cmds[4])) self.assertEqual(ret, 0, msg.format(split_dir, bricks)) # Bring offline bricks online ret = bring_bricks_online( self.mnode, self.volname, bricks, bring_bricks_online_methods='volume_start_force') self.assertTrue(ret, 'Unable to bring {} online'.format(bricks)) # Enable self-heal daemon, trigger heal and assert volume is in split # brain condition ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, 'Failed to enable self heal daemon') ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname) self.assertTrue(ret, 'Not all self heal daemons are online') ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger index heal on the volume') ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, 'Volume should be in split brain condition') # Select source brick and take note of files in source brick stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) source_bricks = [choice(subvol[0:stop]) for subvol in subvols] files = [ self._get_files_in_brick(path, split_dir) for path in source_bricks ] # Resolve `file1, file2, file3` gfid split files using `source-brick` cmd = ('gluster volume heal ' + self.volname + ' split-brain ' 'source-brick {} /' + split_dir + '/{}') for index, source_brick in enumerate(source_bricks): for each_file in files[index]: run_cmd = cmd.format(source_brick, each_file) self._run_cmd_and_assert(run_cmd) # Resolve `file4, file5, file6` gfid split files using `bigger-file` cmd = ('gluster volume heal ' + self.volname + ' split-brain bigger-file /' + split_dir + '/{}') for each_file in ('file4', 'file5', 'file6'): run_cmd = cmd.format(each_file) self._run_cmd_and_assert(run_cmd) # Resolve `file7, file8, file9` gfid split files using `latest-mtime` cmd = ('gluster volume heal ' + self.volname + ' split-brain latest-mtime /' + split_dir + '/{}') for each_file in ('file7', 'file8', 'file9'): run_cmd = cmd.format(each_file) self._run_cmd_and_assert(run_cmd) # Unless `shd` is triggered manually/automatically files will still # appear in `heal info` ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full self heal') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue( ret, 'All files in volume should be healed after healing files via' ' `source-brick`, `bigger-file`, `latest-mtime` methods manually') # Validate normal file `file10` and healed files don't differ in # subvols via an `arequal` for subvol in subvols: # Disregard last brick if volume is of arbiter type ret, arequal = collect_bricks_arequal(subvol[0:stop]) self.assertTrue( ret, 'Unable to get `arequal` checksum on ' '{}'.format(subvol[0:stop])) self.assertEqual( len(set(arequal)), 1, 'Mismatch of `arequal` ' 'checksum among {} is identified'.format(subvol[0:stop])) g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, ' '`bigger-file` and `latest-mtime` methods is complete')
def test_metadata_self_heal(self): """ Test MetaData Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Change the permissions, ownership and the group of the files under "test_meta_data_self_heal" folder - get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check is heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online - check group and user are 'qa' """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off' successfully") # Creating files on client side all_mounts_procs = [] test_meta_data_self_heal_folder = 'test_meta_data_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 50` ; " "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; " "done ;" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder, test_meta_data_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Changing the permissions, ownership and the group # of the files under "test_meta_data_self_heal" folder g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Change permissions to 444 g.log.info('Changing permissions...') command = ("cd %s/%s/ ; " "chmod -R 444 *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Permissions are changed successfully') # Change the ownership to qa g.log.info('Changing the ownership...') command = ("cd %s/%s/ ; " "chown -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Ownership is changed successfully') # Change the group to qa g.log.info('Changing the group...') command = ("cd %s/%s/ ; " "chgrp -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Group is changed successfully') # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume process %s not online " "despite waiting for 5 minutes", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal') # Adding servers and client in single dict to check permissions nodes_to_check = {} all_bricks = get_all_bricks(self.mnode, self.volname) for brick in all_bricks: node, brick_path = brick.split(':') nodes_to_check[node] = brick_path nodes_to_check[self.mounts[0].client_system] = \ self.mounts[0].mountpoint # Checking for user and group for node in nodes_to_check: # Get file list command = ("cd %s/%s/ ; " "ls" % (nodes_to_check[node], test_meta_data_self_heal_folder)) ret, out, err = g.run(node, command) file_list = out.split() for file_name in file_list: file_to_check = '%s/%s/%s' % (nodes_to_check[node], test_meta_data_self_heal_folder, file_name) g.log.info('Checking for permissions, user and group for %s', file_name) # Check for permissions cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'" .format(file_to_check)) ret, permissions, _ = g.run(node, cmd) self.assertEqual(permissions.split('\n')[0], '444', 'Permissions %s is not equal to 444' % permissions) g.log.info("Permissions are '444' for %s", file_name) # Check for user cmd = ("ls -ld {} | awk '{{print $3}}'" .format(file_to_check)) ret, username, _ = g.run(node, cmd) self.assertEqual(username.split('\n')[0], 'qa', 'User %s is not equal qa' % username) g.log.info("User is 'qa' for %s", file_name) # Check for group cmd = ("ls -ld {} | awk '{{print $4}}'" .format(file_to_check)) ret, groupname, _ = g.run(node, cmd) self.assertEqual(groupname.split('\n')[0], 'qa', 'Group %s is not equal qa' % groupname) g.log.info("Group is 'qa' for %s", file_name)
def test_heal_client_io_hang(self): mountpoint = self.mounts[0].mountpoint # disable server side heal ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, ("Failed to disable server side heal")) g.log.info("Successfully disabled server side heal") # Log Volume Info and Status after disabling client side heal g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the bricks list") # Create files cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do touch file$i; done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished creating files while all the bricks are UP') # Bring bricks offline ret = bring_bricks_offline(self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") # Start pumping IO from client cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished writing on files while a brick is DOWN') # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring up the bricks") g.log.info("Successfully brought the bricks up") # Verifying all bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") # Start client side heal by reading/writing files. appendcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=1 oflag=append conv=notrunc;done" % mountpoint) readcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=file$i of=/dev/zero bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, appendcmd) self.assertEqual(ret, 0, err) g.log.info('Finished append on files after bringing bricks online') ret, _, err = g.run(self.mounts[0].client_system, readcmd) self.assertEqual(ret, 0, err) g.log.info('Finished read on files after bringing bricks online') # check the heal info and completion ec_check_heal_comp(self) # Log Volume Info and Status after bringing the brick up g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info( "Successful in logging volume info and status " "of volume %s", self.volname)
def test_data_self_heal_algorithm_full_default(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full' Description: - set the volume option "data-self-heal-algorithm" to value "full" - create IO - bring down all bricks processes from selected set - modify the data - calculate arequal - bring bricks online - start healing - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options "data-self-heal-algorithm": "full"...') options = {"data-self-heal-algorithm": "full"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'data-self-heal-algorithm' is set to 'full' " "successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def test_heal_info_should_have_fixed_fields(self): """ - Create IO - While IO is creating - bring down a couple of bricks - Wait for IO to complete - Bring up the down bricks - Wait for heal to complete - Check for fields 'Brick', 'Status', 'Number of entries' in heal info """ # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -f 50 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get heal info g.log.info('Getting heal info...') heal_info_dicts = get_heal_info_summary(self.mnode, self.volname) self.assertFalse(ret, 'Failed to get heal info') g.log.info(heal_info_dicts) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Check all fields in heal info dict g.log.info('Checking for all the fields in heal info...') for brick in bricks_list: g.log.info('Checking fields for %s', brick) self.assertEqual(heal_info_dicts[brick]['status'], 'Connected', 'Status is not Connected for brick %s' % brick) self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0', 'numberOfEntries is not 0 for brick %s' % brick) g.log.info('Successfully checked for all the fields in heal info')