def _bring_bricks_online_heal(self, mnode, volname, bricks_list): """ Bring bricks online and monitor heal completion """ # Bring bricks online ret = bring_bricks_online( mnode, volname, bricks_list, bring_bricks_online_methods=['volume_start_force']) self.assertTrue(ret, 'Failed to bring bricks online') # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(mnode, volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(volname))) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(mnode, volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(volname))) g.log.info("Volume %s : All process are online", volname) # Monitor heal completion ret = monitor_heal_completion(mnode, volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(mnode, volname) self.assertFalse(ret, 'Volume is in split-brain state')
def _validate_heal_completion_and_arequal(self, op_type): '''Refactor of steps common to all tests: Validate heal from heal commands, verify arequal, perform IO and verify arequal after IO''' # Validate heal completion self.assertTrue(monitor_heal_completion(self.mnode, self.volname), 'Self heal is not completed within timeout') self.assertFalse( is_volume_in_split_brain(self.mnode, self.volname), 'Volume is in split brain even after heal completion') subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] self.assertTrue(subvols, 'Not able to get list of subvols') arbiter = self.volume_type.find('arbiter') >= 0 stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) # Validate arequal self._validate_arequal_and_perform_lookup(subvols, stop) # Perform some additional metadata/data operations cmd = self.op_cmd[op_type][4].format(self.fqpath, self.io_cmd) ret, _, err = g.run(self.client, cmd) self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) # Validate arequal after additional operations self._validate_arequal_and_perform_lookup(subvols, stop)
def test_replica_to_arbiter_volume_with_io(self): """ Description: Replica 3 to arbiter conversion with ongoing IO's Steps : 1) Create a replica 3 volume and start volume. 2) Set client side self heal off. 3) Fuse mount the volume. 4) Create directory dir1 and write data. Example: untar linux tar from the client into the dir1 5) When IO's is running, execute remove-brick command, and convert replica 3 to replica 2 volume 6) Execute add-brick command and convert to arbiter volume, provide the path of new arbiter brick. 7) Issue gluster volume heal. 8) Heal should be completed with no files in split-brain. """ # pylint: disable=too-many-statements # Create a dir to start untar self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint, "linuxuntar") ret = mkdir(self.clients[0], self.linux_untar_dir) self.assertTrue(ret, "Failed to create dir linuxuntar for untar") # Start linux untar on dir linuxuntar self.io_process = run_linux_untar(self.clients[0], self.mounts[0].mountpoint, dirs=tuple(['linuxuntar'])) self.is_io_running = True # Convert relicated to arbiter volume self._convert_replicated_to_arbiter_volume() # Wait for IO to complete. ret = self._wait_for_untar_completion() self.assertFalse(ret, "IO didn't complete or failed on client") self.is_io_running = False # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def _check_heal_is_completed_and_not_in_split_brain(self): """Check if heal is completed and volume not in split brain""" # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check if volume is in split brian or not ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def test_heal_info_should_have_fixed_fields(self): """ - Create IO - While IO is creating - bring down a couple of bricks - Wait for IO to complete - Bring up the down bricks - Wait for heal to complete - Check for fields 'Brick', 'Status', 'Number of entries' in heal info """ # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -f 50 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get heal info g.log.info('Getting heal info...') heal_info_dicts = get_heal_info_summary(self.mnode, self.volname) self.assertFalse(ret, 'Failed to get heal info') g.log.info(heal_info_dicts) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Check all fields in heal info dict g.log.info('Checking for all the fields in heal info...') for brick in bricks_list: g.log.info('Checking fields for %s', brick) self.assertEqual(heal_info_dicts[brick]['status'], 'Connected', 'Status is not Connected for brick %s' % brick) self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0', 'numberOfEntries is not 0 for brick %s' % brick) g.log.info('Successfully checked for all the fields in heal info')
def test_metadata_self_heal(self): """ Test MetaData Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Change the permissions, ownership and the group of the files under "test_meta_data_self_heal" folder - get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check is heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online - check group and user are 'qa' """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off' successfully") # Creating files on client side all_mounts_procs = [] test_meta_data_self_heal_folder = 'test_meta_data_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 50` ; " "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; " "done ;" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder, test_meta_data_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Changing the permissions, ownership and the group # of the files under "test_meta_data_self_heal" folder g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Change permissions to 444 g.log.info('Changing permissions...') command = ("cd %s/%s/ ; " "chmod -R 444 *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Permissions are changed successfully') # Change the ownership to qa g.log.info('Changing the ownership...') command = ("cd %s/%s/ ; " "chown -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Ownership is changed successfully') # Change the group to qa g.log.info('Changing the group...') command = ("cd %s/%s/ ; " "chgrp -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Group is changed successfully') # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume process %s not online " "despite waiting for 5 minutes", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal') # Adding servers and client in single dict to check permissions nodes_to_check = {} all_bricks = get_all_bricks(self.mnode, self.volname) for brick in all_bricks: node, brick_path = brick.split(':') nodes_to_check[node] = brick_path nodes_to_check[self.mounts[0].client_system] = \ self.mounts[0].mountpoint # Checking for user and group for node in nodes_to_check: # Get file list command = ("cd %s/%s/ ; " "ls" % (nodes_to_check[node], test_meta_data_self_heal_folder)) ret, out, err = g.run(node, command) file_list = out.split() for file_name in file_list: file_to_check = '%s/%s/%s' % (nodes_to_check[node], test_meta_data_self_heal_folder, file_name) g.log.info('Checking for permissions, user and group for %s', file_name) # Check for permissions cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'" .format(file_to_check)) ret, permissions, _ = g.run(node, cmd) self.assertEqual(permissions.split('\n')[0], '444', 'Permissions %s is not equal to 444' % permissions) g.log.info("Permissions are '444' for %s", file_name) # Check for user cmd = ("ls -ld {} | awk '{{print $3}}'" .format(file_to_check)) ret, username, _ = g.run(node, cmd) self.assertEqual(username.split('\n')[0], 'qa', 'User %s is not equal qa' % username) g.log.info("User is 'qa' for %s", file_name) # Check for group cmd = ("ls -ld {} | awk '{{print $4}}'" .format(file_to_check)) ret, groupname, _ = g.run(node, cmd) self.assertEqual(groupname.split('\n')[0], 'qa', 'Group %s is not equal qa' % groupname) g.log.info("Group is 'qa' for %s", file_name)
def test_replace_brick_self_heal_io_in_progress(self): """ - Create directory on mount point and write files/dirs - Create another set of files (1K files) - While creation of files/dirs are in progress Kill one brick - Remove the contents of the killed brick(simulating disk replacement) - When the IO's are still in progress, restart glusterd on the nodes where we simulated disk replacement to bring back bricks online - Start volume heal - Wait for IO's to complete - Verify whether the files are self-healed - Calculate arequals of the mount point and all the bricks """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Create dirs with files g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 10 %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, command, user=self.mounts[0].user) self.assertFalse(ret, err) g.log.info("IO is successful") # Creating another set of files (1K files) self.all_mounts_procs = [] # Create dirs with files g.log.info('Creating 1K files...') command = ("/usr/bin/env python %s create_files " "-f 1500 --fixed-file-size 10k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts[0]) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Remove the content of the killed bricks for brick in bricks_to_bring_offline: brick_node, brick_path = brick.split(':') # Removing files command = ('cd %s ; rm -rf *' % brick_path) ret, _, err = g.run(brick_node, command) self.assertFalse(ret, err) g.log.info('Files are deleted on brick %s', brick) # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal daemons are online") # Start healing ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Check arequals for "replicated" all_bricks = get_all_bricks(self.mnode, self.volname) if self.volume_type == "replicated": # Get arequal after bricks are online ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after successfully bringing' 'bricks online.') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total ret, arequals = collect_bricks_arequal(all_bricks) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and brick ' 'are not equal') g.log.info('Arequals for mountpoint and brick are equal') # Check arequals for "distributed-replicated" if self.volume_type == "distributed-replicated": # Get the subvolumes subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols_dict['volume_subvols'][i] ret, arequal = collect_bricks_arequal(subvol_brick_list[0]) self.assertTrue(ret, 'Failed to get arequal on first brick') first_brick_total = arequal[0].splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick ret, arequals = collect_bricks_arequal(subvol_brick_list) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and brick are ' 'not equal') g.log.info('Arequals for subvol and brick are equal')
def test_manual_heal_should_trigger_heal(self): """ - create a single brick volume - add some files and directories - get arequal from mountpoint - add-brick such that this brick makes the volume a replica vol 1x2 - start heal - make sure heal is completed - get arequals from all bricks and compare with arequal from mountpoint """ # pylint: disable=too-many-statements,too-many-locals # Start IO on mounts g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("python %s create_deep_dirs_with_files " "--dir-length 1 " "--dir-depth 1 " "--max-num-of-dirs 1 " "--num-of-files 10 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is started successfully", mount_obj.client_system, mount_obj.mountpoint) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Get arequal for mount before adding bricks g.log.info('Getting arequal before adding bricks...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Form brick list to add g.log.info('Forming brick list to add...') bricks_to_add = form_bricks_list(self.mnode, self.volname, 1, self.servers, self.all_servers_info) g.log.info('Brick list to add: %s', bricks_to_add) # Add bricks g.log.info("Start adding bricks to volume...") ret, _, _ = add_brick(self.mnode, self.volname, bricks_to_add, force=True, replica_count=2) self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add) g.log.info("Adding bricks is successful on volume %s", self.volname) # Make sure the newly added bricks are available in the volume # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) for brick in bricks_to_add: self.assertIn(brick, bricks_list, 'Brick %s is not in brick list' % brick) g.log.info('New bricks are present in the volume') # Make sure volume change from distribute to replicate volume vol_info_dict = get_volume_type_info(self.mnode, self.volname) vol_type = vol_info_dict['volume_type_info']['typeStr'] self.assertEqual( 'Replicate', vol_type, 'Volume type is not converted to Replicate ' 'after adding bricks') g.log.info('Volume type is successfully converted to Replicate ' 'after adding bricks') # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal for replicated')
def test_afr_heal_with_brickdown_hardlink(self): """ Steps: 1. Create 2 * 3 distribute replicate volume and disable all heals 2. Create a file and 3 hardlinks to it from fuse mount. 3. Kill brick4, rename HLINK1 to an appropriate name so that it gets hashed to replicate-1 4. Likewise rename HLINK3 and HLINK7 as well, killing brick5 and brick6 respectively each time. i.e. a different brick of the 2nd replica is down each time. 5. Now enable shd and let selfheals complete. 6. Heal should complete without split-brains. """ bricks_list = get_all_bricks(self.mnode, self.volname) options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" } g.log.info("setting options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) cmd = ("touch %s/FILE" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "file creation failed") # Creating a hardlink for the file created for i in range(1, 4): ret = create_link_file( self.clients[0], '{}/FILE'.format(self.mounts[0].mountpoint), '{}/HLINK{}'.format(self.mounts[0].mountpoint, i)) self.assertTrue(ret, "Unable to create hard link file ") # Bring brick3 offline,Rename file HLINK1,and bring back brick3 online self._test_brick_down_with_file_rename("HLINK1", "NEW-HLINK1", bricks_list[3]) # Bring brick4 offline,Rename file HLINK2,and bring back brick4 online self._test_brick_down_with_file_rename("HLINK2", "NEW-HLINK2", bricks_list[4]) # Bring brick5 offline,Rename file HLINK3,and bring back brick5 online self._test_brick_down_with_file_rename("HLINK3", "NEW-HLINK3", bricks_list[5]) # Setting options options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Check data on mount point cmd = ("ls %s" % (self.mounts[0].mountpoint)) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "failed to fetch data from mount point")
def test_handling_data_split_brain(self): """ - create IO - calculate arequal from mountpoint - set volume option 'self-heal-daemon' to value "off" - kill data brick1 - calculate arequal checksum and compare it - modify files and directories - bring back all bricks processes online - kill data brick3 - modify files and directories - calculate arequal from mountpoint - bring back all bricks processes online - run the find command to trigger heal from mountpoint - set volume option 'self-heal-daemon' to value "on" - check if heal is completed - check for split-brain - read files - calculate arequal checksum and compare it """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do mkdir dir.$i ; " "for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1K count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1k count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Setting options options = {"self-heal-daemon": "off"} g.log.info('Setting options %s for volume %s', options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Bring brick 1 offline bricks_to_bring_offline = [bricks_list[0]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Comparing arequals before getting bricks offline # and after getting bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Arequals before getting bricks offline ' 'and after getting bricks offline are not equal') g.log.info('Arequals before getting bricks offline ' 'and after getting bricks offline are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify files g.log.info('Modifying files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1M count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1M count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Bring 1-st brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Bring brick 3rd offline bricks_to_bring_offline = [bricks_list[-1]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Modifying files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1M count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1M count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring 3rd brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Mount and unmount mounts ret = self.unmount_volume(self.mounts) self.assertTrue(ret, 'Failed to unmount %s' % self.volname) ret = self.mount_volume(self.mounts) self.assertTrue(ret, 'Unable to mount %s' % self.volname) # Start heal from mount point g.log.info('Starting heal from mount point...') for mount_obj in self.mounts: g.log.info("Start heal for %s:%s", mount_obj.client_system, mount_obj.mountpoint) command = "/usr/bin/env python %s read %s" % ( self.script_upload_path, self.mounts[0].mountpoint) ret, _, err = g.run(mount_obj.client_system, command) self.assertFalse(ret, err) g.log.info("Heal triggered for %s:%s", mount_obj.client_system, mount_obj.mountpoint) g.log.info('Heal triggered for all mountpoints') # Enable self-heal daemon ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, 'Successfully started self heal daemon') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Reading files g.log.info('Reading files...') for mount_obj in self.mounts: g.log.info("Start reading files for %s:%s", mount_obj.client_system, mount_obj.mountpoint) command = ('cd %s/ ; ' 'for i in `seq 1 10` ; ' 'do cat file.$i > /dev/null ; ' 'for j in `seq 1 5` ; ' 'do cat dir.$i/file.$j > /dev/null ; ' 'done ; done' % mount_obj.mountpoint) ret, _, err = g.run(mount_obj.client_system, command) self.assertFalse(ret, err) g.log.info("Reading files successfully for %s:%s", mount_obj.client_system, mount_obj.mountpoint) g.log.info('Reading files successfully for all mountpoints') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Comparing arequals before getting bricks online # and after getting bricks online self.assertEqual(result_before_online, result_after_online, 'Arequals before getting bricks online ' 'and after getting bricks online are not equal') g.log.info('Arequals before getting bricks online ' 'and after getting bricks online are equal')
def test_gfid_split_brain_resolution(self): """ Description: Simulates gfid split brain on multiple files in a dir and resolve them via `bigger-file`, `mtime` and `source-brick` methods Steps: - Create and mount a replicated volume, create a dir and ~10 data files - Simulate gfid splits in 9 of the files - Resolve each 3 set of files using `bigger-file`, `mtime` and `source-bricks` split-brain resoultion methods - Trigger and monitor for heal completion - Validate all the files are healed and arequal matches for bricks in subvols """ io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c ' client, m_point = (self.mounts[0].client_system, self.mounts[0].mountpoint) arbiter = self.volume_type.find('arbiter') >= 0 # Disable self-heal daemon and set `quorum-type` option to `none` ret = set_volume_options(self.mnode, self.volname, { 'self-heal-daemon': 'off', 'cluster.quorum-type': 'none' }) self.assertTrue( ret, 'Not able to disable `quorum-type` and ' '`self-heal` daemon volume options') # Create required dir and files from the mount split_dir = 'gfid_split_dir' file_io = ('cd %s; for i in {1..10}; do ' + io_cmd + ' 1M > %s/file$i; done;') ret = mkdir(client, '{}/{}'.format(m_point, split_dir)) self.assertTrue(ret, 'Unable to create a directory from mount point') ret, _, _ = g.run(client, file_io % (m_point, split_dir)) # `file{4,5,6}` are re-created every time to be used in `bigger-file` # resolution method cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}' split_cmds = { 1: ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)), 2: ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)), 3: ';'.join( cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)), 4: ';'.join( cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)), } # Get subvols and simulate entry split brain subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] self.assertTrue(subvols, 'Not able to get list of subvols') msg = ('Unable to bring files under {} dir to entry split brain while ' '{} are down') for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter), 1): # Bring down two bricks from each subvol ret = bring_bricks_offline(self.volname, list(bricks)) self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks)) ret, _, _ = g.run(client, 'cd {}; {}'.format(m_point, split_cmds[index])) self.assertEqual(ret, 0, msg.format(split_dir, bricks)) # Bricks will be brought down only two times in case of arbiter and # bringing remaining files into split brain for `latest-mtime` heal if arbiter and index == 2: ret, _, _ = g.run(client, 'cd {}; {}'.format(m_point, split_cmds[4])) self.assertEqual(ret, 0, msg.format(split_dir, bricks)) # Bring offline bricks online ret = bring_bricks_online( self.mnode, self.volname, bricks, bring_bricks_online_methods='volume_start_force') self.assertTrue(ret, 'Unable to bring {} online'.format(bricks)) # Enable self-heal daemon, trigger heal and assert volume is in split # brain condition ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, 'Failed to enable self heal daemon') ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname) self.assertTrue(ret, 'Not all self heal daemons are online') ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger index heal on the volume') ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, 'Volume should be in split brain condition') # Select source brick and take note of files in source brick stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) source_bricks = [choice(subvol[0:stop]) for subvol in subvols] files = [ self._get_files_in_brick(path, split_dir) for path in source_bricks ] # Resolve `file1, file2, file3` gfid split files using `source-brick` cmd = ('gluster volume heal ' + self.volname + ' split-brain ' 'source-brick {} /' + split_dir + '/{}') for index, source_brick in enumerate(source_bricks): for each_file in files[index]: run_cmd = cmd.format(source_brick, each_file) self._run_cmd_and_assert(run_cmd) # Resolve `file4, file5, file6` gfid split files using `bigger-file` cmd = ('gluster volume heal ' + self.volname + ' split-brain bigger-file /' + split_dir + '/{}') for each_file in ('file4', 'file5', 'file6'): run_cmd = cmd.format(each_file) self._run_cmd_and_assert(run_cmd) # Resolve `file7, file8, file9` gfid split files using `latest-mtime` cmd = ('gluster volume heal ' + self.volname + ' split-brain latest-mtime /' + split_dir + '/{}') for each_file in ('file7', 'file8', 'file9'): run_cmd = cmd.format(each_file) self._run_cmd_and_assert(run_cmd) # Unless `shd` is triggered manually/automatically files will still # appear in `heal info` ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full self heal') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue( ret, 'All files in volume should be healed after healing files via' ' `source-brick`, `bigger-file`, `latest-mtime` methods manually') # Validate normal file `file10` and healed files don't differ in # subvols via an `arequal` for subvol in subvols: # Disregard last brick if volume is of arbiter type ret, arequal = collect_bricks_arequal(subvol[0:stop]) self.assertTrue( ret, 'Unable to get `arequal` checksum on ' '{}'.format(subvol[0:stop])) self.assertEqual( len(set(arequal)), 1, 'Mismatch of `arequal` ' 'checksum among {} is identified'.format(subvol[0:stop])) g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, ' '`bigger-file` and `latest-mtime` methods is complete')
def test_gluster_clone_heal(self): """ Test gluster compilation on mount point(Heal command) - Creating directory test_compilation - Compile gluster on mountpoint - Select bricks to bring offline - Bring brick offline - Validate IO - Bring bricks online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal after getting bricks online - Compile gluster on mountpoint again - Select bricks to bring offline - Bring brick offline - Validate IO - Bring bricks online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal after getting bricks online """ # pylint: disable=too-many-branches,too-many-statements,too-many-locals # Creating directory test_compilation ret = mkdir(self.mounts[0].client_system, "{}/test_compilation".format(self.mounts[0].mountpoint)) self.assertTrue(ret, "Failed to create directory") g.log.info( "Directory 'test_compilation' on %s created " "successfully", self.mounts[0]) # Compile gluster on mountpoint cmd = ("cd %s/test_compilation ; rm -rf glusterfs; git clone" " git://github.com/gluster/glusterfs.git ; cd glusterfs ;" " ./autogen.sh ;./configure CFLAGS='-g3 -O0 -DDEBUG'; make ;" " cd ../..;" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs([proc], self.mounts[0]), "IO failed on some of the clients") # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info("Arequal of mountpoint %s", result_after_online) # Compile gluster on mountpoint again proc1 = g.run_async(self.mounts[0].client_system, cmd) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) # Validate IO self.assertTrue(validate_io_procs([proc1], self.mounts[0]), "IO failed on some of the clients") # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info("Arequal of mountpoint %s", result_after_online)
def test_self_heal_with_diff_algorithm(self): """ Test Steps: 1. Create a replicated/distributed-replicate volume and mount it 2. Set data/metadata/entry-self-heal to off and data-self-heal-algorithm to diff 3. Create few files inside a directory with some data 4. Check arequal of the subvol and all the bricks in the subvol should have same checksum 5. Bring down a brick from the subvol and validate it is offline 6. Modify the data of existing files under the directory 7. Bring back the brick online and wait for heal to complete 8. Check arequal of the subvol and all the brick in the same subvol should have same checksum """ # Setting options for key, value in (("data-self-heal", "off"), ("metadata-self-heal", "off"), ("entry-self-heal", "off"), ("data-self-heal-algorithm", "diff")): ret = set_volume_options(self.mnode, self.volname, {key: value}) self.assertTrue(ret, 'Failed to set %s to %s.' % (key, value)) g.log.info("%s set to %s successfully", key, value) # Create few files under a directory with data mountpoint = self.mounts[0].mountpoint client = self.mounts[0].client_system cmd = ("mkdir %s/test_diff_self_heal ; cd %s/test_diff_self_heal ;" "for i in `seq 1 100` ; do dd if=/dev/urandom of=file.$i " " bs=1M count=1; done;" % (mountpoint, mountpoint)) ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "Failed to create file on mountpoint") g.log.info("Successfully created files on mountpoint") # Check arequal checksum of all the bricks is same subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] for subvol in subvols: ret, arequal_from_the_bricks = collect_bricks_arequal(subvol) self.assertTrue( ret, "Arequal is collected successfully across " "the bricks in the subvol {}".format(subvol)) cmd = len(set(arequal_from_the_bricks)) if (self.volume_type == "arbiter" or self.volume_type == "distributed-arbiter"): cmd = len(set(arequal_from_the_bricks[:2])) self.assertEqual( cmd, 1, "Arequal" " is same on all the bricks in the subvol") # List a brick in each subvol and bring them offline brick_to_bring_offline = [] for subvol in subvols: self.assertTrue(subvol, "List is empty") brick_to_bring_offline.extend(sample(subvol, 1)) ret = bring_bricks_offline(self.volname, brick_to_bring_offline) self.assertTrue( ret, "Unable to bring brick: {} offline".format(brick_to_bring_offline)) # Validate the brick is offline ret = are_bricks_offline(self.mnode, self.volname, brick_to_bring_offline) self.assertTrue( ret, "Brick:{} is still online".format(brick_to_bring_offline)) # Modify files under test_diff_self_heal directory cmd = ("for i in `seq 1 100` ; do truncate -s 0 file.$i ; " "truncate -s 2M file.$i ; done;") ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "Failed to modify the files") g.log.info("Successfully modified files") # Start volume with force to bring all bricks online ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Volume start with force failed") g.log.info("Volume: %s started successfully", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online", self.volname)) # Monitor heal completion self.assertTrue( monitor_heal_completion(self.mnode, self.volname, interval_check=10), "Heal failed after 20 mins") # Check are there any files in split-brain self.assertFalse( is_volume_in_split_brain(self.mnode, self.volname), "Some files are in split brain for " "volume: {}".format(self.volname)) # Check arequal checksum of all the bricks is same for subvol in subvols: ret, arequal_from_the_bricks = collect_bricks_arequal(subvol) self.assertTrue( ret, "Arequal is collected successfully across " "the bricks in the subvol {}".format(subvol)) cmd = len(set(arequal_from_the_bricks)) if (self.volume_type == "arbiter" or self.volume_type == "distributed-arbiter"): cmd = len(set(arequal_from_the_bricks[:2])) self.assertEqual( cmd, 1, "Arequal" " is same on all the bricks in the subvol")
def test_metadata_self_heal_client_side_heal(self): """ Testcase steps: 1.Turn off the options self heal daemon 2.Create IO 3.Calculate arequal of the bricks and mount point 4.Bring down "brick1" process 5.Change the permissions of the directories and files 6.Change the ownership of the directories and files 7.Change the group of the directories and files 8.Bring back the brick "brick1" process 9.Execute "find . | xargs stat" from the mount point to trigger heal 10.Verify the changes in permissions are not self healed on brick1 11.Verify the changes in permissions on all bricks but brick1 12.Verify the changes in ownership are not self healed on brick1 13.Verify the changes in ownership on all the bricks but brick1 14.Verify the changes in group are not successfully self-healed on brick1 15.Verify the changes in group on all the bricks but brick1 16.Turn on the option metadata-self-heal 17.Execute "find . | xargs md5sum" from the mount point to trgger heal 18.Wait for heal to complete 19.Verify the changes in permissions are self-healed on brick1 20.Verify the changes in ownership are successfully self-healed on brick1 21.Verify the changes in group are successfully self-healed on brick1 22.Calculate arequal check on all the bricks and mount point """ # Setting options ret = set_volume_options(self.mnode, self.volname, {"self-heal-daemon": "off"}) self.assertTrue(ret, 'Failed to set options self-heal-daemon ' 'and metadata-self-heal to OFF') g.log.info("Options are set successfully") # Creating files on client side self.test_meta_data_self_heal_folder = 'test_meta_data_self_heal' for mount_object in self.mounts: command = ("cd {0}/ ; mkdir {1} ; cd {1}/ ;" "for i in `seq 1 100` ; " "do mkdir dir.$i ; " "for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1K count=$j ; done ; done ;".format (mount_object.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_object.client_system, command, user=mount_object.user) self.all_mounts_procs.append(proc) # Validate IO self.validate_io_on_clients() # Calculate and check arequal of the bricks and mount point self.check_arequal_from_mount_point_and_bricks() # Select bricks to bring offline from a replica set subvols_dict = get_subvols(self.mnode, self.volname) subvols = subvols_dict['volume_subvols'] bricks_to_bring_offline = [] bricks_to_be_online = [] for subvol in subvols: bricks_to_bring_offline.append(subvol[0]) for brick in subvol[1:]: bricks_to_be_online.append(brick) # Bring bricks offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Change the permissions of the directories and files self.all_mounts_procs = [] for mount_obj in self.mounts: command = ('cd {}/{}; ' 'for i in `seq 1 100` ; ' 'do chmod 555 dir.$i ; done ; ' 'for i in `seq 1 50` ; ' 'do for j in `seq 1 5` ; ' 'do chmod 666 dir.$i/file.$j ; done ; done ; ' 'for i in `seq 51 100` ; ' 'do for j in `seq 1 5` ; ' 'do chmod 444 dir.$i/file.$j ; done ; done ;' .format(mount_obj.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.validate_io_on_clients() # Change the ownership of the directories and files self.all_mounts_procs = [] for mount_obj in self.mounts: command = ('cd {}/{} ; ' 'for i in `seq 1 35` ; ' 'do chown -R qa_func dir.$i ; done ; ' 'for i in `seq 36 70` ; ' 'do chown -R qa_system dir.$i ; done ; ' 'for i in `seq 71 100` ; ' 'do chown -R qa_perf dir.$i ; done ;' .format(mount_obj.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.validate_io_on_clients() # Change the group of the directories and files self.all_mounts_procs = [] for mount_obj in self.mounts: command = ('cd {}/{}; ' 'for i in `seq 1 100` ; ' 'do chgrp -R qa_all dir.$i ; done ;' .format(mount_obj.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.validate_io_on_clients() # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Trigger heal from mount point self.trigger_heal_from_mount_point() # Verify the changes are not self healed on brick1 for each subvol for brick in bricks_to_bring_offline: node, brick_path = brick.split(':') dir_list = get_dir_contents(node, "{}/{}".format( brick_path, self.test_meta_data_self_heal_folder)) self.assertIsNotNone(dir_list, "Dir list from " "brick is empty") g.log.info("Successfully got dir list from bick") # Verify changes for dirs for folder in dir_list: ret = get_file_stat(node, "{}/{}/{}".format( brick_path, self.test_meta_data_self_heal_folder, folder)) self.assertEqual('755', ret['access'], "Permissions mismatch on node {}" .format(node)) self.assertEqual('root', ret['username'], "User id mismatch on node {}" .format(node)) self.assertEqual('root', ret['groupname'], "Group id mismatch on node {}" .format(node)) # Get list of files for each dir file_list = get_dir_contents(node, "{}/{}/{}".format( brick_path, self.test_meta_data_self_heal_folder, folder)) self.assertIsNotNone(file_list, "File list from " "brick is empty.") g.log.info("Successfully got file list from bick.") if file_list: for file_name in file_list: ret = get_file_stat(node, "{}/{}/{}/{}".format( brick_path, self.test_meta_data_self_heal_folder, folder, file_name)) self.assertEqual('644', ret['access'], "Permissions mismatch on node" " {} for file {}".format(node, file_name)) self.assertEqual('root', ret['username'], "User id mismatch on node" " {} for file {}".format(node, file_name)) self.assertEqual('root', ret['groupname'], "Group id mismatch on node" " {} for file {}".format(node, file_name)) # Verify the changes are self healed on all bricks except brick1 # for each subvol self.check_permssions_on_bricks(bricks_to_be_online) # Setting options ret = set_volume_options(self.mnode, self.volname, {"metadata-self-heal": "on"}) self.assertTrue(ret, 'Failed to set options to ON.') g.log.info("Options are set successfully") # Trigger heal from mount point self.trigger_heal_from_mount_point() # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Verify the changes are self healed on brick1 for each subvol self.check_permssions_on_bricks(bricks_to_bring_offline) # Calculate and check arequal of the bricks and mount point self.check_arequal_from_mount_point_and_bricks()
def test_data_self_heal_daemon_off(self): """ Test Data-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get areequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get areequal after getting bricks offline and compare with areequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - add bricks - do rebalance - create 5k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s" % (options, self.volname)) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 100 --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before getting bricks offline g.log.info('Getting areequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Get areequal after getting bricks offline g.log.info('Getting areequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks offline ' 'is successful') # Checking areequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 100 --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Bring brick online g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s" % self.volname) # Do rebalance ret, out, err = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Create 1k files self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 1000 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline' % brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Bring brick online g.log.info('Bringing bricks %s online...' % brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts")
def test_self_heal_50k_files_heal_command_by_add_brick(self): """ Test self-heal of 50k files (heal command) Description: - Set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "self-heal-daemon": "off" - Bring down all bricks processes from selected set - Create IO (50k files) - Get arequal before getting bricks online - Bring bricks online - Set the volume option "self-heal-daemon": "on" - Check for daemons - Start healing - Check if heal is completed - Check for split-brain - Get arequal after getting bricks online and compare with arequal before getting bricks online - Add bricks - Do rebalance - Get arequal after adding bricks and compare with arequal after getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Successfully set %s for volume %s", options, self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files on client side all_mounts_procs = [] # Create 50k files g.log.info('Creating files...') command = ("cd %s ; " "for i in `seq 1 50000` ; " "do dd if=/dev/urandom of=test.$i " "bs=100k count=1 ; " "done ;" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts[0]), "IO failed on some of the clients") # Get arequal before getting bricks online ret, result_before_online = collect_mounts_arequal(self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options ret = set_volume_options(self.mnode, self.volname, {"self-heal-daemon": "on"}) self.assertTrue(ret, 'Failed to set option self-heal-daemon to ON.') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual( result_before_online, result_after_online, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Add bricks ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance and wait for it to complete ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname, timeout=3600) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Get arequal after adding bricks ret, result_after_adding_bricks = collect_mounts_arequal( self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks ' 'is successful') # Checking arequals after bringing bricks online # and after adding bricks self.assertItemsEqual( result_after_online, result_after_adding_bricks, 'Checksums after bringing bricks online and ' 'after adding bricks are not equal') g.log.info('Checksums after bringing bricks online and ' 'after adding bricks are equal')
def test_heal_full_after_deleting_files(self): """ - Create IO - Calculate arequal from mount - Delete data from backend from the EC volume - Trigger heal full - Check if heal is completed - Check for split-brain - Calculate arequal checksum and compare it """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 20 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before deleting the files from brick g.log.info('Getting areequal before getting bricks offline...') ret, result_before_killing_procs = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] # Delete data from backend from the erasure node for subvol in subvols: erasure = subvol[-1] g.log.info('Clearing ec brick %s', erasure) node, brick_path = erasure.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) g.log.error('Clearing ec brick %s is unsuccessful', erasure) self.assertFalse(ret, err) g.log.info('Clearing data from brick is successful') # Trigger heal full ret = trigger_heal_full(self.mnode, self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get areequal after healing g.log.info('Getting areequal after getting bricks online...') ret, result_after_healing = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks online ' 'is successful') # Comparing areequals self.assertEqual( result_before_killing_procs, result_after_healing, 'Areequals areequals before before killing arbiter ' 'processes and after healing are not equal') g.log.info('Areequals areequals before before killing arbiter ' 'processes and after healing equal')
def test_resolving_meta_data(self): """ - Create a file test_file.txt - Find out which brick the file resides on and kill arbiter brick in the replica pair - Modify the permissions of the file - Bring back the killed brick - Kill the other brick in the replica pair - Modify the permissions of the file - Bring back the killed brick - Trigger heal - Check if heal is completed - Check for split-brain """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side file_to_create = 'test_file.txt' for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create file g.log.info('Creating file...') command = ("cd %s ; " "touch %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # get bricks with file g.log.info('Getting bricks with file...') subvols_dict = get_subvols(self.mnode, self.volname) brick_list_with_file = [] for subvol in subvols_dict['volume_subvols']: for brick in subvol: node, brick_path = brick.split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path) if 'test_file.txt' in brick_file_list: brick_list_with_file.append(brick) g.log.info('Bricks with file: %s', brick_list_with_file) # Bring arbiter brick offline bricks_to_bring_offline = [brick_list_with_file[-1]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify the permissions g.log.info('Modifying the permissions of the file...') command = ("cd %s ; " "chmod 600 %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring arbiter brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Bring 1-st data brick offline bricks_to_bring_offline = [brick_list_with_file[0]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify the permissions g.log.info('Modifying the permissions of the file...') command = ("cd %s ; " "chmod 644 %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring 1-st data brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def test_data_self_heal_algorithm_diff_heal_command(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff' Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - modify the data - get arequal before getting bricks online - bring bricks online - expand volume by adding bricks to the volume - do rebalance - set the volume option "self-heal-daemon": "on" and check for daemons - start healing - check if heal is completed - check for split-brain - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-branches,too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "data-self-heal-algorithm": "diff" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off'," "'data-self-heal-algorithm' " "is set to 'diff' successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Expand volume by adding bricks to the volume g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums are equal')
def test_afr_gfid_heal(self): """ Description: This test case runs split-brain resolution on a 5 files in split-brain on a 1x2 volume. After resolving split-brain, it makes sure that split brain resolution doesn't work on files already in split brain. """ g.log.info("disabling the self heal daemon") ret = disable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "unable to disable self heal daemon") g.log.info("Successfully disabled the self heal daemon") # getting list of all bricks all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "failed to get list of bricks") g.log.info("bringing down brick1") ret = bring_bricks_offline(self.volname, all_bricks[0:1]) self.assertTrue(ret, "unable to bring brick1 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is still online") g.log.info("verified: brick1 is offline") g.log.info("creating 5 files from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a file from mount point") g.log.info("bringing brick 1 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertIsNotNone(ret, "unable to bring brick 1 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is not online") g.log.info("verified: brick1 is online") g.log.info("bringing down brick2") ret = bring_bricks_offline(self.volname, all_bricks[1:2]) self.assertTrue(ret, "unable to bring brick2 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is still online") g.log.info("verified: brick2 is offline") g.log.info("creating 5 new files of same name from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a new file of same name " "from mount point") g.log.info("bringing brick2 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertIsNotNone(ret, "unable to bring brick2 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is not online") g.log.info("verified: brick2 is online") g.log.info("enabling the self heal daemon") ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "failed to enable self heal daemon") g.log.info("Successfully enabled the self heal daemon") g.log.info("checking if volume is in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("resolving split-brain by choosing first brick as " "the source brick") node, brick_path = all_bricks[0].split(':') for fcount in range(5): command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[0] + ' /test_file' + str(fcount) + '.txt') ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "heal not triggered") g.log.info("Successfully triggered heal") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=240) self.assertTrue(ret, "heal not completed") g.log.info("Heal completed successfully") # checking if any file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "file still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") g.log.info("resolving split-brain on a file not in split-brain") node, brick_path = all_bricks[0].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[1] + " /test_file0.txt") ret, _, _ = g.run(node, command) self.assertNotEqual( ret, 0, "Unexpected: split-brain resolution " "command is successful on a file which" " is not in split-brain") g.log.info("Expected: split-brian resolution command failed on " "a file which is not in split-brain") g.log.info("checking the split-brain status of each file") for fcount in range(5): fpath = (self.mounts[0].mountpoint + '/test_file' + str(fcount) + '.txt') status = get_fattr(self.mounts[0].client_system, fpath, 'replica.split-brain-status') compare_string = ("The file is not under data or metadata " "split-brain") self.assertEqual( status.rstrip('\x00'), compare_string, "file test_file%s is under" " split-brain" % str(fcount)) g.log.info("none of the files are under split-brain")
def test_self_heal_differing_in_file_type(self): """ testing self heal of files with different file types with default configuration Description: - create IO - calculate arequal - bring down all bricks processes from selected set - calculate arequal and compare with arequal before getting bricks offline - modify the data - arequal before getting bricks online - bring bricks online - check daemons and healing completion - start healing - calculate arequal and compare with arequal before bringing bricks online and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side all_mounts_procs = [] test_file_type_differs_self_heal_folder = \ 'test_file_type_differs_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 10` ; " "do mkdir l1_dir.$i ; " "for j in `seq 1 5` ; " "do mkdir l1_dir.$i/l2_dir.$j ; " "for k in `seq 1 10` ; " "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k " "bs=1k count=$k ; " "done ; " "done ; " "done ; " % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(sorted(result_before_offline), sorted(result_after_offline), 'Checksums before and after bringing bricks' ' offline are not equal') g.log.info('Checksums before and after ' 'bringing bricks offline are equal') # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("cd %s/%s/ ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do for k in `seq 1 10` ; " "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; " "mkdir l1_dir.$i/l2_dir.$j/test.$k ; " "done ; " "done ; " "done ;" % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums before and after bringing bricks' ' online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_heal_for_conservative_merge_with_two_bricks_blame(self): """ 1) Create 1x3 volume and fuse mount the volume 2) On mount created a dir dir1 3) Pkill glusterfsd on node n1 (b2 on node2 and b3 and node3 up) 4) touch f{1..10} on the mountpoint 5) b2 and b3 xattrs would be blaming b1 as files are created while b1 is down 6) Reset the b3 xattrs to NOT blame b1 by using setattr 7) Now pkill glusterfsd of b2 on node2 8) Restart glusterd on node1 to bring up b1 9) Now bricks b1 online , b2 down, b3 online 10) touch x{1..10} under dir1 itself 11) Again reset xattr on node3 of b3 so that it doesn't blame b2, as done for b1 in step 6 12) Do restart glusterd on node2 hosting b2 to bring all bricks online 13) Check for heal info, split-brain and arequal for the bricks """ # pylint: disable=too-many-locals # Create dir `dir1/` on mountpont path = self.mounts[0].mountpoint + "/dir1" ret = mkdir(self.mounts[0].client_system, path, parents=True) self.assertTrue(ret, "Directory {} creation failed".format(path)) all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume") brick1, brick2, brick3 = all_bricks # Bring first brick offline self._bring_brick_offline_and_check(brick1) # touch f{1..10} files on the mountpoint cmd = ("cd {mpt}; for i in `seq 1 10`; do touch f$i" "; done".format(mpt=path)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to create files on mountpoint") # Check b2 and b3 xattrs are blaming b1 and are same self.assertEqual(self._get_fattr_for_the_brick(brick2), self._get_fattr_for_the_brick(brick3), "Both the bricks xattrs are not blaming " "brick: {}".format(brick1)) # Reset the xattrs of dir1 on b3 for brick b1 first_xattr_to_reset = "trusted.afr.{}-client-0".format(self.volname) xattr_value = "0x000000000000000000000000" host, brick_path = brick3.split(":") brick_path = brick_path + "/dir1" ret = set_fattr(host, brick_path, first_xattr_to_reset, xattr_value) self.assertTrue(ret, "Unable to set xattr for the directory") # Kill brick2 on the node2 self._bring_brick_offline_and_check(brick2) # Restart glusterd on node1 to bring the brick1 online self.assertTrue(restart_glusterd([brick1.split(":")[0]]), "Unable to " "restart glusterd") # checking for peer status post glusterd restart self._check_peers_status() # Check if the brick b1 on node1 is online or not online_bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(online_bricks, "Unable to fetch online bricks") self.assertIn(brick1, online_bricks, "Brick:{} is still offline after " "glusterd restart".format(brick1)) # Create 10 files under dir1 naming x{1..10} cmd = ("cd {mpt}; for i in `seq 1 10`; do touch x$i" "; done".format(mpt=path)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to create files on mountpoint") # Reset the xattrs from brick3 on to brick2 second_xattr_to_reset = "trusted.afr.{}-client-1".format(self.volname) ret = set_fattr(host, brick_path, second_xattr_to_reset, xattr_value) self.assertTrue(ret, "Unable to set xattr for the directory") # Bring brick2 online self.assertTrue(restart_glusterd([brick2.split(":")[0]]), "Unable to " "restart glusterd") self._check_peers_status() self.assertTrue(are_bricks_online(self.mnode, self.volname, [brick2])) # Check are there any files in split-brain and heal completion self.assertFalse(is_volume_in_split_brain(self.mnode, self.volname), "Some files are in split brain for " "volume: {}".format(self.volname)) self.assertTrue(monitor_heal_completion(self.mnode, self.volname), "Conservative merge of files failed") # Check arequal checksum of all the bricks is same ret, arequal_from_the_bricks = collect_bricks_arequal(all_bricks) self.assertTrue(ret, "Arequal is collected successfully across the" " bricks in the subvol {}".format(all_bricks)) self.assertEqual(len(set(arequal_from_the_bricks)), 1, "Arequal is " "same on all the bricks in the subvol")
def test_data_split_brain_resolution(self): # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files and directories on client side g.log.info('Creating files and directories...') cmd = ("for i in `seq 1 10`; do mkdir %s/dir.$i; for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1K count=1;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Creating files and directories failed") g.log.info("Files & directories created successfully") # Check arequals for all the bricks g.log.info('Getting arequal before getting bricks offline...') self.verify_brick_arequals() g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Set option self-heal-daemon to OFF g.log.info('Setting option self-heal-daemon to off...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick1 offline g.log.info('Bringing brick %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[0]) g.log.info('Bringing brick %s offline is successful', bricks_list[0]) # Modify the contents of the files cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=1;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file contents failed") g.log.info("File contents updated successfully") # Bricng brick1 online and check the status g.log.info('Bringing brick %s online', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[0]) g.log.info('Bringing brick %s online is successful', bricks_list[0]) g.log.info("Verifying if brick %s is online", bricks_list[0]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0])) g.log.info("Brick %s has come online.", bricks_list[0]) # Bring brick2 offline g.log.info('Bringing brick %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[1]) g.log.info('Bringing brick %s offline is successful', bricks_list[1]) # Modify the contents of the files cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=2;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=2; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file contents failed") g.log.info("File contents updated successfully") # Bricng brick2 online and check the status g.log.info('Bringing brick %s online', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[1]) g.log.info('Bringing brick %s online is successful', bricks_list[1]) g.log.info("Verifying if brick %s is online", bricks_list[1]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1])) g.log.info("Brick %s has come online.", bricks_list[1]) # Set option self-heal-daemon to ON g.log.info('Setting option self-heal-daemon to on...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") g.log.info("Checking if files are in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("Resolving split-brain by using the source-brick option " "by choosing second brick as source for all the files") node, _ = bricks_list[1].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + bricks_list[1]) ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "Command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "Heal not triggered") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=120) self.assertTrue(ret, "Heal not completed") # Try accessing the file content from the mount cmd = ("for i in `seq 1 10`; do cat %s/file.$i > /dev/null;" "for j in `seq 1 5` ; do cat %s/dir.$i/file.$j > /dev/null;" "done ; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to access the file contents") g.log.info("File contents are accessible") # checking if file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "File still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks after heal...') self.verify_brick_arequals() g.log.info('Getting arequal after heal is successful')
def test_self_heal_daemon(self): """ Test Data-Self-Heal(heal command) Description: - Create directory test_hardlink_self_heal - Create directory test_data_self_heal - Creating files for hardlinks and data files - Get arequal before getting bricks offline - Select bricks to bring offline - Bring brick offline - Create hardlinks and append data to data files - Bring brick online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal after getting bricks online - Select bricks to bring offline - Bring brick offline - Truncate data to data files and verify hardlinks - Bring brick online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal again """ # pylint: disable=too-many-branches,too-many-statements,too-many-locals # Creating directory test_hardlink_self_heal ret = mkdir( self.mounts[0].client_system, "{}/test_hardlink_self_heal".format(self.mounts[0].mountpoint)) self.assertTrue(ret, "Failed to create directory") g.log.info( "Directory 'test_hardlink_self_heal' on %s created " "successfully", self.mounts[0]) # Creating directory test_data_self_heal ret = mkdir(self.mounts[0].client_system, "{}/test_data_self_heal".format(self.mounts[0].mountpoint)) self.assertTrue(ret, "Failed to create directory") g.log.info( "Directory test_hardlink_self_heal on %s created " "successfully", self.mounts[0]) # Creating files for hardlinks and data files cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5`;' 'do mkdir dir.$i ; for j in `seq 1 10` ; do dd if=' '/dev/urandom of=dir.$i/file.$j bs=1k count=$j;done; done;' 'cd ..' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create file on mountpoint") g.log.info("Successfully created files on mountpoint") cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;' 'do dd if=/dev/urandom of=file.$i bs=128K count=$i;done;' 'cd ..' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create file on mountpoint") g.log.info("Successfully created files on mountpoint") # Get arequal before getting bricks offline ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Arequal before getting bricks online-%s', result_before_online) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Append data to data files and create hardlinks cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;' 'do dd if=/dev/urandom of=file.$i bs=512K count=$i ; done ;' 'cd .. ' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to modify data files.") g.log.info("Successfully modified data files") cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5` ;do ' 'for j in `seq 1 10`;do ln dir.$i/file.$j dir.$i/link_file.$j;' 'done ; done ; cd .. ' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Hardlinks creation failed") g.log.info("Successfully created hardlinks of files") # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) g.log.info("Volume %s : All process are online", self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Arequal after getting bricks online ' 'is %s', result_after_online) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Truncate data to data files and verify hardlinks cmd = ('cd %s/test_data_self_heal ; for i in `seq 1 100` ;' 'do truncate -s $(( $i * 128)) file.$i ; done ; cd ..' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to truncate files") g.log.info("Successfully truncated files on mountpoint") file_path = ('%s/test_hardlink_self_heal/dir{1..5}/file{1..10}' % (self.mounts[0].mountpoint)) link_path = ('%s/test_hardlink_self_heal/dir{1..5}/link_file{1..10}' % (self.mounts[0].mountpoint)) file_stat = get_file_stat(self.mounts[0], file_path) link_stat = get_file_stat(self.mounts[0], link_path) self.assertEqual(file_stat, link_stat, "Verification of hardlinks " "failed") g.log.info("Successfully verified hardlinks") # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) g.log.info("Volume %s : All process are online", self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed')
def test_heal_command_unsuccessful_as_bricks_down(self): """ - write 2 Gb file on mount - while write is in progress, kill brick b0 - start heal on the volume (should fail and have error message) - bring up the brick which was down (b0) - bring down another brick (b1) - start heal on the volume (should fail and have error message) - bring bricks up - wait for heal to complete """ # pylint: disable=too-many-statements bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create 2 Gb file g.log.info('Creating files...') command = ("cd %s ; dd if=/dev/zero of=file1 bs=10M count=200" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Bring brick0 offline g.log.info('Bringing bricks %s offline...', bricks_list[0]) ret = bring_bricks_offline(self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0]) g.log.info('Bringing bricks %s offline is successful', bricks_list[0]) # Start healing # Need to use 'gluster volume heal' command to check error message # after g.run cmd = "gluster volume heal %s" % self.volname ret, _, err = g.run(self.mnode, cmd) self.assertTrue(ret, 'Heal is started') # Check for error message self.assertIn("Launching heal operation to perform index self heal on " "volume %s has been unsuccessful" % self.volname, err, "Error message is not present or not valid") g.log.info('Expected: Healing is not started') # Bring brick0 online g.log.info("Bring bricks: %s online", bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, "Failed to bring bricks: %s online" % bricks_list[0]) g.log.info("Successfully brought all bricks:%s online", bricks_list[0]) # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Start healing # Need to use 'gluster volume heal' command to check error message # after g.run cmd = "gluster volume heal %s" % self.volname ret, _, err = g.run(self.mnode, cmd) self.assertTrue(ret, 'Heal is started') # Check for error message self.assertIn("Launching heal operation to perform index self heal on " "volume %s has been unsuccessful" % self.volname, err, "Error message is not present or not valid") g.log.info('Expected: Healing is not started') # Bring brick 1 online g.log.info("Bring bricks: %s online", bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, "Failed to bring bricks: %s online" % bricks_list[1]) g.log.info("Successfully brought all bricks:%s online", bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True
def test_conservative_merge_of_files_heal_command(self): """ - set options: "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" - Bring brick 0 offline - Creating files on client side - Bring brick 0 online - Bring brick 1 offline - Creating files on client side - Bring brick 1 online - Get arequal on bricks - Setting option "self-heal-daemon": "on" - Start healing - Get arequal on bricks and compare with arequals before healing and mountpoint """ # pylint: disable=too-many-statements,too-many-locals # set options bricks_list = get_all_bricks(self.mnode, self.volname) options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" } g.log.info("setting options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Bring brick 0 offline g.log.info('Bringing bricks %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0]) g.log.info('Bringing bricks %s offline is successful', bricks_list[0]) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("python %s create_deep_dirs_with_files " "-d 0 -l 5 -f 10 --dirname-start-num 1 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick 0 online g.log.info('Bringing bricks %s online...', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[0]) g.log.info('Bringing bricks %s online is successful', bricks_list[0]) # Bring brick 1 offline g.log.info('Bringing bricks %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Creating files on client side self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("python %s create_deep_dirs_with_files " "-d 0 -l 5 -f 10 --dirname-start-num 6 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick 1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Get arequal on bricks arequals_before_heal = {} g.log.info('Getting arequal on bricks...') for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_before_heal[brick] = brick_total # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequals for mount g.log.info('Getting arequal before getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal for replicated') self.assertNotEqual( cmp(arequals_before_heal, arequals_after_heal), 0, 'Arequals are equal for bricks ' 'before and after healing')
def test_metadata_split_brain_resolution(self): # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files and directories on client side g.log.info('Creating files and directories...') cmd = ("mkdir %s/test_metadata_sb && cd %s/test_metadata_sb &&" "for i in `seq 1 3`; do mkdir dir.$i; for j in `seq 1 5`;" "do dd if=/dev/urandom of=dir.$i/file.$j bs=1K count=1;" "done; dd if=/dev/urandom of=file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Creating files and directories failed") g.log.info("Files & directories created successfully") # Check arequals for all the bricks g.log.info('Getting arequal before getting bricks offline...') self.verify_brick_arequals() g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Set option self-heal-daemon to OFF g.log.info('Setting option self-heal-daemon to off...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick1 offline g.log.info('Bringing brick %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[0]) g.log.info('Bringing brick %s offline is successful', bricks_list[0]) # Change metadata of some files & directories cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2`; do chmod -R 0555 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Bricng brick1 online and check the status # Bring brick3 online and check status g.log.info('Bringing brick %s online', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[0]) g.log.info('Bringing brick %s online is successful', bricks_list[0]) g.log.info("Verifying if brick %s is online", bricks_list[0]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0])) g.log.info("Brick %s has come online.", bricks_list[0]) # Bring brick2 offline g.log.info('Bringing brick %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[1]) g.log.info('Bringing brick %s offline is successful', bricks_list[1]) # Change metadata of same files & directories as before cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2` ; do chmod -R 0777 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Bricng brick2 online and check the status g.log.info('Bringing brick %s online', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[1]) g.log.info('Bringing brick %s online is successful', bricks_list[1]) g.log.info("Verifying if brick %s is online", bricks_list[1]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1])) g.log.info("Brick %s has come online.", bricks_list[1]) # Set option self-heal-daemon to ON g.log.info('Setting option self-heal-daemon to on...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") g.log.info("Checking if files are in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("Resolving split-brain by using the source-brick option " "by choosing second brick as source for all the files") node, _ = bricks_list[1].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + bricks_list[1]) ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "Command execution not successful") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Heal not completed") # Do lookup on the files from mount cmd = ("ls -lR %s/test_metadata_sb" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to lookup") g.log.info("Lookup successful") # Checking if files are still in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "File still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks after heal...') self.verify_brick_arequals() g.log.info('Getting arequal after heal is successful') # Change metadata of same files & directories as before cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2` ; do chmod -R 0555 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Do lookup on the mount cmd = ("find %s | xargs stat" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Lookup on the mount failed") g.log.info("Lookup on the mount is successful") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks...') self.verify_brick_arequals() g.log.info('Getting arequal is successful')
def test_self_heal_50k_files(self): """ Description: - Select bricks to bring offline - Bring brick offline - Create 50k files - Validate IO - Bring bricks online - Monitor heal - Check for split-brain - Validate IO """ # pylint: disable=too-many-statements,too-many-locals # Select bricks to bring offline bricks_to_bring_offline_dict = select_bricks_to_bring_offline( self.mnode, self.volname) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Create 50k files command = ("cd %s ; " "for i in `seq 1 50000` ; " "do dd if=/dev/urandom of=test.$i " "bs=100k count=1 ; " "done ;" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) # Validate IO self.assertTrue(validate_io_procs([proc], self.mounts[0]), "IO failed on some of the clients") # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3000) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def test_data_self_heal_algorithm_full_default(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full' Description: - set the volume option "data-self-heal-algorithm" to value "full" - create IO - bring down all bricks processes from selected set - modify the data - calculate arequal - bring bricks online - start healing - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options "data-self-heal-algorithm": "full"...') options = {"data-self-heal-algorithm": "full"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'data-self-heal-algorithm' is set to 'full' " "successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def test_multiple_clients_dd_on_same_file_default(self): """ - Create 2GB file - While creating file, start reading file - Bring down brick1 - Bring back the brick brick1 - Start healing - Bring down brick1 - Wait for IO to complete - Wait for reading to complete - Bring back the brick brick1 - Start healing - Wait for heal to complete - Check for split-brain - Calculate arequals on all the bricks and compare with mountpoint """ # pylint: disable=too-many-statements,too-many-locals bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s ; " "dd if=/dev/urandom of=test_file bs=1M count=2020" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Reading files on client side all_mounts_procs_read = [] for mount_obj in self.mounts: g.log.info("Reading data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Reading files...') command = ("python %s read %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) all_mounts_procs_read.append(proc) # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Bring brick1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Validate reading self.assertTrue( validate_io_procs(all_mounts_procs_read, self.mounts), "Reading failed on some of the clients" ) self.io_validation_complete = True # Bring brick1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal for mount g.log.info('Getting arequal...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual(mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal')