def test_mount_point_not_go_to_rofs(self): """ - create two volumes with arbiter1 and mount it on same client - create IO - start deleting files from both mountpoints - kill brick from one of the node - Check if all the files are deleted from the mount point from both the servers """ # create files on all mounts g.log.info("Starting IO on all mounts...") all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("python %s create_files " "-f 100 " "--fixed-file-size 1M " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # select bricks to bring offline volume_list = get_volume_list(self.mnode) for volname in volume_list: bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # bring bricks offline g.log.info("Going to bring down the brick process for %s", bricks_to_bring_offline) ret = bring_bricks_offline(volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process for %s successfully", bricks_to_bring_offline) # delete files on all mounts g.log.info("Deleting IO on all mounts...") all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Deleting data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Delete files g.log.info('Deleting files...') command = ("python %s delete %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients")
def test_data_self_heal_daemon_off(self): """ Test Data-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get arequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get areeual after getting bricks offline and compare with arequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - add bricks - do rebalance - create 1k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # pylint: disable=too-many-statements,too-many-locals # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files on client side g.log.info("Starting IO on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("/usr/bin/env python %s create_files -f 100" " --fixed-file-size 1k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to create the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Created IO for %s is successfully', self.mounts[0].mountpoint) # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_before_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_after_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual( result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("/usr/bin/env python %s create_files -f 100" " --fixed-file-size 10k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to midify the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Modified IO for %s is successfully', self.mounts[0].mountpoint) # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Create 1k files all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 1000" " --base-file-name newfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline', brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful', brick) # Introducing 30 second sleep when brick is down g.log.info( "Waiting for 30 seconds, with ongoing IO while " "brick %s is offline", brick) sleep(30) # Bring brick online g.log.info('Bringing bricks %s online...', brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', brick) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue( ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients")
def test_snap_self_heal(self): """ Steps: 1. create a volume 2. mount volume 3. create snapshot of that volume 4. Activate snapshot 5. Clone snapshot and Mount 6. Perform I/O 7. Bring Down Few bricks from volume without affecting the volume or cluster. 8. Perform I/O 9. Bring back down bricks to online 10. Validate heal is complete with areequal """ # pylint: disable=too-many-statements, too-many-locals # Creating snapshot: g.log.info("Starting to Create snapshot") ret, _, _ = snap_create(self.mnode, self.volname, self.snap) self.assertEqual( ret, 0, ("Failed to create snapshot for volume %s" % self.volname)) g.log.info("Snapshot %s created successfully for volume %s", self.snap, self.volname) # Activating snapshot g.log.info("Starting to Activate Snapshot") ret, _, _ = snap_activate(self.mnode, self.snap) self.assertEqual(ret, 0, ("Failed to Activate snapshot %s" % self.snap)) g.log.info("Snapshot %s activated successfully", self.snap) # snapshot list ret, _, _ = snap_list(self.mnode) self.assertEqual(ret, 0, ("Failed to list all the snapshot")) g.log.info("Snapshot list command was successful") # Creating a Clone volume from snapshot: g.log.info("Starting to Clone volume from Snapshot") ret, _, _ = snap_clone(self.mnode, self.snap, self.clone) self.assertEqual(ret, 0, ("Failed to clone %s from snapshot %s" % (self.clone, self.snap))) g.log.info("%s created successfully", self.clone) # start clone volumes g.log.info("start to created clone volumes") ret, _, _ = volume_start(self.mnode, self.clone) self.assertEqual(ret, 0, "Failed to start clone %s" % self.clone) g.log.info("clone volume %s started successfully", self.clone) # Mounting a clone volume g.log.info("Mounting a clone volume") ret, _, _ = mount_volume(self.clone, self.mount_type, self.mount1, self.mnode, self.clients[0]) self.assertEqual(ret, 0, "Failed to mount clone Volume %s" % self.clone) g.log.info("Clone volume %s mounted Successfully", self.clone) # Checking cloned volume mounted or not ret = is_mounted(self.clone, self.mount1, self.mnode, self.clients[0], self.mount_type) self.assertTrue( ret, "Failed to mount clone volume on mount point: %s" % self.mount1) g.log.info("clone Volume %s mounted on %s", self.clone, self.mount1) # write files on all mounts g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mount1) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mount1)) proc = g.run(self.clients[0], cmd) all_mounts_procs.append(proc) g.log.info("Successful in creating I/O on mounts") # get the bricks from the volume g.log.info("Fetching bricks for the volume : %s", self.clone) bricks_list = get_all_bricks(self.mnode, self.clone) g.log.info("Brick List : %s", bricks_list) # Select bricks to bring offline g.log.info("Starting to bring bricks to offline") bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) g.log.info("Brick to bring offline: %s ", bricks_to_bring_offline) ret = bring_bricks_offline(self.clone, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring the bricks offline") g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Offline Bricks list offline_bricks = get_offline_bricks_list(self.mnode, self.clone) self.assertIsNotNone( offline_bricks, "Failed to get offline bricklist" "for volume %s" % self.clone) for bricks in offline_bricks: self.assertIn(bricks, bricks_to_bring_offline, "Failed to validate " "Bricks offline") g.log.info("Bricks Offline: %s", offline_bricks) # Online Bricks list online_bricks = get_online_bricks_list(self.mnode, self.clone) self.assertIsNotNone( online_bricks, "Failed to get online bricks" " for volume %s" % self.clone) g.log.info("Bricks Online: %s", online_bricks) # write files mountpoint g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mount1) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mount1)) proc = g.run(self.clients[0], cmd) all_mounts_procs.append(proc) g.log.info("Successful in creating I/O on mounts") # Bring all bricks online g.log.info("bring all bricks online") ret = bring_bricks_online(self.mnode, self.clone, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring bricks online") g.log.info("Successful in bringing all bricks online") # Validate Bricks are online g.log.info("Validating all bricks are online") ret = are_bricks_online(self.mnode, self.clone, bricks_list) self.assertTrue(ret, "Failed to bring all the bricks online") g.log.info("bricks online: %s", bricks_list) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.clone) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online" % self.clone)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.clone) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.clone) self.assertTrue( ret, ("Volume %s : All process are not online" % self.clone)) g.log.info("Volume %s : All process are online", self.clone) # wait for the heal process to complete g.log.info("waiting for heal process to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Failed to complete the heal process") g.log.info("Successfully completed heal process") # Check areequal # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.clone) subvols = get_subvols(self.mnode, self.clone) num_subvols = len(subvols['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare g.log.info("Starting to Compare areequals") for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols['volume_subvols'][i] node, brick_path = subvol_brick_list[0].split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) first_brick_total = arequal.splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick for brick in subvol_brick_list: node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, brick_arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = brick_arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and %s are not equal' % brick) g.log.info('Arequals for subvol and %s are equal', brick) g.log.info('All arequals are equal for distributed-replicated')
def test_data_self_heal_daemon_off(self): """ Test Data-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get areequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get areequal after getting bricks offline and compare with areequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - add bricks - do rebalance - create 5k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s" % (options, self.volname)) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 100 --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before getting bricks offline g.log.info('Getting areequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Get areequal after getting bricks offline g.log.info('Getting areequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks offline ' 'is successful') # Checking areequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 100 --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Bring brick online g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s" % self.volname) # Do rebalance ret, out, err = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Create 1k files self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 1000 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline' % brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Bring brick online g.log.info('Bringing bricks %s online...' % brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts")
def test_entry_self_heal_heal_command(self): """ Test Entry-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - get areequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - get areequal after getting bricks offline and compare with arequal after bringing bricks offline - modify the data - get areequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - get areequal after getting bricks online and compare with arequal before bringing bricks online """ # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off'") # Start IO on mounts g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) cmd = ( "python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-length 2 " "--dir-depth 2 " "--max-num-of-dirs 2 " "--num-of-files 20 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 g.log.info("IO on %s:%s is started successfully" % (mount_obj.client_system, mount_obj.mountpoint)) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Command list to do different operations with data - # create, rename, copy and delete cmd_list = [ "python %s create_files -f 20 %s", "python %s mv -i '.trashcan' %s", "python %s copy --dest-dir new_dir %s", "python %s delete %s", ] for cmd in cmd_list: # Get areequal before getting bricks offline g.log.info('Getting areequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = { "self-heal-daemon": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' " "is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Get areequal after getting bricks offline g.log.info('Getting areequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks offline ' 'is successful') # Checking areequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums are not equal') g.log.info('Checksums before bringing bricks offline ' 'and after bringing bricks offline are equal') # Modify the data g.log.info("Start modifying IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = cmd % (self.script_upload_path, mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is modified successfully" % (mount_obj.client_system, mount_obj.mountpoint)) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before getting bricks online g.log.info('Getting areequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Bring brick online g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = { "self-heal-daemon": "on", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get areequal after getting bricks online g.log.info('Getting areequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Checking areequals before bringing bricks online # and after bringing bricks online self.assertEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def test_replace_brick_self_heal_io_in_progress(self): """ - Create directory on mount point and write files/dirs - Create another set of files (1K files) - While creation of files/dirs are in progress Kill one brick - Remove the contents of the killed brick(simulating disk replacement) - When the IO's are still in progress, restart glusterd on the nodes where we simulated disk replacement to bring back bricks online - Start volume heal - Wait for IO's to complete - Verify whether the files are self-healed - Calculate arequals of the mount point and all the bricks """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Create dirs with files g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 10 %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, command, user=self.mounts[0].user) self.assertFalse(ret, err) g.log.info("IO is successful") # Creating another set of files (1K files) self.all_mounts_procs = [] # Create dirs with files g.log.info('Creating 1K files...') command = ("/usr/bin/env python %s create_files " "-f 1500 --fixed-file-size 10k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts[0]) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Remove the content of the killed bricks for brick in bricks_to_bring_offline: brick_node, brick_path = brick.split(':') # Removing files command = ('cd %s ; rm -rf *' % brick_path) ret, _, err = g.run(brick_node, command) self.assertFalse(ret, err) g.log.info('Files are deleted on brick %s', brick) # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal daemons are online") # Start healing ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Check arequals for "replicated" all_bricks = get_all_bricks(self.mnode, self.volname) if self.volume_type == "replicated": # Get arequal after bricks are online ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after successfully bringing' 'bricks online.') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total ret, arequals = collect_bricks_arequal(all_bricks) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and brick ' 'are not equal') g.log.info('Arequals for mountpoint and brick are equal') # Check arequals for "distributed-replicated" if self.volume_type == "distributed-replicated": # Get the subvolumes subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols_dict['volume_subvols'][i] ret, arequal = collect_bricks_arequal(subvol_brick_list[0]) self.assertTrue(ret, 'Failed to get arequal on first brick') first_brick_total = arequal[0].splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick ret, arequals = collect_bricks_arequal(subvol_brick_list) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and brick are ' 'not equal') g.log.info('Arequals for subvol and brick are equal')
def test_self_heal_differing_in_file_type(self): """ testing self heal of files with different file types with default configuration Description: - create IO - calculate arequal - bring down all bricks processes from selected set - calculate arequal and compare with arequal before getting bricks offline - modify the data - arequal before getting bricks online - bring bricks online - check daemons and healing completion - start healing - calculate arequal and compare with arequal before bringing bricks online and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side all_mounts_procs = [] test_file_type_differs_self_heal_folder = \ 'test_file_type_differs_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 10` ; " "do mkdir l1_dir.$i ; " "for j in `seq 1 5` ; " "do mkdir l1_dir.$i/l2_dir.$j ; " "for k in `seq 1 10` ; " "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k " "bs=1k count=$k ; " "done ; " "done ; " "done ; " % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue(wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual( sorted(result_before_offline), sorted(result_after_offline), 'Checksums before and after bringing bricks' ' offline are not equal') g.log.info('Checksums before and after ' 'bringing bricks offline are equal') # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("cd %s/%s/ ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do for k in `seq 1 10` ; " "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; " "mkdir l1_dir.$i/l2_dir.$j/test.$k ; " "done ; " "done ; " "done ;" % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual( sorted(result_before_online), sorted(result_after_online), 'Checksums before and after bringing bricks' ' online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_self_heal_symbolic_links(self): """ Test Self-Heal of Symbolic Links (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - calculate arequals and compare with arequal before bringing bricks offline - modify the data and verify whether the links are properly created - calculate arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check is heal is complited - check for split-brain - calculate arequal after getting bricks online and compare with arequal before getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off' successfully") # Creating files on client side all_mounts_procs = [] test_sym_link_self_heal_folder = 'test_sym_link_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("cd %s/ ; " "mkdir %s ; " "cd %s/ ;" "for i in `seq 1 5` ; " "do mkdir dir.$i ; " "for j in `seq 1 10` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1k count=$j ; " "done ; " "done ;" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder, test_sym_link_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(sorted(result_before_offline), sorted(result_after_offline), 'Checksums before and after bringing bricks ' 'online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create symlinks g.log.info('Creating symlinks...') command = ("cd %s/%s/ ; " "for i in `seq 1 5` ; " "do ln -s dir.$i sym_link_dir.$i ; " "done ;" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder)) ret, _, _ = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, 'Failed to modify the data for %s...' % self.mounts[0].mountpoint) g.log.info('Modifying the data for %s is successful', self.mounts[0].mountpoint) # Verify whether the links are properly created # Get symlink list command = ("cd %s/%s/ ; " "ls |grep 'sym'" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder)) _, out, _ = g.run(self.mounts[0].client_system, command) symlink_list = out.strip().split('\n') # Get folder list command = ("cd %s/%s/ ; " "ls |grep -v 'sym'" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder)) _, out, _ = g.run(self.mounts[0].client_system, command) folder_list = out.strip().split('\n') # Compare symlinks and folders for symlink in symlink_list: symlink_index = symlink_list.index(symlink) command = ("cd %s/%s/ ; " "readlink %s" % (self.mounts[0].mountpoint, test_sym_link_self_heal_folder, symlink)) _, out, _ = g.run(self.mounts[0].client_system, command) symlink_to_folder = out.strip() self.assertEqual(symlink_to_folder, folder_list[symlink_index], 'Links are not properly created') g.log.info('Links for %s are properly created', self.mounts[0].mountpoint) # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums before and after bringing bricks ' 'online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_metadata_self_heal(self): """ Test MetaData Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Change the permissions, ownership and the group of the files under "test_meta_data_self_heal" folder - get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check is heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online - check group and user are 'qa' """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off' successfully") # Creating files on client side all_mounts_procs = [] test_meta_data_self_heal_folder = 'test_meta_data_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 50` ; " "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; " "done ;" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder, test_meta_data_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue(wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Changing the permissions, ownership and the group # of the files under "test_meta_data_self_heal" folder g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Change permissions to 444 g.log.info('Changing permissions...') command = ( "cd %s/%s/ ; " "chmod -R 444 *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Permissions are changed successfully') # Change the ownership to qa g.log.info('Changing the ownership...') command = ( "cd %s/%s/ ; " "chown -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Ownership is changed successfully') # Change the group to qa g.log.info('Changing the group...') command = ( "cd %s/%s/ ; " "chgrp -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Group is changed successfully') # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume process %s not online " "despite waiting for 5 minutes", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal') # Adding servers and client in single dict to check permissions nodes_to_check = {} all_bricks = get_all_bricks(self.mnode, self.volname) for brick in all_bricks: node, brick_path = brick.split(':') nodes_to_check[node] = brick_path nodes_to_check[self.mounts[0].client_system] = \ self.mounts[0].mountpoint # Checking for user and group for node in nodes_to_check: # Get file list command = ("cd %s/%s/ ; " "ls" % (nodes_to_check[node], test_meta_data_self_heal_folder)) ret, out, err = g.run(node, command) file_list = out.split() for file_name in file_list: file_to_check = '%s/%s/%s' % (nodes_to_check[node], test_meta_data_self_heal_folder, file_name) g.log.info('Checking for permissions, user and group for %s', file_name) # Check for permissions cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'".format( file_to_check)) ret, permissions, _ = g.run(node, cmd) self.assertEqual( permissions.split('\n')[0], '444', 'Permissions %s is not equal to 444' % permissions) g.log.info("Permissions are '444' for %s", file_name) # Check for user cmd = ("ls -ld {} | awk '{{print $3}}'".format(file_to_check)) ret, username, _ = g.run(node, cmd) self.assertEqual( username.split('\n')[0], 'qa', 'User %s is not equal qa' % username) g.log.info("User is 'qa' for %s", file_name) # Check for group cmd = ("ls -ld {} | awk '{{print $4}}'".format(file_to_check)) ret, groupname, _ = g.run(node, cmd) self.assertEqual( groupname.split('\n')[0], 'qa', 'Group %s is not equal qa' % groupname) g.log.info("Group is 'qa' for %s", file_name)
def test_entry_self_heal_heal_command(self): """ Test Entry-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - get arequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - get arequal after getting bricks offline and compare with arequal after bringing bricks offline - modify the data - get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before bringing bricks online """ # pylint: disable=too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off'") # Start IO on mounts g.log.info("Starting IO on all mounts...") g.log.info("Starting IO on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("python %s create_deep_dirs_with_files " "--dir-length 2 " "--dir-depth 2 " "--max-num-of-dirs 2 " "--num-of-files 20 %s/files" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to create the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Created IO for %s is successfully', self.mounts[0].mountpoint) # Command list to do different operations with data - # create, rename, copy and delete cmd_list = [ "python %s create_files -f 20 %s/files", "python %s mv %s/files", # 'copy' command works incorrect. disable until fixed # "python %s copy --dest-dir %s/new_dir %s/files", "python %s delete %s" ] for cmd in cmd_list: # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_before_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' " "is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_after_offline = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums are not equal') g.log.info('Checksums before bringing bricks offline ' 'and after bringing bricks offline are equal') # Modify the data g.log.info("Start modifying IO on all mounts...") g.log.info("Modifying IO on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) if 'copy --dest-dir' in cmd: parsed_cmd = cmd % (self.script_upload_path, self.mounts[0].mountpoint, self.mounts[0].mountpoint) else: parsed_cmd = cmd % (self.script_upload_path, self.mounts[0].mountpoint) ret, _, err = g.run(self.mounts[0].client_system, parsed_cmd, user=self.mounts[0].user) self.assertFalse( ret, 'Failed to modify the data for %s: %s' % (self.mounts[0].mountpoint, err)) g.log.info('Modified IO for %s is successfully', self.mounts[0].mountpoint) # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_before_online = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal before getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') result_after_online = arequals[0].splitlines()[-1].split(':')[-1] g.log.info('Getting arequal after getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def test_self_heal_when_io_in_progress(self): """Test self-heal is successful when IO is in progress. Description: - simulate brick down. - bring bricks online - wait for heal to complete - validate IO """ # Log Volume Info and Status before simulating brick failure g.log.info("Logging volume info and Status before bringing bricks " "offlien from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring bricks offline g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s processes not online despite waiting" "for 5 mins", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Log Volume Info and Status g.log.info("Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all the bricks in list: %s are offline", bricks_to_bring_offline)) g.log.info("Successfully validated that bricks: %s are all offline", bricks_to_bring_offline) # Add delay before bringing bricks online time.sleep(40) # Bring bricks online g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring bricks: %s online", bricks_to_bring_offline)) g.log.info("Successfully brought all bricks:%s online", bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Log Volume Info and Status g.log.info("Logging volume info and Status after bringing bricks " "online from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online", self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal to complete g.log.info("Wait for self-heal to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Self heal didn't complete even after waiting " "for 20 minutes. 20 minutes is too much a time for " "current test workload") g.log.info("self-heal is successful after replace-brick operation") # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.io_validation_complete = True self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_self_heal_50k_files_heal_default(self): """ Test self-heal of 50k files by heal default Description: - bring down all bricks processes from selected set - create IO (50k files) - Get arequal before getting bricks online - check for daemons to come online - heal daemon should pick up entries to heal automatically - check if heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create 50k files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 50000 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Default Heal testing, wait for shd to pick up healing # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(result_before_online, result_after_online, 'Checksums before and after bringing bricks online ' 'are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_data_self_heal_command(self): """ Test Data-Self-Heal (heal command) Description: - get the client side healing volume options and check if they have already been disabled by default NOTE: Client side healing has been disabled by default since GlusterFS 6.0 "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get arequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get arequal after getting bricks offline and compare with arequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - create 5k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # pylint: disable=too-many-statements # Checking if Client side healing options are disabled by default g.log.info('Checking Client side healing is disabled by default') options = ('cluster.metadata-self-heal', 'cluster.data-self-heal', 'cluster.entry-self-heal') for option in options: ret = get_volume_options(self.mnode, self.volname, option)[option] self.assertTrue(bool(ret == 'off' or ret == 'off (DEFAULT)'), "{} option is not disabled by default" .format(option)) g.log.info("Client side healing options are disabled by default") # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1k %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 10k %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all processes are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Create 1k files self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 1000 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline', brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Bring brick online g.log.info('Bringing bricks %s online...', brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True
def test_data_self_heal_algorithm_diff_heal_command(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff' Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - modify the data - get arequal before getting bricks online - bring bricks online - expand volume by adding bricks to the volume - do rebalance - set the volume option "self-heal-daemon": "on" and check for daemons - start healing - check if heal is completed - check for split-brain - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-branches,too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "data-self-heal-algorithm": "diff" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off'," "'data-self-heal-algorithm' " "is set to 'diff' successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Expand volume by adding bricks to the volume g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums are not equal') g.log.info('Checksums are equal')
def test_self_heal_50k_files(self): """ Description: - Select bricks to bring offline - Bring brick offline - Create 50k files - Validate IO - Bring bricks online - Monitor heal - Check for split-brain - Validate IO """ # pylint: disable=too-many-statements,too-many-locals # Select bricks to bring offline bricks_to_bring_offline_dict = select_bricks_to_bring_offline( self.mnode, self.volname) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Create 50k files command = ("cd %s ; " "for i in `seq 1 50000` ; " "do dd if=/dev/urandom of=test.$i " "bs=100k count=1 ; " "done ;" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) # Validate IO self.assertTrue(validate_io_procs([proc], self.mounts[0]), "IO failed on some of the clients") # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3000) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def test_self_heal_50k_files_heal_command_by_add_brick(self): """ Test self-heal of 50k files (heal command Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "self-heal-daemon": "off" - bring down all bricks processes from selected set - create IO (50k files) - Get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check for daemons - start healing - check if heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online - add bricks - do rebalance - get arequal after adding bricks and compare with arequal after getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Successfully set %s for volume %s", options, self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create 50k files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 50000 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Get arequal after adding bricks g.log.info('Getting arequal after adding bricks...') ret, result_after_adding_bricks = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks ' 'is successful') # Checking arequals after bringing bricks online # and after adding bricks self.assertItemsEqual(result_after_online, result_after_adding_bricks, 'Checksums after bringing bricks online and ' 'after adding bricks are not equal') g.log.info('Checksums after bringing bricks online and ' 'after adding bricks are equal')
def test_heal_info_should_have_fixed_fields(self): """ - Create IO - While IO is creating - bring down a couple of bricks - Wait for IO to complete - Bring up the down bricks - Wait for heal to complete - Check for fields 'Brick', 'Status', 'Number of entries' in heal info """ # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -f 50 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get heal info g.log.info('Getting heal info...') heal_info_dicts = get_heal_info_summary(self.mnode, self.volname) self.assertFalse(ret, 'Failed to get heal info') g.log.info(heal_info_dicts) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Check all fields in heal info dict g.log.info('Checking for all the fields in heal info...') for brick in bricks_list: g.log.info('Checking fields for %s', brick) self.assertEqual(heal_info_dicts[brick]['status'], 'Connected', 'Status is not Connected for brick %s' % brick) self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0', 'numberOfEntries is not 0 for brick %s' % brick) g.log.info('Successfully checked for all the fields in heal info')
def test_self_heal_algorithm_full_daemon_off(self): """"" Description:- Checking healing when algorithm is set to "full" and self heal daemon is "off". """ "" # pylint: disable=too-many-statements # Setting volume option of self heal & algorithm options = { "metadata-self-heal": "disable", "entry-self-heal": "disable", "data-self-heal": "disable", "data-self-heal-algorithm": "full", "self-heal-daemon": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, "Failed to set the volume options %s" % options) g.log.info(" Volume set options success") # Select bricks to bring down bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Failed to bring bricks: %s offline" % bricks_to_bring_offline) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Not all the bricks in list:%s are offline" % bricks_to_bring_offline) g.log.info("Successfully validated that bricks %s are all offline", bricks_to_bring_offline) # IO on the mount point all_mounts_procs = [] g.log.info("Creating Files on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("cd %s ;for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=1;done" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Collecting Arequal before bring the bricks up g.log.info("Collecting Arequal before the bring of bricks down") result_before = collect_mounts_arequal(self.mounts) # Turning self heal daemon ON optionstwo = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, optionstwo) self.assertTrue(ret, "Failed to turn self-heal ON") g.log.info("Volume set options %s: success", optionstwo) # Bring bricks online g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Failed to bring bricks: %s online" % bricks_to_bring_offline) g.log.info("Successfully brought all bricks:%s online", bricks_to_bring_offline) # Waiting for bricks to come online g.log.info("Waiting for brick process to come online") ret = wait_for_bricks_to_be_online(self.mnode, self.volname, timeout=30) self.assertTrue(ret, "bricks didn't come online after adding bricks") g.log.info("Bricks are online") # Verifying all bricks online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, "Volume %s : All process are not online" % self.volname) g.log.info("Volume %s : All process are online", self.volname) # Wait for self heal processes to come online g.log.info("Wait for selfheal process to come online") ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname, timeout=300) self.assertTrue(ret, "Self-heal process are not online") g.log.info("All self heal process are online") # Wait for self-heal to complete g.log.info("Wait for self-heal to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue( ret, "Self heal didn't complete even after waiting " "for 20 minutes. 20 minutes is too much a time for " "current test workload") g.log.info("self-heal is successful after replace-brick operation") # arequal after healing g.log.info("Collecting Arequal before the bring of bricks down") result_after = collect_mounts_arequal(self.mounts) # Comparing the results g.log.info("comparing both the results") self.assertEqual(result_before, result_after, "Arequals are not equal")
def test_self_heal_50k_files_heal_command_by_add_brick(self): """ Test self-heal of 50k files (heal command) Description: - Set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "self-heal-daemon": "off" - Bring down all bricks processes from selected set - Create IO (50k files) - Get arequal before getting bricks online - Bring bricks online - Set the volume option "self-heal-daemon": "on" - Check for daemons - Start healing - Check if heal is completed - Check for split-brain - Get arequal after getting bricks online and compare with arequal before getting bricks online - Add bricks - Do rebalance - Get arequal after adding bricks and compare with arequal after getting bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Successfully set %s for volume %s", options, self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files on client side all_mounts_procs = [] # Create 50k files g.log.info('Creating files...') command = ("cd %s ; " "for i in `seq 1 50000` ; " "do dd if=/dev/urandom of=test.$i " "bs=100k count=1 ; " "done ;" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts[0]), "IO failed on some of the clients") # Get arequal before getting bricks online ret, result_before_online = collect_mounts_arequal(self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options ret = set_volume_options(self.mnode, self.volname, {"self-heal-daemon": "on"}) self.assertTrue(ret, 'Failed to set option self-heal-daemon to ON.') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=3600) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual( result_before_online, result_after_online, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Add bricks ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance and wait for it to complete ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname, timeout=3600) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Get arequal after adding bricks ret, result_after_adding_bricks = collect_mounts_arequal( self.mounts[0]) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks ' 'is successful') # Checking arequals after bringing bricks online # and after adding bricks self.assertItemsEqual( result_after_online, result_after_adding_bricks, 'Checksums after bringing bricks online and ' 'after adding bricks are not equal') g.log.info('Checksums after bringing bricks online and ' 'after adding bricks are equal')
def test_self_heal_when_io_in_progress(self): """Test self-heal is successful when IO is in progress. Description: - simulate brick down. - bring bricks online - wait for heal to complete - validate IO """ # pylint: disable=too-many-statements # Check if volume type is dispersed. If the volume type is # dispersed, set the volume option 'disperse.optimistic-change-log' # to 'off' # Refer to: https://bugzilla.redhat.com/show_bug.cgi?id=1470938 # pylint: disable=unsupported-membership-test if 'dispersed' in self.volume_type and 'nfs' in self.mount_type: g.log.info("Set volume option 'disperse.optimistic-change-log' " "to 'off' on a dispersed volume . " "Refer to bug: " "https://bugzilla.redhat.com/show_bug.cgi?id=1470938") ret = set_volume_options(self.mnode, self.volname, {'disperse.optimistic-change-log': 'off'}) self.assertTrue(ret, ("Failed to set the volume option %s to " "off on volume %s", 'disperse.optimistic-change-log', self.volname)) g.log.info("Successfully set the volume option " "'disperse.optimistic-change-log' to 'off'") # Log Volume Info and Status before simulating brick failure g.log.info("Logging volume info and Status before bringing bricks " "offlien from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring bricks offline g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Log Volume Info and Status g.log.info("Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all the bricks in list: %s are offline", bricks_to_bring_offline)) g.log.info("Successfully validated that bricks: %s are all offline", bricks_to_bring_offline) # Add delay before bringing bricks online time.sleep(40) # Bring bricks online g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring bricks: %s online", bricks_to_bring_offline)) g.log.info("Successfully brought all bricks:%s online", bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Log Volume Info and Status g.log.info("Logging volume info and Status after bringing bricks " "online from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online", self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal to complete g.log.info("Wait for self-heal to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Self heal didn't complete even after waiting " "for 20 minutes. 20 minutes is too much a time for " "current test workload") g.log.info("self-heal is successful after replace-brick operation") # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.io_validation_complete = True self.assertTrue(ret, "IO failed on some of the clients") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_brick_process_not_started_on_read_only_node_disks(self): """ * create volume and start * kill one brick * start IO * unmount the brick directory from node * remount the brick directory with read-only option * start the volume with "force" option * check for error 'posix: initializing translator failed' in log file * remount the brick directory with read-write option * start the volume with "force" option * validate IO """ # pylint: disable=too-many-locals,too-many-statements # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files for all volumes self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_files -f 100 " "%s/test_dir" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # umount brick brick_node, volume_brick = bricks_to_bring_offline[0].split(':') node_brick = '/'.join(volume_brick.split('/')[0:3]) g.log.info('Start umount brick %s...', node_brick) ret, _, _ = g.run(brick_node, 'umount %s' % node_brick) self.assertFalse(ret, 'Failed to umount brick %s' % node_brick) g.log.info('Successfully umounted %s', node_brick) # get time before remount the directory and checking logs for error g.log.info('Getting time before remount the directory and ' 'checking logs for error...') _, time_before_checking_logs, _ = g.run(brick_node, 'date -u +%s') g.log.info('Time before remount the directory and checking logs - %s', time_before_checking_logs) # remount the directory with read-only option g.log.info('Start remount brick %s with read-only option...', node_brick) ret, _, _ = g.run(brick_node, 'mount -o ro %s' % node_brick) self.assertFalse(ret, 'Failed to remount brick %s' % node_brick) g.log.info('Successfully remounted %s with read-only option', node_brick) # start volume with "force" option g.log.info('starting volume with "force" option...') ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertFalse( ret, 'Failed to start volume %s with "force" option' % self.volname) g.log.info('Successfully started volume %s with "force" option', self.volname) # check logs for an 'initializing translator failed' error g.log.info( "Checking logs for an 'initializing translator failed' " "error for %s brick...", node_brick) error_msg = 'posix: initializing translator failed' cmd = ("cat /var/log/glusterfs/bricks/bricks-%s-%s.log | " "grep '%s'" % (volume_brick.split('/')[-2], volume_brick.split('/')[-1], error_msg)) ret, log_msgs, _ = g.run(brick_node, cmd) log_msg = log_msgs.rstrip().split('\n')[-1] self.assertTrue(error_msg in log_msg, 'No errors in logs') g.log.info('EXPECTED: %s', error_msg) # get time from log message log_time_msg = log_msg.split('E')[0][1:-2].split('.')[0] log_time_msg_converted = calendar.timegm( time.strptime(log_time_msg, '%Y-%m-%d %H:%M:%S')) g.log.info('Time_msg from logs - %s ', log_time_msg) g.log.info('Time from logs - %s ', log_time_msg_converted) # get time after remount the directory checking logs for error g.log.info('Getting time after remount the directory and ' 'checking logs for error...') _, time_after_checking_logs, _ = g.run(brick_node, 'date -u +%s') g.log.info('Time after remount the directory and checking logs - %s', time_after_checking_logs) # check time periods g.log.info('Checking if an error is in right time period...') self.assertTrue( int(time_before_checking_logs) <= int(log_time_msg_converted) <= int(time_after_checking_logs), 'Expected error is not in right time period') g.log.info('Expected error is in right time period') # umount brick g.log.info('Start umount brick %s...', node_brick) ret, _, _ = g.run(brick_node, 'umount %s' % node_brick) self.assertFalse(ret, 'Failed to umount brick %s' % node_brick) g.log.info('Successfully umounted %s', node_brick) # remount the directory with read-write option g.log.info('Start remount brick %s with read-write option...', node_brick) ret, _, _ = g.run(brick_node, 'mount %s' % node_brick) self.assertFalse(ret, 'Failed to remount brick %s' % node_brick) g.log.info('Successfully remounted %s with read-write option', node_brick) # start volume with "force" option g.log.info('starting volume with "force" option...') ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertFalse( ret, 'Failed to start volume %s with "force" option' % self.volname) g.log.info('Successfully started volume %s with "force" option', self.volname) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True
def test_data_self_heal_algorithm_full_default(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full' Description: - set the volume option "data-self-heal-algorithm" to value "full" - create IO - bring down all bricks processes from selected set - modify the data - calculate arequal - bring bricks online - start healing - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options "data-self-heal-algorithm": "full"...') options = {"data-self-heal-algorithm": "full"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'data-self-heal-algorithm' is set to 'full' " "successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("python %s create_files -f 100 %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("python %s create_files -f 100 --fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def test_glustershd_with_restarting_glusterd(self): """ Test Script to verify the self heal daemon process with restarting glusterd and rebooting the server * stop all volumes * restart glusterd - should not run self heal daemon process * start replicated involved volumes * single self heal daemon process running * restart glusterd * self heal daemon pid will change * bring down brick and restart glusterd * self heal daemon pid will change and its different from previous * brought up the brick """ # pylint: disable=too-many-statements nodes = self.volume['servers'] # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # check the self heal daemon process after stopping the volume g.log.info("Verifying the self heal daemon process for " "volume %s", self.volname) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is still running " "even after stopping volume %s" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume %s", self.volname) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) self.assertTrue( wait_for_glusterd_to_start(self.servers), "Failed to start glusterd on %s" % self.servers) # check the self heal daemon process after restarting glusterd process g.log.info("Starting to get self-heal daemon process on" " nodes %s", nodes) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is running after " "glusterd restart with volume %s in " "stop state" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume and restarting glusterd %s", self.volname) # start the volume g.log.info("Starting the volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verfiy glustershd process releases its parent process g.log.info("Checking whether glustershd process is daemonized or not") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Single self heal daemon process on all nodes %s", nodes) # get the self heal daemon pids after starting volume g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self heal daemon pids") glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info # with glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different from " "glustershd server volume file. " "Please check log file for details.")) g.log.info("Successfully parsed %s file", self.glustershd) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after starting volume and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_glusterd_restart = pids self.assertNotEqual(glustershd_pids, glustershd_pids_after_glusterd_restart, ("Self Heal Daemon pids are same after " "restarting glusterd process")) g.log.info("Self Heal Daemon process are different before and " "after restarting glusterd process") # select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # bring bricks offline g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # restart glusterd after brought down the brick g.log.info("Restart glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after killing brick and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_killing_brick = pids self.assertNotEqual(glustershd_pids_after_glusterd_restart, glustershd_pids_after_killing_brick, ("Self Heal Daemon process are same from before " "killing the brick,restarting glusterd process")) g.log.info("Self Heal Daemon process are different after killing the " "brick, restarting the glusterd process") # brought the brick online g.log.info("bringing up the bricks : %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to brought the bricks online")) g.log.info("Successfully brought the bricks online") # check all bricks are online g.log.info("Verifying all bricka are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.")
def test_self_heal(self): """ Description:- - Create files on mount point - Kill one brick from volume - rm -rfv on mount point - bring bricks online - wait for heals - list """ # pylint: disable=too-many-statements # IO on the mount point g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ( "/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 35 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Killing one brick from the volume set g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Not all the bricks in list: %s are offline" % bricks_to_bring_offline) g.log.info("Successfully validated that bricks: %s are all offline", bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Checking volume status g.log.info( "Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Removing files from the mount point when one brick is down g.log.info("Removing files from the mount point") mountpoint = self.mounts[0].mountpoint client = self.mounts[0].client_system cmd = "rm -rfv %s/*" % mountpoint ret, _, _ = g.run(client, cmd) if ret != 0: raise ExecutionError("failed to delete the files") # Bringing bricks online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Check if bricks are online g.log.info("Checking bricks are online or not") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Monitoring heals on the volume g.log.info("Wait for heal completion...") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue( ret, "Self heal didn't complete even after waiting " "for 20 minutes.") g.log.info("self-heal is successful after changing the volume type " "from replicated to arbitered volume") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")