def test_dynamic_provisioning_glusterfile_glusterpod_failure(self): """Create glusterblock PVC when gluster pod is down.""" # Check that we work with containerized Gluster if not self.is_containerized_gluster(): self.skipTest("Only containerized Gluster clusters are supported.") mount_path = "/mnt" datafile_path = '%s/fake_file_for_%s' % (mount_path, self.id()) # Create secret and storage class self.create_storage_class() # Create PVC pvc_name = self.create_and_wait_for_pvc() # Create app POD with attached volume pod_name = oc_create_tiny_pod_with_volume( self.node, pvc_name, "test-pvc-mount-on-app-pod", mount_path=mount_path) self.addCleanup( wait_for_resource_absence, self.node, 'pod', pod_name) self.addCleanup(oc_delete, self.node, 'pod', pod_name) # Wait for app POD be up and running wait_for_pod_be_ready( self.node, pod_name, timeout=60, wait_step=2) # Run IO in background io_cmd = "oc rsh %s dd if=/dev/urandom of=%s bs=1000K count=900" % ( pod_name, datafile_path) async_io = g.run_async(self.node, io_cmd, "root") # Pick up one of the hosts which stores PV brick (4+ nodes case) gluster_pod_data = get_gluster_pod_names_by_pvc_name( self.node, pvc_name)[0] # Delete glusterfs POD from chosen host and wait for spawn of new one oc_delete(self.node, 'pod', gluster_pod_data["pod_name"]) cmd = ("oc get pods -o wide | grep glusterfs | grep %s | " "grep -v Terminating | awk '{print $1}'") % ( gluster_pod_data["host_name"]) for w in Waiter(600, 15): out = self.cmd_run(cmd) new_gluster_pod_name = out.strip().split("\n")[0].strip() if not new_gluster_pod_name: continue else: break if w.expired: error_msg = "exceeded timeout, new gluster pod not created" g.log.error(error_msg) raise ExecutionError(error_msg) new_gluster_pod_name = out.strip().split("\n")[0].strip() g.log.info("new gluster pod name is %s" % new_gluster_pod_name) wait_for_pod_be_ready(self.node, new_gluster_pod_name) # Check that async IO was not interrupted ret, out, err = async_io.async_communicate() self.assertEqual(ret, 0, "IO %s failed on %s" % (io_cmd, self.node))
def test_uss_snap_active_deactive(self): # pylint: disable=too-many-statements """ Steps: * Create volume * Mount volume * Perform I/O on mounts * Create 2 snapshots snapy1 & snapy2 * Validate snap created * Enable USS * Validate USS is enabled * Validate snapd is running * Activate snapy1 & snapy2 * List snaps under .snap directory -- snap1 and snap2 should be listed under .snaps * Deactivate snapy2 * List snaps under .snap directory -- snapy2 is not listed as it is deactivated * Activate snapy2 * List snaps under .snap directory -- snap1 and snap2 should be listed under .snaps """ # Perform I/O g.log.info("Starting IO on all mounts...") self.counter = 1 self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ( "/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 2 " "--max-num-of-dirs 2 " "--num-of-files 2 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("I/O successful on clients") # Enable USS g.log.info("Enable USS on volume") ret, _, _ = enable_uss(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to enable USS on volume") g.log.info("Successfully enabled USS on volume") # Validate USS is enabled g.log.info("Validating USS is enabled") ret = is_uss_enabled(self.mnode, self.volname) self.assertTrue(ret, "USS is disabled on volume " "%s" % self.volname) g.log.info("USS enabled on volume %s", self.volname) # Validate snapd running for server in self.servers: g.log.info("Validating snapd daemon on:%s", server) ret = is_snapd_running(server, self.volname) self.assertTrue(ret, "Snapd is Not running on " "%s" % server) g.log.info("Snapd Running on node: %s", server) # Create 2 snapshot g.log.info("Creating 2 snapshots for volume %s", self.volname) for i in range(1, 3): ret, _, _ = snap_create(self.mnode, self.volname, "snapy%s" % i) self.assertEqual( ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot %s created successfully for volume %s", "snapy%s" % i, self.volname) # Check for no of snaps using snap_list it should be 2 now snap_list = get_snap_list(self.mnode) self.assertEqual( 2, len(snap_list), "No of snaps not consistent " "for volume %s" % self.volname) g.log.info("Successfully validated number of snaps.") # Activate snapshot snapy1 & snapy2 g.log.info("Activating snapshot snapy1 & snapy2") for i in range(1, 3): ret, _, _ = snap_activate(self.mnode, "snapy%s" % i) self.assertEqual(ret, 0, "Failed to activate snapshot snapy%s" % i) g.log.info("Both snapshots activated successfully") # list activated snapshots directory under .snaps g.log.info("Listing activated snapshots under .snaps") for mount_obj in self.mounts: ret, out, _ = uss_list_snaps(mount_obj.client_system, mount_obj.mountpoint) self.assertEqual( ret, 0, "Directory Listing Failed for" " Activated Snapshot") validate_dir = out.split('\n') self.assertIn( "snapy1", validate_dir, "Failed to " "validate snapy1 under .snaps directory") g.log.info("Activated Snapshot snapy1 listed Successfully") self.assertIn( "snapy2", validate_dir, "Successfully listed" " snapy2 under.snaps directory") g.log.info("Expected: De-activated Snapshot not listed") # Deactivate snapshot snapy2 g.log.info("Deactivating snapshot snapy2") ret, _, _ = snap_deactivate(self.mnode, "snapy2") self.assertEqual(ret, 0, "Failed to deactivate snapshot snapy2") g.log.info("Successfully deactivated snapshot snapy2") # validate snapy2 should not present in mountpoint ret = view_snaps_from_mount(self.mounts, "snapy2") self.assertFalse( ret, " UnExpected : Still able to View snapy2" " from mount ") g.log.info("Successfully verified deactivated snapshot " "snapy2 is not listed") # Activate snapshot snapy2 ret, _, _ = snap_activate(self.mnode, "snapy2") self.assertEqual(ret, 0, "Failed to activate Snapshot snapy2") g.log.info("Snapshot snapy2 activated successfully") # list activated snapshots directory under .snaps g.log.info("Listing activated snapshots under .snaps") for mount_obj in self.mounts: ret, out, _ = uss_list_snaps(mount_obj.client_system, mount_obj.mountpoint) self.assertEqual( ret, 0, "Directory Listing Failed for" " Activated Snapshot") validate_dir = out.split('\n') self.assertIn( "snapy1", validate_dir, "Failed to " "validate snapy%s under .snaps directory") g.log.info("Activated Snapshot listed Successfully") self.assertIn( "snapy2", validate_dir, "Successfully listed" "snapy2 under .snaps directory") g.log.info("Expected: De-activated Snapshot not listed")
def test_data_self_heal_daemon_off(self): """ Test Data-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - Get areequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Get areequal after getting bricks offline and compare with areequal before getting bricks offline - modify the data - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - add bricks - do rebalance - create 5k files - while creating files - kill bricks and bring bricks online one by one in cycle - validate IO """ # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s" % (options, self.volname)) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ( "python %s create_files -f 100 --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before getting bricks offline g.log.info('Getting areequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Get areequal after getting bricks offline g.log.info('Getting areequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks offline ' 'is successful') # Checking areequals before bringing bricks offline # and after bringing bricks offline self.assertEqual( result_before_offline, result_after_offline, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ( "python %s create_files -f 100 --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Bring brick online g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Add bricks g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s" % self.volname) # Do rebalance ret, out, err = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Create 1k files self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Create files g.log.info('Creating files...') command = ("python %s create_files -f 1000 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Kill all bricks in cycle bricks_list = get_all_bricks(self.mnode, self.volname) for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline' % brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Bring brick online g.log.info('Bringing bricks %s online...' % brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue( ret, "Either No self heal daemon process found or" "more than one self heal daemon process" "found") g.log.info("All self-heal-daemons are online") # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts")
def test_rebalance_with_quota_enabled_on_subdirectory(self): """ Test rebalance with quota enabled on subdirectory. 1. Create Volume of type distribute 2. Set Quota limit on subdirectory 3. Do some IO to reach the Hard limit 4. After IO ends, compute arequal checksum 5. Add bricks to the volume. 6. Start rebalance 7. After rebalance is completed, check arequal checksum """ # Creating main directory. ret = mkdir(self.mounts[0].client_system, "{}/main".format(self.mounts[0].mountpoint)) self.assertTrue(ret, "mkdir of dir main failed") # Enable Quota ret, _, _ = quota_enable(self.mnode, self.volname) self.assertEqual( ret, 0, ("Failed to enable quota on the volume %s", self.volname)) g.log.info("Successfully enabled quota on volume %s", self.volname) # Set the Quota timeouts to 0 for strict accounting ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, 0) self.assertEqual( ret, 0, ("Failed to set hard-timeout to 0 for %s", self.volname)) ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, 0) self.assertEqual( ret, 0, ("Failed to set soft-timeout to 0 for %s", self.volname)) g.log.info( "Quota soft and hard timeout has been set to 0 for %s", self.volname) # Set the quota limit of 1 GB on /main dir of the volume ret, _, _ = quota_limit_usage(self.mnode, self.volname, "/main", "1GB") self.assertEqual(ret, 0, "Failed to set Quota for dir /main") g.log.info("Successfully set quota limit for dir /main") # Do some IO until hard limit is reached. cmd = ( "/usr/bin/env python %s create_files " "-f 1024 --fixed-file-size 1M --base-file-name file %s/main/" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async( self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.all_mounts_procs.append(proc) # Wait for IO to complete and validate IO self.assertTrue(wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]), "IO failed on some of the clients") g.log.info("IO completed on the clients") # Validate quota ret = quota_validate(self.mnode, self.volname, path='/main', hard_limit=1073741824, sl_exceeded=True, hl_exceeded=True) self.assertTrue(ret, "Quota validate Failed for '/main'") g.log.info("Quota Validated for path '/main'") # Compute arequal checksum. arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts) # Log Volume info and status before expanding volume. log_volume_info_and_status(self.mnode, self.volname) # Expand the volume. ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s", self.volname) # Log volume info and status after expanding volume. log_volume_info_and_status(self.mnode, self.volname) # Perform rebalance start operation. ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to start rebalance on the volume " "%s", self.volname)) g.log.info("Rebalance started.") # Check rebalance is in progress rebalance_status = get_rebalance_status(self.mnode, self.volname) ret = rebalance_status['aggregate']['statusStr'] self.assertEqual(ret, "in progress", ("Rebalance is not in " "'in progress' state, either " "rebalance is in completed state" " or failed to get rebalance " "status")) # Wait till rebalance ends. ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, ("Rebalance is not yet complete on the volume " "%s", self.volname)) g.log.info("Rebalance is successfully complete on the volume %s", self.volname) # Validate quota ret = quota_validate(self.mnode, self.volname, path='/main', hard_limit=1073741824, sl_exceeded=True, hl_exceeded=True) self.assertTrue(ret, "Quota validate Failed for '/main'") g.log.info("Quota Validated for path '/main'") # Compute arequal checksum. arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts) # Comparing arequals checksum before and after rebalance. self.assertEqual(arequal_checksum_before_rebalance, arequal_checksum_after_rebalance, "arequal checksum is NOT MATCHING") g.log.info("arequal checksum is SAME")
def test_replace_brick_self_heal_io_in_progress(self): """ - Create directory on mount point and write files/dirs - Create another set of files (1K files) - While creation of files/dirs are in progress Kill one brick - Remove the contents of the killed brick(simulating disk replacement) - When the IO's are still in progress, restart glusterd on the nodes where we simulated disk replacement to bring back bricks online - Start volume heal - Wait for IO's to complete - Verify whether the files are self-healed - Calculate arequals of the mount point and all the bricks """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Create dirs with files g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 10 %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, command, user=self.mounts[0].user) self.assertFalse(ret, err) g.log.info("IO is successful") # Creating another set of files (1K files) self.all_mounts_procs = [] # Create dirs with files g.log.info('Creating 1K files...') command = ("/usr/bin/env python %s create_files " "-f 1500 --fixed-file-size 10k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts[0]) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Remove the content of the killed bricks for brick in bricks_to_bring_offline: brick_node, brick_path = brick.split(':') # Removing files command = ('cd %s ; rm -rf *' % brick_path) ret, _, err = g.run(brick_node, command) self.assertFalse(ret, err) g.log.info('Files are deleted on brick %s', brick) # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal daemons are online") # Start healing ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Check arequals for "replicated" all_bricks = get_all_bricks(self.mnode, self.volname) if self.volume_type == "replicated": # Get arequal after bricks are online ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after successfully bringing' 'bricks online.') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total ret, arequals = collect_bricks_arequal(all_bricks) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and brick ' 'are not equal') g.log.info('Arequals for mountpoint and brick are equal') # Check arequals for "distributed-replicated" if self.volume_type == "distributed-replicated": # Get the subvolumes subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols_dict['volume_subvols'][i] ret, arequal = collect_bricks_arequal(subvol_brick_list[0]) self.assertTrue(ret, 'Failed to get arequal on first brick') first_brick_total = arequal[0].splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick ret, arequals = collect_bricks_arequal(subvol_brick_list) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and brick are ' 'not equal') g.log.info('Arequals for subvol and brick are equal')
def setUpClass(cls): # Calling GlusterBaseClass setUpClass GlusterBaseClass.setUpClass.im_func(cls) # Setup Volume and Mount Volume g.log.info("Starting to Setup Volume and Mount Volume") ret = cls.setup_volume_and_mount_volume(mounts=cls.mounts) if not ret: raise ExecutionError("Failed to Setup_Volume and Mount_Volume") g.log.info("Successful in Setup Volume and Mount Volume") # Upload io scripts for running IO on mounts g.log.info("Upload io scripts to clients %s for running IO on " "mounts", cls.clients) script_local_path = ("/usr/share/glustolibs/io/scripts/" "file_dir_ops.py") cls.script_upload_path = ("/usr/share/glustolibs/io/scripts/" "file_dir_ops.py") ret = upload_scripts(cls.clients, script_local_path) if not ret: raise ExecutionError("Failed to upload IO scripts to clients %s" % cls.clients) g.log.info("Successfully uploaded IO scripts to clients %s", cls.clients) # The --dir-length argument value for # file_dir_ops.py create_deep_dirs_with_files is set to 10 # (refer to the cmd in setUp method). This means every mount will # create # 10 top level dirs. For every mountpoint/testcase to create new set of # dirs, we are incrementing the counter by --dir-length value i.e 10 # in this test suite. # # If we are changing the --dir-length to new value, ensure the counter # is also incremented by same value to create new set of files/dirs. # Start IO on mounts g.log.info("Starting IO on all mounts...") cls.all_mounts_procs = [] for index, mount_obj in enumerate(cls.mounts, start=1): g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 1 " "--dir-length 2 " "--max-num-of-dirs 2 " "--num-of-files 55 %s" % (cls.script_upload_path, index + 10, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) cls.all_mounts_procs.append(proc) cls.io_validation_complete = False # Wait for IO to complete if not cls.io_validation_complete: g.log.info("Wait for IO to complete") ret = wait_for_io_to_complete(cls.all_mounts_procs, cls.mounts) if not ret: raise ExecutionError("IO failed on some of the clients") g.log.info("IO is successful on all mounts") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(cls.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_fops_ec_volume(self): # pylint: disable=too-many-branches,too-many-statements,too-many-locals """ - 1.Start resource consumption tool - 2.Create directory dir1 - 3.Create 5 dir and 5 files in each dir in directory 1 - 4.Rename all file inside dir1 - 5.Truncate at any dir in mountpoint inside dir1 - 6.Create softlink and hardlink of files in mountpoint - 7.Delete op for deleting all file in one of the dirs - 8.chmod, chown, chgrp inside dir1 - 9.Create tiny, small, medium nd large file - 10.Creating files on client side for dir1 - 11.Validating IO's and waiting to complete - 12.Get areequal before killing the brick - 13.Killing 1st brick manually - 14.Get areequal after killing 1st brick - 15.Killing 2nd brick manually - 16.Get areequal after killing 2nd brick - 17.Getting arequal and comparing the arequals - 18.Deleting dir1 """ # Starting resource consumption using top log_file_mem_monitor = getcwd() + '/mem_usage.log' cmd = 'for i in {1..100};do top -n 1 -b|egrep \ "RES|gluster" & free -h 2>&1 >> ' + \ log_file_mem_monitor + ' ;sleep 10;done' g.log.info(cmd) for mount_obj in self.mounts: g.run_async(mount_obj.client_system, cmd) bricks_list = [] # get the bricks from the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # Creating dir1 cmd = ('mkdir %s/dir1' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create directory1") g.log.info("Directory 1 created successfully for %s", self.mounts[0]) # Create 5 dir and 5 files in each dir at mountpoint on dir1 start = 1 end = 5 for mount_obj in self.mounts: # Number of dir and files to be created. dir_range = str(start) + ".." + str(end) file_range = str(start) + ".." + str(end) # Create dir 1-5 at mountpoint. cmd = ('mkdir %s/dir1/dir{%s};' % (mount_obj.mountpoint, dir_range)) g.run(mount_obj.client_system, cmd) # Create files inside each dir. cmd = ('touch %s/dir1/dir{%s}/file{%s};' % (mount_obj.mountpoint, dir_range, file_range)) g.run(mount_obj.client_system, cmd) # Increment counter so that at next client dir and files are made # with diff offset. Like at next client dir will be named # dir6, dir7...dir10. Same with files. start += 5 end += 5 # Rename all files inside dir1 at mountpoint on dir1 clients = [] for mount_obj in self.mounts: clients.append(mount_obj.client_system) cmd = ('cd %s/dir1/dir1/; ' 'for FILENAME in *;' 'do mv $FILENAME Unix_$FILENAME; ' 'done;' % mount_obj.mountpoint) g.run_parallel(clients, cmd) # Truncate at any dir in mountpoint inside dir1 # start is an offset to be added to dirname to act on # diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s/; ' 'for FILENAME in *;' 'do echo > $FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) g.run(mount_obj.client_system, cmd) # Create softlink and hardlink of files in mountpoint. Start is an # offset to be added to dirname to act on diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln -s $FILENAME softlink_$FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) g.run(mount_obj.client_system, cmd) cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln $FILENAME hardlink_$FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start + 1))) g.run(mount_obj.client_system, cmd) start += 5 # chmod, chown, chgrp inside dir1 # start and end used as offset to access diff files # at diff clients. start = 2 end = 5 for mount_obj in self.mounts: dir_file_range = '%s..%s' % (str(start), str(end)) cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) g.run(mount_obj.client_system, cmd) cmd = ('chown root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) g.run(mount_obj.client_system, cmd) cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) g.run(mount_obj.client_system, cmd) start += 5 end += 5 # Create tiny, small, medium nd large file # at mountpoint. Offset to differ filenames # at diff clients. offset = 1 for mount_obj in self.mounts: cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) cmd = 'fallocate -l 20M small_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) cmd = 'fallocate -l 1G large_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) offset += 1 # Creating 2TB file if volume is greater # than equal to 3TB list1 = [] command = ("df %s" % mount_obj.mountpoint) rcode, rout, rerr = g.run(mount_obj.client_system[0], command) if rcode == 0: list1 = rout.split("\n")[1].split() avail = list1[3] if int(avail) >= 3000000000: cmd = 'fallocate -l 2TB tiny_file_large.txt' g.run(mount_obj.client_system[0], cmd) g.log.error("Get mountpoint failed: %s", rerr) # Creating files on client side for dir1 # Write IO all_mounts_procs = [] count = 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Validating IO's and waiting to complete g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get areequal before killing the brick g.log.info('Getting areequal before killing of brick...') ret, result_before_killing_brick = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before killing of brick ' 'is successful') # Kill 1st brick manually ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick not offline') g.log.info('Brick is offline successfully') # Get areequal after killing 1st brick g.log.info('Getting areequal after killing of brick...') ret, result_after_killing_brick = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before killing of brick ' 'is successful') # Kill 2nd brick manually ret = bring_bricks_offline(self.volname, [bricks_list[3]]) self.assertTrue(ret, 'Brick not offline') g.log.info('Brick is offline successfully') # Get areequal after killing 2nd brick g.log.info('Getting areequal after killing of brick...') ret, result_after_killing_brick_2 = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before killing of brick ' 'is successful') # Comparing areequals self.assertEqual( result_before_killing_brick, result_after_killing_brick, 'Areequals are not equals before killing brick' 'processes and after offlining 1 redundant bricks') g.log.info('Areequals are equals before killing brick' 'processes and after offlining 1 redundant bricks') # Comparing areequals self.assertEqual(result_after_killing_brick, result_after_killing_brick_2, 'Areequals are not equals after killing 2' ' bricks') g.log.info('Areequals are equals after offlining 2 redundant bricks') # Delete op for deleting all file in one of the dirs. start is being # used as offset like in previous testcase in dir1 start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do rm -f $FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) g.run(mount_obj.client_system, cmd) start += 5 # Deleting dir1 cmd = ('rm -rf %s/dir1' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to delete directory1") g.log.info("Directory 1 deleted successfully for %s", self.mounts[0])
def test_brick_log_messages(self): ''' -> Create volume -> Mount volume -> write files on mount point -> delete files from mount point -> check for any errors filled in all brick logs ''' # checking volume mounted or not for mount_obj in self.mounts: ret = is_mounted(self.volname, mount_obj.mountpoint, self.mnode, mount_obj.client_system, self.mount_type) self.assertTrue(ret, "Not mounted on %s" % mount_obj.client_system) g.log.info("Mounted on %s", mount_obj.client_system) # run IOs g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 5 " "--max-num-of-dirs 3 " "--num-of-files 10 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Getting timestamp _, timestamp, _ = g.run_local('date +%s') timestamp = timestamp.strip() # Getting all bricks brick_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(brick_list, "Failed to get brick list") g.log.info("Successful in getting brick list %s", brick_list) # Creating dictionary for each node brick path, # here nodes are keys and brick paths are values brick_path_dict = {} for brick in brick_list: node, brick_path = brick.split(r':') brick_path_list = brick_path.split(r'/') del brick_path_list[0] brick_log_path = '-'.join(brick_path_list) brick_path_dict[node] = brick_log_path for node in brick_path_dict: # Copying brick logs into other file for backup purpose ret, _, _ = g.run(node, 'cp /var/log/glusterfs/bricks/%s.log ' '/var/log/glusterfs/bricks/%s_%s.log' % (brick_path_dict[node], brick_path_dict[node], timestamp)) if ret: raise ExecutionError("Failed to copy brick logs of %s" % node) g.log.info("Brick logs copied successfully on node %s", node) # Clearing the existing brick log file ret, _, _ = g.run(node, 'echo > /var/log/glusterfs/bricks/%s.log' % brick_path_dict[node]) if ret: raise ExecutionError("Failed to clear brick log file on %s" % node) g.log.info("Successfully cleared the brick log files on node %s", node) # Deleting files from mount point ret, _, _ = g.run(self.mounts[0].client_system, 'rm -rf %s/*' % self.mounts[0].mountpoint) self.assertEqual(ret, 0, "Failed to delete files from mountpoint %s" % self.mounts[0].mountpoint) g.log.info("Files deleted successfully from mountpoint %s", self.mounts[0].mountpoint) # Searching for error messages in brick logs after deleting # files from mountpoint for node in brick_path_dict: ret, out, _ = g.run( node, "grep ' E ' /var/log/glusterfs/bricks/%s.log | wc -l" % brick_path_dict[node]) self.assertEqual(int(out), 0, "Found Error messages in brick " "log %s" % node) g.log.info("No error messages found in brick log %s", node)
def test_conservative_merge_of_files_heal_command(self): """ - set options: "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" - Bring brick 0 offline - Creating files on client side - Bring brick 0 online - Bring brick 1 offline - Creating files on client side - Bring brick 1 online - Get arequal on bricks - Setting option "self-heal-daemon": "on" - Start healing - Get arequal on bricks and compare with arequals before healing and mountpoint """ # pylint: disable=too-many-statements,too-many-locals # set options bricks_list = get_all_bricks(self.mnode, self.volname) options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "self-heal-daemon": "off" } g.log.info("setting options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Bring brick 0 offline g.log.info('Bringing bricks %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0]) g.log.info('Bringing bricks %s offline is successful', bricks_list[0]) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 0 -l 5 -f 10 --dirname-start-num 1 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick 0 online g.log.info('Bringing bricks %s online...', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[0]) g.log.info('Bringing bricks %s online is successful', bricks_list[0]) # Bring brick 1 offline g.log.info('Bringing bricks %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Creating files on client side self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 0 -l 5 -f 10 --dirname-start-num 6 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick 1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Get arequal on bricks arequals_before_heal = {} g.log.info('Getting arequal on bricks...') for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_before_heal[brick] = brick_total # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequals for mount g.log.info('Getting arequal before getting bricks offline...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal for replicated') self.assertNotEqual( arequals_before_heal, arequals_after_heal, 'Arequals are equal for bricks before (%s) and after (%s) ' 'healing' % (arequals_before_heal, arequals_after_heal))
def test_validate_snaps_restore(self): # pylint: disable=too-many-statements # Start IO on all mounts. all_mounts_procs = [] count = 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created") # Setting some volume option related to snapshot option_before_restore = { 'volumeConfig': [{ 'softLimit': '100', 'effectiveHardLimit': '200', 'hardLimit': '256' }], 'systemConfig': { 'softLimit': '90%', 'activateOnCreate': 'disable', 'hardLimit': '256', 'autoDelete': 'disable' } } ret = set_snap_config(self.mnode, option_before_restore) self.assertTrue(ret, ("Failed to set vol option on %s" % self.volname)) g.log.info("Volume options for%s is set successfully", self.volname) # Get brick list before taking snap_restore bricks_before_snap_restore = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List before snap restore " "volume: %s", bricks_before_snap_restore) # Creating snapshot ret = snap_create(self.mnode, self.volname, "snap1") self.assertTrue(ret, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snap1 created successfully for volume %s", self.volname) # Again start IO on all mounts. all_mounts_procs = [] count = 1000 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created") # Reset volume to make sure volume options will reset ret = volume_reset(self.mnode, self.volname, force=False) self.assertTrue(ret, ("Failed to reset %s" % self.volname)) g.log.info("Reset Volume %s is Successful", self.volname) # Removing one brick g.log.info("Starting volume shrink") ret = shrink_volume(self.mnode, self.volname, force=True) self.assertTrue(ret, ("Failed to shrink the volume on " "volume %s", self.volname)) g.log.info("Shrinking volume is successful on " "volume %s", self.volname) # Restore snapshot ret = snap_restore_complete(self.mnode, self.volname, "snap1") self.assertTrue(ret, ("Failed to restore snap snap1 on the " "volume %s", self.volname)) g.log.info( "Restore of volume is successful from snap1 on " "volume %s", self.volname) # Validate volume is up and running g.log.info("Verifying volume is up and process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online", self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Get volume options post restore option_after_restore = get_snap_config(self.mnode) # Compare volume options self.assertNotEqual(option_before_restore, option_after_restore, "Volume Options are not same after snap restore") # Get brick list post restore bricks_after_snap_restore = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after snap restore " "volume: %s", bricks_after_snap_restore) # Compare brick_list self.assertNotEqual(bricks_before_snap_restore, bricks_after_snap_restore, "Bricks are not same after snap restore") # Creating snapshot ret = snap_create(self.mnode, self.volname, "snap2") self.assertTrue(ret, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snap2 created successfully for volume %s", self.volname) # Again start IO on all mounts after restore all_mounts_procs = [] count = 1000 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created")
def test_oom_on_client_heal_in_progress(self): """ - Create a 1x(2+1) arbiter replicate volume - Create IO - Bring down the 1-st data brick while creating IO - Bring up the 1-st data brick after creating and checking IO - Bring down the 3-d arbiter brick - Bring up the 3-d arbiter brick - Check there no any oom by listing the files from mountpoint """ # Creating IO on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("python %s create_files " "-f 1000 " "--fixed-file-size 10k " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Bring brick 1 offline bricks_to_bring_offline = [bricks_list[0]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring 1-st brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Bring brick 3 offline bricks_to_bring_offline = [bricks_list[-1]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Bring brick 3 online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Get file list from mountpoint g.log.info('Getting file list from mountpoints...') for mount_obj in self.mounts: g.log.info("Getting file list for %s:%s", mount_obj.client_system, mount_obj.mountpoint) g.log.info('Getting file list...') file_list = list_files(mount_obj.client_system, mount_obj.mountpoint) self.assertIsNotNone(file_list) g.log.info('Getting file list from mountpoints finished successfully')
def test_data_self_heal_algorithm_diff_heal_command(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff' Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - modify the data - get arequal before getting bricks online - bring bricks online - expand volume by adding bricks to the volume - do rebalance - set the volume option "self-heal-daemon": "on" and check for daemons - start healing - check if heal is completed - check for split-brain - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-branches,too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "data-self-heal-algorithm": "diff" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off'," "'data-self-heal-algorithm' " "is set to 'diff' successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Expand volume by adding bricks to the volume g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums are not equal') g.log.info('Checksums are equal')
def test_subdir_with_removebrick(self): # pylint: disable=too-many-statements """ Mount the volume Create 2 subdir on client subdir1 and subdir2 Auth allow - Client1(subdir1,subdir2),Client2(subdir1,subdir2) Mount the subdir to their respective clients Start IO's on both subdirs Perform remove-brick Validate on client if subdir's are mounted post remove-brick operation is performed """ # Create directories subdir1 and subdir2 on mount point ret = mkdir(self.mounts[0].client_system, "%s/subdir1" % self.mounts[0].mountpoint) self.assertTrue( ret, ("Failed to create directory 'subdir1' in" "volume %s from client %s" % (self.mounts[0].volname, self.mounts[0].client_system))) ret = mkdir(self.mounts[0].client_system, "%s/subdir2" % self.mounts[0].mountpoint) self.assertTrue( ret, ("Failed to create directory 'subdir2' in" "volume %s from client %s" % (self.mounts[0].volname, self.mounts[0].client_system))) # unmount volume ret = self.unmount_volume(self.mounts) self.assertTrue(ret, "Volumes UnMount failed") g.log.info("Volumes UnMounted successfully") # Set authentication on the subdirectory subdir1 # and subdir2 to access by 2 clients g.log.info( 'Setting authentication on subdir1 and subdir2' 'for client %s and %s', self.clients[0], self.clients[0]) ret = set_auth_allow( self.volname, self.mnode, { '/subdir1': [self.clients[0], self.clients[1]], '/subdir2': [self.clients[0], self.clients[1]] }) self.assertTrue( ret, 'Failed to set Authentication on volume %s' % self.volume) self.mpoint = "/mnt/Mount_Point1" # Mount Subdir1 mount on client 1 _, _, _ = mount_volume("%s/subdir1" % self.volname, self.mount_type, self.mpoint, self.mnode, self.clients[0]) # Checking subdir1 is mounted or not ret = is_mounted("%s/subdir1" % self.volname, self.mpoint, self.mnode, self.clients[0], self.mount_type) self.assertTrue(ret, "Volume not mounted on mount point: %s" % self.mpoint) g.log.info("Volume %s mounted on %s/subdir1", self.volname, self.mpoint) # Mount Subdir2 mount on client 2 _, _, _ = mount_volume("%s/subdir2" % self.volname, self.mount_type, self.mpoint, self.mnode, self.clients[1]) # Checking subdir2 is mounted or not ret = is_mounted("%s/subdir2" % self.volname, self.mpoint, self.mnode, self.clients[1], self.mount_type) self.assertTrue(ret, "Volume not mounted on mount point: %s" % self.mpoint) g.log.info("Volume %s mounted on %s/subdir2", self.volname, self.mpoint) # Start IO on all the subdir mounts. self.subdir_mounts = [ copy.deepcopy(self.mounts[0]), copy.deepcopy(self.mounts[1]) ] self.subdir_mounts[0].volname = "%s/subdir1" % self.volname self.subdir_mounts[1].volname = "%s/subdir2" % self.volname all_mounts_procs = [] count = 1 for mount_obj in self.subdir_mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, self.mpoint) cmd = ("python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, self.mpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.subdir_mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.subdir_mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created") # Perform remove brick operation when subdir is mounted on client g.log.info("Start removing bricks from volume") ret = shrink_volume(self.mnode, self.volname) self.assertTrue(ret, ("Remove brick operation failed on " "%s", self.volname)) g.log.info("Remove brick operation is successful on " "volume %s", self.volname) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("All volume %s processes failed to come up " "online", self.volname)) g.log.info("All volume %s processes came up " "online successfully", self.volname) # Log Volume Info and Status after performing remove brick g.log.info("Logging volume info and Status after shrinking volume") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Again Checking subdir1 is mounted or not on Client 1 ret = is_mounted("%s/subdir1" % self.volname, self.mpoint, self.mnode, self.clients[0], self.mount_type) self.assertTrue(ret, "Volume not mounted on mount point: %s" % self.mpoint) g.log.info("Volume %s mounted on %s/subdir1", self.volname, self.mpoint) # Again Checking subdir2 is mounted or not on Client 2 ret = is_mounted("%s/subdir2" % self.volname, self.mpoint, self.mnode, self.clients[1], self.mount_type) self.assertTrue(ret, "Volume not mounted on mount point: %s" % self.mpoint) g.log.info("Volume %s mounted on %s/subdir2", self.volname, self.mpoint)
def test_restore_online_vol(self): # pylint: disable=too-many-statements """ Steps: 1. Create volume 2. Mount volume 3. Perform I/O on mounts 4. Create 1 snapshots snapy1 5. Validate snap created 6. Perform some more I/O 7. Create 1 more snapshot snapy2 8. Restore volume to snapy1 -- Restore should fail with message "volume needs to be stopped before restore" """ # Performing step 3 to 7 in loop here for i in range(1, 3): # Perform I/O g.log.info("Starting IO on all mounts...") self.counter = 1 self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 2 " "--max-num-of-dirs 2 " "--num-of-files 2 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created") # Create snapshot g.log.info("Creating snapshot for volume %s", self.volname) ret, _, _ = snap_create(self.mnode, self.volname, "snapy%s" % i) self.assertEqual( ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot created successfully for volume %s", self.volname) # Check for no of snaps using snap_list snap_list = get_snap_list(self.mnode) self.assertEqual( i, len(snap_list), "No of snaps not consistent " "for volume %s" % self.volname) g.log.info("Successfully validated number of snaps.") # Increase counter for next iteration self.counter = 1000 # Restore volume to snapshot snapy2, it should fail i = 2 g.log.info("Starting to restore volume to snapy%s", i) ret, _, err = snap_restore(self.mnode, "snapy%s" % i) errmsg = ("snapshot restore: failed: Volume (%s) has been started. " "Volume needs to be stopped before restoring a snapshot.\n" % self.volname) log_msg = ("Expected : %s, but Returned : %s", errmsg, err) self.assertEqual(err, errmsg, log_msg) g.log.info("Expected : Failed to restore volume to snapy%s", i)
def test_validate_snaps_max_limit(self): # pylint: disable=too-many-statements # Start IO on all mounts. all_mounts_procs = [] count = 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % ( self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created") # set config snap-max-hard-limit for 10 snpas cmd_str = ("gluster snapshot config snap-max-hard-limit 10" " --mode=script") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, "Failed to set snap-max-hard-limit to 10.") g.log.info("snap-max-hard-limit successfully set for 10.") # set config snap-max-soft-limit to 50% cmd_str = ("gluster snapshot config snap-max-soft-limit 50" " --mode=script") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, "Failed to set snap-max-soft-limit to 50%.") g.log.info("snap-max-soft-limit successfully set for 50%.") # Create 5 snaps for i in range(1, 6): cmd_str = "gluster snapshot create %s %s %s" % ("snapy%s" % i, self.volname, "no-timestamp") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snapy%s created successfully" " for volume %s", i, self.volname) # Check for no. of snaps using snap_list it should be 5 snap_list = get_snap_list(self.mnode) self.assertEqual(5, len(snap_list), "Expected 5 snapshots. " "Found %s snapshots" % len(snap_list)) g.log.info("Successfully validated number of snapshots.") # Validate all 5 snap names created during for i in range(1, 6): self.assertTrue(("snapy%s" % i in snap_list), "%s snap not " "found " % ("snapy%s" % i)) g.log.info("Successfully validated names of snapshots") # create 6th snapshot cmd_str = "gluster snapshot create %s %s %s" % ("snapy6", self.volname, "no-timestamp") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snap6 " "for %s" % self.volname)) g.log.info("Snapshot 'snapy6' created as it is 6th snap") # set config snap-max-soft-limit to 100% cmd_str = ("gluster snapshot config snap-max-soft-limit 100" " --mode=script") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, "Failed to set snap-max-soft-limit to 100%.") g.log.info("snap-max-soft-limit successfully set for 100%.") # create 7th snapshot cmd_str = "gluster snapshot create %s %s %s" % ("snapy7", self.volname, "no-timestamp") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create " "snap7 for %s" % self.volname)) g.log.info("Snapshot 'snapy7' created as it is 7th snap") # Create 3 snaps for i in range(8, 11, 1): cmd_str = "gluster snapshot create %s %s %s" % ("snapy%s" % i, self.volname, "no-timestamp") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snapy%s created successfully " "for volume %s", i, self.volname) # Check for no. of snaps using snap_list it should be 10 snap_list = get_snap_list(self.mnode) self.assertEqual(len(snap_list), 10, "Expected 10 snapshots. " "found %s snapshots" % len(snap_list)) g.log.info("Successfully validated number of snapshots.") # Validate all 10 snap names created for i in range(1, 11, 1): self.assertTrue(("snapy%s" % i in snap_list), "%s snap not " "found " % ("snapy%s" % i)) g.log.info("Successfully validated names of snapshots") # create 11th snapshot cmd_str = "gluster snapshot create %s %s %s" % ("snap", self.volname, "no-timestamp") ret, _, _ = g.run(self.mnode, cmd_str) self.assertNotEqual(ret, 0, ("Unexpected: successfully created 'snap' " "for %s" % self.volname)) g.log.info("Expected: Snapshot 'snap' not created as it is 11th snap") # Check for no. of snaps using snap_list it should be 10 snap_list = get_snap_list(self.mnode) self.assertEqual(len(snap_list), 10, "Expected 10 snapshots. " "found %s snapshots" % len(snap_list)) g.log.info("Successfully validated number of snapshots.") # modify config snap-max-hard-limit for 20 snpas cmd_str = ("gluster snapshot config snap-max-hard-limit 20" " --mode=script") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, "Failed to set snap-max-hard-limit to 20.") g.log.info("snap-max-hard-limit successfully set for 20.") # Create 10 snaps for i in range(11, 21, 1): cmd_str = "gluster snapshot create %s %s %s" % ("snapy%s" % i, self.volname, "no-timestamp") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snapy%s created successfully for " "volume %s", i, self.volname) # Check for no. of snaps using snap_list it should be 20 snap_list = get_snap_list(self.mnode) self.assertEqual(len(snap_list), 20, "Expected 20 snapshots. " "found %s snapshots" % len(snap_list)) g.log.info("Successfully validated number of snaps.")
def test_heal_info_shouldnot_list_files_being_accessed(self): """ - bring brick 1 offline - create files and validate IO - get entries before accessing file - get first filename from active subvol without offline bricks - access and modify the file - while accessing - get entries - Compare entries before accessing and while accessing - validate IO """ # Bring 1-st brick offline brick_to_bring_offline = [self.bricks_list[0]] g.log.info('Bringing bricks %s offline...', brick_to_bring_offline) ret = bring_bricks_offline(self.volname, brick_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, brick_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % brick_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', brick_to_bring_offline) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("python %s create_files -f 100 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Get entries before accessing file g.log.info("Getting entries_before_accessing file...") entries_before_accessing = get_heal_info_summary( self.mnode, self.volname) self.assertNotEqual(entries_before_accessing, None, 'Can`t get heal info summary') g.log.info( "Getting entries_before_accessing file finished successfully") # Get filename to access from active subvol without offline bricks # Get last subvol subvols = get_subvols(self.mnode, self.volname) subvol_without_offline_brick = subvols['volume_subvols'][-1] # Get first brick server and brick path # and get first file from filelist subvol_mnode, mnode_brick = subvol_without_offline_brick[0].split(':') ret, file_list, _ = g.run(subvol_mnode, 'ls %s' % mnode_brick) file_to_edit = file_list.splitlines()[0] # Access and modify the file g.log.info("Start modifying IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("cd %s/ ; " "dd if=/dev/zero of=%s bs=1G count=1" % (mount_obj.mountpoint, file_to_edit)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is modified successfully", mount_obj.client_system, mount_obj.mountpoint) self.io_validation_complete = False # Get entries while accessing file g.log.info("Getting entries while accessing file...") entries_while_accessing = get_heal_info_summary( self.mnode, self.volname) self.assertNotEqual(entries_before_accessing, None, 'Can`t get heal info summary') g.log.info("Getting entries while accessing file " "finished successfully") # Compare dicts before accessing and while accessing g.log.info('Comparing entries before modifying and while modifying...') ret = cmp(entries_before_accessing, entries_while_accessing) self.assertEqual(ret, 0, 'Entries before modifying and while modifying' 'are not equal') g.log.info('Comparison entries before modifying and while modifying' 'finished successfully.') # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True
def test_replacing_all_arbiters(self): """ - Create an arbiter volume 4(2+1) distributed replicate - Start writing IO - While the I/O's are going on replace all the arbiter bricks - check for the new bricks attached successfully - Check for heals - Validate IO """ # pylint: disable=too-many-locals,too-many-statements # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Clear all brick folders. Its need to prevent healing with old files for brick in bricks_list: g.log.info('Clearing brick %s', brick) node, brick_path = brick.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) self.assertFalse(ret, err) g.log.info('Clearing brick %s is successful', brick) g.log.info('Clearing for all brick is successful') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("python %s create_deep_dirs_with_files " "-d 3 " "-l 3 " "-n 3 " "-f 20 " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # replace bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] for subvol in subvols: g.log.info('Replacing arbiter brick for %s', subvol) brick_to_replace = subvol[-1] self.bricks_to_clean.append(brick_to_replace) new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume: %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) # check replaced bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] index = 0 for subvol in subvols: expected_brick_path = self.bricks_to_clean[index] + 'new' brick_to_check = subvol[-1] self.assertEqual(expected_brick_path, brick_to_check, 'Brick %s is not replaced brick' % brick_to_check) index += 1 # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s: All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True
def test_glusterd_rebalance(self): ''' -> Create Volume -> Fuse mount the volume -> Perform I/O on fuse mount -> Add bricks to the volume -> Perform rebalance on the volume -> While rebalance is in progress, -> restart glusterd on all the nodes in the cluster ''' # run IOs g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 4 " "--dir-length 6 " "--max-num-of-dirs 3 " "--num-of-files 25 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Forming brick list self.brick_list = form_bricks_list_to_add_brick( self.mnode, self.volname, self.servers, self.all_servers_info) # Adding Bricks ret, _, _ = add_brick(self.mnode, self.volname, self.brick_list) self.assertEqual(ret, 0, "Failed to add brick to the volume %s" % self.volname) g.log.info("Brick added successfully to the volume %s", self.volname) # Performing rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance on volume %s' % self.volname) g.log.info("Rebalance started successfully on volume %s", self.volname) # Checking Rebalance is in progress or not rebalance_status = get_rebalance_status(self.mnode, self.volname) if rebalance_status['aggregate']['statusStr'] != 'in progress': raise ExecutionError("Rebalance is not in 'in progress' state, " "either rebalance is in compeleted state or" " failed to get rebalance status") # Restart glusterd ret = restart_glusterd(self.servers) self.assertTrue(ret, "Failed to restart glusterd on servers") g.log.info("Glusterd restarted successfully on %s", self.servers) # Checking glusterd status ret = wait_for_glusterd_to_start(self.servers) self.assertTrue(ret, "Glusterd is not running on some of the " "servers") g.log.info("Glusterd is running on all servers %s", self.servers)
def test_verify_lock_granted_from_2_clients(self): """ - Create disperse volume and mount it to 2 clients` - Create file from 1 client on mount point - Take lock from client 1 => Lock is acquired - Try taking lock from client 2=> Lock is blocked (as already being taken by client 1) - Release lock from client1=> Lock is released - Take lock from client2 - Again try taking lock from client 1 - verify test with once, by disabling eagerlock and other eager lock and once by leaving eager and other eagerlock enabled(by default) """ mpoint = self.mounts[0].mountpoint # Create a file on client 1 cmd = "touch {}/test_file".format(mpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create file on client 1") # Verifying OCL as ON option = "optimistic-change-log" option_dict = get_volume_options(self.mnode, self.volname, option) self.assertIsNotNone(option_dict, ("Failed to get %s volume option" " for volume %s" % (option, self.volname))) self.assertEqual(option_dict['disperse.optimistic-change-log'], 'on', ("%s is not ON for volume %s" % (option, self.volname))) g.log.info("Succesfully verified %s value for volume %s", option, self.volname) # Repeat the test with eager-lock and other-eager-lock 'on' & 'off' for lock_status in ('on', 'off'): options = { 'disperse.eager-lock': lock_status, 'disperse.other-eager-lock': lock_status } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("failed to set eagerlock and other " "eagerlock value as %s " % lock_status)) g.log.info( "Successfully set eagerlock and other eagerlock value" " to %s", lock_status) # Repeat the test for both the combinations of clients for client_1, client_2 in list( itertools.permutations([ self.mounts[0].client_system, self.mounts[1].client_system ], r=2)): # Get lock to file from one client lock_cmd = ("/usr/bin/env python {} -f {}/" "test_file -t 30".format(self.script, mpoint)) proc = g.run_async(client_1, lock_cmd) time.sleep(5) # As the lock is been acquired by one client, # try to get lock from the other ret, _, _ = g.run(client_2, lock_cmd) self.assertEqual( ret, 1, ("Unexpected: {} acquired the lock " "before been released by {}".format(client_2, client_1))) g.log.info( "Expected : Lock can't be acquired by %s before " "being released by %s", client_2, client_1) # Wait for first client to release the lock. ret, _, _ = proc.async_communicate() self.assertEqual( ret, 0, ("File lock process failed on %s:%s", client_1, mpoint)) # Try taking the lock from other client and releasing it lock_cmd = ("/usr/bin/env python {} -f " "{}/test_file -t 1".format(self.script, mpoint)) ret, _, _ = g.run(client_2, lock_cmd) self.assertEqual(ret, 0, ("Unexpected:{} Can't acquire the lock even " "after its been released by {}".format( client_2, client_1))) g.log.info( "Successful, Lock acquired by %s after being " "released by %s", client_2, client_1)
def test_write_io_mount_point_resumed_quorum_restored_x3(self): """ - set cluster.quorum-type to auto - start I/O from the mount point - Do IO and check on subvols with two nodes to reboot (do for each subvol) - get files to delete/create for nodes to be offline - delete files from mountpoint - reboot nodes - creating files on nodes while rebooting - validate for rofs - wait for volume processes to be online - creating files on nodes after rebooting - validate IO - Do IO and check on subvols without nodes to reboot (do for each subvol) - get files to delete/create for nodes to be online - delete files from mountpoint - reboot nodes - creating files on online nodes while rebooting other nodes - validate IO - Do IO and check and reboot two nodes on all subvols - get files to delete/create for nodes to be offline - delete files from mountpoint - reboot nodes - creating files on nodes while rebooting - validate for rofs - wait for volume processes to be online - creating files on nodes after rebooting - validate IO """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting cluster.quorum-type to auto on volume %s", self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("python %s create_files -f 30 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Do IO and check on subvols with nodes to reboot subvols_dict = get_subvols(self.mnode, self.volname) for subvol in subvols_dict['volume_subvols']: # define nodes to reboot brick_list = subvol[0:2] nodes_to_reboot = [] for brick in brick_list: node, brick_path = brick.split(':') nodes_to_reboot.append(node) # get files to delete/create for nodes to be offline node, brick_path = brick_list[0].split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path) self.assertFalse(ret, 'Failed to ls files on %s' % node) file_list = brick_file_list.splitlines() # delete files from mountpoint for mount_obj in self.mounts: g.log.info("Deleting data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ('cd %s/ ; rm -rf %s' % (mount_obj.mountpoint, ' '.join(file_list))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to rm file on %s' % mount_obj.client_system) g.log.info('Files %s are deleted', file_list) # reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Creating files on nodes while rebooting self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False g.log.info("Validating if IO failed with read-only filesystem") ret = is_io_procs_fail_with_rofs(self, self.all_mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected error and IO successful" " on read-only filesystem")) self.io_validation_complete = True g.log.info("EXPECTED: " "Read-only file system in IO while creating file") # check if nodes are online counter = 0 timeout = 300 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") time.sleep(5) counter = counter + 5 else: _rc = True break if not _rc: for node in reboot_results: if reboot_results[node]: g.log.info("Node %s is online", node) else: g.log.error( "Node %s is offline even after " "%d minutes", node, timeout / 60.0) else: g.log.info("All nodes %s are up and running", nodes_to_reboot) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Creating files on nodes after rebooting self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Do IO and check on subvols without nodes to reboot subvols_dict = get_subvols(self.mnode, self.volname) for subvol in subvols_dict['volume_subvols']: # define nodes to reboot brick_list = subvol[0:2] nodes_to_reboot = [] for brick in brick_list: node, brick_path = brick.split(':') nodes_to_reboot.append(node) # get files to delete/create for nodes to be online new_subvols_dict = get_subvols(self.mnode, self.volname) subvol_to_operate = new_subvols_dict['volume_subvols'] subvol_to_operate.remove(subvol) brick_list_subvol_online = subvol_to_operate[0] node, brick_path_vol_online = \ brick_list_subvol_online[0].split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path_vol_online) self.assertFalse(ret, 'Failed to ls files on %s' % node) file_list = brick_file_list.splitlines() # delete files from mountpoint for mount_obj in self.mounts: g.log.info("Deleting data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ('cd %s/ ; rm -rf %s' % (mount_obj.mountpoint, ' '.join(file_list))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to rm file on %s' % mount_obj.client_system) g.log.info('Files %s are deleted', file_list) # reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Creating files on nodes while rebooting self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # check if nodes are online counter = 0 timeout = 300 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") time.sleep(5) counter = counter + 5 else: _rc = True break if not _rc: for node in reboot_results: if reboot_results[node]: g.log.info("Node %s is online", node) else: g.log.error( "Node %s is offline even after " "%d minutes", node, timeout / 60.0) else: g.log.info("All nodes %s are up and running", nodes_to_reboot) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Do IO and check and reboot nodes on all subvols subvols_dict = get_subvols(self.mnode, self.volname) nodes_to_reboot = [] file_list_for_all_subvols = [] for subvol in subvols_dict['volume_subvols']: # define nodes to reboot brick_list = subvol[0:2] for brick in brick_list: node, brick_path = brick.split(':') nodes_to_reboot.append(node) # get files to delete/create for nodes to be offline node, brick_path = brick_list[0].split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path) self.assertFalse(ret, 'Failed to ls files on %s' % node) file_list = brick_file_list.splitlines() file_list_for_all_subvols.append(file_list) # delete files from mountpoint for mount_obj in self.mounts: g.log.info("Deleting data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ('cd %s/ ; rm -rf %s' % (mount_obj.mountpoint, ' '.join(file_list))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, 'Failed to rm file on %s' % node) g.log.info('Files %s are deleted', file_list) # reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Creating files on nodes while rebooting all_mounts_procs, all_mounts_procs_1, all_mounts_procs_2 = [], [], [] # Create files for 1-st subvol and get all_mounts_procs_1 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[0]))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_1.append(proc) all_mounts_procs.append(all_mounts_procs_1) # Create files for 2-st subvol and get all_mounts_procs_2 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[1]))) proc2 = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_2.append(proc2) all_mounts_procs.append(all_mounts_procs_2) for mounts_procs in all_mounts_procs: # Validate IO self.io_validation_complete = False g.log.info("Validating if IO failed with read-only filesystem") ret = is_io_procs_fail_with_rofs(self, mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected error and IO successful" " on read-only filesystem")) self.io_validation_complete = True g.log.info("EXPECTED: " "Read-only file system in IO while creating file") # check if nodes are online counter = 0 timeout = 300 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") time.sleep(5) counter = counter + 5 else: _rc = True break if not _rc: for node in reboot_results: if reboot_results[node]: g.log.info("Node %s is online", node) else: g.log.error("Node %s is offline even after " "%d minutes", node, timeout / 60.0) else: g.log.info("All nodes %s are up and running", nodes_to_reboot) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Creating files on nodes after rebooting all_mounts_procs, all_mounts_procs_1, all_mounts_procs_2 = [], [], [] # Create files for 1-st subvol and get all_mounts_procs_1 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[0]))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_1.append(proc) all_mounts_procs.append(all_mounts_procs_1) # Create files for 2-st subvol and get all_mounts_procs_2 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[1]))) proc2 = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_2.append(proc2) all_mounts_procs.append(all_mounts_procs_2) for mounts_procs in all_mounts_procs: # Validate IO self.io_validation_complete = False self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True
def test_dynamic_provisioning_glusterfile_gluster_pod_or_node_failure( self): """Create glusterblock PVC when gluster pod or node is down.""" mount_path = "/mnt" datafile_path = '%s/fake_file_for_%s' % (mount_path, self.id()) # Create secret and storage class self.create_storage_class() # Create PVC pvc_name = self.create_and_wait_for_pvc() # Create app POD with attached volume pod_name = oc_create_tiny_pod_with_volume( self.node, pvc_name, "test-pvc-mount-on-app-pod", mount_path=mount_path) self.addCleanup( wait_for_resource_absence, self.node, 'pod', pod_name) self.addCleanup(oc_delete, self.node, 'pod', pod_name) # Wait for app POD be up and running wait_for_pod_be_ready( self.node, pod_name, timeout=60, wait_step=2) # Run IO in background io_cmd = "oc rsh %s dd if=/dev/urandom of=%s bs=1000K count=900" % ( pod_name, datafile_path) async_io = g.run_async(self.node, io_cmd, "root") # Check for containerized Gluster if self.is_containerized_gluster(): # Pick up one of the hosts which stores PV brick (4+ nodes case) gluster_pod_data = get_gluster_pod_names_by_pvc_name( self.node, pvc_name)[0] # Delete glusterfs POD from chosen host and wait for # spawn of new one oc_delete(self.node, 'pod', gluster_pod_data["pod_name"]) cmd = ("oc get pods -o wide | grep glusterfs | grep %s | " "grep -v Terminating | awk '{print $1}'") % ( gluster_pod_data["pod_hostname"]) for w in Waiter(600, 15): new_gluster_pod_name = self.cmd_run(cmd) if new_gluster_pod_name: break if w.expired: error_msg = "exceeded timeout, new gluster pod not created" g.log.error(error_msg) raise AssertionError(error_msg) g.log.info("new gluster pod name is %s" % new_gluster_pod_name) wait_for_pod_be_ready(self.node, new_gluster_pod_name) else: pvc_hosting_node_ip = get_gluster_host_ips_by_pvc_name( self.node, pvc_name)[0] heketi_nodes = heketi_node_list( self.heketi_client_node, self.heketi_server_url) node_ip_for_reboot = None for heketi_node in heketi_nodes: heketi_node_ip = heketi_node_info( self.heketi_client_node, self.heketi_server_url, heketi_node, json=True)["hostnames"]["storage"][0] if heketi_node_ip == pvc_hosting_node_ip: node_ip_for_reboot = heketi_node_ip break if not node_ip_for_reboot: raise AssertionError( "Gluster node IP %s not matched with heketi node %s" % ( pvc_hosting_node_ip, heketi_node_ip)) node_reboot_by_command(node_ip_for_reboot) # Check that async IO was not interrupted ret, out, err = async_io.async_communicate() self.assertEqual(ret, 0, "IO %s failed on %s" % (io_cmd, self.node))
def test_fops_ec_brickdown(self): # pylint: disable=too-many-branches,too-many-statements,too-many-locals """ - 1.Start resource consumption tool - 2.Create directory dir1 - 3.Create 5 dir and 5 files in each dir in directory 1 - 4.Rename all file inside dir1 - 5.Truncate at any dir in mountpoint inside dir1 - 6.Create softlink and hardlink of files in mountpoint - 7.chmod, chown, chgrp inside dir1 - 8.Create tiny, small, medium nd large file - 9.Creating files on client side for dir1 - 10.Brick redundant bricks down - 11.Validating IO's and waiting to complete - 12.Creating dir2 - 13.Creating files on client side for dir2 - 14.Bring bricks online - 15.Wait for brick to come online - 16.Check if bricks are online - 17.Monitor heal completion - 18.Validating IO's and waiting to complete """ # Starting resource consumption using top log_file_mem_monitor = '/var/log/glusterfs/mem_usage.log' cmd = ('for i in {1..100};do top -n 1 -b|egrep \ "RES|gluster" & free -h 2>&1 >> %s ; \ sleep 10;done' % (log_file_mem_monitor)) g.log.info(cmd) for server in self.servers: g.run_async(server, cmd) bricks_list = [] # get the bricks from the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Brick list is empty") g.log.info("Brick List : %s", bricks_list) # Creating dir1 cmd = ('mkdir %s/dir1' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create dir1") g.log.info("dir1 created successfully for %s", self.mounts[0]) # Create 5 dir and 5 files in each dir at mountpoint on dir1 start, end = 1, 5 for mount_obj in self.mounts: # Number of dir and files to be created. dir_range = ("%s..%s" % (str(start), str(end))) file_range = ("%s..%s" % (str(start), str(end))) # Create dir 1-5 at mountpoint. cmd = ('mkdir %s/dir1/dir{%s};' % (mount_obj.mountpoint, dir_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Directory creation failed") g.log.info("Directory created successfull") # Create files inside each dir. cmd = ('touch %s/dir1/dir{%s}/file{%s};' % (mount_obj.mountpoint, dir_range, file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "File creation failed") g.log.info("File created successfull") # Increment counter so that at next client dir and files are made # with diff offset. Like at next client dir will be named # dir6, dir7...dir10. Same with files. start += 5 end += 5 # Rename all files inside dir1 at mountpoint on dir1 cmd = ('cd %s/dir1/dir1/; ' 'for FILENAME in *;' 'do mv $FILENAME Unix_$FILENAME; ' 'done;' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to rename file on" "client") g.log.info("Successfully renamed file on client") # Truncate at any dir in mountpoint inside dir1 # start is an offset to be added to dirname to act on # diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s/; ' 'for FILENAME in *;' 'do echo > $FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Truncate failed") g.log.info("Truncate of files successfull") # Create softlink and hardlink of files in mountpoint. Start is an # offset to be added to dirname to act on diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln -s $FILENAME softlink_$FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Creating Softlinks have failed") g.log.info("Softlink of files have been changed successfully") cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln $FILENAME hardlink_$FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start + 1))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Creating Hardlinks have failed") g.log.info("Hardlink of files have been changed successfully") start += 5 # chmod, chown, chgrp inside dir1 # start and end used as offset to access diff files # at diff clients. start, end = 2, 5 for mount_obj in self.mounts: dir_file_range = '%s..%s' % (str(start), str(end)) cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Changing mode of files has failed") g.log.info("Mode of files have been changed successfully") cmd = ('chown root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Changing owner of files has failed") g.log.info("Owner of files have been changed successfully") cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Changing group of files has failed") g.log.info("Group of files have been changed successfully") start += 5 end += 5 # Create tiny, small, medium nd large file # at mountpoint. Offset to differ filenames # at diff clients. offset = 1 for mount_obj in self.mounts: cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for tiny files failed") g.log.info("Fallocate for tiny files successfully") cmd = 'fallocate -l 20M small_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for small files failed") g.log.info("Fallocate for small files successfully") cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for medium files failed") g.log.info("Fallocate for medium files successfully") cmd = 'fallocate -l 1G large_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for large files failed") g.log.info("Fallocate for large files successfully") offset += 1 # Creating files on client side for dir1 # Write IO all_mounts_procs = [] count = 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s/dir1" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Bring down other bricks to max redundancy # Bringing bricks offline ret = bring_bricks_offline(self.volname, bricks_list[2:4]) self.assertTrue(ret, 'Bricks not offline') g.log.info('Bricks are offline successfully') # Validating IO's and waiting to complete g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Creating dir2 cmd = ('mkdir %s/dir2' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create dir2 ") g.log.info("dir2 created successfully for %s", self.mounts[0]) # Creating files on client side for dir2 # Write IO all_mounts_procs = [] count = 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s/dir2" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Bring bricks online list_of_bricks_to_bring_online = bricks_list[2:4] ret = bring_bricks_online(self.mnode, self.volname, list_of_bricks_to_bring_online) self.assertTrue(ret, 'Bricks not brought online') g.log.info('Bricks are online successfully') # Wait for brick to come online g.log.info("Waiting for brick to come online") ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Bricks are not online") g.log.info("EXPECTED : Bricks are online") # Check if bricks are online ret = get_offline_bricks_list(self.mnode, self.volname) self.assertListEqual(ret, [], 'All bricks are not online') g.log.info('All bricks are online') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') g.log.info('Heal has completed successfully') # Validating IO's and waiting to complete g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Check file exist for memory log g.log.info("Validating log exists") ret = file_exists(self.mnode, '/var/log/glusterfs/mem_usage.log') self.assertTrue(ret, "Memory log file does not exist") g.log.info("Memory log file exists")
def test_disperse_removebrick(self): # pylint: disable=too-many-branches,too-many-statements,too-many-locals """ - Write IO's - Start remove brick - Validate IOs - Start rebalance - Wait for rebalance to complete - Start IO's and Vaildate IO's """ # Write IO all_mounts_procs = [] count = 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Start remove-brick (subvolume-decrease) g.log.info("Start removing bricks from volume") ret = shrink_volume(self.mnode, self.volname) self.assertTrue(ret, ("Remove brick operation failed on " "%s", self.volname)) g.log.info("Remove brick operation is successful on " "volume %s", self.volname) # Log Volume Info and Status after shrinking the volume g.log.info("Logging volume info and Status after shrinking volume") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("All process for volume %s are not" "online", self.volname)) g.log.info("All volume %s processes are now online", self.volname) # Validating IO's and waiting to complete g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Start IO on all mounts after rebalance completes all_mounts_procs = [] count = 21 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's")
def test_data_self_heal_algorithm_diff_default(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff' Description: - set the volume option "data-self-heal-algorithm" to value "diff" - create IO - bring down all bricks processes from selected set - modify the data - calculate arequal - bring bricks online - start healing - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options "data-self-heal-algorithm": "diff"...') options = {"data-self-heal-algorithm": "diff"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'data-self-heal-algorithm' is set to 'diff' " "successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertEqual(sorted(result_before_online), sorted(result_after_online), 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def run_async(cmd, hostname, raise_on_error=True): async_op = g.run_async(host=hostname, command=cmd) async_obj.append(async_op) return async_op
def test_self_heal(self): """ Description:- - Create files on mount point - Kill one brick from volume - rm -rfv on mount point - bring bricks online - wait for heals - list """ # pylint: disable=too-many-statements # IO on the mount point g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ( "/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 35 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Killing one brick from the volume set g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Not all the bricks in list: %s are offline" % bricks_to_bring_offline) g.log.info("Successfully validated that bricks: %s are all offline", bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Checking volume status g.log.info( "Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Removing files from the mount point when one brick is down g.log.info("Removing files from the mount point") mountpoint = self.mounts[0].mountpoint client = self.mounts[0].client_system cmd = "rm -rfv %s/*" % mountpoint ret, _, _ = g.run(client, cmd) if ret != 0: raise ExecutionError("failed to delete the files") # Bringing bricks online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Check if bricks are online g.log.info("Checking bricks are online or not") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Monitoring heals on the volume g.log.info("Wait for heal completion...") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue( ret, "Self heal didn't complete even after waiting " "for 20 minutes.") g.log.info("self-heal is successful after changing the volume type " "from replicated to arbitered volume") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def cleanup_mounts(mounts): """Removes all the data from all the mountpoints Args: mounts (list): List of all GlusterMount objs. Returns: bool: True if cleanup is successful on all mounts. False otherwise. """ if isinstance(mounts, GlusterMount): mounts = [mounts] g.log.info("Start cleanup mounts") all_mounts_procs = [] valid_mounts = [] for mount_obj in mounts: g.log.info("Cleaning up data from %s:%s", mount_obj.client_system, mount_obj.mountpoint) if (not mount_obj.mountpoint or (os.path.realpath(os.path.abspath(mount_obj.mountpoint)) == '/')): g.log.error("%s on %s is not a valid mount point", mount_obj.mountpoint, mount_obj.client_system) continue cmd = "rm -rf %s/*" % (mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) valid_mounts.append(mount_obj) g.log.info("rm -rf on all clients is complete. Validating deletion now...") # Get cleanup status _rc_rmdir = True for i, proc in enumerate(all_mounts_procs): ret, out, err = proc.async_communicate() if ret != 0 or out or err: g.log.error("Deleting files/dirs Failed on %s:%s", valid_mounts[i].client_system, valid_mounts[i].mountpoint) _rc_rmdir = False else: g.log.info("Deleting files/dirs is successful on %s:%s", valid_mounts[i].client_system, valid_mounts[i].mountpoint) if _rc_rmdir: g.log.info("Successfully deleted files/dirs from all mounts") else: g.log.error("Deleting files/dirs failed on some of the mounts") # Check if mount points are empty ignore_dirs_list = [".trashcan"] ignore_dirs = r"\|".join(ignore_dirs_list) all_mounts_procs = [] for mount_obj in mounts: cmd = ("find %s -mindepth 1 | grep -ve '%s'" % (mount_obj.mountpoint, ignore_dirs)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Get cleanup status _rc_lookup = True for i, proc in enumerate(all_mounts_procs): ret, out, err = proc.async_communicate() if ret == 0: g.log.error("Mount %s on %s is still having entries:\n%s", mounts[i].mountpoint, mounts[i].client_system, out) _rc_lookup = False else: g.log.info("Mount %s on %s is cleaned up\n%s", mounts[i].mountpoint, mounts[i].client_system, out) if _rc_lookup: g.log.info("All the mounts are successfully cleaned up") else: g.log.error("Failed to cleanup all mounts") # List mounts entries g.log.info("Listing mounts entries:") list_all_files_and_dirs_mounts(mounts) return _rc_lookup
def test_entry_self_heal_heal_command(self): """ Test Entry-Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - get areequal before getting bricks offline - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - get areequal after getting bricks offline and compare with arequal after bringing bricks offline - modify the data - get areequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check if heal is completed - check for split-brain - get areequal after getting bricks online and compare with arequal before bringing bricks online """ # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off'") # Start IO on mounts g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) cmd = ( "python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-length 2 " "--dir-depth 2 " "--max-num-of-dirs 2 " "--num-of-files 20 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 g.log.info("IO on %s:%s is started successfully" % (mount_obj.client_system, mount_obj.mountpoint)) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Command list to do different operations with data - # create, rename, copy and delete cmd_list = [ "python %s create_files -f 20 %s", "python %s mv -i '.trashcan' %s", "python %s copy --dest-dir new_dir %s", "python %s delete %s", ] for cmd in cmd_list: # Get areequal before getting bricks offline g.log.info('Getting areequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') # Setting options g.log.info('Setting options...') options = { "self-heal-daemon": "off", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' " "is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) # Bring brick offline g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful' % bricks_to_bring_offline) # Get areequal after getting bricks offline g.log.info('Getting areequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks offline ' 'is successful') # Checking areequals before bringing bricks offline # and after bringing bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Checksums are not equal') g.log.info('Checksums before bringing bricks offline ' 'and after bringing bricks offline are equal') # Modify the data g.log.info("Start modifying IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = cmd % (self.script_upload_path, mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is modified successfully" % (mount_obj.client_system, mount_obj.mountpoint)) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before getting bricks online g.log.info('Getting areequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Bring brick online g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful' % bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = { "self-heal-daemon": "on", } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online" % self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get areequal after getting bricks online g.log.info('Getting areequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks online ' 'is successful') # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) if not ret: raise ExecutionError("Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Checking areequals before bringing bricks online # and after bringing bricks online self.assertEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal')
def run_bonnie(servers, directory_to_run, username="******"): """Run bonnie test suite on the given servers. Args: servers (list): servers in which tests to be run. directory_to_run (list): directory path where tests will run for each server. Kwargs: username (str): username. Defaults to root. Returns: bool: True, if test passes in all servers, False otherwise Example: run_bonnie(["abc.com", "def.com"], ["/mnt/test1", "/mnt/test2"]) """ g.log.info("Running bonnie tests on %s" % ','.join(servers)) rt = True options_for_each_servers = [] # Install bonnie test suite if not installed results = g.run_parallel(servers, "yum list installed bonnie++") for index, server in enumerate(servers): if results[server][0] != 0: ret, out, _ = g.run( server, "yum list installed bonnie++ || " "yum -y install bonnie++") if ret != 0: g.log.error("Failed to install bonnie on %s" % server) return False # Building options for bonnie tests options_list = [] options = "" freemem_command = "free -g | grep Mem: | awk '{ print $2 }'" ret, out, _ = g.run(server, freemem_command) memory = int(out) g.log.info("Memory = %i", memory) options_list.append("-d %s -u %s" % (directory_to_run[index], username)) if memory >= 8: options_list.append("-r 16G -s 16G -n 0 -m TEST -f -b") options = " ".join(options_list) options_for_each_servers.append(options) proc_list = [] for index, server in enumerate(servers): bonnie_command = "bonnie++ %s" % (options_for_each_servers[index]) proc = g.run_async(server, bonnie_command) proc_list.append(proc) for index, proc in enumerate(proc_list): results = proc.async_communicate() if results[0] != 0: g.log.error("Bonnie test failed on server %s" % servers[index]) rt = False for index, server in enumerate(servers): ret, out, _ = g.run(server, "rm -rf %s/Bonnie.*" % directory_to_run[index]) if ret != 0: g.log.error("Failed to remove files from %s" % server) rt = False for server in servers: ret, out, _ = g.run(server, "yum -y remove bonnie++") if ret != 0: g.log.error("Failed to remove bonnie from %s" % server) return False return rt
def run_fio(servers, directory_to_run): """Run fio test suite on the given servers. Args: servers (list): servers in which tests to be run. directory_to_run (list): directory path where tests will run for each server. Returns: bool: True, if test passes in all servers, False otherwise Example: run_fio(["abc.com", "def.com"], ["/mnt/test1", "/mnt/test2"]) """ g.log.info("Running fio tests on %s" % ','.join(servers)) rt = True # Installing fio if not installed results = g.run_parallel(servers, "yum list installed fio") for index, server in enumerate(servers): if results[server][0] != 0: ret, out, _ = g.run( server, "yum list installed fio || " "yum -y install fio") if ret != 0: g.log.error("Failed to install fio on %s" % server) return False # building job file for running fio # TODO: parametrizing the fio and to get input values from user job_file = "/tmp/fio_job.ini" cmd = ("echo -e '[global]\nrw=randrw\nio_size=1g\nfsync_on_close=1\n" "size=4g\nbs=64k\nrwmixread=20\nopenfiles=1\nstartdelay=0\n" "ioengine=sync\n[write]\ndirectory=%s\nnrfiles=1\n" "filename_format=fio_file.$jobnum.$filenum\nnumjobs=8' " "> %s" % (directory_to_run[index], job_file)) ret, _, _ = g.run(server, cmd) if ret != 0: g.log.error("Failed to create fio job file") rt = False proc_list = [] for index, server in enumerate(servers): fio_command = "fio %s" % (job_file) proc = g.run_async(server, fio_command) proc_list.append(proc) for index, proc in enumerate(proc_list): results = proc.async_communicate() if results[0] != 0: g.log.error("fio test failed on server %s" % servers[index]) rt = False for index, server in enumerate(servers): ret, out, _ = g.run(server, "rm -rf %s/fio_file.*" % directory_to_run[index]) if ret != 0: g.log.error("Failed to remove files from %s" % server) rt = False for index, server in enumerate(servers): ret, out, _ = g.run(server, "rm -rf %s" % job_file) if ret != 0: g.log.error("Failed to remove job file from %s" % server) rt = False for server in servers: ret, out, _ = g.run(server, "yum -y remove fio") if ret != 0: g.log.error("Failed to remove fio from %s" % server) return False return rt
def test_validate_snaps_create(self): """ Creating snapshot using gluster snapshot create <snap1> <vol-name> """ cmd_str = "gluster snapshot create %s %s" % ("snap1", self.volname) ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snap1 created successfully for volume %s", self.volname) # Create snapshot of volume using # -- gluster snapshot create <snap2> <vol-name(s)> [description # <description with words and quotes>] desc = 'description "this is a snap with snap2 name and description"' cmd_str = ("gluster snapshot create %s %s %s" % ("snap2", self.volname, desc)) ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snap2 created successfully for volume %s", self.volname) # Create one more snapshot of volume using force cmd_str = ("gluster snapshot create %s %s %s" % ("snap3", self.volname, "force")) ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snap3 created successfully for volume %s", self.volname) # Create one more snapshot of volume using no-timestamp option cmd_str = ("gluster snapshot create %s %s %s" % ("snap4", self.volname, "no-timestamp")) ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot snap4 created successfully for volume %s", self.volname) # Delete all snaps g.log.info("delete all snapshots present") ret, _, _ = snap_delete_all(self.mnode) self.assertEqual(ret, 0, "Snapshot delete failed.") g.log.info("Successfully deleted all snaps") # Start IO on all mounts. all_mounts_procs = [] count = 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Create 5 snaps while IO is in progress for i in range(0, 5): cmd_str = "gluster snapshot create %s %s %s" % ( "snapy%s" % i, self.volname, "no-timestamp") ret, _, _ = g.run(self.mnode, cmd_str) self.assertEqual(ret, 0, ("Failed to create snapshot for %s" % self.volname)) g.log.info("Snapshot %s created successfully for volume %s", "snapy%s" % i, self.volname) # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created") # Check for no of snaps using snap_list it should be 5 now snap_list = get_snap_list(self.mnode) self.assertEqual(5, len(snap_list), "No of snaps not consistent " "for volume %s" % self.volname) g.log.info("Successfully validated number of snaps.") # Validate all snaps created during IO for i in range(0, 5): self.assertIn("snapy%s" % i, snap_list, "%s snap not " "found " % ("snapy%s" % i)) g.log.info("Successfully validated names of snap")