def test_ec_open_fd(self): """ Test Steps: - disable server side heal - Create a file - Set volume option to implement open FD on file - Bring a brick down,say b1 - Open FD on file - Bring brick b1 up - write to open FD file - Monitor heal - Check xattr , ec.version and ec.size of file - Check stat of file """ # pylint: disable=too-many-branches,too-many-statements,too-many-locals mountpoint = self.mounts[0].mountpoint # Disable server side heal ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, ("Failed to disable server side heal")) g.log.info("Successfully disabled server side heal") # Log Volume Info and Status after disabling server side heal ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) # Create a file cmd = ("cd %s; touch 'file_openfd';" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished creating a file while all the bricks are UP') # Set volume options ret = set_volume_options(self.mnode, self.volname, {"performance.read-after-open": "yes"}) self.assertTrue(ret, 'Failed to set volume {}' ' options'.format(self.volname)) g.log.info('Successfully set %s volume options', self.volname,) # Bringing brick b1 offline sub_vols = get_subvols(self.mnode, self.volname) subvols_list = sub_vols['volume_subvols'] bricks_list1 = subvols_list[0] brick_b1_down = choice(bricks_list1) ret = bring_bricks_offline(self.volname, brick_b1_down) self.assertTrue(ret, 'Brick %s is not offline' % brick_b1_down) g.log.info('Brick %s is offline successfully', brick_b1_down) node = self.mounts[0].client_system # Open FD proc = open_file_fd(mountpoint, time=100, client=node) # Bring brick b1 online ret = bring_bricks_online(self.mnode, self.volname, [brick_b1_down], 'glusterd_restart') self.assertTrue(ret, 'Brick {} is not brought ' 'online'.format(brick_b1_down)) g.log.info('Brick %s is online successfully', brick_b1_down) # Validate peers are connected ret = self.validate_peers_are_connected() self.assertTrue(ret, "Peers are not in connected state after bringing" " an offline brick to online via `glusterd restart`") g.log.info("Successfully validated peers are in connected state") # Check if write to FD is successful g.log.info('Open FD on file successful') ret, _, _ = proc.async_communicate() self.assertEqual(ret, 0, "Write to FD is successful") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') g.log.info('Heal has completed successfully') file_openfd = os.path.join(mountpoint, 'file_openfd') # Check if data exists on file ret = check_if_pattern_in_file(node, 'xyz', file_openfd) self.assertEqual(ret, 0, 'xyz does not exists in file') g.log.info('xyz exists in file') file_fd = 'file_openfd' # Check if EC version is same on all bricks which are up ret = validate_xattr_on_all_bricks(bricks_list1, file_fd, 'trusted.ec.version') self.assertTrue(ret, "Healing not completed and EC version is " "not updated") g.log.info("Healing is completed and EC version is updated") # Check if EC size is same on all bricks which are up ret = validate_xattr_on_all_bricks(bricks_list1, file_fd, 'trusted.ec.size') self.assertTrue(ret, "Healing not completed and EC size is " "not updated") g.log.info("Healing is completed and EC size is updated") # Check stat of file cmd = "cd %s; du -kh file_openfd" % mountpoint ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('File %s is accessible', file_fd)
def test_heal_client_io_hang(self): mountpoint = self.mounts[0].mountpoint # disable server side heal ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, ("Failed to disable server side heal")) g.log.info("Successfully disabled server side heal") # Log Volume Info and Status after disabling client side heal g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the bricks list") # Create files cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do touch file$i; done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished creating files while all the bricks are UP') # Bring bricks offline ret = bring_bricks_offline(self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") # Start pumping IO from client cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished writing on files while a brick is DOWN') # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring up the bricks") g.log.info("Successfully brought the bricks up") # Verifying all bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") # Start client side heal by reading/writing files. appendcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=1 oflag=append conv=notrunc;done" % mountpoint) readcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=file$i of=/dev/zero bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, appendcmd) self.assertEqual(ret, 0, err) g.log.info('Finished append on files after bringing bricks online') ret, _, err = g.run(self.mounts[0].client_system, readcmd) self.assertEqual(ret, 0, err) g.log.info('Finished read on files after bringing bricks online') # check the heal info and completion ec_check_heal_comp(self) # Log Volume Info and Status after bringing the brick up g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info( "Successful in logging volume info and status " "of volume %s", self.volname)
def test_heal_full_node_reboot(self): """ - Create IO from mountpoint. - Calculate arequal from mount. - Delete data from backend from the EC volume. - Trigger heal full. - Disable Heal. - Again Enable and do Heal full. - Reboot a Node. - Calculate arequal checksum and compare it. """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 20 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before deleting the files from brick g.log.info('Getting arequal before getting bricks offline...') ret, result_before_killing_procs = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] # Delete data from backend from the erasure node for subvol in subvols: erasure = subvol[-1] g.log.info('Clearing ec brick %s', erasure) node, brick_path = erasure.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) self.assertFalse(ret, err) g.log.info('Clearing ec brick %s is successful', erasure) g.log.info('Clearing data from brick is unsuccessful') # Trigger heal full ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full heal.') # Disable Heal and Enable Heal Full Again g.log.info("Disabling Healon the Servers") ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, "Disabling Failed") g.log.info("Healing is Now Disabled") g.log.info("Enbaling Heal Now") ret = enable_heal(self.mnode, self.volname) self.assertTrue(ret, "Enabling Heal failed") g.log.info("Healing is now enabled") ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full heal.') # Reboot A Node g.log.info("Rebooting Node from the Cluster") subvols_dict = get_subvols(self.mnode, self.volname) nodes_to_reboot = [] for subvol in subvols_dict['volume_subvols']: # Define nodes to reboot brick_list = subvol[1:2] for brick in brick_list: node, brick_path = brick.split(':') if node not in nodes_to_reboot: nodes_to_reboot.append(node) # Reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Check if nodes are online counter = 0 timeout = 700 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") sleep(5) counter = counter + 5 _rc = True break if not _rc: for node in reboot_results: if not reboot_results[node]: g.log.error("Node %s is offline even after " "%d minutes", node, timeout / 60.0) g.log.info("All nodes %s are up and running", nodes_to_reboot) # Trigger Heal Full ret = trigger_heal_full(self.mnode, self.volname) if not ret: sleep(10) ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full heal.') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after healing g.log.info('Getting arequal after getting bricks online...') ret, result_after_healing = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Comparing arequals self.assertEqual(result_before_killing_procs, result_after_healing, 'Arequals before killing arbiter ' 'processes and after healing are not equal') g.log.info('Arequals before killing arbiter ' 'processes and after healing are equal')