def test_gfind_when_node_down(self): """ Verifying the glusterfind functionality when node is down. 1. Create a volume 2. Create a session on the volume 3. Create various files from mount point 4. Bring down glusterd on one of the node 5. Perform glusterfind pre 6. Perform glusterfind post 7. Check the contents of outfile 8. Create more files from mountpoint 9. Reboot one of the nodes 10. Perform gluserfind pre 11. Perform glusterfind post 12. Check the contents of outfile """ # pylint: disable=too-many-statements # Create a session for the volume ret, _, _ = gfind_create(self.mnode, self.volname, self.session) self.assertEqual(ret, 0, ("Unexpected: Creation of a session for the " "volume %s failed" % self.volname)) g.log.info("Successfully created a session for the volume %s", self.volname) # Perform glusterfind list to check if session exists _, out, _ = gfind_list(self.mnode, volname=self.volname, sessname=self.session) self.assertNotEqual(out, "No sessions found.", "Failed to list the glusterfind session") g.log.info("Successfully listed the glusterfind session") self._perform_io_and_validate_presence_of_files() # Wait for changelog to get updated sleep(2) # Bring one of the node down. self.random_server = choice(self.servers[1:]) ret = stop_glusterd(self.random_server) self.assertTrue(ret, "Failed to stop glusterd on one node.") g.log.info("Succesfully stopped glusterd on one node.") # Wait till glusterd is completely down. while is_glusterd_running(self.random_server) != 1: sleep(2) self._perform_glusterfind_pre_and_validate_outfile() # Perform glusterfind post for the session ret, _, _ = gfind_post(self.mnode, self.volname, self.session) self.assertEqual(ret, 0, ("Failed to perform glusterfind post")) g.log.info("Successfully performed glusterfind post") # Bring glusterd which was downed on a random node, up. ret = start_glusterd(self.random_server) self.assertTrue(ret, "Failed to start glusterd on %s" % self.random_server) g.log.info("Successfully started glusterd on node : %s", self.random_server) # Waiting for glusterd to start completely. ret = wait_for_glusterd_to_start(self.random_server) self.assertTrue(ret, "glusterd is not running on %s" % self.random_server) g.log.info("glusterd is started and running on %s", self.random_server) self._perform_io_and_validate_presence_of_files() # Perform IO self._perform_io_and_validate_presence_of_files() # Wait for changelog to get updated sleep(2) # Reboot one of the nodes. self.random_server = choice(self.servers[1:]) ret = reboot_nodes(self.random_server) self.assertTrue(ret, "Failed to reboot the said node.") g.log.info("Successfully started reboot process on one node.") self._perform_glusterfind_pre_and_validate_outfile() # Perform glusterfind post for the session ret, _, _ = gfind_post(self.mnode, self.volname, self.session) self.assertEqual(ret, 0, ("Failed to perform glusterfind post")) g.log.info("Successfully performed glusterfind post") # Gradual sleep backoff till the node has rebooted. counter = 0 timeout = 300 ret = False while counter < timeout: ret, _ = are_nodes_online(self.random_server) if not ret: g.log.info("Node's offline, Retrying after 5 seconds ...") sleep(5) counter += 5 else: ret = True break self.assertTrue(ret, "Node is still offline.") g.log.info("Rebooted node is online") # Wait for glusterd to start completely ret = wait_for_glusterd_to_start(self.random_server) self.assertTrue(ret, "glusterd is not running on %s" % self.random_server) g.log.info("glusterd is started and running on %s", self.random_server)
def test_write_io_mount_point_resumed_quorum_restored_x3(self): """ - set cluster.quorum-type to auto - start I/O from the mount point - Do IO and check on subvols with two nodes to reboot (do for each subvol) - get files to delete/create for nodes to be offline - delete files from mountpoint - reboot nodes - creating files on nodes while rebooting - validate for rofs - wait for volume processes to be online - creating files on nodes after rebooting - validate IO - Do IO and check on subvols without nodes to reboot (do for each subvol) - get files to delete/create for nodes to be online - delete files from mountpoint - reboot nodes - creating files on online nodes while rebooting other nodes - validate IO - Do IO and check and reboot two nodes on all subvols - get files to delete/create for nodes to be offline - delete files from mountpoint - reboot nodes - creating files on nodes while rebooting - validate for rofs - wait for volume processes to be online - creating files on nodes after rebooting - validate IO """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting cluster.quorum-type to auto on volume %s", self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = "/usr/bin/env python %s create_files -f 30 %s" % ( self.script_upload_path, mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Do IO and check on subvols with nodes to reboot subvols_dict = get_subvols(self.mnode, self.volname) for subvol in subvols_dict['volume_subvols']: # define nodes to reboot brick_list = subvol[0:2] nodes_to_reboot = [] for brick in brick_list: node, brick_path = brick.split(':') nodes_to_reboot.append(node) # get files to delete/create for nodes to be offline node, brick_path = brick_list[0].split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path) self.assertFalse(ret, 'Failed to ls files on %s' % node) file_list = brick_file_list.splitlines() # delete files from mountpoint for mount_obj in self.mounts: g.log.info("Deleting data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ('cd %s/ ; rm -rf %s' % (mount_obj.mountpoint, ' '.join(file_list))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to rm file on %s' % mount_obj.client_system) g.log.info('Files %s are deleted', file_list) # reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Creating files on nodes while rebooting self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False g.log.info("Validating if IO failed with read-only filesystem") ret = is_io_procs_fail_with_rofs(self, self.all_mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected error and IO successful" " on read-only filesystem")) self.io_validation_complete = True g.log.info("EXPECTED: " "Read-only file system in IO while creating file") # check if nodes are online counter = 0 timeout = 300 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") time.sleep(5) counter = counter + 5 else: _rc = True break if not _rc: for node in reboot_results: if reboot_results[node]: g.log.info("Node %s is online", node) else: g.log.error( "Node %s is offline even after " "%d minutes", node, timeout / 60.0) else: g.log.info("All nodes %s are up and running", nodes_to_reboot) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Creating files on nodes after rebooting self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Do IO and check on subvols without nodes to reboot subvols_dict = get_subvols(self.mnode, self.volname) for subvol in subvols_dict['volume_subvols']: # define nodes to reboot brick_list = subvol[0:2] nodes_to_reboot = [] for brick in brick_list: node, brick_path = brick.split(':') nodes_to_reboot.append(node) # get files to delete/create for nodes to be online new_subvols_dict = get_subvols(self.mnode, self.volname) subvol_to_operate = new_subvols_dict['volume_subvols'] subvol_to_operate.remove(subvol) brick_list_subvol_online = subvol_to_operate[0] node, brick_path_vol_online = \ brick_list_subvol_online[0].split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path_vol_online) self.assertFalse(ret, 'Failed to ls files on %s' % node) file_list = brick_file_list.splitlines() # delete files from mountpoint for mount_obj in self.mounts: g.log.info("Deleting data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ('cd %s/ ; rm -rf %s' % (mount_obj.mountpoint, ' '.join(file_list))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to rm file on %s' % mount_obj.client_system) g.log.info('Files %s are deleted', file_list) # reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Creating files on nodes while rebooting self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ("cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO self.io_validation_complete = False self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # check if nodes are online counter = 0 timeout = 300 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") time.sleep(5) counter = counter + 5 else: _rc = True break if not _rc: for node in reboot_results: if reboot_results[node]: g.log.info("Node %s is online", node) else: g.log.error( "Node %s is offline even after " "%d minutes", node, timeout / 60.0) else: g.log.info("All nodes %s are up and running", nodes_to_reboot) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online( self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Do IO and check and reboot nodes on all subvols subvols_dict = get_subvols(self.mnode, self.volname) nodes_to_reboot = [] file_list_for_all_subvols = [] for subvol in subvols_dict['volume_subvols']: # define nodes to reboot brick_list = subvol[0:2] for brick in brick_list: node, brick_path = brick.split(':') nodes_to_reboot.append(node) # get files to delete/create for nodes to be offline node, brick_path = brick_list[0].split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path) self.assertFalse(ret, 'Failed to ls files on %s' % node) file_list = brick_file_list.splitlines() file_list_for_all_subvols.append(file_list) # delete files from mountpoint for mount_obj in self.mounts: g.log.info("Deleting data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ('cd %s/ ; rm -rf %s' % (mount_obj.mountpoint, ' '.join(file_list))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, 'Failed to rm file on %s' % node) g.log.info('Files %s are deleted', file_list) # reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Creating files on nodes while rebooting all_mounts_procs, all_mounts_procs_1, all_mounts_procs_2 = [], [], [] # Create files for 1-st subvol and get all_mounts_procs_1 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[0]))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_1.append(proc) all_mounts_procs.append(all_mounts_procs_1) # Create files for 2-st subvol and get all_mounts_procs_2 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[1]))) proc2 = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_2.append(proc2) all_mounts_procs.append(all_mounts_procs_2) for mounts_procs in all_mounts_procs: # Validate IO self.io_validation_complete = False g.log.info("Validating if IO failed with read-only filesystem") ret = is_io_procs_fail_with_rofs(self, mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected error and IO successful" " on read-only filesystem")) self.io_validation_complete = True g.log.info("EXPECTED: " "Read-only file system in IO while creating file") # check if nodes are online counter = 0 timeout = 300 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") time.sleep(5) counter = counter + 5 else: _rc = True break if not _rc: for node in reboot_results: if reboot_results[node]: g.log.info("Node %s is online", node) else: g.log.error("Node %s is offline even after " "%d minutes", node, timeout / 60.0) else: g.log.info("All nodes %s are up and running", nodes_to_reboot) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Creating files on nodes after rebooting all_mounts_procs, all_mounts_procs_1, all_mounts_procs_2 = [], [], [] # Create files for 1-st subvol and get all_mounts_procs_1 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[0]))) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_1.append(proc) all_mounts_procs.append(all_mounts_procs_1) # Create files for 2-st subvol and get all_mounts_procs_2 for mount_obj in self.mounts: g.log.info("Creating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Creating files cmd = ( "cd %s/ ;" "touch %s" % (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[1]))) proc2 = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs_2.append(proc2) all_mounts_procs.append(all_mounts_procs_2) for mounts_procs in all_mounts_procs: # Validate IO self.io_validation_complete = False self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True
def test_heal_full_node_reboot(self): """ - Create IO from mountpoint. - Calculate arequal from mount. - Delete data from backend from the EC volume. - Trigger heal full. - Disable Heal. - Again Enable and do Heal full. - Reboot a Node. - Calculate arequal checksum and compare it. """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 20 %s" % ( self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before deleting the files from brick g.log.info('Getting arequal before getting bricks offline...') ret, result_before_killing_procs = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] # Delete data from backend from the erasure node for subvol in subvols: erasure = subvol[-1] g.log.info('Clearing ec brick %s', erasure) node, brick_path = erasure.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) self.assertFalse(ret, err) g.log.info('Clearing ec brick %s is successful', erasure) g.log.info('Clearing data from brick is unsuccessful') # Trigger heal full ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full heal.') # Disable Heal and Enable Heal Full Again g.log.info("Disabling Healon the Servers") ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, "Disabling Failed") g.log.info("Healing is Now Disabled") g.log.info("Enbaling Heal Now") ret = enable_heal(self.mnode, self.volname) self.assertTrue(ret, "Enabling Heal failed") g.log.info("Healing is now enabled") ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full heal.') # Reboot A Node g.log.info("Rebooting Node from the Cluster") subvols_dict = get_subvols(self.mnode, self.volname) nodes_to_reboot = [] for subvol in subvols_dict['volume_subvols']: # Define nodes to reboot brick_list = subvol[1:2] for brick in brick_list: node, brick_path = brick.split(':') if node not in nodes_to_reboot: nodes_to_reboot.append(node) # Reboot nodes on subvol and wait while rebooting g.log.info("Rebooting the nodes %s", nodes_to_reboot) ret = reboot_nodes(nodes_to_reboot) self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot) # Check if nodes are online counter = 0 timeout = 700 _rc = False while counter < timeout: ret, reboot_results = are_nodes_online(nodes_to_reboot) if not ret: g.log.info("Nodes are offline, Retry after 5 seconds ... ") sleep(5) counter = counter + 5 _rc = True break if not _rc: for node in reboot_results: if not reboot_results[node]: g.log.error("Node %s is offline even after " "%d minutes", node, timeout / 60.0) g.log.info("All nodes %s are up and running", nodes_to_reboot) # Trigger Heal Full ret = trigger_heal_full(self.mnode, self.volname) if not ret: sleep(10) ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full heal.') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after healing g.log.info('Getting arequal after getting bricks online...') ret, result_after_healing = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Comparing arequals self.assertEqual(result_before_killing_procs, result_after_healing, 'Arequals before killing arbiter ' 'processes and after healing are not equal') g.log.info('Arequals before killing arbiter ' 'processes and after healing are equal')