def test_offline_brick_status_when_quorum_not_met(self): """ Test Brick status when Quorum is not met after glusterd restart. 1. Create a volume and mount it. 2. Set the quorum type to 'server'. 3. Bring some nodes down such that quorum won't be met. 4. Brick status should be offline in the node which is up. 5. Restart glusterd in this node. 6. The brick status still should be offline as quorum isn't met. """ # Set the quorum type to server and validate it. vol_option = {'cluster.server-quorum-type': 'server'} ret = set_volume_options(self.mnode, self.volname, vol_option) self.assertTrue(ret, "gluster volume option set of %s to %s failed" % ('cluster.server-quorum-type', 'server')) g.log.info("Cluster quorum set to type server.") # Get the brick list. brick_list = get_all_bricks(self.mnode, self.volname) # Stop glusterd processes. ret = stop_glusterd(self.servers[1:]) self.assertTrue(ret, "Failed to stop glusterd on specified nodes.") g.log.info("Glusterd processes stopped in the desired servers.") # Get the brick status in a node where glusterd is up. ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1]) self.assertTrue(ret, "Bricks are online") g.log.info("Bricks are offline as expected.") # Restart one of the node which is up. ret = restart_glusterd(self.servers[0]) self.assertTrue(ret, ("Failed to restart glusterd on desired node.")) g.log.info("Glusterd restarted on the desired node.") # Wait for glusterd to be online and validate it's running. self.assertTrue(wait_for_glusterd_to_start(self.servers[0]), "Glusterd not up on the desired server.") g.log.info("Glusterd is up in the desired server.") # Get the brick status from the restarted node. ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1]) self.assertTrue(ret, "Bricks are online") g.log.info("Bricks are offline as expected.") # Start glusterd on all servers. ret = start_glusterd(self.servers) self.assertTrue(ret, "Failed to start glusterd on the specified nodes") g.log.info("Initiated start of glusterd on all nodes.") # Wait for glusterd to start. ret = wait_for_glusterd_to_start(self.servers) self.assertTrue(ret, "Glusterd not up on all nodes.") g.log.info("Glusterd is up and running on all nodes.") # Wait for all volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=600) self.assertTrue(ret, ("All volume processes not up.")) g.log.info("All volume processes are up.")
def tearDown(self): # restart glusterd on all servers g.log.info("Restart glusterd on all servers %s", self.servers) ret = restart_glusterd(self.servers) if not ret: raise ExecutionError("Failed to restart glusterd on all " "servers %s" % self.servers) g.log.info("Glusterd restart successful on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: active) g.log.info("Check if glusterd is running on all servers %s" "(expected: active)", self.servers) ret = is_glusterd_running(self.servers) if ret != 0: raise ExecutionError("Glusterd is not running on all servers " "%s" % self.servers) g.log.info("Glusterd is running on all the servers %s", self.servers) # Validate all the peers are in connected state count = 0 while count < 80: ret = self.validate_peers_are_connected() if ret: g.log.info("All peers are in connected state") break sleep(2) count += 1 if not ret: raise ExecutionError("All peers are in connected state")
def tearDown(self): """In case of any failure restart glusterd on all servers """ if not self.test_method_complete: # restart glusterd on all servers g.log.info("Restart glusterd on all servers %s", self.servers) ret = restart_glusterd(self.servers) if not ret: raise ExecutionError( "Failed to restart glusterd on all " "servers %s", self.servers) g.log.info("Successfully restarted glusterd on all servers %s", self.servers) # Wait for all the glusterd's to establish communication. time.sleep(30) # Validate all the peers are in connected state g.log.info("Validating all the peers are in Cluster and Connected") ret = self.validate_peers_are_connected() if not ret: raise ExecutionError("Validating Peers to be in Cluster " "Failed") g.log.info("All peers are in connected state") # Calling GlusterBaseClass tearDown GlusterBaseClass.tearDown.im_func(self)
def scratch_cleanup(cls, error_or_failure_exists): """ This scratch_cleanup script will run only when the code currently running goes into execution or assertion error. Args: error_or_failure_exists (bool): If set True will cleanup setup atlast of testcase only if exectution or assertion error in teststeps. False will skip this scratch cleanup step. Returns (bool): True if setup cleanup is successful. False otherwise. """ if error_or_failure_exists: ret = stop_glusterd(cls.servers) if not ret: g.log.error("Failed to stop glusterd") cmd_list = ("pkill pidof glusterd", "rm /var/run/glusterd.socket") for server in cls.servers: for cmd in cmd_list: ret, _, _ = g.run(server, cmd, "root") if ret: g.log.error("Failed to stop glusterd") return False for server in cls.servers: cmd_list = ("rm -rf /var/lib/glusterd/vols/*", "rm -rf /var/lib/glusterd/snaps/*", "rm -rf /var/lib/glusterd/peers/*", "rm -rf {}/*/*".format( cls.all_servers_info[server]['brick_root'])) for cmd in cmd_list: ret, _, _ = g.run(server, cmd, "root") if ret: g.log.error( "failed to cleanup server {}".format(server)) return False ret = restart_glusterd(cls.servers) if not ret: g.log.error("Failed to start glusterd") return False sleep(2) ret = wait_for_glusterd_to_start(cls.servers) if not ret: g.log.error("Failed to bring glusterd up") return False ret = peer_probe_servers(cls.mnode, cls.servers) if not ret: g.log.error("Failed to peer probe servers") return False for client in cls.clients: cmd_list = ("umount /mnt/*", "rm -rf /mnt/*") for cmd in cmd_list: ret = g.run(client, cmd, "root") if ret: g.log.error( "failed to unmount/already unmounted {}".format( client)) return True
def test_glusterd_services(self): """Test restart, stop, start of glusterd """ # restart glusterd on all servers g.log.info("Restart glusterd on all servers %s", self.servers) ret = restart_glusterd(self.servers) self.assertTrue( ret, ("Failed to restart glusterd on all servers %s", self.servers)) g.log.info("Successfully restarted glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: active) g.log.info( "Check if glusterd is running on all servers %s" "(expected: active)", self.servers) ret = is_glusterd_running(self.servers) self.assertEqual( ret, 0, ("Glusterd is not running on all servers %s", self.servers)) g.log.info("Glusterd is running on all the servers %s", self.servers) # Stop glusterd on all servers g.log.info("Stop glusterd on all servers %s", self.servers) ret = stop_glusterd(self.servers) self.assertTrue( ret, ("Failed to stop glusterd on all servers %s", self.servers)) g.log.info("Successfully stopped glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: not running) g.log.info( "Check if glusterd is running on all servers %s" "(expected: not running)", self.servers) ret = is_glusterd_running(self.servers) self.assertNotEqual(ret, 0, ("Glusterd is still running on some " "servers %s", self.servers)) g.log.info("Glusterd not running on any servers %s as expected.", self.servers) # Start glusterd on all servers g.log.info("Start glusterd on all servers %s", self.servers) ret = start_glusterd(self.servers) self.assertTrue( ret, ("Failed to start glusterd on all servers %s", self.servers)) g.log.info("Successfully started glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: active) g.log.info( "Check if glusterd is running on all servers %s" "(expected: active)", self.servers) ret = is_glusterd_running(self.servers) self.assertEqual( ret, 0, ("Glusterd is not running on all servers %s", self.servers)) g.log.info("Glusterd is running on all the servers %s", self.servers)
def test_glusterd_restart_stop_start(self): """Tests glusterd stop, start, restart and validate if all peers are in connected state after glusterd restarts. """ # restart glusterd on all servers ret = restart_glusterd(self.servers) self.assertTrue( ret, ("Failed to restart glusterd on all servers %s", self.servers)) g.log.info("Successfully restarted glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: active) ret = is_glusterd_running(self.servers) self.assertEqual( ret, 0, ("Glusterd is not running on all servers %s", self.servers)) g.log.info("Glusterd is running on all the servers %s", self.servers) # Stop glusterd on all servers ret = stop_glusterd(self.servers) self.assertTrue( ret, ("Failed to stop glusterd on all servers %s", self.servers)) g.log.info("Successfully stopped glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: not running) ret = is_glusterd_running(self.servers) self.assertNotEqual(ret, 0, ("Glusterd is still running on some " "servers %s", self.servers)) g.log.info("Glusterd not running on any servers %s as expected.", self.servers) # Start glusterd on all servers ret = start_glusterd(self.servers) self.assertTrue( ret, ("Failed to start glusterd on all servers %s", self.servers)) g.log.info("Successfully started glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: active) ret = is_glusterd_running(self.servers) self.assertEqual( ret, 0, ("Glusterd is not running on all servers %s", self.servers)) g.log.info("Glusterd is running on all the servers %s", self.servers) # Wait for all the glusterd's to establish communication. time.sleep(30) # Validate all the peers are in connected state ret = self.validate_peers_are_connected() self.assertTrue(ret, "Validating Peers to be in Cluster Failed") self.test_method_complete = True
def test_setting_vol_option_with_max_characters(self): ret = setup_volume(self.mnode, self.all_servers_info, self.volume) self.assertTrue(ret, ("Failed to create " "and start volume %s" % self.volname)) auth_list = [] for ip_addr in range(256): auth_list.append('192.168.122.%d' % ip_addr) for ip_addr in range(7): auth_list.append('192.168.123.%d' % ip_addr) ip_list = ','.join(auth_list) # set auth.allow with <4096 characters and restart the glusterd g.log.info("Setting auth.allow with string of length %d for %s", len(ip_list), self.volname) self.options = {"auth.allow": ip_list} ret = set_volume_options(self.mnode, self.volname, self.options) self.assertTrue(ret, ("Failed to set auth.allow with string of length" " %d for %s" % (len(ip_list), self.volname))) ret = restart_glusterd(self.mnode) self.assertTrue(ret, "Failed to restart the glusterd on %s" % self.mnode) # set auth.allow with >4096 characters and restart the glusterd ip_list = ip_list + ",192.168.123.7" self.options = {"auth.allow": ip_list} g.log.info("Setting auth.allow with string of length %d for %s", len(ip_list), self.volname) ret = set_volume_options(self.mnode, self.volname, self.options) self.assertTrue(ret, ("Failed to set auth.allow with string of length" " %d for %s" % (len(ip_list), self.volname))) ret = restart_glusterd(self.mnode) self.assertTrue(ret, "Failed to restart the glusterd on %s" % self.mnode) count = 0 while count < 60: ret = is_glusterd_running(self.mnode) if not ret: break sleep(2) count += 1 self.assertEqual(ret, 0, "glusterd is not running on %s" % self.mnode)
def test_setting_vol_option_with_max_characters(self): ret = setup_volume(self.mnode, self.all_servers_info, self.volume) self.assertTrue(ret, ("Failed to create " "and start volume %s" % self.volname)) auth_list = [] for ip_addr in range(256): auth_list.append('192.168.122.%d' % ip_addr) for ip_addr in range(7): auth_list.append('192.168.123.%d' % ip_addr) ip_list = ','.join(auth_list) # set auth.allow with <4096 characters and restart the glusterd g.log.info("Setting auth.allow with string of length %d for %s", len(ip_list), self.volname) self.options = {"auth.allow": ip_list} ret = set_volume_options(self.mnode, self.volname, self.options) self.assertTrue(ret, ("Failed to set auth.allow with string of length" " %d for %s" % (len(ip_list), self.volname))) ret = restart_glusterd(self.mnode) self.assertTrue(ret, "Failed to restart the glusterd on %s" % self.mnode) # set auth.allow with >4096 characters and restart the glusterd ip_list = ip_list + ",192.168.123.7" self.options = {"auth.allow": ip_list} g.log.info("Setting auth.allow with string of length %d for %s", len(ip_list), self.volname) ret = set_volume_options(self.mnode, self.volname, self.options) self.assertTrue(ret, ("Failed to set auth.allow with string of length" " %d for %s" % (len(ip_list), self.volname))) ret = restart_glusterd(self.mnode) self.assertTrue(ret, "Failed to restart the glusterd on %s" % self.mnode) ret = wait_for_glusterd_to_start(self.servers) self.assertTrue(ret, "glusterd is not running on %s" % self.servers) g.log.info("Glusterd start on the nodes : %s " "succeeded", self.servers)
def test_snap_info_glusterd_restart(self): """ Verify snapshot info before and after glusterd restart * Create multiple snapshots * Check snapshot info - Without using snapname or volname - Using snapname - Using volname * Restart glusterd on all servers * Repeat the snapshot info step for all the three scenarios mentioned above """ # pylint: disable=too-many-statements # Create snapshots with description for snap in self.snapshots: ret, _, _ = snap_create(self.mnode, self.volname, snap, description='$p3C!@l C#@R@cT#R$') self.assertEqual( ret, 0, ("Failed to create snapshot for volume %s" % self.volname)) g.log.info("Snapshot %s created successfully for volume %s", snap, self.volname) # Perform the snapshot info tests before glusterd restart self.snapshot_info() # Restart Glusterd on all servers for server in self.servers: ret = restart_glusterd(server) self.assertTrue(ret, ("Failed to restart glusterd on node %s" % server)) g.log.info("Successfully restarted glusterd on node %s", server) # Wait for glusterd to be online and validate glusterd running on all # server nodes self.assertTrue( wait_for_glusterd_to_start(self.servers), "Unexpected: glusterd not up on one or more of the nodes") g.log.info("Glusterd is up and running on all nodes") # Check if peers are connected self.assertTrue(wait_for_peers_to_connect(self.mnode, self.servers), "Unexpected: Peers are not in connected state") g.log.info("Successful: All peers are in connected state") # perform the snapshot info tests after glusterd restart self.snapshot_info()
def tearDown(self): """restart glusterd on all servers during teardown """ # restart glusterd on all servers g.log.info("Restart glusterd on all servers %s", self.servers) ret = restart_glusterd(self.servers) if not ret: raise ExecutionError("Failed to restart glusterd on all " "servers %s", self.servers) g.log.info("Successfully restarted glusterd on all servers %s", self.servers) # Calling GlusterBaseClass tearDown GlusterBaseClass.tearDown.im_func(self)
def tearDown(self): # restart glusterd on the stopped server g.log.info("Restart glusterd on %s", self.random_server) ret = restart_glusterd(self.random_server) if not ret: raise ExecutionError("Failed to restart glusterd %s" % self.random_server) g.log.info("Successfully restarted glusterd on %s", self.random_server) # Check if glusterd is running on all servers(expected: active) g.log.info("Check if glusterd is running on all servers %s" "(expected: active)", self.servers) ret = is_glusterd_running(self.servers) if ret != 0: raise ExecutionError("Glusterd is not running on all servers" " %s" % self.servers) g.log.info("Glusterd is running on all the servers " "%s", self.servers) # Check peer status from every node count = 0 while count < 80: ret = self.validate_peers_are_connected() if ret: break sleep(2) count += 1 if not ret: raise ExecutionError("All peers are in connected state") # Validate all the peers are in connected state g.log.info("Validating all the peers are in Cluster and Connected") ret = self.validate_peers_are_connected() if not ret: raise ExecutionError("Validating Peers to be in Cluster " "Failed") g.log.info("All peers are in connected state") # Unmount and cleanup original volume g.log.info("Starting to Unmount Volume and Cleanup Volume") ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts) if not ret: raise ExecutionError("Failed to umount the vol & cleanup Volume") g.log.info("Successful in umounting the volume and Cleanup") # Calling GlusterBaseClass tearDown self.get_super_method(self, 'tearDown')()
def tearDown(self): """In case of any failure restart glusterd on all servers """ if not self.test_method_complete: # restart glusterd on all servers g.log.info("Restart glusterd on all servers") ret = restart_glusterd(self.servers) self.assertTrue(ret, "Failed to restart glusterd on all servers") g.log.info("Successfully restarted glusterd on all servers") # Wait for all the glusterd's to establish communication. time.sleep(30) # Validate all the peers are in connected state g.log.info("Validating all the peers are in Cluster and Connected") ret = self.are_peers_in_connected_state() self.assertTrue(ret, "Validating Peers to be in Cluster Failed") g.log.info("All peers are in connected state") GlusterBaseClass.tearDown.im_func(self)
def test_snap_info(self): """ 1. Create volumes 2. create multiple snapshots 3. Check snapshot info for snapshots created using snap name, using volume name and without using snap name and volume name 4. restart glusterd 5. follow step 3 """ # pylint: disable=too-many-statements # Creating snapshot with description g.log.info("Starting to Create snapshot") for count in range(0, 2): self.snap = "snap%s" % count ret, _, _ = snap_create(self.mnode, self.volname, self.snap, description='$p3C!@l C#@R@cT#R$') self.assertEqual( ret, 0, ("Failed to create snapshot for volume %s" % self.volname)) g.log.info("Snapshot %s created successfully" " for volume %s", self.snap, self.volname) self.snapshot_info() # Restart Glusterd on all node g.log.info("Restarting Glusterd on all node") ret = restart_glusterd(self.servers) self.assertTrue(ret, "Failed to stop glusterd") g.log.info("Successfully stopped glusterd on all node") # Check Glusterd status g.log.info("Check glusterd running or not") ret = is_glusterd_running(self.servers) self.assertEqual(ret, 0, "glusterd running on node ") g.log.info("glusterd is not running") self.snapshot_info()
def tearDown(self): """ Cleanup the volumes """ if self.glusterd_is_stopped: ret = restart_glusterd(self.servers[1]) if not ret: raise ExecutionError("Failed to start glusterd on node: %s" % self.servers[1]) ret = wait_for_glusterd_to_start(self.servers[1]) if not ret: raise ExecutionError("Glusterd is not yet started on node: %s" % self.servers[1]) vol_list = get_volume_list(self.mnode) if vol_list is None: raise ExecutionError("Failed to get the volume list") for volume in vol_list: ret = cleanup_volume(self.mnode, volume) if not ret: raise ExecutionError("Unable to delete volume %s" % volume) # Disable multiplex ret = set_volume_options(self.mnode, 'all', {'cluster.brick-multiplex': 'disable'}) if not ret: raise ExecutionError("Failed to disable brick mux in cluster") # Peer probe detached servers pool = nodes_from_pool_list(self.mnode) for node in pool: peer_detach(self.mnode, node) ret = peer_probe_servers(self.mnode, self.servers) if not ret: raise ExecutionError("Failed to probe detached " "servers %s" % self.servers) # Calling baseclass tearDown method self.get_super_method(self, 'tearDown')()
def tearDown(self): # restart glusterd on the stopped server g.log.info("Restart glusterd on %s", self.random_server) ret = restart_glusterd(self.random_server) if not ret: raise ExecutionError("Failed to restart glusterd %s" % self.random_server) g.log.info("Successfully restarted glusterd on %s", self.random_server) # Check if glusterd is running on all servers(expected: active) g.log.info("Check if glusterd is running on all servers %s" "(expected: active)", self.servers) ret = is_glusterd_running(self.servers) if ret != 0: raise ExecutionError("Glusterd is not running on all servers" " %s" % self.servers) g.log.info("Glusterd is running on all the servers " "%s", self.servers) # Check peer status from every node count = 0 while count < 80: ret = self.validate_peers_are_connected() if ret: break sleep(2) count += 1 if not ret: raise ExecutionError("All peers are in connected state") # Validate all the peers are in connected state g.log.info("Validating all the peers are in Cluster and Connected") ret = self.validate_peers_are_connected() if not ret: raise ExecutionError("Validating Peers to be in Cluster " "Failed") g.log.info("All peers are in connected state")
def tearDown(self): # Restart glusterd on nodes for which it was stopped ret = restart_glusterd(self.servers[3:5]) if not ret: raise ExecutionError("Failed to restart glusterd on nodes: %s" % self.servers[3:5]) # Wait for glusterd to be online and validate it's running. ret = wait_for_glusterd_to_start(self.servers[3:5]) if not ret: raise ExecutionError("Glusterd not up on the servers: %s" % self.servers[3:5]) # clean up all volumes vol_list = get_volume_list(self.mnode) if vol_list is None: raise ExecutionError("Failed to get the volume list") for volume in vol_list: ret = cleanup_volume(self.mnode, volume) if not ret: raise ExecutionError("Unable to delete volume %s" % volume) g.log.info("Volume deleted successfully : %s", volume) # Peer probe detached servers ret = peer_probe_servers(self.mnode, self.servers[1:3]) if not ret: raise ExecutionError("Failed to probe detached " "servers %s" % self.servers[1:3]) # Remove all the statedump files created in the test cmd = "rm -rf /var/run/gluster/glusterdump.*" ret, _, _ = g.run(self.mnode, cmd) if ret: raise ExecutionError("Failed to clear out the statedump files") self.get_super_method(self, 'tearDown')()
def test_root_squash_enable(self): """ Tests to verify Nfs Ganesha rootsquash functionality when glusterd service is restarted Steps: 1. Create some files and dirs inside mount point 2. Set permission as 777 for mount point 3. Enable root-squash on volume 4. Create some more files and dirs 5. Restart glusterd on all the nodes 6. Try to edit file created in step 1 It should not allow to edit the file 7. Try to edit the file created in step 5 It should allow to edit the file """ # Start IO on mount point. cmd = ("for i in {1..10}; do touch %s/file$i; done" % self.mounts[0].mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertEqual(ret, 0, err) # Get stat of all the files/dirs created. ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successful in getting stats of files/dirs " "from mount point") # Set mount point permission to 777 ret = set_file_permissions(self.mounts[0].client_system, self.mounts[0].mountpoint, 777) self.assertTrue(ret, "Failed to set permission for directory") g.log.info("Successfully set permissions for directory") # Enable root-squash on volume ret = set_root_squash(self.servers[0], self.volname) self.assertTrue(ret, "Failed to enable root-squash on volume") g.log.info("root-squash is enable on the volume") # Start IO on mount point. cmd = ("for i in {1..10}; do touch %s/Squashfile$i; done" % self.mounts[0].mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.assertEqual(ret, 0, err) # Get stat of all the files/dirs created. ret = get_mounts_stat(self.mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfull in getting stats of files/dirs " "from mount point") # Restart glusterd on all servers ret = restart_glusterd(self.servers) self.assertTrue(ret, ("Failed to restart glusterd on all servers %s", self.servers)) g.log.info("Successfully restarted glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers ret = is_glusterd_running(self.servers) self.assertEqual(ret, 0, ("Failed:Glusterd is not running on all " "servers %s", self.servers)) g.log.info("Glusterd is running on all the servers %s", self.servers) # Checking if peer is connected. ret = wait_for_peers_to_connect(self.mnode, self.servers) self.assertTrue(ret, "Failed:Peer is not in connected state.") g.log.info("Peers are in connected state.") # Edit file created by root user for mount_obj in self.mounts: ret = append_string_to_file(mount_obj.client_system, "%s/file10" % mount_obj.mountpoint, 'hello') self.assertFalse(ret, "Unexpected:nfsnobody user editing file " "created by root user should FAIL") g.log.info("Successful:nfsnobody user failed to edit file " "created by root user") # Edit the file created by nfsnobody user for mount_obj in self.mounts: ret = append_string_to_file(mount_obj.client_system, "%s/Squashfile5" % mount_obj.mountpoint, 'hello') self.assertTrue(ret, "Unexpected:nfsnobody user failed to edit " "the file created by nfsnobody user") g.log.info("Successful:nfsnobody user successfully edited the " "file created by nfsnobody user")
def test_heal_for_conservative_merge_with_two_bricks_blame(self): """ 1) Create 1x3 volume and fuse mount the volume 2) On mount created a dir dir1 3) Pkill glusterfsd on node n1 (b2 on node2 and b3 and node3 up) 4) touch f{1..10} on the mountpoint 5) b2 and b3 xattrs would be blaming b1 as files are created while b1 is down 6) Reset the b3 xattrs to NOT blame b1 by using setattr 7) Now pkill glusterfsd of b2 on node2 8) Restart glusterd on node1 to bring up b1 9) Now bricks b1 online , b2 down, b3 online 10) touch x{1..10} under dir1 itself 11) Again reset xattr on node3 of b3 so that it doesn't blame b2, as done for b1 in step 6 12) Do restart glusterd on node2 hosting b2 to bring all bricks online 13) Check for heal info, split-brain and arequal for the bricks """ # pylint: disable=too-many-locals # Create dir `dir1/` on mountpont path = self.mounts[0].mountpoint + "/dir1" ret = mkdir(self.mounts[0].client_system, path, parents=True) self.assertTrue(ret, "Directory {} creation failed".format(path)) all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume") brick1, brick2, brick3 = all_bricks # Bring first brick offline self._bring_brick_offline_and_check(brick1) # touch f{1..10} files on the mountpoint cmd = ("cd {mpt}; for i in `seq 1 10`; do touch f$i" "; done".format(mpt=path)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to create files on mountpoint") # Check b2 and b3 xattrs are blaming b1 and are same self.assertEqual(self._get_fattr_for_the_brick(brick2), self._get_fattr_for_the_brick(brick3), "Both the bricks xattrs are not blaming " "brick: {}".format(brick1)) # Reset the xattrs of dir1 on b3 for brick b1 first_xattr_to_reset = "trusted.afr.{}-client-0".format(self.volname) xattr_value = "0x000000000000000000000000" host, brick_path = brick3.split(":") brick_path = brick_path + "/dir1" ret = set_fattr(host, brick_path, first_xattr_to_reset, xattr_value) self.assertTrue(ret, "Unable to set xattr for the directory") # Kill brick2 on the node2 self._bring_brick_offline_and_check(brick2) # Restart glusterd on node1 to bring the brick1 online self.assertTrue(restart_glusterd([brick1.split(":")[0]]), "Unable to " "restart glusterd") # checking for peer status post glusterd restart self._check_peers_status() # Check if the brick b1 on node1 is online or not online_bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(online_bricks, "Unable to fetch online bricks") self.assertIn(brick1, online_bricks, "Brick:{} is still offline after " "glusterd restart".format(brick1)) # Create 10 files under dir1 naming x{1..10} cmd = ("cd {mpt}; for i in `seq 1 10`; do touch x$i" "; done".format(mpt=path)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to create files on mountpoint") # Reset the xattrs from brick3 on to brick2 second_xattr_to_reset = "trusted.afr.{}-client-1".format(self.volname) ret = set_fattr(host, brick_path, second_xattr_to_reset, xattr_value) self.assertTrue(ret, "Unable to set xattr for the directory") # Bring brick2 online self.assertTrue(restart_glusterd([brick2.split(":")[0]]), "Unable to " "restart glusterd") self._check_peers_status() self.assertTrue(are_bricks_online(self.mnode, self.volname, [brick2])) # Check are there any files in split-brain and heal completion self.assertFalse(is_volume_in_split_brain(self.mnode, self.volname), "Some files are in split brain for " "volume: {}".format(self.volname)) self.assertTrue(monitor_heal_completion(self.mnode, self.volname), "Conservative merge of files failed") # Check arequal checksum of all the bricks is same ret, arequal_from_the_bricks = collect_bricks_arequal(all_bricks) self.assertTrue(ret, "Arequal is collected successfully across the" " bricks in the subvol {}".format(all_bricks)) self.assertEqual(len(set(arequal_from_the_bricks)), 1, "Arequal is " "same on all the bricks in the subvol")
def test_snap_delete_and_list_glusterd_down(self): # pylint: disable=too-many-statements """ Steps: 1. create a volume 2. mount volume 3. create 3 snapshot of that volume 4. delete snapshot snap1 5. list all snapshots created 6. restart glusterd 7. list all snapshots created except snap1 """ # Creating snapshot: g.log.info("Starting to Create snapshot") for snap_count in range(0, 3): self.snap = "snap%s" % snap_count ret, _, _ = snap_create(self.mnode, self.volname, self.snap) self.assertEqual(ret, 0, ("Failed to create snapshot for " "volume %s" % self.volname)) g.log.info("Snapshot %s created successfully " "for volume %s", self.snap, self.volname) # delete snap1 snapshot g.log.info("Starting to Delete snapshot snap1") ret, _, _ = snap_delete(self.mnode, "snap1") self.assertEqual(ret, 0, "Failed to delete" "snapshot snap1") g.log.info("Snapshots snap1 deleted Successfully") # snapshot list g.log.info("Starting to list all snapshots") out = get_snap_list(self.mnode) self.assertIsNotNone(out, "Failed to list all snapshots") self.assertEqual(len(out), 2, "Failed to validate snap list") g.log.info("Successfully validated snap list") # restart Glusterd g.log.info("Restarting Glusterd on all nodes") ret = restart_glusterd(self.servers) self.assertTrue( ret, "Failed to restart glusterd on nodes" "%s" % self.servers) g.log.info("Successfully restarted glusterd on nodes" " %s", self.servers) # check glusterd running g.log.info("Checking glusterd is running or not") count = 0 while count < 80: ret = is_glusterd_running(self.servers) if ret == 0: break time.sleep(2) count += 1 self.assertEqual( ret, 0, "Failed to validate glusterd " "running on nodes %s" % self.servers) g.log.info("glusterd is running on " "nodes %s", self.servers) # snapshot list g.log.info("Starting to list all snapshots") for server in self.servers[0:]: out = get_snap_list(server) self.assertIsNotNone(out, "Failed to list snap in node" "%s" % server) self.assertEqual( len(out), 2, "Failed to validate snap list" "on node %s" % server) g.log.info("Successfully validated snap list on node %s", server)
def test_enabling_gluster_debug_mode(self): # pylint: disable=too-many-statements """ Testcase: 1. Stop glusterd. 2. Change log level to DEBUG in /usr/local/lib/systemd/system/glusterd.service. 3. Remove glusterd log 4. Start glusterd 5. Issue some gluster commands 6. Check for debug messages in glusterd log """ # Stop glusterd ret = stop_glusterd(self.mnode) self.assertTrue(ret, "Failed to stop glusterd on %s" % self.mnode) g.log.info("Successfully stopped glusterd.") # Change log level in /usr/lib/systemd/system/glusterd.service # to DEBUG glusterd_file = "/usr/lib/systemd/system/glusterd.service" ret = find_and_replace_in_file(self.mnode, 'LOG_LEVEL=INFO', 'LOG_LEVEL=DEBUG', glusterd_file) self.assertTrue(ret, "Unable to change Log_LEVEL to DEBUG.") # Archive old glusterd.log file. ret = move_file(self.mnode, '/var/log/glusterfs/glusterd.log', '/var/log/glusterfs/old.log') self.assertTrue(ret, "Renaming the glusterd log is failed") g.log.info("Successfully renamed glusterd.log file.") # Daemon reloading as the unit file of the daemon changed ret = daemon_reload(self.mnode) self.assertTrue(ret, "Daemon reloaded successfully") # Start glusterd ret = start_glusterd(self.mnode) self.assertTrue(ret, "Failed to start glusterd on %s" % self.mnode) g.log.info('Successfully to started glusterd.') # Check if glusterd is running or not. count = 0 while count < 60: ret = is_glusterd_running(self.mnode) if ret: break sleep(2) count += 1 self.assertEqual(ret, 0, "glusterd is not running on %s" % self.mnode) g.log.info('glusterd is running after changing log_level to debug.') # Instead of executing commands in loop, if glusterd is restarted in # one of the nodes in the cluster the handshake messages # will be in debug mode. ret = restart_glusterd(self.servers[1]) self.assertTrue(ret, "restarted successfully") count = 0 while count < 60: ret = is_glusterd_running(self.mnode) if ret: break sleep(2) count += 1 self.assertEqual(ret, 0, "glusterd is not running on %s" % self.mnode) g.log.info('glusterd is running after changing log_level to debug.') # Check glusterd logs for debug messages glusterd_log_file = "/var/log/glusterfs/glusterd.log" ret = check_if_pattern_in_file(self.mnode, ' D ', glusterd_log_file) self.assertEqual(ret, 0, "Debug messages are not present in log.") g.log.info("Debug messages are present in log.")
def test_restart_glusterd_after_rebalance(self): # Log Volume Info and Status before expanding the volume. g.log.info("Logging volume info and Status before expanding volume") ret = log_volume_info_and_status(self.mnode, self.volume) g.log.info("Successful in logging volume info and status of " "volume %s", self.volname) # Expanding volume by adding bricks to the volume g.log.info("Start adding bricks to volume") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Volume %s: Expand failed", self.volname)) g.log.info("Volume %s: Expand success", self.volname) # Wait for gluster processes to come online g.log.info("Wait for gluster processes to come online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=600) self.assertTrue(ret, ("Volume %s: one or more volume process are " "not up", self.volname)) g.log.info("All volume %s processes are online", self.volname) # Log Volume Info and Status after expanding the volume g.log.info("Logging volume info and Status after expanding volume") ret = log_volume_info_and_status(self.mnode, self.volume) # Start Rebalance g.log.info("Starting rebalance on the volume") ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to start rebalance on %s ", self.volname)) g.log.info("Successfully started rebalance on %s ", self.volname) # Wait for rebalance to complete g.log.info("Waiting for rebalance to complete") ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, ("Rebalance is not yet complete on the volume " "%s", self.volname)) g.log.info("Rebalance is successfully complete on the volume %s", self.volname) # restart glusterd on all servers g.log.info("Restart glusterd on all servers %s", self.servers) ret = restart_glusterd(self.servers) self.assertTrue(ret, ("Failed to restart glusterd on all servers %s", self.servers)) g.log.info("Successfully restarted glusterd on all servers %s", self.servers) # Check if glusterd is running on all servers(expected: active) g.log.info("Check if glusterd is running on all servers %s" "(expected: active)", self.servers) ret = is_glusterd_running(self.servers) self.assertEqual(ret, 0, ("Glusterd is not running on all servers %s", self.servers)) g.log.info("Glusterd is running on all the servers %s", self.servers) # Check if rebalance process has started after glusterd restart g.log.info("Checking if rebalance process has started after " "glusterd restart") for server in self.servers: ret, _, _ = g.run(server, "pgrep rebalance") self.assertNotEqual(ret, 0, ("Rebalance process is triggered on " "%s after glusterd restart", server)) g.log.info("Rebalance is NOT triggered on %s after glusterd " "restart", server)
def test_glustershd_with_restarting_glusterd(self): """ Test Script to verify the self heal daemon process with restarting glusterd and rebooting the server * stop all volumes * restart glusterd - should not run self heal daemon process * start replicated involved volumes * single self heal daemon process running * restart glusterd * self heal daemon pid will change * bring down brick and restart glusterd * self heal daemon pid will change and its different from previous * brought up the brick """ # pylint: disable=too-many-statements nodes = self.volume['servers'] # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # check the self heal daemon process after stopping the volume g.log.info("Verifying the self heal daemon process for " "volume %s", self.volname) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is still running " "even after stopping volume %s" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume %s", self.volname) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) self.assertTrue( wait_for_glusterd_to_start(self.servers), "Failed to start glusterd on %s" % self.servers) # check the self heal daemon process after restarting glusterd process g.log.info("Starting to get self-heal daemon process on" " nodes %s", nodes) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is running after " "glusterd restart with volume %s in " "stop state" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume and restarting glusterd %s", self.volname) # start the volume g.log.info("Starting the volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verfiy glustershd process releases its parent process g.log.info("Checking whether glustershd process is daemonized or not") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Single self heal daemon process on all nodes %s", nodes) # get the self heal daemon pids after starting volume g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self heal daemon pids") glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info # with glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different from " "glustershd server volume file. " "Please check log file for details.")) g.log.info("Successfully parsed %s file", self.glustershd) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after starting volume and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_glusterd_restart = pids self.assertNotEqual(glustershd_pids, glustershd_pids_after_glusterd_restart, ("Self Heal Daemon pids are same after " "restarting glusterd process")) g.log.info("Self Heal Daemon process are different before and " "after restarting glusterd process") # select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # bring bricks offline g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # restart glusterd after brought down the brick g.log.info("Restart glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after killing brick and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_killing_brick = pids self.assertNotEqual(glustershd_pids_after_glusterd_restart, glustershd_pids_after_killing_brick, ("Self Heal Daemon process are same from before " "killing the brick,restarting glusterd process")) g.log.info("Self Heal Daemon process are different after killing the " "brick, restarting the glusterd process") # brought the brick online g.log.info("bringing up the bricks : %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to brought the bricks online")) g.log.info("Successfully brought the bricks online") # check all bricks are online g.log.info("Verifying all bricka are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.")
def test_glusterd_rebalance(self): ''' -> Create Volume -> Fuse mount the volume -> Perform I/O on fuse mount -> Add bricks to the volume -> Perform rebalance on the volume -> While rebalance is in progress, -> restart glusterd on all the nodes in the cluster ''' # run IOs g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ( "python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 4 " "--dir-length 6 " "--max-num-of-dirs 3 " "--num-of-files 25 %s" % (self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") # Forming brick list brick_list = form_bricks_list_to_add_brick(self.mnode, self.volname, self.servers, self.all_servers_info) # Adding Bricks ret, _, _ = add_brick(self.mnode, self.volname, brick_list) self.assertEqual(ret, 0, "Failed to add brick to the volume %s" % self.volname) g.log.info("Brick added successfully to the volume %s", self.volname) # Performing rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual( ret, 0, 'Failed to start rebalance on volume %s' % self.volname) g.log.info("Rebalance started successfully on volume %s", self.volname) # Checking Rebalance is in progress or not rebalance_status = get_rebalance_status(self.mnode, self.volname) if rebalance_status['aggregate']['statusStr'] != 'in progress': raise ExecutionError("Rebalance is not in 'in progress' state, " "either rebalance is in compeleted state or" " failed to get rebalance status") # Restart glusterd ret = restart_glusterd(self.servers) self.assertTrue(ret, "Failed to restart glusterd on servers") g.log.info("Glusterd restarted successfully on %s", self.servers) # Checking glusterd status count = 0 while count < 60: ret = is_glusterd_running(self.servers) if not ret: break sleep(2) count += 1 self.assertEqual(ret, 0, "Glusterd is not running on some of the " "servers") g.log.info("Glusterd is running on all servers %s", self.servers)
def test_snap_status_glusterd_restart(self): # pylint: disable=too-many-statements, too-many-branches """ Test Case: 1. Create volume 2. Create two snapshots with description 3. Check snapshot status informations with snapname, volume name and without snap name/volname. 4. Restart glusterd on all nodes 5. Follow step3 again and validate snapshot """ # Creating snapshot with description for snap in self.snapshots: ret, _, _ = snap_create(self.mnode, self.volname, snap, description='$p3C!@l C#@R@cT#R$') self.assertEqual( ret, 0, ("Failed to create snapshot for volume %s" % self.volname)) g.log.info("Snapshot %s created successfully" " for volume %s", snap, self.volname) # Validate snapshot status information # Check snapshot status snap_stat = get_snap_status(self.mnode) self.assertIsNotNone(snap_stat, "failed to get snap status") snap_count = 0 for snap in self.snapshots: self.assertEqual(snap_stat[snap_count]['name'], snap, "Failed to show snapshot status") snap_count += 1 g.log.info("Successfully checked snapshot status") # Check snapshot status using snap name snap_status = get_snap_status_by_snapname(self.mnode, self.snapshots[0]) self.assertIsNotNone(snap_status, "failed to get snap status") self.assertEqual( snap_status['name'], "%s" % self.snapshots[0], "Failed to show snapshot " "status for %s" % self.snapshots[0]) g.log.info("Successfully checked snapshot status for %s", self.snapshots[0]) # Check snapshot status using volname ret, snap_vol_status, _ = snap_status_by_volname( self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to get snapshot statue " "by volume name")) self.assertIsNotNone(snap_vol_status, "failed to get snap status") for snap in self.snapshots: self.assertIn(snap, snap_vol_status, "Failed to validate snapshot name") g.log.info("Successfully validated snapshot status for %s", self.volname) # Restart Glusterd on all node ret = restart_glusterd(self.servers) self.assertTrue(ret, "Failed to stop glusterd") g.log.info("Successfully stopped glusterd on all node") # Check Glusterd status ret = is_glusterd_running(self.servers) self.assertEqual(ret, 0, "glusterd running on node ") g.log.info("glusterd is not running") # Validate snapshot status information # Check snapshot status snap_stat = get_snap_status(self.mnode) self.assertIsNotNone(snap_stat, "failed to get snap status") snap_count = 0 for snap in self.snapshots: self.assertEqual(snap_stat[snap_count]['name'], snap, "Failed to show snapshot status") snap_count += 1 g.log.info("Successfully checked snapshot status") # Check snapshot status using snap name snap_status = get_snap_status_by_snapname(self.mnode, self.snapshots[0]) self.assertIsNotNone(snap_status, "failed to get snap status") self.assertEqual( snap_status['name'], "%s" % self.snapshots[0], "Failed to show snapshot " "status for %s" % self.snapshots[0]) g.log.info("Successfully checked snapshot status for %s", self.snapshots[0]) # Check snapshot status using volname ret, snap_vol_status, _ = snap_status_by_volname( self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to get snapshot statue " "by volume name")) self.assertIsNotNone(snap_vol_status, "failed to get snap status") for snap in self.snapshots: self.assertIn( snap, snap_vol_status, "Failed to validate snapshot status " "using volume name") g.log.info("Successfully validated snapshot status for %s", self.volname)
def georep_nonroot_prerequisites(mnode, snodes, group, user, mntbroker_dir, slavevol): """ Setup pre-requisites for mountbroker setup Args: mnode (str) : Master node on which cmd is to be executed snodes (list): List of slave nodes group (str): Specifies a group name user (str): Specifies a user name mntbroker_dir: Mountbroker mount directory slavevol (str) The name of the slave volume Returns: bool: True if all pre-requisite are successful else False """ g.log.debug("Enable shared-storage") ret, _, err = create_shared_storage(mnode) if ret: if "already exists" not in err: g.log.error("Failed to enable shared storage on %s", mnode) return False g.log.debug("Create new group: %s on all slave nodes", group) if not georep_groupadd(snodes, group): g.log.error("Creating group: %s on all slave nodes failed", group) return False g.log.debug("Create user: %s in group: %s on all slave nodes", user, group) if not georep_geoaccount(snodes, group, user): g.log.error( "Creating user: %s in group: %s on all slave nodes " "failed", user, group) return False g.log.debug("Setting up mount broker root directory: %s node: %s", mntbroker_dir, snodes[0]) ret, _, _ = georep_mountbroker_setup(snodes[0], group, mntbroker_dir) if ret: g.log.error("Setting up of mount broker directory failed: %s node: %s", mntbroker_dir, snodes[0]) return False g.log.debug("Add volume: %s and user: %s to mountbroker service", slavevol, user) ret, _, _ = georep_mountbroker_adduser(snodes[0], slavevol, user) if ret: g.log.error( "Add volume: %s and user: %s to mountbroker " "service failed", slavevol, user) return False g.log.debug("Checking mountbroker status") ret, out, _ = georep_mountbroker_status(snodes[0]) if not ret: if "not ok" in out: g.log.error("Mountbroker status not ok") return False else: g.log.error("Mountbroker status command failed") return False g.log.debug("Restart glusterd on all slave nodes") if not restart_glusterd(snodes): g.log.error("Restarting glusterd failed") return False g.log.debug("Set passwd for user account on slave") if not georep_geoaccount_setpasswd(snodes, group, user, "geopasswd"): g.log.error("Setting password failed on slaves") return False g.log.debug("Setup passwordless SSH between %s and %s", mnode, snodes[0]) if not georep_ssh_keygen(mnode): g.log.error("ssh keygen is failed on %s", mnode) return False if not georep_ssh_copyid(mnode, snodes[0], user, "geopasswd"): g.log.error("ssh copy-id is failed from %s to %s", mnode, snodes[0]) return False return True
def bring_bricks_online(mnode, volname, bricks_list, bring_bricks_online_methods=None): """Bring the bricks specified in the bricks_list online. Args: mnode (str): Node on which commands will be executed. volname (str): Name of the volume. bricks_list (list): List of bricks to bring them online. Kwargs: bring_bricks_online_methods (list): List of methods using which bricks will be brought online. The method to bring a brick online is randomly selected from the bring_bricks_online_methods list. By default all bricks will be brought online with ['glusterd_restart', 'volume_start_force'] methods. If 'volume_start_force' command is randomly selected then all the bricks would be started with the command execution. Hence we break from bringing bricks online individually Returns: bool : True on successfully bringing all bricks online. False otherwise """ if bring_bricks_online_methods is None: bring_bricks_online_methods = ['glusterd_restart', 'volume_start_force'] elif not isinstance(bring_bricks_online_methods, list): bring_bricks_online_methods = [bring_bricks_online_methods] g.log.info("Bringing bricks '%s' online with '%s'", bricks_list, bring_bricks_online_methods) _rc = True failed_to_bring_online_list = [] for brick in bricks_list: bring_brick_online_method = random.choice(bring_bricks_online_methods) if is_brick_mux_enabled(mnode): bring_bricks_online_command = ("gluster volume start %s force" % volname) ret, _, _ = g.run(mnode, bring_bricks_online_command) if ret != 0: g.log.error("Unable to start the volume %s with force option", volname) _rc = False else: g.log.info("Successfully restarted volume %s to bring all " "the bricks '%s' online", volname, bricks_list) elif bring_brick_online_method == 'glusterd_restart': brick_node, _ = brick.split(":") ret = restart_glusterd(brick_node) if not ret: g.log.error("Unable to restart glusterd on node %s", brick_node) _rc = False failed_to_bring_online_list.append(brick) else: g.log.info("Successfully restarted glusterd on node %s to " "bring back brick %s online", brick_node, brick) elif bring_brick_online_method == 'volume_start_force': bring_brick_online_command = ("gluster volume start %s force" % volname) ret, _, _ = g.run(mnode, bring_brick_online_command) if ret != 0: g.log.error("Unable to start the volume %s with force option", volname) _rc = False else: g.log.info("Successfully restarted volume %s to bring all " "the bricks '%s' online", volname, bricks_list) break else: g.log.error("Invalid method '%s' to bring brick online", bring_brick_online_method) return False g.log.info("Waiting for 30 seconds for all the bricks to be online") time.sleep(30) return _rc
def test_remove_brick(self): """ In this test case: 1. Trusted storage Pool of 4 nodes 2. Create a distributed-replicated volumes with 4 bricks 3. Start the volume 4. Fuse mount the gluster volume on out of trusted nodes 5. Create some data file 6. Start remove-brick operation for one replica pair 7. Restart glusterd on all nodes 8. Try to commit the remove-brick operation while rebalance is in progress, it should fail """ # pylint: disable=too-many-statements my_servers = self.servers[0:4] my_server_info = {} for server in self.servers[0:4]: my_server_info[server] = self.all_servers_info[server] for index in range(1, 4): ret, _, _ = peer_probe(self.servers[0], self.servers[index]) self.assertEqual(ret, 0, ("peer probe from %s to %s is failed", self.servers[0], self.servers[index])) g.log.info("peer probe is success from %s to " "%s", self.servers[0], self.servers[index]) # Validating whether the peer are connected or not # In jenkins This case is failing saying peers are not in connected # state, that is reason adding a check whether peers are connected # or not count = 0 while count < 30: ret = is_peer_connected(self.mnode, my_servers) if ret: g.log.info("Peers are in connected state") break sleep(3) count = count + 1 self.assertTrue(ret, "Some peers are not in connected state") self.volname = "testvol" bricks_list = form_bricks_list(self.mnode, self.volname, 4, my_servers, my_server_info) g.log.info("Creating a volume %s ", self.volname) kwargs = {} kwargs['replica_count'] = 2 ret = volume_create(self.mnode, self.volname, bricks_list, force=False, **kwargs) self.assertEqual(ret[0], 0, ("Unable" "to create volume %s" % self.volname)) g.log.info("Volume created successfully %s", self.volname) ret, _, _ = volume_start(self.mnode, self.volname, False) self.assertEqual(ret, 0, ("Failed to start the " "volume %s", self.volname)) g.log.info("Get all the bricks of the volume") bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") g.log.info("Successfully got the list of bricks of volume") # Mounting a volume ret, _, _ = mount_volume(self.volname, mtype=self.mount_type, mpoint=self.mounts[0].mountpoint, mserver=self.mnode, mclient=self.mounts[0].client_system) self.assertEqual(ret, 0, ("Volume %s is not mounted") % self.volname) g.log.info("Volume mounted successfully : %s", self.volname) self.all_mounts_procs = [] # Creating files command = ("cd %s/ ; " "for i in `seq 1 10` ; " "do mkdir l1_dir.$i ; " "for j in `seq 1 5` ; " "do mkdir l1_dir.$i/l2_dir.$j ; " "for k in `seq 1 10` ; " "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k " "bs=128k count=$k ; " "done ; " "done ; " "done ; " % (self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.io_validation_complete = True self.assertTrue(ret, "IO failed on some of the clients") remove_brick_list = bricks_list[2:4] ret, _, _ = remove_brick(self.mnode, self.volname, remove_brick_list, 'start') self.assertEqual(ret, 0, "Failed to start remove brick operation") g.log.info("Remove bricks operation started successfully") g.log.info("Restart glusterd on servers %s", self.servers) ret = restart_glusterd(self.servers) self.assertTrue( ret, ("Failed to restart glusterd on servers %s", self.servers)) g.log.info("Successfully restarted glusterd on servers %s", self.servers) ret, _, _ = remove_brick(self.mnode, self.volname, remove_brick_list, 'commit') self.assertNotEqual(ret, 0, "Remove brick commit ops should be fail") g.log.info("Remove bricks commit operation failure is expected")
def test_validate_optimized_glusterd_handshake(self): """ Test Case: 1) Create a 3 node cluster 2) Enable brick-multiplex in the cluster 3) Create and start 2000 volumes 4) Stop one of the node in the cluster 5) Set an option for around 850 volumes in the cluster 6) Restart glusterd on the previous node 7) Check the value of the option set earlier, in the restarted node """ # pylint: disable=too-many-locals # Enable brick-multiplex ret = set_volume_options(self.mnode, 'all', {'cluster.brick-multiplex': 'enable'}) self.assertTrue(ret, "Failed to enable brick mux on cluster") server_info_frm_three_node = {} for server in self.servers[:3]: server_info_frm_three_node[server] = self.all_servers_info[server] # Fetch the available bricks dict bricks_dict = get_servers_bricks_dict(self.servers[:3], server_info_frm_three_node) self.assertIsNotNone(bricks_dict, "Failed to get the bricks dict") # Using, custome method because method bulk_volume_creation creates # a huge logging and does unwanted calls, which will slow down the # test case and use more memory # Create and start 2000 volumes for i in range(2000): self.volname = "volume-%d" % i bricks_list = [] j = 0 for key, value in bricks_dict.items(): j += 1 brick = choice(value) brick = "{}:{}/{}_brick-{}".format(key, brick, self.volname, j) bricks_list.append(brick) kwargs = {'replica_count': 3} ret, _, _ = volume_create(self.mnode, self.volname, bricks_list, False, **kwargs) self.assertEqual(ret, 0, "Failed to create volume: %s" % self.volname) ret, _, _ = volume_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to start volume: %s" % self.volname) g.log.info("Successfully created and started all the volumes") # Stop glusterd on one node ret = stop_glusterd(self.servers[1]) self.assertTrue(ret, "Failed to stop glusterd on node :%s" % self.servers[1]) self.glusterd_is_stopped = True # Set a volume option for 800 volumes option_value = {'network.ping-timeout': 45} for i in range(850): vol_name = "volume-" + str(i) ret = set_volume_options(self.mnode, vol_name, option_value) self.assertTrue(ret, "Failed to set volume option") # Start glusterd on the previous node ret = restart_glusterd(self.servers[1]) self.assertTrue(ret, "Failed to start glusterd on node: %s" % self.servers[1]) ret = wait_for_glusterd_to_start(self.servers[1]) self.assertTrue(ret, "Glusterd is not yet started on the node :%s" % self.servers[1]) # It might take some time, to get the peers to connected state, # because of huge number of volumes to sync while True: ret = is_peer_connected(self.mnode, self.servers[1:3]) if ret: break sleep(1) self.assertTrue(ret, "Peers are not in connected state") self.glusterd_is_stopped = False # Check the volume option set earlier is synced on restarted node for i in range(850): vol_name = "volume-" + str(i) # Doing, a while True loop because there might be race condition # and it might take time for the node to sync the data initially while True: ret = get_volume_options(self.servers[1], vol_name, 'network.ping-timeout') self.assertTrue(ret, "Failed to get volume option") g.log.info("Ret: %s", ret['network.ping-timeout']) if ret['network.ping-timeout'] == '45': break self.assertEqual(ret['network.ping-timeout'], '45', "Option value not updated in the restarted node")
def test_snap_list_glusterd_restart(self): """ Verify snapshot list before and after glusterd restart * Create 3 snapshots of the volume * Delete one snapshot * List all snapshots created * Restart glusterd on all nodes * List all snapshots All snapshots must be listed except the one that was deleted """ # pylint: disable=too-many-statements # Create snapshots for snap in self.snapshots: ret, _, _ = snap_create(self.mnode, self.volname, snap) self.assertEqual(ret, 0, ("Failed to create snapshot %s for " "volume %s" % (snap, self.volname))) g.log.info("Snapshot %s created successfully " "for volume %s", snap, self.volname) # List the snapshots and validate with snapname snap_list = get_snap_list(self.mnode) self.assertIsNotNone(snap_list, "Failed to list all snapshots") self.assertEqual(len(snap_list), 3, "Failed to validate snap list") g.log.info("Successfully validated snap list") for snap in self.snapshots: self.assertIn( snap, snap_list, "Failed to validate the snapshot " "%s in the snapshot list" % snap) g.log.info("Successfully validated the presence of snapshots using " "snapname") # Delete one snapshot ret, _, _ = snap_delete(self.mnode, self.snapshots[0]) self.assertEqual(ret, 0, ("Failed to delete snapshot %s" % self.snapshots[0])) g.log.info("Snapshots %s deleted Successfully", self.snapshots[0]) # List the snapshots and validate with snapname snap_list = get_snap_list(self.mnode) self.assertIsNotNone(snap_list, "Failed to list all snapshots") self.assertEqual(len(snap_list), 2, "Failed to validate snap list") g.log.info("Successfully validated snap list") for snap in self.snapshots[1:]: self.assertIn( snap, snap_list, "Failed to validate the snapshot " "%s in the snapshot list" % snap) g.log.info("Successfully validated the presence of snapshots using " "snapname") # Restart glusterd on all the servers ret = restart_glusterd(self.servers) self.assertTrue( ret, ("Failed to restart glusterd on nodes %s" % self.servers)) g.log.info("Successfully restarted glusterd on nodes %s", self.servers) # Wait for glusterd to be online and validate glusterd running on all # server nodes self.assertTrue( wait_for_glusterd_to_start(self.servers), "Unexpected: glusterd not up on one or more of the nodes") g.log.info("Glusterd is up and running on all nodes") # Check if peers are connected self.assertTrue(is_peer_connected(self.mnode, self.servers), "Unexpected: Peers are not in connected state") g.log.info("Successful: All peers are in connected state") # List the snapshots after glusterd restart # All snapshots must be listed except the one deleted for server in self.servers: snap_list = get_snap_list(server) self.assertIsNotNone( snap_list, "Failed to get the list of snapshots in node %s" % server) self.assertEqual( len(snap_list), 2, "Unexpected: Number of snapshots not consistent in the node %s" % server) g.log.info("Successfully validated snap list for node %s", server) for snap in self.snapshots[1:]: self.assertIn( snap, snap_list, "Failed to validate the snapshot " "%s in the snapshot list" % snap) g.log.info( "Successfully validated the presence of snapshots " "using snapname for node %s", server)
def test_glusterd_memory_consumption_increase(self): """ Test Case: 1) Enable brick-multiplex and set max-bricks-per-process to 3 in the cluster 2) Get the glusterd memory consumption 3) Perform create,start,stop,delete operation for 100 volumes 4) Check glusterd memory consumption, it should not increase by more than 50MB 5) Repeat steps 3-4 for two more time 6) Check glusterd memory consumption it should not increase by more than 10MB """ # pylint: disable=too-many-locals # Restarting glusterd to refresh its memory consumption ret = restart_glusterd(self.servers) self.assertTrue(ret, "Restarting glusterd failed") # check if glusterd is running post reboot ret = wait_for_glusterd_to_start(self.servers) self.assertTrue(ret, "Glusterd service is not running post reboot") # Enable brick-multiplex, set max-bricks-per-process to 3 in cluster for key, value in (('cluster.brick-multiplex', 'enable'), ('cluster.max-bricks-per-process', '3')): ret = set_volume_options(self.mnode, 'all', {key: value}) self.assertTrue( ret, "Failed to set {} to {} " " for the cluster".format(key, value)) # Get the pidof of glusterd process pid_list = [] for server in self.servers: # Get the pidof of glusterd process cmd = "pidof glusterd" ret, pid, _ = g.run(server, cmd) self.assertEqual(ret, 0, "Failed to get the pid of glusterd") pid = int(pid) pid_list.append(pid) # Fetch the list of memory consumed in all the nodes mem_consumed_list = self._memory_consumption_for_all_nodes(pid_list) # Perform volume operations for 100 volumes for first time self._volume_operations_in_loop() # Fetch the list of memory consumed in all the nodes after 1 iteration mem_consumed_list_1 = self._memory_consumption_for_all_nodes(pid_list) for i, mem in enumerate(mem_consumed_list_1): condition_met = False if mem - mem_consumed_list[i] <= 50: condition_met = True self.assertTrue( condition_met, "Unexpected: Memory consumption" " glusterd increased more than the expected" " of value") # Perform volume operations for 100 volumes for second time self._volume_operations_in_loop() # Fetch the list of memory consumed in all the nodes after 2 iterations mem_consumed_list_2 = self._memory_consumption_for_all_nodes(pid_list) for i, mem in enumerate(mem_consumed_list_2): condition_met = False if mem - mem_consumed_list_1[i] <= 10: condition_met = True self.assertTrue( condition_met, "Unexpected: Memory consumption" " glusterd increased more than the expected" " of value") # Perform volume operations for 100 volumes for third time self._volume_operations_in_loop() # Fetch the list of memory consumed in all the nodes after 3 iterations mem_consumed_list_3 = self._memory_consumption_for_all_nodes(pid_list) for i, mem in enumerate(mem_consumed_list_3): condition_met = False if mem - mem_consumed_list_2[i] <= 10: condition_met = True self.assertTrue( condition_met, "Unexpected: Memory consumption" " glusterd increased more than the expected" " of value")