class NvmeFault(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate IO works fine when NVMe fault generated on single or multiple servers with single drive. :avocado: recursive """ def setUp(self): """Set up for test case.""" super().setUp() self.no_of_pools = self.params.get("number_of_pools", '/run/pool/*', 1) self.capacity = self.params.get("percentage", '/run/faulttests/pool_capacity/*') self.no_of_servers = self.params.get( "count", '/run/faulttests/no_of_servers/*/') self.no_of_drives = self.params.get("count", '/run/faulttests/no_of_drives/*/') self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") #Set to True to generate the NVMe fault during IO self.set_faulty_device = True def test_nvme_fault(self): """Jira ID: DAOS-4722. Test Description: Test NVMe disk fault. Use Case: Create the large size of pool and start filling up the pool. while IO is in progress remove single disks from single/multiple servers. :avocado: tags=all,hw,medium,nvme,ib2,nvme_fault,full_regression """ #Create the Pool with Maximum NVMe size self.create_pool_max_size(nvme=True) #Start the IOR Command and generate the NVMe fault. self.start_ior_load(percent=self.capacity) print("pool_percentage_used -- After -- {}".format( self.pool.pool_percentage_used())) #Check nvme-health command works try: self.dmg.hostlist = self.hostlist_servers self.dmg.storage_scan_nvme_health() except CommandFailure as _error: self.fail("dmg storage scan --nvme-health failed")
class NvmeHealth(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate NVMe health test cases :avocado: recursive """ def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,full_regression :avocado: tags=hw,medium :avocado: tags=nvme :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') pool_capacity = self.params.get("pool_used_percentage", '/run/pool/*') pool_capacity = pool_capacity / 100 storage = self.get_max_storage_sizes() #Create the pool from available of storage space single_pool_nvme_size = int((storage[1] * pool_capacity) / no_of_pools) single_pool_scm_size = int((storage[0] * pool_capacity) / no_of_pools) self.pool = [] # Create the Large number of pools for _pool in range(no_of_pools): self.log.info("-- Creating pool number = %s", _pool) self.pool.append(self.get_pool(create=False)) self.pool[-1].scm_size.update(single_pool_scm_size, "scm_size") self.pool[-1].nvme_size.update(single_pool_nvme_size, "nvme_size") self.pool[-1].create() # initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") # List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.set_sub_command( "list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout_text: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout_text: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))
class NvmeHealth(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate NVMe health test cases :avocado: recursive """ @skipForTicket("DAOS-7011") def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') # Stop the servers to run SPDK too to get the server capacity self.stop_servers() storage = self.get_nvme_max_capacity() self.start_servers() # Create the pool from 80% of available of storage space single_pool_nvme_size = int((storage * 0.80) / no_of_pools) self.pool = [] # Create the Large number of pools for _pool in range(no_of_pools): pool = TestPool(self.context, self.get_dmg_command()) pool.get_params(self) # SCM size is 10% of NVMe pool.scm_size.update('{}'.format(int(single_pool_nvme_size * 0.10))) pool.nvme_size.update('{}'.format(single_pool_nvme_size)) pool.create() self.pool.append(pool) # initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") # List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.set_sub_command( "list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout_text: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout_text: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))