class NvmeFault(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate IO works fine when NVMe fault generated on single or multiple servers with single drive. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(NvmeFault, self).setUp() self.no_of_pools = self.params.get("number_of_pools", '/run/pool/*', 1) self.capacity = self.params.get("percentage", '/run/faulttests/pool_capacity/*') self.no_of_servers = self.params.get( "count", '/run/faulttests/no_of_servers/*/') self.no_of_drives = self.params.get("count", '/run/faulttests/no_of_drives/*/') self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") #Set to True to generate the NVMe fault during IO self.set_faulty_device = True @skipForTicket("DAOS-5497") def test_nvme_fault(self): """Jira ID: DAOS-4722. Test Description: Test NVMe disk fault. Use Case: Create the large size of pool and start filling up the pool. while IO is in progress remove single disks from single/multiple servers. :avocado: tags=all,hw,medium,nvme,ib2,nvme_fault,full_regression """ #Create the Pool with Maximum NVMe size self.create_pool_max_size(nvme=True) #Start the IOR Command and generate the NVMe fault. self.start_ior_load(precent=self.capacity) print("pool_percentage_used -- After -- {}".format( self.pool.pool_percentage_used())) #Check nvme-health command works try: self.dmg.hostlist = self.hostlist_servers self.dmg.storage_query_nvme_health() except CommandFailure as _error: self.fail("dmg nvme-health failed")
class NvmeHealth(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate NVMe health test cases :avocado: recursive """ def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') #Stop the servers to run SPDK too to get the server capacity self.stop_servers() storage = self.get_nvme_max_capacity() self.start_servers() #Create the pool from 80% of available of storage space single_pool_nvme_size = int((storage * 0.80) / no_of_pools) self.pool = [] #Create the Large number of pools for _pool in range(no_of_pools): pool = TestPool(self.context, dmg_command=self.get_dmg_command()) pool.get_params(self) #SCM size is 10% of NVMe pool.scm_size.update('{}'.format(int(single_pool_nvme_size * 0.10))) pool.nvme_size.update('{}'.format(single_pool_nvme_size)) pool.create() self.pool.append(pool) #initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") #List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.\ set_sub_command("list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_query_nvme_health() except CommandFailure as error: self.fail("dmg nvme-health failed {}".format(error))