class NvmeHealth(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate NVMe health test cases :avocado: recursive """ def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,full_regression :avocado: tags=hw,medium :avocado: tags=nvme :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') pool_capacity = self.params.get("pool_used_percentage", '/run/pool/*') pool_capacity = pool_capacity / 100 storage = self.get_max_storage_sizes() #Create the pool from available of storage space single_pool_nvme_size = int((storage[1] * pool_capacity) / no_of_pools) single_pool_scm_size = int((storage[0] * pool_capacity) / no_of_pools) self.pool = [] # Create the Large number of pools for _pool in range(no_of_pools): self.log.info("-- Creating pool number = %s", _pool) self.pool.append(self.get_pool(create=False)) self.pool[-1].scm_size.update(single_pool_scm_size, "scm_size") self.pool[-1].nvme_size.update(single_pool_nvme_size, "nvme_size") self.pool[-1].create() # initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") # List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.set_sub_command( "list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout_text: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout_text: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))
class CSumErrorLog(DaosCoreBase): """ Test Class Description: This test runs daos_test -z (Checksum tests) and verifies whether Checksum Error Counters are incremented in the NVME device due to checksum fault injection. :avocado: recursive """ # pylint: disable=too-many-instance-attributes def setUp(self): super(CSumErrorLog, self).setUp() self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.hostlist = self.hostlist_servers[0] self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") def get_nvme_device_id(self): self.dmg.sub_command_class.sub_command_class.set_sub_command("smd") self.dmg.sub_command_class. \ sub_command_class.sub_command_class.devices.value = True self.dmg.sub_command_class. \ sub_command_class.sub_command_class.pools.value = True try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) uid = None for line in result.stdout.splitlines(): line = line.strip() if re.search("^UUID:", line): temp = line.split() uid = temp[1] break return uid def get_checksum_error_value(self, device_id=None): if device_id is None: self.fail("No device id provided") return self.dmg.sub_command_class. \ sub_command_class.set_sub_command("blobstore-health") self.dmg.sub_command_class. \ sub_command_class. \ sub_command_class.devuuid.value = "{}".format(device_id) try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) csum_count = None for line in result.stdout.splitlines(): line = line.strip() if re.search("^Checksum", line): temp = line.split() csum_count = int(temp[2]) break return csum_count def test_csum_error_logging(self): """ Test ID: DAOS-3927 Test Description: Write Avocado Test to verify single data after pool/container disconnect/reconnect. :avocado: tags=all,pr,hw,medium,ib2,csum_error_log """ dev_id = self.get_nvme_device_id() self.log.info("%s", dev_id) csum = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum) DaosCoreBase.run_subtest(self) csum_latest = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum_latest) self.assertTrue(csum_latest > csum, "Checksum Error Log not incremented") self.log.info("Checksum Error Logging Test Passed")
class NvmeHealth(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate NVMe health test cases :avocado: recursive """ @skipForTicket("DAOS-7011") def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') # Stop the servers to run SPDK too to get the server capacity self.stop_servers() storage = self.get_nvme_max_capacity() self.start_servers() # Create the pool from 80% of available of storage space single_pool_nvme_size = int((storage * 0.80) / no_of_pools) self.pool = [] # Create the Large number of pools for _pool in range(no_of_pools): pool = TestPool(self.context, self.get_dmg_command()) pool.get_params(self) # SCM size is 10% of NVMe pool.scm_size.update('{}'.format(int(single_pool_nvme_size * 0.10))) pool.nvme_size.update('{}'.format(single_pool_nvme_size)) pool.create() self.pool.append(pool) # initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") # List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.set_sub_command( "list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout_text: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout_text: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))
class CSumErrorLog(DaosCoreBase): """ Test Class Description: This test runs daos_test -z (Checksum tests) and verifies whether Checksum Error Counters are incremented in the NVME device due to checksum fault injection. :avocado: recursive """ # pylint: disable=too-many-instance-attributes def setUp(self): super(CSumErrorLog, self).setUp() self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.hostlist = self.hostlist_servers[0] self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") def get_nvme_device_id(self): self.dmg.json.value = True self.dmg.sub_command_class. \ sub_command_class.set_sub_command("list-devices") try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) data = json.loads(result.stdout) if len(data['host_errors']) > 0: self.fail("dmg command failed: {}".format(data['host_errors'])) for v in data['host_storage_map'].values(): if v['storage']['smd_info']['devices']: return v['storage']['smd_info']['devices'][0]['uuid'] def get_checksum_error_value(self, device_id=None): if device_id is None: self.fail("No device id provided") return self.dmg.json.value = True self.dmg.sub_command_class. \ sub_command_class.set_sub_command("device-health") self.dmg.sub_command_class. \ sub_command_class. \ sub_command_class.uuid.value = device_id try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) data = json.loads(result.stdout) if len(data['host_errors']) > 0: self.fail("dmg command failed: {}".format(data['host_errors'])) for v in data['host_storage_map'].values(): if v['storage']['smd_info']['devices']: dev = v['storage']['smd_info']['devices'][0] return dev['health']['checksum_errors'] def test_csum_error_logging(self): """ Test ID: DAOS-3927 Test Description: Write Avocado Test to verify single data after pool/container disconnect/reconnect. :avocado: tags=all,pr,hw,medium,ib2,csum_error_log """ dev_id = self.get_nvme_device_id() self.log.info("%s", dev_id) csum = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum) DaosCoreBase.run_subtest(self) csum_latest = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum_latest) self.assertTrue(csum_latest > csum, "Checksum Error Log not incremented") self.log.info("Checksum Error Logging Test Passed")