Ejemplo n.º 1
0
class NvmeHealth(ServerFillUp):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: To validate NVMe health test cases
    :avocado: recursive
    """
    def test_monitor_for_large_pools(self):
        """Jira ID: DAOS-4722.

        Test Description: Test Health monitor for large number of pools.
        Use Case: This tests will create the 40 number of pools and verify the
                  dmg list-pools, device-health and nvme-health works for all
                  pools.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium
        :avocado: tags=nvme
        :avocado: tags=nvme_health
        """
        # pylint: disable=attribute-defined-outside-init
        # pylint: disable=too-many-branches
        no_of_pools = self.params.get("number_of_pools", '/run/pool/*')
        pool_capacity = self.params.get("pool_used_percentage", '/run/pool/*')
        pool_capacity = pool_capacity / 100
        storage = self.get_max_storage_sizes()

        #Create the pool from available of storage space
        single_pool_nvme_size = int((storage[1] * pool_capacity) / no_of_pools)
        single_pool_scm_size = int((storage[0] * pool_capacity) / no_of_pools)

        self.pool = []
        # Create the Large number of pools
        for _pool in range(no_of_pools):
            self.log.info("-- Creating pool number = %s", _pool)
            self.pool.append(self.get_pool(create=False))
            self.pool[-1].scm_size.update(single_pool_scm_size, "scm_size")
            self.pool[-1].nvme_size.update(single_pool_nvme_size, "nvme_size")
            self.pool[-1].create()

        # initialize the dmg command
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")

        # List all pools
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")
        self.dmg.sub_command_class.sub_command_class.set_sub_command(
            "list-pools")
        for host in self.hostlist_servers:
            self.dmg.hostlist = host
            try:
                result = self.dmg.run()
            except CommandFailure as error:
                self.fail("dmg command failed: {}".format(error))
            #Verify all pools UUID listed as part of query
            for pool in self.pool:
                if pool.uuid.lower() not in result.stdout_text:
                    self.fail('Pool uuid {} not found in smd query'.format(
                        pool.uuid.lower()))

        # Get the device ID from all the servers.
        device_ids = get_device_ids(self.dmg, self.hostlist_servers)

        # Get the device health
        for host in device_ids:
            self.dmg.hostlist = host
            for _dev in device_ids[host]:
                try:
                    result = self.dmg.storage_query_device_health(_dev)
                except CommandFailure as error:
                    self.fail("dmg get device states failed {}".format(error))
                if 'State:NORMAL' not in result.stdout_text:
                    self.fail("device {} on host {} is not NORMAL".format(
                        _dev, host))

        # Get the nvme-health
        try:
            self.dmg.storage_scan_nvme_health()
        except CommandFailure as error:
            self.fail("dmg storage scan --nvme-health failed {}".format(error))
Ejemplo n.º 2
0
class CSumErrorLog(DaosCoreBase):
    """
    Test Class Description: This test runs
    daos_test -z (Checksum tests) and verifies
    whether Checksum Error Counters are incremented
    in the NVME device due to checksum fault injection.
    :avocado: recursive
    """
    # pylint: disable=too-many-instance-attributes
    def setUp(self):
        super(CSumErrorLog, self).setUp()
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.hostlist = self.hostlist_servers[0]
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")

    def get_nvme_device_id(self):
        self.dmg.sub_command_class.sub_command_class.set_sub_command("smd")
        self.dmg.sub_command_class. \
            sub_command_class.sub_command_class.devices.value = True
        self.dmg.sub_command_class. \
            sub_command_class.sub_command_class.pools.value = True
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))
        uid = None
        for line in result.stdout.splitlines():
            line = line.strip()
            if re.search("^UUID:", line):
                temp = line.split()
                uid = temp[1]
                break
        return uid

    def get_checksum_error_value(self, device_id=None):
        if device_id is None:
            self.fail("No device id provided")
            return
        self.dmg.sub_command_class. \
            sub_command_class.set_sub_command("blobstore-health")
        self.dmg.sub_command_class. \
            sub_command_class. \
            sub_command_class.devuuid.value = "{}".format(device_id)
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))
        csum_count = None
        for line in result.stdout.splitlines():
            line = line.strip()
            if re.search("^Checksum", line):
                temp = line.split()
                csum_count = int(temp[2])
                break
        return csum_count

    def test_csum_error_logging(self):
        """
        Test ID: DAOS-3927
        Test Description: Write Avocado Test to verify single data after
                          pool/container disconnect/reconnect.
        :avocado: tags=all,pr,hw,medium,ib2,csum_error_log
        """
        dev_id = self.get_nvme_device_id()
        self.log.info("%s", dev_id)
        csum = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum)
        DaosCoreBase.run_subtest(self)
        csum_latest = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum_latest)
        self.assertTrue(csum_latest > csum,
                        "Checksum Error Log not incremented")
        self.log.info("Checksum Error Logging Test Passed")
Ejemplo n.º 3
0
class NvmeHealth(ServerFillUp):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: To validate NVMe health test cases
    :avocado: recursive
    """
    @skipForTicket("DAOS-7011")
    def test_monitor_for_large_pools(self):
        """Jira ID: DAOS-4722.

        Test Description: Test Health monitor for large number of pools.
        Use Case: This tests will create the 40 number of pools and verify the
                  dmg list-pools, device-health and nvme-health works for all
                  pools.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=nvme_health
        """
        # pylint: disable=attribute-defined-outside-init
        # pylint: disable=too-many-branches
        no_of_pools = self.params.get("number_of_pools", '/run/pool/*')
        # Stop the servers to run SPDK too to get the server capacity
        self.stop_servers()
        storage = self.get_nvme_max_capacity()
        self.start_servers()

        # Create the pool from 80% of available of storage space
        single_pool_nvme_size = int((storage * 0.80) / no_of_pools)

        self.pool = []
        # Create the Large number of pools
        for _pool in range(no_of_pools):
            pool = TestPool(self.context, self.get_dmg_command())
            pool.get_params(self)
            # SCM size is 10% of NVMe
            pool.scm_size.update('{}'.format(int(single_pool_nvme_size *
                                                 0.10)))
            pool.nvme_size.update('{}'.format(single_pool_nvme_size))
            pool.create()
            self.pool.append(pool)

        # initialize the dmg command
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")

        # List all pools
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")
        self.dmg.sub_command_class.sub_command_class.set_sub_command(
            "list-pools")
        for host in self.hostlist_servers:
            self.dmg.hostlist = host
            try:
                result = self.dmg.run()
            except CommandFailure as error:
                self.fail("dmg command failed: {}".format(error))
            #Verify all pools UUID listed as part of query
            for pool in self.pool:
                if pool.uuid.lower() not in result.stdout_text:
                    self.fail('Pool uuid {} not found in smd query'.format(
                        pool.uuid.lower()))

        # Get the device ID from all the servers.
        device_ids = get_device_ids(self.dmg, self.hostlist_servers)

        # Get the device health
        for host in device_ids:
            self.dmg.hostlist = host
            for _dev in device_ids[host]:
                try:
                    result = self.dmg.storage_query_device_health(_dev)
                except CommandFailure as error:
                    self.fail("dmg get device states failed {}".format(error))
                if 'State:NORMAL' not in result.stdout_text:
                    self.fail("device {} on host {} is not NORMAL".format(
                        _dev, host))

        # Get the nvme-health
        try:
            self.dmg.storage_scan_nvme_health()
        except CommandFailure as error:
            self.fail("dmg storage scan --nvme-health failed {}".format(error))
Ejemplo n.º 4
0
class CSumErrorLog(DaosCoreBase):
    """
    Test Class Description: This test runs
    daos_test -z (Checksum tests) and verifies
    whether Checksum Error Counters are incremented
    in the NVME device due to checksum fault injection.
    :avocado: recursive
    """

    # pylint: disable=too-many-instance-attributes
    def setUp(self):
        super(CSumErrorLog, self).setUp()
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.hostlist = self.hostlist_servers[0]
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")

    def get_nvme_device_id(self):
        self.dmg.json.value = True
        self.dmg.sub_command_class. \
            sub_command_class.set_sub_command("list-devices")
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))

        data = json.loads(result.stdout)
        if len(data['host_errors']) > 0:
            self.fail("dmg command failed: {}".format(data['host_errors']))
        for v in data['host_storage_map'].values():
            if v['storage']['smd_info']['devices']:
                return v['storage']['smd_info']['devices'][0]['uuid']

    def get_checksum_error_value(self, device_id=None):
        if device_id is None:
            self.fail("No device id provided")
            return
        self.dmg.json.value = True
        self.dmg.sub_command_class. \
            sub_command_class.set_sub_command("device-health")
        self.dmg.sub_command_class. \
            sub_command_class. \
            sub_command_class.uuid.value = device_id
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))

        data = json.loads(result.stdout)
        if len(data['host_errors']) > 0:
            self.fail("dmg command failed: {}".format(data['host_errors']))
        for v in data['host_storage_map'].values():
            if v['storage']['smd_info']['devices']:
                dev = v['storage']['smd_info']['devices'][0]
                return dev['health']['checksum_errors']

    def test_csum_error_logging(self):
        """
        Test ID: DAOS-3927
        Test Description: Write Avocado Test to verify single data after
                          pool/container disconnect/reconnect.
        :avocado: tags=all,pr,hw,medium,ib2,csum_error_log
        """
        dev_id = self.get_nvme_device_id()
        self.log.info("%s", dev_id)
        csum = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum)
        DaosCoreBase.run_subtest(self)
        csum_latest = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum_latest)
        self.assertTrue(csum_latest > csum,
                        "Checksum Error Log not incremented")
        self.log.info("Checksum Error Logging Test Passed")