Example #1
0
    def get_pool_acl_list(self, uuid):
        '''
        Deascription:
            Get daos pool acl list by dmg get-acl.
        Args:
            uuid: pool uuid number.
        Return:
            pool_permission_list: daos pool acl list.
        '''
        dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        dmg.request.value = "pool"
        dmg.action.value = "get-acl --pool " + uuid
        port = self.params.get("port", "/run/server_config/*")
        servers_with_ports = [
            "{}:{}".format(host, port) for host in self.hostlist_servers
        ]
        dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist")
        result = dmg.run()

        pool_permission_list = []
        for line in result.stdout.splitlines():
            if not line.startswith("A:"):
                continue
            elif line.startswith("A::"):
                found_user = re.search(r"A::(.+)@:(.*)", line)
                if found_user:
                    pool_permission_list.append(line)
            elif line.startswith("A:G:"):
                found_group = re.search(r"A:G:(.+)@:(.*)", line)
                if found_group:
                    pool_permission_list.append(line)
        return pool_permission_list
 def update_pool_acl_entry(self, uuid, action, entry):
     '''
     Deascription:
         Update daos pool acl list by dmg tool.
     Args:
         uuid: pool uuid.
         action: update-acl or delete-acl.
         entry: pool acl entry or principal to be updated.
     Return:
         none.
     '''
     dmg = DmgCommand(os.path.join(self.prefix, "bin"))
     dmg.request.value = "pool"
     if action is "delete":
         dmg.action.value = "delete-acl --pool " + uuid
         dmg.action.value += " --principal " + entry
     elif action is "update":
         dmg.action.value = "update-acl --pool " + uuid
         dmg.action.value += " --entry " + entry
     else:
         self.fail("##update_pool_acl_entry, action: {} is not supported."
                   "\n  supported action: update, delete.".format(action))
     port = self.params.get("port", "/run/server_config/*")
     servers_with_ports = [
         "{}:{}".format(host, port) for host in self.hostlist_servers
     ]
     dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist")
     result = dmg.run()
     self.log.info(" At update_pool_acl_entry, dmg.run result=\n %s",\
         result)
Example #3
0
    def verify_access_point(self, host_port_input, failure_expected=None):
        """Run with given AP and verify the AP in the output.

        Args:
            host_port_input (str): Host:Port or just Host. Supports multiple APs
                that are separated by comma.
            failure_expected (str): Expected error message. Set it to None if
                not expecting any error. Defaults to None.

        Returns:
            list: List or errors.
        """
        errors = []
        check = {}

        check["expected"] = host_port_input.split(",")
        if ":" not in host_port_input:
            # dmg automatically sets 10001 if it's not given in the input.
            check["expected"] = [
                "{}:10001".format(host) for host in check["expected"]
            ]

        # Create a new DmgCommand and set its exit_status_exception to False to
        # make it not raise a TestFailure when the command failed. Then we'll be
        # able to check result.exit_status for our testing purpose.
        dmg = DmgCommand(self.bin)
        dmg.exit_status_exception = False

        try:
            result = dmg.config_generate(access_points=host_port_input)
        except CommandFailure as err:
            errors.append("Unexpected failure! {}".format(err))

        if result.exit_status == 0 and failure_expected is None:
            try:
                yaml_data = yaml.safe_load(result.stdout)
                check["actual"] = yaml_data["access_points"]
                if sorted(check["expected"]) != sorted(check["actual"]):
                    errors.append("Unexpected access point: {} != {}".format(
                        check["expected"], check["actual"]))
            except yaml.YAMLError as error:
                errors.append(
                    "Error loading dmg generated config!: {}".format(error))
        elif result.exit_status == 0 and failure_expected is not None:
            errors.append(
                "dmg command passed when expected to fail!: {}".format(result))
        elif result.exit_status != 0 and failure_expected is not None:
            if failure_expected not in result.stderr_text:
                errors.append(
                    "Missing expected error message in failed dmg command!: " +
                    "{}".format(result))
        else:
            errors.append(
                "dmg command failed when expected to pass!: {}".format(result))

        return errors
Example #4
0
 def setUp(self):
     super(CSumErrorLog, self).setUp()
     self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
     self.dmg.get_params(self)
     self.dmg.hostlist = self.hostlist_servers[0]
     self.dmg.insecure.update(
         self.server_managers[0].get_config_value("allow_insecure"),
         "dmg.insecure")
     self.dmg.set_sub_command("storage")
     self.dmg.sub_command_class.set_sub_command("query")
Example #5
0
class NvmeFault(ServerFillUp):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: To validate IO works fine when NVMe fault generated
                            on single or multiple servers with single drive.
    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(NvmeFault, self).setUp()
        self.no_of_pools = self.params.get("number_of_pools", '/run/pool/*', 1)
        self.capacity = self.params.get("percentage",
                                        '/run/faulttests/pool_capacity/*')
        self.no_of_servers = self.params.get(
            "count", '/run/faulttests/no_of_servers/*/')
        self.no_of_drives = self.params.get("count",
                                            '/run/faulttests/no_of_drives/*/')
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")
        #Set to True to generate the NVMe fault during IO
        self.set_faulty_device = True

    @skipForTicket("DAOS-5497")
    def test_nvme_fault(self):
        """Jira ID: DAOS-4722.

        Test Description: Test NVMe disk fault.
        Use Case: Create the large size of pool and start filling up the pool.
                  while IO is in progress remove single disks from
                  single/multiple servers.

        :avocado: tags=all,hw,medium,nvme,ib2,nvme_fault,full_regression
        """
        #Create the Pool with Maximum NVMe size
        self.create_pool_max_size(nvme=True)

        #Start the IOR Command and generate the NVMe fault.
        self.start_ior_load(precent=self.capacity)

        print("pool_percentage_used -- After -- {}".format(
            self.pool.pool_percentage_used()))

        #Check nvme-health command works
        try:
            self.dmg.hostlist = self.hostlist_servers
            self.dmg.storage_query_nvme_health()
        except CommandFailure as _error:
            self.fail("dmg nvme-health failed")
Example #6
0
    def test_num_engines(self):
        """Test --num-engines.

        1. Using the NVMe PCI dictionary, find the number of keys. i.e., number
        of Socket IDs. This would determine the maximum number of engines.
        2. Call dmg config generate --num-engines=<1 to max_engine>. Should
        pass.
        3. Call dmg config generate --num-engines=<max_engine + 1> Should fail.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,small
        :avocado: tags=control,config_generate_entries,num_engines
        """
        # Get necessary storage and network info.
        self.prepare_expected_data()

        # Find the maximum number of engines we can use. It's the number of
        # sockets in NVMe. However, I'm not sure if we need to have the same
        # number of interfaces. Go over this step if we have issue with the
        # max_engine assumption.
        max_engine = len(list(self.nvme_socket_to_addrs.keys()))
        self.log.info("max_engine threshold = %s", max_engine)

        dmg = DmgCommand(self.bin)
        dmg.exit_status_exception = False
        errors = []

        # Call dmg config generate --num-engines=<1 to max_engine>
        for num_engines in range(1, max_engine + 1):
            result = dmg.config_generate(access_points="wolf-a",
                                         num_engines=num_engines)
            generated_yaml = yaml.safe_load(result.stdout)
            actual_num_engines = len(generated_yaml["engines"])

            # Verify the number of engine field.
            if actual_num_engines != num_engines:
                msg = "Unexpected number of engine field! Expected = {}; "\
                    "Actual = {}".format(num_engines, actual_num_engines)
                errors.append(msg)

        # Verify that max_engine + 1 fails.
        result = dmg.config_generate(access_points="wolf-a",
                                     num_engines=max_engine + 1)
        if result.exit_status == 0:
            errors.append(
                "Host + invalid num engines succeeded with {}!".format(
                    max_engine + 1))

        self.check_errors(errors)
Example #7
0
    def get_dmg_command(self, index=0):
        """Get a DmgCommand setup to interact with server manager index.

        Return a DmgCommand object configured with:
            - the "-l" parameter assigned to the server's access point list
            - the "-i" parameter assigned to the server's interactive mode

        This method is intended to be used by tests that wants to use dmg to
        create and destroy pool. Pass in the object to TestPool constructor.

        Access point should be passed in to -l regardless of the number of
        servers.

        Args:
            index (int, optional): Server index. Defaults to 0.

        Returns:
            DmgCommand: New DmgCommand object.

        """
        if self.server_managers:
            return self.server_managers[index].dmg

        dmg_config_file = self.get_config_file("daos", "dmg")
        dmg_cfg = DmgYamlParameters(dmg_config_file, self.server_group,
                                    DmgTransportCredentials(self.workdir))
        dmg_cfg.hostlist.update(self.hostlist_servers[:1], "dmg.yaml.hostlist")
        return DmgCommand(self.bin, dmg_cfg)
Example #8
0
    def get_dmg_command(self, index=0):
        """Get a DmgCommand setup to interact with server manager index.

        Return a DmgCommand object configured with:
            - the "-l" parameter assigned to the server's access point list
            - the "-i" parameter assigned to the server's interactive mode

        This method is intended to be used by tests that wants to use dmg to
        create and destroy pool. Pass in the object to TestPool constructor.

        Access point should be passed in to -l regardless of the number of
        servers.

        Args:
            index (int, optional): Server index. Defaults to 0.

        Returns:
            DmgCommand: New DmgCommand object.

        """
        dmg = DmgCommand(self.bin)
        dmg.hostlist.value = self.server_managers[index].runner.job.\
            yaml_params.access_points.value
        dmg.insecure.value = \
            self.server_managers[index].insecure.value
        return dmg
Example #9
0
 def setUp(self):
     """Set up for test case."""
     super(NvmeFault, self).setUp()
     self.no_of_pools = self.params.get("number_of_pools", '/run/pool/*', 1)
     self.capacity = self.params.get("percentage",
                                     '/run/faulttests/pool_capacity/*')
     self.no_of_servers = self.params.get(
         "count", '/run/faulttests/no_of_servers/*/')
     self.no_of_drives = self.params.get("count",
                                         '/run/faulttests/no_of_drives/*/')
     self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
     self.dmg.get_params(self)
     self.dmg.insecure.update(
         self.server_managers[0].get_config_value("allow_insecure"),
         "dmg.insecure")
     #Set to True to generate the NVMe fault during IO
     self.set_faulty_device = True
Example #10
0
    def __init__(self, server_command, manager="Orterun", dmg_cfg=None):
        """Initialize a DaosServerManager object.

        Args:
            server_command (ServerCommand): server command object
            manager (str, optional): the name of the JobManager class used to
                manage the YamlCommand defined through the "job" attribute.
                Defaults to "OpenMpi".
            dmg_cfg (DmgYamlParameters, optional): The dmg configuration
                file parameters used to connect to this group of servers.
        """
        super(DaosServerManager, self).__init__(server_command, manager)
        self.manager.job.sub_command_override = "start"

        # Dmg command to access this group of servers which will be configured
        # to access the daos_servers when they are started
        self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg)
Example #11
0
    def __init__(self, context, log=None, cb_handler=None, dmg_bin_path=None):
        # pylint: disable=unused-argument
        """Initialize a TestPool object.

        Note: 'log' is now a defunct argument and will be removed in the future

        Args:
            context (DaosContext): [description]
            log (logging): logging object used to report the pool status
            cb_handler (CallbackHandler, optional): callback object to use with
                the API methods. Defaults to None.
        """
        super(TestPool, self).__init__("/run/pool/*", cb_handler)
        self.context = context
        self.uid = os.geteuid()
        self.gid = os.getegid()

        self.mode = BasicParameter(None)
        self.name = BasicParameter(None)  # server group name
        self.svcn = BasicParameter(None)
        self.target_list = BasicParameter(None)
        self.scm_size = BasicParameter(None)
        self.nvme_size = BasicParameter(None)
        # Set USE_API to use API or USE_DMG to use dmg. If it's not set, API is
        # used.
        self.control_method = BasicParameter(self.USE_API, self.USE_API)
        uname = getpass.getuser()
        gname = grp.getgrnam(uname)[0]
        self.username = BasicParameter(uname, uname)
        self.groupname = BasicParameter(gname, gname)

        self.pool = None
        self.uuid = None
        self.info = None
        self.svc_ranks = None
        self.connected = False
        self.dmg = None
        # Required to use dmg. It defined the directory where dmg is installed.
        # Use self.basepath + '/install/bin' in the test
        self.dmg_bin_path = dmg_bin_path
        if dmg_bin_path is not None:
            # We make dmg as the member of this class because the test would
            # have more flexibility over the usage of the command.
            self.dmg = DmgCommand(self.dmg_bin_path)
            self.dmg.insecure.value = True
            self.dmg.request.value = "pool"
    def test_dmg_nvme_scan_basic(self):
        """
        JIRA ID: DAOS-2485
        Test Description: Test basic dmg functionality to scan the nvme storage.
        on the system.
        :avocado: tags=all,tiny,pr,dmg,nvme_scan,basic
        """
        # Create dmg command
        dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        dmg.get_params(self)

        # Update hostlist value for dmg command
        port = self.params.get("port", "/run/server_config/*")
        servers_with_ports = [
            "{}:{}".format(host, port) for host in self.hostlist_servers]
        dmg.hostlist = servers_with_ports

        try:
            dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))
Example #13
0
    def verify_pool_acl_prim_sec_groups(self, pool_acl_list, acl_file,\
        uuid, svc):
        '''
        Deascription:
            Verify daos pool acl access with primary and secondary
            groups access permission.
        Args:
            pool_acl_list: pool acl entry list.
            acl_file: acl file to be used.
            uuid: daos pool uuid.
            svc:  daos pool svc.
        Return:
            None.
        '''
        sec_group = self.params.get("secondary_group_name", "/run/pool_acl/*")
        sec_group_perm = self.params.get("sg_permission", "/run/pool_acl/*")
        sec_group_rw = self.params.get("sg_read_write", "/run/pool_acl/*")
        user_gid = os.getegid()
        current_group = grp.getgrgid(user_gid)[0]
        primary_grp_perm = self.params.get(\
            "pg_permission", "/run/pool_acl/primary_secondary_group_test/*")[0]
        sec_group = self.params.get(\
            "secondary_group_name", \
            "/run/pool_acl/primary_secondary_group_test/*")
        sec_group_perm = self.params.get(\
            "sg_permission", "/run/pool_acl/primary_secondary_group_test/*")
        sec_group_rw = self.params.get(\
            "sg_read_write", "/run/pool_acl/primary_secondary_group_test/*")
        l_group = grp.getgrgid(os.getegid())[0]
        for group in sec_group:
            add_del_user(self.hostlist_clients, "groupadd", group)
        cmd = "usermod -G " + ",".join(sec_group)
        self.log.info("  (8-1)verify_pool_acl_prim_sec_groups, cmd= %s", cmd)
        add_del_user(self.hostlist_clients, cmd, l_group)

        self.log.info(
            "  (8-2)Before update sec_group permission,\
            pool_acl_list= %s", pool_acl_list)
        for group, permission in zip(sec_group, sec_group_perm):
            if permission == "none":
                permission = ""
            n_acl = acl_entry("group", group, permission)
            pool_acl_list.append(n_acl)

        self.log.info(
            "  (8-3)After update sec_group permission,\
            pool_acl_list= %s", pool_acl_list)
        self.log.info("      pool acl_file= %s", acl_file)
        create_acl_file(acl_file, pool_acl_list)

        #modify primary-group permission for secondary-group test
        grp_entry = acl_entry("group", current_group, primary_grp_perm)
        new_grp_entry = acl_entry("group", current_group, "")
        self.modify_acl_file_entry(acl_file, grp_entry, new_grp_entry)

        #dmg pool overwrite-acl --pool <uuid> --acl-file <file>
        dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        dmg.request.value = "pool"
        dmg.action.value = "overwrite-acl --pool={} --acl-file={}".\
            format(uuid, acl_file)
        port = self.params.get("port", "/run/server_config/*", 10001)
        servers_with_ports = [
            "{}:{}".format(host, port) for host in self.hostlist_servers
        ]
        dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist")
        self.log.info("  (8-4)dmg= %s", dmg)
        result = dmg.run()
        self.log.info("  (8-5)dmg.run() result=\n %s", result)

        #Verify pool read operation
        #daos pool query --pool <uuid>
        self.log.info("  (8-6)Verify pool read by: daos pool query --pool")
        exp_read = sec_group_rw[0]
        self.verify_pool_readwrite(svc, uuid, "read", expect=exp_read)

        #Verify pool write operation
        #daos continer create --pool <uuid>
        self.log.info("  (8-7)Verify pool write by: daos continer create pool")
        exp_write = sec_group_rw[1]
        self.verify_pool_readwrite(svc, uuid, "write", expect=exp_write)

        for group in sec_group:
            add_del_user(self.hostlist_clients, "groupdel", group)
    def test_create(self):
        """Test dmg pool create and destroy with various parameters.

        Create a pool and verify that the pool was created by comparing the
        UUID returned from the dmg command against the directory name in
        /mnt/daos

        Destroy the pool and verify that the directory is deleted.

        :avocado: tags=all,pool,full_regression,small,multitarget
        """
        # Create a dmg command object
        dmg = DmgCommand(self.bin)
        dmg.get_params(self)
        dmg.hostlist.update(
            self.server_managers[0].runner.job.yaml_params.access_points.value,
            "dmg.hostlist")

        # Disable raising an exception if the dmg command fails
        dmg.exit_status_exception = False

        # Accumulate a list of pass/fail indicators representing what is
        # expected for each parameter then "and" them to determine the
        # expected result of the test
        expected_for_param = []

        userlist = self.params.get("user", '/run/tests/users/*')
        user = os.getlogin() if userlist[0] == 'valid' else userlist[0]
        expected_for_param.append(userlist[1])

        grouplist = self.params.get("group", '/run/tests/groups/*')
        group = os.getlogin() if grouplist[0] == 'valid' else grouplist[0]
        expected_for_param.append(grouplist[1])

        systemnamelist = self.params.get("systemname",
                                         '/run/tests/systemnames/*')
        system_name = systemnamelist[0]
        expected_for_param.append(systemnamelist[1])

        tgtlistlist = self.params.get("tgt", '/run/tests/tgtlist/*')
        tgtlist = tgtlistlist[0]
        expected_for_param.append(tgtlistlist[1])

        # if any parameter is FAIL then the test should FAIL
        expected_result = RESULT_PASS
        if RESULT_FAIL in expected_for_param:
            expected_result = RESULT_FAIL

        host1 = self.hostlist_servers[0]
        host2 = self.hostlist_servers[1]
        test_destroy = True
        create_result = dmg.pool_create("1GB", user, group, None, tgtlist,
                                        None, system_name)
        if create_result.exit_status == 0:
            if expected_result == RESULT_FAIL:
                self.fail(
                    "Test was expected to fail but it passed at pool create.")
            uuid, _ = get_pool_uuid_service_replicas_from_stdout(
                create_result.stdout)
            if '0' in tgtlist:
                # check_for_pool checks if the uuid directory exists in host1
                exists = check_for_pool.check_for_pool(host1, uuid)
                if exists != 0:
                    self.fail("Pool {0} not found on host {1}.\n".format(
                        uuid, host1))
            if '1' in tgtlist:
                exists = check_for_pool.check_for_pool(host2, uuid)
                if exists != 0:
                    self.fail("Pool {0} not found on host {1}.\n".format(
                        uuid, host2))
        else:
            test_destroy = False
            if expected_result == RESULT_PASS:
                self.fail("Test was expected to pass but it failed at pool " +
                          "create.")

        if test_destroy:
            destroy_result = dmg.pool_destroy(uuid)
            if destroy_result.exit_status == 0:
                if expected_result == RESULT_FAIL:
                    self.fail("Test was expected to fail but it passed at " +
                              "pool create.")
                if '0' in tgtlist:
                    exists = check_for_pool.check_for_pool(host1, uuid)
                    if exists == 0:
                        self.fail(
                            "Pool {0} found on host {1} after destroy.\n".
                            format(uuid, host1))
                if '1' in tgtlist:
                    exists = check_for_pool.check_for_pool(host2, uuid)
                    if exists == 0:
                        self.fail(
                            "Pool {0} found on host {1} after destroy.\n".
                            format(uuid, host2))
            else:
                if expected_result == RESULT_PASS:
                    self.fail("Test was expected to pass but it failed at " +
                              "pool destroy.")
Example #15
0
    def test_monitor_for_large_pools(self):
        """Jira ID: DAOS-4722.

        Test Description: Test Health monitor for large number of pools.
        Use Case: This tests will create the 40 number of pools and verify the
                  dmg list-pools, device-health and nvme-health works for all
                  pools.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium
        :avocado: tags=nvme
        :avocado: tags=nvme_health
        """
        # pylint: disable=attribute-defined-outside-init
        # pylint: disable=too-many-branches
        no_of_pools = self.params.get("number_of_pools", '/run/pool/*')
        pool_capacity = self.params.get("pool_used_percentage", '/run/pool/*')
        pool_capacity = pool_capacity / 100
        storage = self.get_max_storage_sizes()

        #Create the pool from available of storage space
        single_pool_nvme_size = int((storage[1] * pool_capacity) / no_of_pools)
        single_pool_scm_size = int((storage[0] * pool_capacity) / no_of_pools)

        self.pool = []
        # Create the Large number of pools
        for _pool in range(no_of_pools):
            self.log.info("-- Creating pool number = %s", _pool)
            self.pool.append(self.get_pool(create=False))
            self.pool[-1].scm_size.update(single_pool_scm_size, "scm_size")
            self.pool[-1].nvme_size.update(single_pool_nvme_size, "nvme_size")
            self.pool[-1].create()

        # initialize the dmg command
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")

        # List all pools
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")
        self.dmg.sub_command_class.sub_command_class.set_sub_command(
            "list-pools")
        for host in self.hostlist_servers:
            self.dmg.hostlist = host
            try:
                result = self.dmg.run()
            except CommandFailure as error:
                self.fail("dmg command failed: {}".format(error))
            #Verify all pools UUID listed as part of query
            for pool in self.pool:
                if pool.uuid.lower() not in result.stdout_text:
                    self.fail('Pool uuid {} not found in smd query'.format(
                        pool.uuid.lower()))

        # Get the device ID from all the servers.
        device_ids = get_device_ids(self.dmg, self.hostlist_servers)

        # Get the device health
        for host in device_ids:
            self.dmg.hostlist = host
            for _dev in device_ids[host]:
                try:
                    result = self.dmg.storage_query_device_health(_dev)
                except CommandFailure as error:
                    self.fail("dmg get device states failed {}".format(error))
                if 'State:NORMAL' not in result.stdout_text:
                    self.fail("device {} on host {} is not NORMAL".format(
                        _dev, host))

        # Get the nvme-health
        try:
            self.dmg.storage_scan_nvme_health()
        except CommandFailure as error:
            self.fail("dmg storage scan --nvme-health failed {}".format(error))
Example #16
0
    def test_net_class(self):
        """Test --net-class.

        1. Iterate the interface set and count the number of elements that
        starts with "ib". This would be our ib_count threshold that we can set
        --num-engines with --net-class=infiniband.
        2. Call dmg config generate --net-class=infiniband
        --num-engines=<1 to ib_count> and verify that it works.
        3. In addition, verify provider using the dictionary. i.e., iterate
        "engines" fields and verify "provider" is in the list where key is
        "fabric_iface".
        4. Similarly find eth_count and call dmg config generate
        --net-class=ethernet --num-engines=<1 to eth_count> and verify that it
        works.
        5. As in ib, also verify provider using the dictionary. i.e., iterate
        "engines" fields and verify "provider" is in the list where key is
        "fabric_iface".

        :avocado: tags=all,full_regression
        :avocado: tags=hw,small
        :avocado: tags=control,config_generate_entries,net_class
        """
        # Get necessary storage and network info.
        self.prepare_expected_data()

        # Get ib_count threshold.
        ib_count = 0
        for interface in self.interface_set:
            if interface[:2] == "ib":
                ib_count += 1
        self.log.info("ib_count = %d", ib_count)

        dmg = DmgCommand(self.bin)
        dmg.exit_status_exception = False
        errors = []

        # Call dmg config generate --num-engines=<1 to ib_count>
        # --net-class=infiniband. Should pass.
        for num_engines in range(1, ib_count + 1):
            # dmg config generate should pass.
            result = dmg.config_generate(access_points="wolf-a",
                                         num_engines=num_engines,
                                         net_class="infiniband")

            if result.exit_status != 0:
                msg = "config generate failed with --net-class=infiniband "\
                    "--num-engines = {}!".format(num_engines)
                errors.append(msg)
            else:
                generated_config = yaml.safe_load(result.stdout)
                for engine in generated_config["engines"]:
                    fabric_iface = engine["fabric_iface"]
                    provider = engine["provider"]
                    # Verify fabric_iface field, e.g., ib0 by checking the
                    # dictionary keys.
                    if not self.interface_to_providers[fabric_iface]:
                        errors.append(
                            "Unexpected fabric_iface! {}".format(fabric_iface))
                    elif provider not in \
                        self.interface_to_providers[fabric_iface]:
                        # Now check the provider field, e.g., ofi+sockets by
                        # checking the corresponding list in the dictionary.
                        msg = "Unexpected provider in fabric_iface! provider ="\
                            " {}; fabric_iface = {}".format(
                                provider, fabric_iface)
                        errors.append(msg)

        # Call dmg config generate --num-engines=<ib_count + 1>
        # --net-class=infiniband. Too many engines. Should fail.
        result = dmg.config_generate(access_points="wolf-a",
                                     num_engines=ib_count + 1,
                                     net_class="infiniband")
        if result.exit_status == 0:
            msg = "config generate succeeded with --net-class=infiniband "\
                "num_engines = {}!".format(ib_count + 1)
            errors.append(msg)

        # Get eth_count threshold.
        eth_count = 0
        for interface in self.interface_set:
            if interface[:3] == "eth":
                eth_count += 1
        self.log.info("eth_count = %d", eth_count)

        # Call dmg config generate --num-engines=<1 to eth_count>
        # --net-class=ethernet. Should pass.
        for num_engines in range(1, eth_count + 1):
            # dmg config generate should pass.
            result = dmg.config_generate(access_points="wolf-a",
                                         num_engines=num_engines,
                                         net_class="ethernet")

            if result.exit_status != 0:
                msg = "config generate failed with --net-class=ethernet "\
                    "--num-engines = {}!".format(num_engines)
                errors.append(msg)
            else:
                generated_config = yaml.safe_load(result.stdout)
                for engine in generated_config["engines"]:
                    fabric_iface = engine["fabric_iface"]
                    provider = engine["provider"]
                    # Verify fabric_iface field, e.g., eth0 by checking the
                    # dictionary keys.
                    if not self.interface_to_providers[fabric_iface]:
                        errors.append(
                            "Unexpected fabric_iface! {}".format(fabric_iface))
                    elif provider not in \
                        self.interface_to_providers[fabric_iface]:
                        # Now check the provider field, e.g., ofi+sockets by
                        # checking the corresponding list in the dictionary.
                        msg = "Unexpected provider in fabric_iface! provider ="\
                            " {}; fabric_iface = {}".format(
                                provider, fabric_iface)
                        errors.append(msg)

        # Call dmg config generate --num-engines=<eth_count + 1>
        # --net-class=ethernet. Too many engines. Should fail.
        result = dmg.config_generate(access_points="wolf-a",
                                     num_engines=eth_count + 1,
                                     net_class="ethernet")
        if result.exit_status == 0:
            msg = "config generate succeeded with --net-class=ethernet, "\
                "num_engines = {}!".format(eth_count + 1)
            errors.append(msg)

        self.check_errors(errors)
Example #17
0
    def test_min_ssds(self):
        """Test --min-ssds.

        1. Iterate the NVMe PCI dictionary and find the key that has the
        shortest list. This would be our min_ssd engine count threshold.
        2. Call dmg config generate --min-ssds=<1 to min_ssd>. Should pass.
        3. Call dmg config generate --min-ssds=<min_ssd + 1>. Should fail.
        4. Call dmg config generate --min-ssds=0. Iterate the engines field and
        verify that there's no bdev_list field.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,small
        :avocado: tags=control,config_generate_entries,min_ssds
        """
        # Get necessary storage and network info.
        self.prepare_expected_data()

        # Iterate the NVMe PCI dictionary and find the key that has the shortest
        # list. This would be our min_ssd engine count threshold.
        socket_ids = list(self.nvme_socket_to_addrs.keys())
        shortest_id = socket_ids[0]
        shortest = len(self.nvme_socket_to_addrs[shortest_id])
        for socket_id in socket_ids:
            if len(self.nvme_socket_to_addrs[socket_id]) < shortest:
                shortest = len(self.nvme_socket_to_addrs[socket_id])
                shortest_id = socket_id

        min_ssd = len(self.nvme_socket_to_addrs[shortest_id])
        self.log.info("Maximum --min-ssds threshold = %d", min_ssd)

        dmg = DmgCommand(self.bin)
        dmg.exit_status_exception = False

        errors = []

        # Call dmg config generate --min-ssds=<1 to min_ssd>. Should pass.
        for num_ssd in range(1, min_ssd + 1):
            result = dmg.config_generate(access_points="wolf-a",
                                         min_ssds=num_ssd)
            if result.exit_status != 0:
                errors.append(
                    "config generate failed with min_ssd = {}!".format(
                        num_ssd))

        # Call dmg config generate --min_ssds=<min_ssd + 1>. Should fail.
        result = dmg.config_generate(access_points="wolf-a",
                                     min_ssds=min_ssd + 1)
        if result.exit_status == 0:
            errors.append(
                "config generate succeeded with min_ssd + 1 = {}!".format(
                    min_ssd + 1))

        # Call dmg config generate --min-ssds=0
        result = dmg.config_generate(access_points="wolf-a", min_ssds=0)
        generated_yaml = yaml.safe_load(result.stdout)
        # Iterate the engines and verify that there's no bdev_list field.
        engines = generated_yaml["engines"]
        for engine in engines:
            if "bdev_list" in engine:
                errors.append("bdev_list field exists with --min-ssds=0!")

        self.check_errors(errors)
Example #18
0
class CSumErrorLog(DaosCoreBase):
    """
    Test Class Description: This test runs
    daos_test -z (Checksum tests) and verifies
    whether Checksum Error Counters are incremented
    in the NVME device due to checksum fault injection.
    :avocado: recursive
    """

    # pylint: disable=too-many-instance-attributes
    def setUp(self):
        super(CSumErrorLog, self).setUp()
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.hostlist = self.hostlist_servers[0]
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")

    def get_nvme_device_id(self):
        self.dmg.json.value = True
        self.dmg.sub_command_class. \
            sub_command_class.set_sub_command("list-devices")
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))

        data = json.loads(result.stdout)
        if len(data['host_errors']) > 0:
            self.fail("dmg command failed: {}".format(data['host_errors']))
        for v in data['host_storage_map'].values():
            if v['storage']['smd_info']['devices']:
                return v['storage']['smd_info']['devices'][0]['uuid']

    def get_checksum_error_value(self, device_id=None):
        if device_id is None:
            self.fail("No device id provided")
            return
        self.dmg.json.value = True
        self.dmg.sub_command_class. \
            sub_command_class.set_sub_command("device-health")
        self.dmg.sub_command_class. \
            sub_command_class. \
            sub_command_class.uuid.value = device_id
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))

        data = json.loads(result.stdout)
        if len(data['host_errors']) > 0:
            self.fail("dmg command failed: {}".format(data['host_errors']))
        for v in data['host_storage_map'].values():
            if v['storage']['smd_info']['devices']:
                dev = v['storage']['smd_info']['devices'][0]
                return dev['health']['checksum_errors']

    def test_csum_error_logging(self):
        """
        Test ID: DAOS-3927
        Test Description: Write Avocado Test to verify single data after
                          pool/container disconnect/reconnect.
        :avocado: tags=all,pr,hw,medium,ib2,csum_error_log
        """
        dev_id = self.get_nvme_device_id()
        self.log.info("%s", dev_id)
        csum = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum)
        DaosCoreBase.run_subtest(self)
        csum_latest = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum_latest)
        self.assertTrue(csum_latest > csum,
                        "Checksum Error Log not incremented")
        self.log.info("Checksum Error Logging Test Passed")
Example #19
0
class CSumErrorLog(DaosCoreBase):
    """
    Test Class Description: This test runs
    daos_test -z (Checksum tests) and verifies
    whether Checksum Error Counters are incremented
    in the NVME device due to checksum fault injection.
    :avocado: recursive
    """
    # pylint: disable=too-many-instance-attributes
    def setUp(self):
        super(CSumErrorLog, self).setUp()
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.hostlist = self.hostlist_servers[0]
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")

    def get_nvme_device_id(self):
        self.dmg.sub_command_class.sub_command_class.set_sub_command("smd")
        self.dmg.sub_command_class. \
            sub_command_class.sub_command_class.devices.value = True
        self.dmg.sub_command_class. \
            sub_command_class.sub_command_class.pools.value = True
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))
        uid = None
        for line in result.stdout.splitlines():
            line = line.strip()
            if re.search("^UUID:", line):
                temp = line.split()
                uid = temp[1]
                break
        return uid

    def get_checksum_error_value(self, device_id=None):
        if device_id is None:
            self.fail("No device id provided")
            return
        self.dmg.sub_command_class. \
            sub_command_class.set_sub_command("blobstore-health")
        self.dmg.sub_command_class. \
            sub_command_class. \
            sub_command_class.devuuid.value = "{}".format(device_id)
        try:
            result = self.dmg.run()
        except process.CmdError as details:
            self.fail("dmg command failed: {}".format(details))
        csum_count = None
        for line in result.stdout.splitlines():
            line = line.strip()
            if re.search("^Checksum", line):
                temp = line.split()
                csum_count = int(temp[2])
                break
        return csum_count

    def test_csum_error_logging(self):
        """
        Test ID: DAOS-3927
        Test Description: Write Avocado Test to verify single data after
                          pool/container disconnect/reconnect.
        :avocado: tags=all,pr,hw,medium,ib2,csum_error_log
        """
        dev_id = self.get_nvme_device_id()
        self.log.info("%s", dev_id)
        csum = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum)
        DaosCoreBase.run_subtest(self)
        csum_latest = self.get_checksum_error_value(dev_id)
        self.log.info("Checksum Errors : %d", csum_latest)
        self.assertTrue(csum_latest > csum,
                        "Checksum Error Log not incremented")
        self.log.info("Checksum Error Logging Test Passed")
Example #20
0
class DaosServerManager(SubprocessManager):
    """Manages the daos_server execution on one or more hosts."""

    # Mapping of environment variable names to daos_server config param names
    ENVIRONMENT_VARIABLE_MAPPING = {
        "CRT_PHY_ADDR_STR": "provider",
        "OFI_INTERFACE": "fabric_iface",
        "OFI_PORT": "fabric_iface_port",
    }

    def __init__(self, server_command, manager="Orterun", dmg_cfg=None):
        """Initialize a DaosServerManager object.

        Args:
            server_command (ServerCommand): server command object
            manager (str, optional): the name of the JobManager class used to
                manage the YamlCommand defined through the "job" attribute.
                Defaults to "OpenMpi".
            dmg_cfg (DmgYamlParameters, optional): The dmg configuration
                file parameters used to connect to this group of servers.
        """
        super(DaosServerManager, self).__init__(server_command, manager)
        self.manager.job.sub_command_override = "start"

        # Dmg command to access this group of servers which will be configured
        # to access the daos_servers when they are started
        self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg)

    def get_params(self, test):
        """Get values for all of the command params from the yaml file.

        Use the yaml file parameter values to assign the server command and
        orterun command parameters.

        Args:
            test (Test): avocado Test object
        """
        super(DaosServerManager, self).get_params(test)
        # Get the values for the dmg parameters
        self.dmg.get_params(test)

    def prepare(self, storage=True):
        """Prepare to start daos_server.

        Args:
            storage (bool, optional): whether or not to prepare dspm/nvme
                storage. Defaults to True.
        """
        self.log.info("<SERVER> Preparing to start daos_server on %s with %s",
                      self._hosts, self.manager.command)

        # Create the daos_server yaml file
        self.manager.job.create_yaml_file()

        # Copy certificates
        self.manager.job.copy_certificates(get_log_file("daosCA/certs"),
                                           self._hosts)
        local_host = socket.gethostname().split('.', 1)[0]
        self.dmg.copy_certificates(get_log_file("daosCA/certs"),
                                   local_host.split())

        # Prepare dmg for running storage format on all server hosts
        self.dmg.hostlist = self._hosts
        if not self.dmg.yaml:
            # If using a dmg config file, transport security was
            # already configured.
            self.dmg.insecure.update(self.get_config_value("allow_insecure"),
                                     "dmg.insecure")

        # Kill any daos servers running on the hosts
        self.kill()

        # Clean up any files that exist on the hosts
        self.clean_files()

        # Make sure log file has been created for ownership change
        if self.manager.job.using_nvme:
            cmd_list = []
            for server_params in self.manager.job.yaml.server_params:
                log_file = server_params.log_file.value
                if log_file is not None:
                    self.log.info("Creating log file: %s", log_file)
                    cmd_list.append("touch {}".format(log_file))
            if cmd_list:
                pcmd(self._hosts, "; ".join(cmd_list), False)

        if storage:
            # Prepare server storage
            if self.manager.job.using_nvme or self.manager.job.using_dcpm:
                self.log.info("Preparing storage in <format> mode")
                self.prepare_storage("root")
                if hasattr(self.manager, "mca"):
                    self.manager.mca.update({"plm_rsh_args": "-l root"},
                                            "orterun.mca", True)

    def clean_files(self, verbose=True):
        """Clean up the daos server files.

        Args:
            verbose (bool, optional): display clean commands. Defaults to True.
        """
        clean_cmds = []
        for server_params in self.manager.job.yaml.server_params:
            scm_mount = server_params.get_value("scm_mount")
            self.log.info("Cleaning up the %s directory.", str(scm_mount))

            # Remove the superblocks
            cmd = "sudo rm -fr {}/*".format(scm_mount)
            if cmd not in clean_cmds:
                clean_cmds.append(cmd)

            # Dismount the scm mount point
            cmd = "while sudo umount {}; do continue; done".format(scm_mount)
            if cmd not in clean_cmds:
                clean_cmds.append(cmd)

            if self.manager.job.using_dcpm:
                scm_list = server_params.get_value("scm_list")
                if isinstance(scm_list, list):
                    self.log.info("Cleaning up the following device(s): %s.",
                                  ", ".join(scm_list))
                    # Umount and wipefs the dcpm device
                    cmd_list = [
                        "for dev in {}".format(" ".join(scm_list)),
                        "do mount=$(lsblk $dev -n -o MOUNTPOINT)",
                        "if [ ! -z $mount ]", "then while sudo umount $mount",
                        "do continue", "done", "fi", "sudo wipefs -a $dev",
                        "done"
                    ]
                    cmd = "; ".join(cmd_list)
                    if cmd not in clean_cmds:
                        clean_cmds.append(cmd)

        pcmd(self._hosts, "; ".join(clean_cmds), verbose)

    def prepare_storage(self, user, using_dcpm=None, using_nvme=None):
        """Prepare the server storage.

        Args:
            user (str): username
            using_dcpm (bool, optional): override option to prepare scm storage.
                Defaults to None, which uses the configuration file to determine
                if scm storage should be formatted.
            using_nvme (bool, optional): override option to prepare nvme
                storage. Defaults to None, which uses the configuration file to
                determine if nvme storage should be formatted.

        Raises:
            ServerFailed: if there was an error preparing the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.target_user.value = user
        cmd.sub_command_class.sub_command_class.force.value = True

        # Use the configuration file settings if no overrides specified
        if using_dcpm is None:
            using_dcpm = self.manager.job.using_dcpm
        if using_nvme is None:
            using_nvme = self.manager.job.using_nvme

        if using_dcpm and not using_nvme:
            cmd.sub_command_class.sub_command_class.scm_only.value = True
        elif not using_dcpm and using_nvme:
            cmd.sub_command_class.sub_command_class.nvme_only.value = True

        if using_nvme:
            cmd.sub_command_class.sub_command_class.hugepages.value = 4096

        self.log.info("Preparing DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=40)
        if len(result) > 1 or 0 not in result:
            dev_type = "nvme"
            if using_dcpm and using_nvme:
                dev_type = "dcpm & nvme"
            elif using_dcpm:
                dev_type = "dcpm"
            raise ServerFailed("Error preparing {} storage".format(dev_type))

    def detect_format_ready(self, reformat=False):
        """Detect when all the daos_servers are ready for storage format."""
        f_type = "format" if not reformat else "reformat"
        self.log.info("<SERVER> Waiting for servers to be ready for format")
        self.manager.job.update_pattern(f_type, len(self._hosts))
        try:
            self.manager.run()
        except CommandFailure as error:
            self.kill()
            raise ServerFailed(
                "Failed to start servers before format: {}".format(error))

    def detect_io_server_start(self, host_qty=None):
        """Detect when all the daos_io_servers have started.

        Args:
            host_qty (int): number of servers expected to have been started.

        Raises:
            ServerFailed: if there was an error starting the servers after
                formatting.

        """
        if host_qty is None:
            hosts_qty = len(self._hosts)
        self.log.info("<SERVER> Waiting for the daos_io_servers to start")
        self.manager.job.update_pattern("normal", hosts_qty)
        if not self.manager.job.check_subprocess_status(self.manager.process):
            self.kill()
            raise ServerFailed("Failed to start servers after format")

        # Update the dmg command host list to work with pool create/destroy
        self.dmg.hostlist = self.get_config_value("access_points")

    def reset_storage(self):
        """Reset the server storage.

        Raises:
            ServerFailed: if there was an error resetting the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.nvme_only.value = True
        cmd.sub_command_class.sub_command_class.reset.value = True
        cmd.sub_command_class.sub_command_class.force.value = True

        self.log.info("Resetting DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=120)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed("Error resetting NVMe storage")

    def set_scm_mount_ownership(self, user=None, verbose=False):
        """Set the ownership to the specified user for each scm mount.

        Args:
            user (str, optional): user name. Defaults to None - current user.
            verbose (bool, optional): display commands. Defaults to False.

        """
        user = getpass.getuser() if user is None else user

        cmd_list = set()
        for server_params in self.manager.job.yaml.server_params:
            scm_mount = server_params.scm_mount.value

            # Support single or multiple scm_mount points
            if not isinstance(scm_mount, list):
                scm_mount = [scm_mount]

            self.log.info("Changing ownership to %s for: %s", user, scm_mount)
            cmd_list.add("sudo chown -R {0}:{0} {1}".format(
                user, " ".join(scm_mount)))

        if cmd_list:
            pcmd(self._hosts, "; ".join(cmd_list), verbose)

    def start(self):
        """Start the server through the job manager."""
        # Prepare the servers
        self.prepare()

        # Start the servers and wait for them to be ready for storage format
        self.detect_format_ready()

        # Format storage and wait for server to change ownership
        self.log.info("<SERVER> Formatting hosts: <%s>", self.dmg.hostlist)
        # Temporarily increasing timeout to avoid CI errors until DAOS-5764 can
        # be further investigated.
        self.dmg.storage_format(timeout=40)

        # Wait for all the daos_io_servers to start
        self.detect_io_server_start()

        return True

    def stop(self):
        """Stop the server through the runner."""
        self.log.info("<SERVER> Stopping server %s command",
                      self.manager.command)

        # Maintain a running list of errors detected trying to stop
        messages = []

        # Stop the subprocess running the job manager command
        try:
            super(DaosServerManager, self).stop()
        except CommandFailure as error:
            messages.append("Error stopping the {} subprocess: {}".format(
                self.manager.command, error))

        # Kill any leftover processes that may not have been stopped correctly
        self.kill()

        if self.manager.job.using_nvme:
            # Reset the storage
            try:
                self.reset_storage()
            except ServerFailed as error:
                messages.append(str(error))

            # Make sure the mount directory belongs to non-root user
            self.set_scm_mount_ownership()

        # Report any errors after all stop actions have been attempted
        if messages:
            raise ServerFailed("Failed to stop servers:\n  {}".format(
                "\n  ".join(messages)))

    def get_environment_value(self, name):
        """Get the server config value associated with the env variable name.

        Args:
            name (str): environment variable name for which to get a daos_server
                configuration value

        Raises:
            ServerFailed: Unable to find a daos_server configuration value for
                the specified environment variable name

        Returns:
            str: the daos_server configuration value for the specified
                environment variable name

        """
        try:
            setting = self.ENVIRONMENT_VARIABLE_MAPPING[name]

        except IndexError:
            raise ServerFailed(
                "Unknown server config setting mapping for the {} environment "
                "variable!".format(name))

        return self.get_config_value(setting)

    def get_single_system_state(self):
        """Get the current homogeneous DAOS system state.

        Raises:
            ServerFailed: if a single state for all servers is not detected

        Returns:
            str: the current DAOS system state

        """
        data = self.dmg.system_query()
        if not data:
            # The regex failed to get the rank and state
            raise ServerFailed("Error obtaining {} output: {}".format(
                self.dmg, data))
        try:
            states = list(set([data[rank]["state"] for rank in data]))
        except KeyError:
            raise ServerFailed(
                "Unexpected result from {} - missing 'state' key: {}".format(
                    self.dmg, data))
        if len(states) > 1:
            # Multiple states for different ranks detected
            raise ServerFailed(
                "Multiple system states ({}) detected:\n  {}".format(
                    states, data))
        return states[0]

    def check_system_state(self, valid_states, max_checks=1):
        """Check that the DAOS system state is one of the provided states.

        Fail the test if the current state does not match one of the specified
        valid states.  Optionally the state check can loop multiple times,
        sleeping one second between checks, by increasing the number of maximum
        checks.

        Args:
            valid_states (list): expected DAOS system states as a list of
                lowercase strings
            max_checks (int, optional): number of times to check the state.
                Defaults to 1.

        Raises:
            ServerFailed: if there was an error detecting the server state or
                the detected state did not match one of the valid states

        Returns:
            str: the matching valid detected state

        """
        checks = 0
        daos_state = "????"
        while daos_state not in valid_states and checks < max_checks:
            if checks > 0:
                time.sleep(1)
            try:
                daos_state = self.get_single_system_state().lower()
            except ServerFailed as error:
                raise error
            checks += 1
            self.log.info("System state check (%s): %s", checks, daos_state)
        if daos_state not in valid_states:
            raise ServerFailed(
                "Error checking DAOS state, currently neither {} after "
                "{} state check(s)!".format(valid_states, checks))
        return daos_state

    def system_start(self):
        """Start the DAOS IO servers.

        Raises:
            ServerFailed: if there was an error starting the servers

        """
        self.log.info("Starting DAOS IO servers")
        self.check_system_state(("stopped"))
        self.dmg.system_start()
        if self.dmg.result.exit_status != 0:
            raise ServerFailed("Error starting DAOS:\n{}".format(
                self.dmg.result))

    def system_stop(self, extra_states=None):
        """Stop the DAOS IO servers.

        Args:
            extra_states (list, optional): a list of DAOS system states in
                addition to "started" and "joined" that are verified prior to
                issuing the stop. Defaults to None.

        Raises:
            ServerFailed: if there was an error stopping the servers

        """
        valid_states = ["started", "joined"]
        if extra_states:
            valid_states.extend(extra_states)
        self.log.info("Stopping DAOS IO servers")
        self.check_system_state(valid_states)
        self.dmg.system_stop(force=True)
        if self.dmg.result.exit_status != 0:
            raise ServerFailed("Error stopping DAOS:\n{}".format(
                self.dmg.result))

    def get_available_storage(self):
        """Get the available SCM and NVMe storage.

        Raises:
            ServerFailed: if there was an error stopping the servers

        Returns:
            list: a list of the maximum available SCM and NVMe sizes in bytes

        """
        def get_host_capacity(key, device_names):
            """Get the total storage capacity per host rank.

            Args:
                key (str): the capacity type, e.g. "scm" or "nvme"
                device_names (list): the device names of this capacity type

            Returns:
                dict: a dictionary of total storage capacity per host rank

            """
            host_capacity = {}
            for host in data:
                device_sizes = []
                for device in data[host][key]:
                    if device in device_names:
                        device_sizes.append(
                            human_to_bytes(
                                data[host][key][device]["capacity"]))
                host_capacity[host] = sum(device_sizes)
            return host_capacity

        # Default maximum bytes for SCM and NVMe
        storage = [0, 0]

        using_dcpm = self.manager.job.using_dcpm
        using_nvme = self.manager.job.using_nvme

        if using_dcpm or using_nvme:
            # Stop the DAOS IO servers in order to be able to scan the storage
            self.system_stop()

            # Scan all of the hosts for their SCM and NVMe storage
            self.dmg.hostlist = self._hosts
            data = self.dmg.storage_scan(verbose=True)
            self.dmg.hostlist = self.get_config_value("access_points")
            if self.dmg.result.exit_status != 0:
                raise ServerFailed("Error obtaining DAOS storage:\n{}".format(
                    self.dmg.result))

            # Restart the DAOS IO servers
            self.system_start()

        if using_dcpm:
            # Find the sizes of the configured SCM storage
            scm_devices = [
                os.path.basename(path)
                for path in self.get_config_value("scm_list") if path
            ]
            capacity = get_host_capacity("scm", scm_devices)
            for host in sorted(capacity):
                self.log.info("SCM capacity for %s: %s", host, capacity[host])
            # Use the minimum SCM storage across all servers
            storage[0] = capacity[min(capacity, key=capacity.get)]
        else:
            # Use the assigned scm_size
            scm_size = self.get_config_value("scm_size")
            storage[0] = human_to_bytes("{}GB".format(scm_size))

        if using_nvme:
            # Find the sizes of the configured NVMe storage
            capacity = get_host_capacity("nvme",
                                         self.get_config_value("bdev_list"))
            for host in sorted(capacity):
                self.log.info("NVMe capacity for %s: %s", host, capacity[host])
            # Use the minimum SCM storage across all servers
            storage[1] = capacity[min(capacity, key=capacity.get)]

        self.log.info(
            "Total available storage:\n  SCM:  %s (%s)\n  NVMe: %s (%s)",
            str(storage[0]), bytes_to_human(storage[0], binary=False),
            str(storage[1]), bytes_to_human(storage[1], binary=False))
        return storage
Example #21
0
    def pool_acl_verification(self, current_user_acl, read, write):
        '''
        Deascription:
            Daos pool security verification with acl file.
            Steps:
                (1)Setup dmg tool for creating a pool
                (2)Generate acl file with permissions
                (3)Create a pool with acl
                (4)Verify the pool create status
                (5)Get the pool's acl list
                (6)Verify pool read operation
                (7)Verify pool write operation
                (8)Cleanup user and destroy pool
        Args:
            current_user_acl: acl with read write access credential.
            read: expecting read permission.
            write: expecting write permission.
        Return:
            pass to continue.
            fail to report the testlog and stop.
        '''

        # (1)Create daos_shell command
        dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        dmg.get_params(self)
        port = self.params.get("port", "/run/server_config/*", 10001)
        get_acl_file = self.params.get("acl_file", "/run/pool_acl/*",
                                       "acl_test.txt")
        acl_file = os.path.join(self.tmp, get_acl_file)
        num_user = self.params.get("num_user", "/run/pool_acl/*")
        num_group = self.params.get("num_group", "/run/pool_acl/*")
        servers_with_ports = [
            "{}:{}".format(host, port) for host in self.hostlist_servers
        ]
        dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist")
        self.log.info("  (1)dmg= %s", dmg)

        # (2)Generate acl file with permissions
        self.log.info("  (2)Generate acl file with user/group permissions")
        permission_list = self.create_pool_acl(num_user, num_group,
                                               current_user_acl, acl_file)

        # (3)Create a pool with acl
        self.log.info("  (3)Create a pool with acl")
        dmg.action_command.acl_file.value = acl_file
        dmg.exit_status_exception = False
        result = dmg.run()

        # (4)Verify the pool create status
        self.log.info("  (4)dmg.run() result=\n%s", result)
        if result.stderr == "":
            uuid, svc = dmg_utils.get_pool_uuid_service_replicas_from_stdout(
                result.stdout)
        else:
            self.fail("##(4)Unable to parse pool uuid and svc.")

        # (5)Get the pool's acl list
        #    dmg pool get-acl --pool <UUID>
        self.log.info("  (5)Get a pool's acl list by: "
                      "dmg pool get-acl --pool --hostlist")
        pool_acl_list = self.get_pool_acl_list(uuid)
        self.log.info("   pool original permission_list: %s", permission_list)
        self.log.info("   pool get_acl  permission_list: %s", pool_acl_list)

        # (6)Verify pool read operation
        #    daos pool query --pool <uuid>
        self.log.info("  (6)Verify pool read by: daos pool query --pool")
        self.verify_pool_readwrite(svc, uuid, "read", expect=read)

        # (7)Verify pool write operation
        #    daos continer create --pool <uuid>
        self.log.info("  (7)Verify pool write by: daos continer create --pool")
        self.verify_pool_readwrite(svc, uuid, "write", expect=write)

        # (8)Cleanup user and destroy pool
        self.log.info("  (8)Cleanup user and destroy pool")
        self.cleanup_user_group(num_user, num_group)
        dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        dmg.request.value = "pool"
        dmg.action.value = "destroy --pool={}".format(uuid)
        dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist")
        result = dmg.run()
        return
Example #22
0
class DaosServerManager(SubprocessManager):
    """Manages the daos_server execution on one or more hosts."""

    # Mapping of environment variable names to daos_server config param names
    ENVIRONMENT_VARIABLE_MAPPING = {
        "CRT_PHY_ADDR_STR": "provider",
        "OFI_INTERFACE": "fabric_iface",
        "OFI_PORT": "fabric_iface_port",
    }

    def __init__(self, server_command, manager="Orterun", dmg_cfg=None):
        """Initialize a DaosServerManager object.

        Args:
            server_command (ServerCommand): server command object
            manager (str, optional): the name of the JobManager class used to
                manage the YamlCommand defined through the "job" attribute.
                Defaults to "OpenMpi".
            dmg_cfg (DmgYamlParameters, optional): The dmg configuration
                file parameters used to connect to this group of servers.
        """
        super(DaosServerManager, self).__init__(server_command, manager)
        self.manager.job.sub_command_override = "start"

        # Dmg command to access this group of servers which will be configured
        # to access the doas_servers when they are started
        self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg)

    def get_params(self, test):
        """Get values for all of the command params from the yaml file.

        Use the yaml file parameter values to assign the server command and
        orterun command parameters.

        Args:
            test (Test): avocado Test object
        """
        super(DaosServerManager, self).get_params(test)
        # Get the values for the dmg parameters
        self.dmg.get_params(test)

    def prepare(self, storage=True):
        """Prepare to start daos_server.

        Args:
            storage (bool, optional): whether or not to prepare dspm/nvme
                storage. Defaults to True.
        """
        self.log.info(
            "<SERVER> Preparing to start daos_server on %s with %s",
            self._hosts, self.manager.command)

        # Create the daos_server yaml file
        self.manager.job.create_yaml_file()

        # Copy certificates
        self.manager.job.copy_certificates(
            get_log_file("daosCA/certs"), self._hosts)
        local_host = socket.gethostname().split('.', 1)[0]
        self.dmg.copy_certificates(
            get_log_file("daosCA/certs"), local_host.split())

        # Prepare dmg for running storage format on all server hosts
        self.dmg.hostlist = self._hosts
        if not self.dmg.yaml:
            # If using a dmg config file, transport security was
            # already configured.
            self.dmg.insecure.update(
                self.get_config_value("allow_insecure"), "dmg.insecure")

        # Kill any daos servers running on the hosts
        self.kill()

        # Clean up any files that exist on the hosts
        self.clean_files()

        # Make sure log file has been created for ownership change
        if self.manager.job.using_nvme:
            cmd_list = []
            for server_params in self.manager.job.yaml.server_params:
                log_file = server_params.log_file.value
                if log_file is not None:
                    self.log.info("Creating log file: %s", log_file)
                    cmd_list.append("touch {}".format(log_file))
            if cmd_list:
                pcmd(self._hosts, "; ".join(cmd_list), False)

        if storage:
            # Prepare server storage
            if self.manager.job.using_nvme or self.manager.job.using_dcpm:
                self.log.info("Preparing storage in <format> mode")
                self.prepare_storage("root")
                if hasattr(self.manager, "mca"):
                    self.manager.mca.update(
                        {"plm_rsh_args": "-l root"}, "orterun.mca", True)

    def clean_files(self, verbose=True):
        """Clean up the daos server files.

        Args:
            verbose (bool, optional): display clean commands. Defaults to True.
        """
        clean_cmds = []
        for server_params in self.manager.job.yaml.server_params:
            scm_mount = server_params.get_value("scm_mount")
            self.log.info("Cleaning up the %s directory.", str(scm_mount))

            # Remove the superblocks
            cmd = "rm -fr {}/*".format(scm_mount)
            if cmd not in clean_cmds:
                clean_cmds.append(cmd)

            # Dismount the scm mount point
            cmd = "while sudo umount {}; do continue; done".format(scm_mount)
            if cmd not in clean_cmds:
                clean_cmds.append(cmd)

            if self.manager.job.using_dcpm:
                scm_list = server_params.get_value("scm_list")
                if isinstance(scm_list, list):
                    self.log.info(
                        "Cleaning up the following device(s): %s.",
                        ", ".join(scm_list))
                    # Umount and wipefs the dcpm device
                    cmd_list = [
                        "for dev in {}".format(" ".join(scm_list)),
                        "do mount=$(lsblk $dev -n -o MOUNTPOINT)",
                        "if [ ! -z $mount ]",
                        "then while sudo umount $mount",
                        "do continue",
                        "done",
                        "fi",
                        "sudo wipefs -a $dev",
                        "done"
                    ]
                    cmd = "; ".join(cmd_list)
                    if cmd not in clean_cmds:
                        clean_cmds.append(cmd)

        pcmd(self._hosts, "; ".join(clean_cmds), verbose)

    def prepare_storage(self, user, using_dcpm=None, using_nvme=None):
        """Prepare the server storage.

        Args:
            user (str): username
            using_dcpm (bool, optional): override option to prepare scm storage.
                Defaults to None, which uses the configuration file to determine
                if scm storage should be formatted.
            using_nvme (bool, optional): override option to prepare nvme
                storage. Defaults to None, which uses the configuration file to
                determine if nvme storage should be formatted.

        Raises:
            ServerFailed: if there was an error preparing the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.target_user.value = user
        cmd.sub_command_class.sub_command_class.force.value = True

        # Use the configuration file settings if no overrides specified
        if using_dcpm is None:
            using_dcpm = self.manager.job.using_dcpm
        if using_nvme is None:
            using_nvme = self.manager.job.using_nvme

        if using_dcpm and not using_nvme:
            cmd.sub_command_class.sub_command_class.scm_only.value = True
        elif not using_dcpm and using_nvme:
            cmd.sub_command_class.sub_command_class.nvme_only.value = True

        if using_nvme:
            cmd.sub_command_class.sub_command_class.hugepages.value = 4096

        self.log.info("Preparing DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=120)
        if len(result) > 1 or 0 not in result:
            dev_type = "nvme"
            if using_dcpm and using_nvme:
                dev_type = "dcpm & nvme"
            elif using_dcpm:
                dev_type = "dcpm"
            raise ServerFailed("Error preparing {} storage".format(dev_type))

    def detect_format_ready(self, reformat=False):
        """Detect when all the daos_servers are ready for storage format."""
        f_type = "format" if not reformat else "reformat"
        self.log.info("<SERVER> Waiting for servers to be ready for format")
        self.manager.job.update_pattern(f_type, len(self._hosts))
        try:
            self.manager.run()
        except CommandFailure as error:
            self.kill()
            raise ServerFailed(
                "Failed to start servers before format: {}".format(error))

    def detect_io_server_start(self):
        """Detect when all the daos_io_servers have started."""
        self.log.info("<SERVER> Waiting for the daos_io_servers to start")
        self.manager.job.update_pattern("normal", len(self._hosts))
        if not self.manager.job.check_subprocess_status(self.manager.process):
            self.kill()
            raise ServerFailed("Failed to start servers after format")

        # Update the dmg command host list to work with pool create/destroy
        self.dmg.hostlist = self.get_config_value("access_points")

    def reset_storage(self):
        """Reset the server storage.

        Raises:
            ServerFailed: if there was an error resetting the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.nvme_only.value = True
        cmd.sub_command_class.sub_command_class.reset.value = True
        cmd.sub_command_class.sub_command_class.force.value = True

        self.log.info("Resetting DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=120)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed("Error resetting NVMe storage")

    def set_scm_mount_ownership(self, user=None, verbose=False):
        """Set the ownership to the specified user for each scm mount.

        Args:
            user (str, optional): user name. Defaults to None - current user.
            verbose (bool, optional): display commands. Defaults to False.

        """
        user = getpass.getuser() if user is None else user

        cmd_list = set()
        for server_params in self.manager.job.yaml.server_params:
            scm_mount = server_params.scm_mount.value

            # Support single or multiple scm_mount points
            if not isinstance(scm_mount, list):
                scm_mount = [scm_mount]

            self.log.info("Changing ownership to %s for: %s", user, scm_mount)
            cmd_list.add(
                "sudo chown -R {0}:{0} {1}".format(user, " ".join(scm_mount)))

        if cmd_list:
            pcmd(self._hosts, "; ".join(cmd_list), verbose)

    def start(self):
        """Start the server through the job manager."""
        # Prepare the servers
        self.prepare()

        # Start the servers and wait for them to be ready for storage format
        self.detect_format_ready()

        # Format storage and wait for server to change ownership
        self.log.info(
            "<SERVER> Formatting hosts: <%s>", self.dmg.hostlist)
        self.dmg.storage_format()

        # Wait for all the doas_io_servers to start
        self.detect_io_server_start()

        return True

    def stop(self):
        """Stop the server through the runner."""
        self.log.info(
            "<SERVER> Stopping server %s command", self.manager.command)

        # Maintain a running list of errors detected trying to stop
        messages = []

        # Stop the subprocess running the job manager command
        try:
            super(DaosServerManager, self).stop()
        except CommandFailure as error:
            messages.append(
                "Error stopping the {} subprocess: {}".format(
                    self.manager.command, error))

        # Kill any leftover processes that may not have been stopped correctly
        self.kill()

        if self.manager.job.using_nvme:
            # Reset the storage
            try:
                self.reset_storage()
            except ServerFailed as error:
                messages.append(str(error))

            # Make sure the mount directory belongs to non-root user
            self.set_scm_mount_ownership()

        # Report any errors after all stop actions have been attempted
        if messages:
            raise ServerFailed(
                "Failed to stop servers:\n  {}".format("\n  ".join(messages)))

    def get_environment_value(self, name):
        """Get the server config value associated with the env variable name.

        Args:
            name (str): environment variable name for which to get a daos_server
                configuration value

        Raises:
            ServerFailed: Unable to find a daos_server configuration value for
                the specified environment variable name

        Returns:
            str: the daos_server configuration value for the specified
                environment variable name

        """
        try:
            setting = self.ENVIRONMENT_VARIABLE_MAPPING[name]

        except IndexError:
            raise ServerFailed(
                "Unknown server config setting mapping for the {} environment "
                "variable!".format(name))

        return self.get_config_value(setting)
Example #23
0
    def test_monitor_for_large_pools(self):
        """Jira ID: DAOS-4722.

        Test Description: Test Health monitor for large number of pools.
        Use Case: This tests will create the 40 number of pools and verify the
                  dmg list-pools, device-health and nvme-health works for all
                  pools.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=nvme_health
        """
        # pylint: disable=attribute-defined-outside-init
        # pylint: disable=too-many-branches
        no_of_pools = self.params.get("number_of_pools", '/run/pool/*')
        #Stop the servers to run SPDK too to get the server capacity
        self.stop_servers()
        storage = self.get_nvme_max_capacity()
        self.start_servers()

        #Create the pool from 80% of available of storage space
        single_pool_nvme_size = int((storage * 0.80) / no_of_pools)

        self.pool = []
        #Create the Large number of pools
        for _pool in range(no_of_pools):
            pool = TestPool(self.context, dmg_command=self.get_dmg_command())
            pool.get_params(self)
            #SCM size is 10% of NVMe
            pool.scm_size.update('{}'.format(int(single_pool_nvme_size *
                                                 0.10)))
            pool.nvme_size.update('{}'.format(single_pool_nvme_size))
            pool.create()
            self.pool.append(pool)

        #initialize the dmg command
        self.dmg = DmgCommand(os.path.join(self.prefix, "bin"))
        self.dmg.get_params(self)
        self.dmg.insecure.update(
            self.server_managers[0].get_config_value("allow_insecure"),
            "dmg.insecure")

        #List all pools
        self.dmg.set_sub_command("storage")
        self.dmg.sub_command_class.set_sub_command("query")
        self.dmg.sub_command_class.sub_command_class.\
        set_sub_command("list-pools")
        for host in self.hostlist_servers:
            self.dmg.hostlist = host
            try:
                result = self.dmg.run()
            except CommandFailure as error:
                self.fail("dmg command failed: {}".format(error))
            #Verify all pools UUID listed as part of query
            for pool in self.pool:
                if pool.uuid.lower() not in result.stdout:
                    self.fail('Pool uuid {} not found in smd query'.format(
                        pool.uuid.lower()))

        # Get the device ID from all the servers.
        device_ids = get_device_ids(self.dmg, self.hostlist_servers)

        # Get the device health
        for host in device_ids:
            self.dmg.hostlist = host
            for _dev in device_ids[host]:
                try:
                    result = self.dmg.storage_query_device_health(_dev)
                except CommandFailure as error:
                    self.fail("dmg get device states failed {}".format(error))
                if 'State:NORMAL' not in result.stdout:
                    self.fail("device {} on host {} is not NORMAL".format(
                        _dev, host))

        # Get the nvme-health
        try:
            self.dmg.storage_scan_nvme_health()
        except CommandFailure as error:
            self.fail("dmg storage scan --nvme-health failed {}".format(error))
Example #24
0
class TestPool(TestDaosApiBase):
    """A class for functional testing of DaosPools objects."""
    # Constants to define whether to use API or dmg to create and destroy
    # pool.
    USE_API = "API"
    USE_DMG = "dmg"

    def __init__(self, context, log=None, cb_handler=None, dmg_bin_path=None):
        # pylint: disable=unused-argument
        """Initialize a TestPool object.

        Note: 'log' is now a defunct argument and will be removed in the future

        Args:
            context (DaosContext): [description]
            log (logging): logging object used to report the pool status
            cb_handler (CallbackHandler, optional): callback object to use with
                the API methods. Defaults to None.
        """
        super(TestPool, self).__init__("/run/pool/*", cb_handler)
        self.context = context
        self.uid = os.geteuid()
        self.gid = os.getegid()

        self.mode = BasicParameter(None)
        self.name = BasicParameter(None)  # server group name
        self.svcn = BasicParameter(None)
        self.target_list = BasicParameter(None)
        self.scm_size = BasicParameter(None)
        self.nvme_size = BasicParameter(None)
        # Set USE_API to use API or USE_DMG to use dmg. If it's not set, API is
        # used.
        self.control_method = BasicParameter(self.USE_API, self.USE_API)
        uname = getpass.getuser()
        gname = grp.getgrnam(uname)[0]
        self.username = BasicParameter(uname, uname)
        self.groupname = BasicParameter(gname, gname)

        self.pool = None
        self.uuid = None
        self.info = None
        self.svc_ranks = None
        self.connected = False
        self.dmg = None
        # Required to use dmg. It defined the directory where dmg is installed.
        # Use self.basepath + '/install/bin' in the test
        self.dmg_bin_path = dmg_bin_path
        if dmg_bin_path is not None:
            # We make dmg as the member of this class because the test would
            # have more flexibility over the usage of the command.
            self.dmg = DmgCommand(self.dmg_bin_path)
            self.dmg.insecure.value = True
            self.dmg.request.value = "pool"

    @fail_on(CommandFailure)
    @fail_on(DaosApiError)
    def create(self):
        """Create a pool with either API or dmg.

        To use dmg, the test needs to set control_method.value to USE_DMG
        prior to calling this method. The recommended way is to specify the
        pool block in yaml. For example,

        pool:
            control_method: dmg

        This tells this method to use dmg. The test also needs to set
        dmg_bin_path through the constructor if dmg is used. For example,

        self.pool = TestPool(self.context,
                             dmg_bin_path=self.basepath + '/install/bin')

        If it wants to use --nsvc option, it needs to set the value to
        svcn.value. Otherwise, 1 is used. If it wants to use --group, it needs
        to set groupname.value. If it wants to use --user, it needs to set
        username.value. If it wants to add other options, directly set it
        to self.dmg.action_command. Refer dmg_utils.py pool_create method for
        more details.

        To test the negative case on create, the test needs to catch
        CommandFailure for dmg and DaosApiError for API. Thus, we need to make
        more than one line modification to the test only for this purpose.
        Currently, pool_svc is the only test that needs this change.
        """
        self.destroy()
        if self.target_list.value is not None:
            self.log.info("Creating a pool on targets %s",
                          self.target_list.value)
        else:
            self.log.info("Creating a pool")
        self.pool = DaosPool(self.context)
        if self.control_method.value == self.USE_API:
            kwargs = {
                "mode": self.mode.value,
                "uid": self.uid,
                "gid": self.gid,
                "scm_size": self.scm_size.value,
                "group": self.name.value
            }
            for key in ("target_list", "svcn", "nvme_size"):
                value = getattr(self, key).value
                if value is not None:
                    kwargs[key] = value
            self._call_method(self.pool.create, kwargs)

            self.svc_ranks = [
                int(self.pool.svc.rl_ranks[index])
                for index in range(self.pool.svc.rl_nr)
            ]
        else:
            if self.dmg is None:
                raise DaosTestError(
                    "self.dmg is None. dmg_bin_path needs to be set through "
                    "the constructor of TestPool to create pool with dmg.")
            # Currently, there is one test that creates the pool over the
            # subset of the server hosts; pool/evict_test. To do so, the test
            # needs to set the rank(s) to target_list.value starting from 0.
            # e.g., if you're using 4 server hosts; wolf-1, wolf-2, wolf-3, and
            # wolf-4, and want to create a pool over the first two hosts;
            # wolf-1 and 2, then set the list [0, 1] to target_list.value.
            # We'll convert it to the comma separated string and set it to dmg.
            # For instance, [0, 1] will result in dmg pool create -r 0,1. If
            # you don't set target_list.value, -r won't be used, in which case
            # the pool is created over all the server hosts.
            if self.target_list.value is None:
                ranks_comma_separated = None
            else:
                ranks_comma_separated = ""
                for i in range(len(self.target_list.value)):
                    ranks_comma_separated += str(self.target_list.value[i])
                    # If this element is not the last one, append comma
                    if i < len(self.target_list.value) - 1:
                        ranks_comma_separated += ","
            # Call the dmg pool create command
            self.dmg.action.value = "create"
            self.dmg.get_action_command()
            # uid/gid used in API correspond to --user and --group in dmg.
            # group, or self.name.value, used in API is called server group and
            # it's different from the group name passed in to --group. Server
            # group isn't used in dmg. We don't pass it into the command, but
            # we'll still use it to set self.pool.group
            self.dmg.action_command.group.value = self.groupname.value
            self.dmg.action_command.user.value = self.username.value
            self.dmg.action_command.scm_size.value = self.scm_size.value
            self.dmg.action_command.ranks.value = ranks_comma_separated
            self.dmg.action_command.nsvc.value = self.svcn.value
            create_result = self.dmg.run()
            self.log.info("Result stdout = %s", create_result.stdout)
            self.log.info("Result exit status = %s", create_result.exit_status)
            # Get UUID and service replica from the output
            uuid_svc = get_pool_uuid_service_replicas_from_stdout(
                create_result.stdout)
            new_uuid = uuid_svc[0]
            service_replica = uuid_svc[1]

            # 3. Create DaosPool object. The process is similar to the one in
            # DaosPool.create, but there are some modifications
            if self.name.value is None:
                self.pool.group = None
            else:
                self.pool.group = ctypes.create_string_buffer(self.name.value)
            # Modification 1: Use the length of service_replica returned by dmg
            # to calculate rank_t. Note that we assume we always get a single
            # number. I'm not sure if we ever get multiple numbers, but in that
            # case, we need to modify this implementation to create a list out
            # of the multiple numbers possibly separated by comma
            service_replicas = [int(service_replica)]
            rank_t = ctypes.c_uint * len(service_replicas)
            # Modification 2: Use the service_replicas list to generate rank.
            # In DaosPool, we first use some garbage 999999 values and let DAOS
            # set the correct values, but we can't do that here, so we need to
            # set the correct rank value by ourself
            rank = rank_t(*list([svc for svc in service_replicas]))
            rl_ranks = ctypes.POINTER(ctypes.c_uint)(rank)
            # Modification 3: Similar to 1. Use the length of service_replicas
            # list instead of self.svcn.value
            self.pool.svc = daos_cref.RankList(rl_ranks, len(service_replicas))

            # 4. Set UUID and attached to the DaosPool object
            self.pool.set_uuid_str(new_uuid)
            self.pool.attached = 1

        self.uuid = self.pool.get_uuid_str()

    @fail_on(DaosApiError)
    def connect(self, permission=1):
        """Connect to the pool.

        Args:
            permission (int, optional): connect permission. Defaults to 1.

        Returns:
            bool: True if the pool has been connected; False if the pool was
                already connected or the pool is not defined.

        """
        if self.pool and not self.connected:
            kwargs = {"flags": 1 << permission}
            self.log.info(
                "Connecting to pool %s with permission %s (flag: %s)",
                self.uuid, permission, kwargs["flags"])
            self._call_method(self.pool.connect, kwargs)
            self.connected = True
            return True
        return False

    @fail_on(DaosApiError)
    def disconnect(self):
        """Disconnect from connected pool.

        Returns:
            bool: True if the pool has been disconnected; False if the pool was
                already disconnected or the pool is not defined.

        """
        if self.pool and self.connected:
            self.log.info("Disonnecting from pool %s", self.uuid)
            self._call_method(self.pool.disconnect, {})
            self.connected = False
            return True
        return False

    @fail_on(CommandFailure)
    @fail_on(DaosApiError)
    def destroy(self, force=1):
        """Destroy the pool with either API or dmg.

        It uses control_method member previously set, so if you want to use the
        other way for some reason, update it before calling this method.

        Args:
            force (int, optional): force flag. Defaults to 1.

        Returns:
            bool: True if the pool has been destroyed; False if the pool is not
                defined.
        """
        if self.pool:
            self.disconnect()
            self.log.info("Destroying pool %s", self.uuid)
            if self.control_method.value == self.USE_API:
                if self.pool.attached:
                    self._call_method(self.pool.destroy, {"force": force})
            elif self.control_method.value == self.USE_DMG:
                if self.pool.attached:
                    self.dmg.action.value = "destroy"
                    self.dmg.get_action_command()
                    self.dmg.action_command.pool.value = self.uuid
                    self.dmg.action_command.force.value = force
                    self.dmg.run()
            else:
                self.log.error("Cannot destroy pool! Use USE_API or USE_DMG")
                return False
            self.pool = None
            self.uuid = None
            self.info = None
            self.svc_ranks = None
            return True
        return False

    @fail_on(DaosApiError)
    def get_info(self):
        """Query the pool for information.

        Sets the self.info attribute.
        """
        if self.pool:
            self.connect()
            self._call_method(self.pool.pool_query, {})
            self.info = self.pool.pool_info

    def check_pool_info(self,
                        pi_uuid=None,
                        pi_ntargets=None,
                        pi_nnodes=None,
                        pi_ndisabled=None,
                        pi_map_ver=None,
                        pi_leader=None,
                        pi_bits=None):
        # pylint: disable=unused-argument
        """Check the pool info attributes.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Args:
            pi_uuid (str, optional): pool uuid. Defaults to None.
            pi_ntargets (int, optional): number of targets. Defaults to None.
            pi_nnodes (int, optional): number of nodes. Defaults to None.
            pi_ndisabled (int, optional): number of disabled. Defaults to None.
            pi_map_ver (int, optional): pool map version. Defaults to None.
            pi_leader (int, optional): pool leader. Defaults to None.
            pi_bits (int, optional): pool bits. Defaults to None.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Returns:
            bool: True if at least one expected value is specified and all the
                specified values match; False otherwise

        """
        self.get_info()
        checks = [(key, c_uuid_to_str(getattr(self.info, key))
                   if key == "pi_uuid" else getattr(self.info, key), val)
                  for key, val in locals().items()
                  if key != "self" and val is not None]
        return self._check_info(checks)

    def check_pool_space(self,
                         ps_free_min=None,
                         ps_free_max=None,
                         ps_free_mean=None,
                         ps_ntargets=None,
                         ps_padding=None):
        # pylint: disable=unused-argument
        """Check the pool info space attributes.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Args:
            ps_free_min (list, optional): minimum free space per device.
                Defaults to None.
            ps_free_max (list, optional): maximum free space per device.
                Defaults to None.
            ps_free_mean (list, optional): mean free space per device.
                Defaults to None.
            ps_ntargets (int, optional): number of targets. Defaults to None.
            ps_padding (int, optional): space padding. Defaults to None.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Returns:
            bool: True if at least one expected value is specified and all the
                specified values match; False otherwise

        """
        self.get_info()
        checks = []
        for key in ("ps_free_min", "ps_free_max", "ps_free_mean"):
            val = locals()[key]
            if isinstance(val, list):
                for index, item in val:
                    checks.append(("{}[{}]".format(key, index),
                                   getattr(self.info.pi_space,
                                           key)[index], item))
        for key in ("ps_ntargets", "ps_padding"):
            val = locals()[key]
            if val is not None:
                checks.append(key, getattr(self.info.pi_space, key), val)
        return self._check_info(checks)

    def check_pool_daos_space(self, s_total=None, s_free=None):
        # pylint: disable=unused-argument
        """Check the pool info daos space attributes.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Args:
            s_total (list, optional): total space per device. Defaults to None.
            s_free (list, optional): free space per device. Defaults to None.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Returns:
            bool: True if at least one expected value is specified and all the
                specified values match; False otherwise

        """
        self.get_info()
        checks = [("{}_{}".format(key, index),
                   getattr(self.info.pi_space.ps_space, key)[index], item)
                  for key, val in locals().items()
                  if key != "self" and val is not None
                  for index, item in enumerate(val)]
        return self._check_info(checks)

    def check_rebuild_status(self,
                             rs_version=None,
                             rs_seconds=None,
                             rs_errno=None,
                             rs_done=None,
                             rs_padding32=None,
                             rs_fail_rank=None,
                             rs_toberb_obj_nr=None,
                             rs_obj_nr=None,
                             rs_rec_nr=None,
                             rs_size=None):
        # pylint: disable=unused-argument
        # pylint: disable=too-many-arguments
        """Check the pool info rebuild attributes.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Args:
            rs_version (int, optional): rebuild version. Defaults to None.
            rs_seconds (int, optional): rebuild seconds. Defaults to None.
            rs_errno (int, optional): rebuild error number. Defaults to None.
            rs_done (int, optional): rebuild done flag. Defaults to None.
            rs_padding32 (int, optional): padding. Defaults to None.
            rs_fail_rank (int, optional): rebuild fail target. Defaults to None.
            rs_toberb_obj_nr (int, optional): number of objects to be rebuilt.
                Defaults to None.
            rs_obj_nr (int, optional): number of rebuilt objects.
                Defaults to None.
            rs_rec_nr (int, optional): number of rebuilt records.
                Defaults to None.
            rs_size (int, optional): size of all rebuilt records.

        Note:
            Arguments may also be provided as a string with a number preceeded
            by '<', '<=', '>', or '>=' for other comparisions besides the
            default '=='.

        Returns:
            bool: True if at least one expected value is specified and all the
                specified values match; False otherwise

        """
        self.get_info()
        checks = [(key, getattr(self.info.pi_rebuild_st, key), val)
                  for key, val in locals().items()
                  if key != "self" and val is not None]
        return self._check_info(checks)

    def rebuild_complete(self):
        """Determine if the pool rebuild is complete.

        Returns:
            bool: True if pool rebuild is complete; False otherwise

        """
        self.display_pool_rebuild_status()
        return self.info.pi_rebuild_st.rs_done == 1

    def wait_for_rebuild(self, to_start, interval=1):
        """Wait for the rebuild to start or end.

        Args:
            to_start (bool): whether to wait for rebuild to start or end
            interval (int): number of seconds to wait in between rebuild
                completion checks
        """
        self.log.info("Waiting for rebuild to %s ...",
                      "start" if to_start else "complete")
        while self.rebuild_complete() == to_start:
            self.log.info("  Rebuild %s ...",
                          "has not yet started" if to_start else "in progress")
            sleep(interval)
        self.log.info("Rebuild %s detected",
                      "start" if to_start else "completion")

    @fail_on(DaosApiError)
    def start_rebuild(self, ranks, daos_log):
        """Kill the specific server ranks using this pool.

        Args:
            ranks (list): a list of daos server ranks (int) to kill
            daos_log (DaosLog): object for logging messages

        Returns:
            bool: True if the server ranks have been killed and the ranks have
            been excluded from the pool; False if the pool is undefined

        """
        msg = "Killing DAOS ranks {} from server group {}".format(
            ranks, self.name.value)
        self.log.info(msg)
        daos_log.info(msg)
        for rank in ranks:
            server = DaosServer(self.context, self.name.value, rank)
            self._call_method(server.kill, {"force": 1})
        return self.exclude(ranks, daos_log)

    @fail_on(DaosApiError)
    def exclude(self, ranks, daos_log):
        """Manually exclude a rank from this pool.

        Args:
            ranks (list): a list daos server ranks (int) to exclude
            daos_log (DaosLog): object for logging messages

        Returns:
            bool: True if the ranks were excluded from the pool; False if the
                pool is undefined

        """
        if self.pool:
            msg = "Excluding server ranks {} from pool {}".format(
                ranks, self.uuid)
            self.log.info(msg)
            daos_log.info(msg)
            self._call_method(self.pool.exclude, {"rank_list": ranks})
            return True
        return False

    def check_files(self, hosts):
        """Check if pool files exist on the specified list of hosts.

        Args:
            hosts (list): list of hosts

        Returns:
            bool: True if the files for this pool exist on each host; False
                otherwise

        """
        return check_pool_files(self.log, hosts, self.uuid.lower())

    def write_file(self, orterun, processes, hostfile, size, timeout=60):
        """Write a file to the pool.

        Args:
            orterun (str): full path to the orterun command
            processes (int): number of processes to launch
            hosts (list): list of clients from which to write the file
            size (int): size of the file to create in bytes
            timeout (int, optional): number of seconds before timing out the
                command. Defaults to 60 seconds.

        Returns:
            process.CmdResult: command execution result

        """
        self.log.info("Writing %s bytes to pool %s", size, self.uuid)
        env = {
            "DAOS_POOL": self.uuid,
            "DAOS_SVCL": "1",
            "DAOS_SINGLETON_CLI": "1",
            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
        }
        load_mpi("openmpi")
        current_path = os.path.dirname(os.path.abspath(__file__))
        command = "{} --np {} --hostfile {} {} {} testfile".format(
            orterun, processes, hostfile,
            os.path.join(current_path, "write_some_data.py"), size)
        return process.run(command, timeout, True, False, "both", True, env)

    def get_pool_daos_space(self):
        """Get the pool info daos space attributes as a dictionary.

        Returns:
            dict: a dictionary of lists of the daos space attributes

        """
        self.get_info()
        keys = ("s_total", "s_free")
        return {key: getattr(self.info.pi_space.ps_space, key) for key in keys}

    def display_pool_daos_space(self, msg=None):
        """Display the pool info daos space attributes.

        Args:
            msg (str, optional): optional text to include in the output.
                Defaults to None.
        """
        daos_space = self.get_pool_daos_space()
        sizes = [
            "{}[{}]={}".format(key, index, item)
            for key in sorted(daos_space.keys())
            for index, item in enumerate(daos_space[key])
        ]
        self.log.info("Pool %s space%s:\n  %s", self.uuid,
                      " " + msg if isinstance(msg, str) else "",
                      "\n  ".join(sizes))

    def get_pool_rebuild_status(self):
        """Get the pool info rebuild status attributes as a dictionary.

        Returns:
            dict: a dictionary of lists of the rebuild status attributes

        """
        self.get_info()
        keys = ("rs_version", "rs_pad_32", "rs_errno", "rs_done",
                "rs_toberb_obj_nr", "rs_obj_nr", "rs_rec_nr")
        return {key: getattr(self.info.pi_rebuild_st, key) for key in keys}

    def display_pool_rebuild_status(self):
        """Display the pool info rebuild status attributes."""
        status = self.get_pool_rebuild_status()
        self.log.info(
            "Pool rebuild status: %s", ", ".join(
                ["{}={}".format(key, status[key]) for key in sorted(status)]))

    def read_data_during_rebuild(self, container):
        """Read data from the container while rebuild is active.

        Args:
            container (TestContainer): container from which to read data

        Returns:
            bool: True if all the data is read sucessfully befoire rebuild
                completes; False otherwise

        """
        container.open()
        self.log.info("Reading objects in container %s during rebuild",
                      self.uuid)

        # Attempt to read all of the data from the container during rebuild
        index = 0
        status = read_incomplete = index < len(container.written_data)
        while not self.rebuild_complete() and read_incomplete:
            try:
                status &= container.written_data[index].read_object(container)
            except DaosTestError as error:
                self.log.error(str(error))
                status = False
            index += 1
            read_incomplete = index < len(container.written_data)

        # Verify that all of the container data was read successfully
        if read_incomplete:
            self.log.error(
                "Rebuild completed before all the written data could be read")
            status = False
        elif not status:
            self.log.error("Errors detected reading data during rebuild")
        return status