def get_pool_acl_list(self, uuid): ''' Deascription: Get daos pool acl list by dmg get-acl. Args: uuid: pool uuid number. Return: pool_permission_list: daos pool acl list. ''' dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.request.value = "pool" dmg.action.value = "get-acl --pool " + uuid port = self.params.get("port", "/run/server_config/*") servers_with_ports = [ "{}:{}".format(host, port) for host in self.hostlist_servers ] dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist") result = dmg.run() pool_permission_list = [] for line in result.stdout.splitlines(): if not line.startswith("A:"): continue elif line.startswith("A::"): found_user = re.search(r"A::(.+)@:(.*)", line) if found_user: pool_permission_list.append(line) elif line.startswith("A:G:"): found_group = re.search(r"A:G:(.+)@:(.*)", line) if found_group: pool_permission_list.append(line) return pool_permission_list
def update_pool_acl_entry(self, uuid, action, entry): ''' Deascription: Update daos pool acl list by dmg tool. Args: uuid: pool uuid. action: update-acl or delete-acl. entry: pool acl entry or principal to be updated. Return: none. ''' dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.request.value = "pool" if action is "delete": dmg.action.value = "delete-acl --pool " + uuid dmg.action.value += " --principal " + entry elif action is "update": dmg.action.value = "update-acl --pool " + uuid dmg.action.value += " --entry " + entry else: self.fail("##update_pool_acl_entry, action: {} is not supported." "\n supported action: update, delete.".format(action)) port = self.params.get("port", "/run/server_config/*") servers_with_ports = [ "{}:{}".format(host, port) for host in self.hostlist_servers ] dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist") result = dmg.run() self.log.info(" At update_pool_acl_entry, dmg.run result=\n %s",\ result)
def verify_access_point(self, host_port_input, failure_expected=None): """Run with given AP and verify the AP in the output. Args: host_port_input (str): Host:Port or just Host. Supports multiple APs that are separated by comma. failure_expected (str): Expected error message. Set it to None if not expecting any error. Defaults to None. Returns: list: List or errors. """ errors = [] check = {} check["expected"] = host_port_input.split(",") if ":" not in host_port_input: # dmg automatically sets 10001 if it's not given in the input. check["expected"] = [ "{}:10001".format(host) for host in check["expected"] ] # Create a new DmgCommand and set its exit_status_exception to False to # make it not raise a TestFailure when the command failed. Then we'll be # able to check result.exit_status for our testing purpose. dmg = DmgCommand(self.bin) dmg.exit_status_exception = False try: result = dmg.config_generate(access_points=host_port_input) except CommandFailure as err: errors.append("Unexpected failure! {}".format(err)) if result.exit_status == 0 and failure_expected is None: try: yaml_data = yaml.safe_load(result.stdout) check["actual"] = yaml_data["access_points"] if sorted(check["expected"]) != sorted(check["actual"]): errors.append("Unexpected access point: {} != {}".format( check["expected"], check["actual"])) except yaml.YAMLError as error: errors.append( "Error loading dmg generated config!: {}".format(error)) elif result.exit_status == 0 and failure_expected is not None: errors.append( "dmg command passed when expected to fail!: {}".format(result)) elif result.exit_status != 0 and failure_expected is not None: if failure_expected not in result.stderr_text: errors.append( "Missing expected error message in failed dmg command!: " + "{}".format(result)) else: errors.append( "dmg command failed when expected to pass!: {}".format(result)) return errors
def setUp(self): super(CSumErrorLog, self).setUp() self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.hostlist = self.hostlist_servers[0] self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query")
class NvmeFault(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate IO works fine when NVMe fault generated on single or multiple servers with single drive. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(NvmeFault, self).setUp() self.no_of_pools = self.params.get("number_of_pools", '/run/pool/*', 1) self.capacity = self.params.get("percentage", '/run/faulttests/pool_capacity/*') self.no_of_servers = self.params.get( "count", '/run/faulttests/no_of_servers/*/') self.no_of_drives = self.params.get("count", '/run/faulttests/no_of_drives/*/') self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") #Set to True to generate the NVMe fault during IO self.set_faulty_device = True @skipForTicket("DAOS-5497") def test_nvme_fault(self): """Jira ID: DAOS-4722. Test Description: Test NVMe disk fault. Use Case: Create the large size of pool and start filling up the pool. while IO is in progress remove single disks from single/multiple servers. :avocado: tags=all,hw,medium,nvme,ib2,nvme_fault,full_regression """ #Create the Pool with Maximum NVMe size self.create_pool_max_size(nvme=True) #Start the IOR Command and generate the NVMe fault. self.start_ior_load(precent=self.capacity) print("pool_percentage_used -- After -- {}".format( self.pool.pool_percentage_used())) #Check nvme-health command works try: self.dmg.hostlist = self.hostlist_servers self.dmg.storage_query_nvme_health() except CommandFailure as _error: self.fail("dmg nvme-health failed")
def test_num_engines(self): """Test --num-engines. 1. Using the NVMe PCI dictionary, find the number of keys. i.e., number of Socket IDs. This would determine the maximum number of engines. 2. Call dmg config generate --num-engines=<1 to max_engine>. Should pass. 3. Call dmg config generate --num-engines=<max_engine + 1> Should fail. :avocado: tags=all,full_regression :avocado: tags=hw,small :avocado: tags=control,config_generate_entries,num_engines """ # Get necessary storage and network info. self.prepare_expected_data() # Find the maximum number of engines we can use. It's the number of # sockets in NVMe. However, I'm not sure if we need to have the same # number of interfaces. Go over this step if we have issue with the # max_engine assumption. max_engine = len(list(self.nvme_socket_to_addrs.keys())) self.log.info("max_engine threshold = %s", max_engine) dmg = DmgCommand(self.bin) dmg.exit_status_exception = False errors = [] # Call dmg config generate --num-engines=<1 to max_engine> for num_engines in range(1, max_engine + 1): result = dmg.config_generate(access_points="wolf-a", num_engines=num_engines) generated_yaml = yaml.safe_load(result.stdout) actual_num_engines = len(generated_yaml["engines"]) # Verify the number of engine field. if actual_num_engines != num_engines: msg = "Unexpected number of engine field! Expected = {}; "\ "Actual = {}".format(num_engines, actual_num_engines) errors.append(msg) # Verify that max_engine + 1 fails. result = dmg.config_generate(access_points="wolf-a", num_engines=max_engine + 1) if result.exit_status == 0: errors.append( "Host + invalid num engines succeeded with {}!".format( max_engine + 1)) self.check_errors(errors)
def get_dmg_command(self, index=0): """Get a DmgCommand setup to interact with server manager index. Return a DmgCommand object configured with: - the "-l" parameter assigned to the server's access point list - the "-i" parameter assigned to the server's interactive mode This method is intended to be used by tests that wants to use dmg to create and destroy pool. Pass in the object to TestPool constructor. Access point should be passed in to -l regardless of the number of servers. Args: index (int, optional): Server index. Defaults to 0. Returns: DmgCommand: New DmgCommand object. """ if self.server_managers: return self.server_managers[index].dmg dmg_config_file = self.get_config_file("daos", "dmg") dmg_cfg = DmgYamlParameters(dmg_config_file, self.server_group, DmgTransportCredentials(self.workdir)) dmg_cfg.hostlist.update(self.hostlist_servers[:1], "dmg.yaml.hostlist") return DmgCommand(self.bin, dmg_cfg)
def get_dmg_command(self, index=0): """Get a DmgCommand setup to interact with server manager index. Return a DmgCommand object configured with: - the "-l" parameter assigned to the server's access point list - the "-i" parameter assigned to the server's interactive mode This method is intended to be used by tests that wants to use dmg to create and destroy pool. Pass in the object to TestPool constructor. Access point should be passed in to -l regardless of the number of servers. Args: index (int, optional): Server index. Defaults to 0. Returns: DmgCommand: New DmgCommand object. """ dmg = DmgCommand(self.bin) dmg.hostlist.value = self.server_managers[index].runner.job.\ yaml_params.access_points.value dmg.insecure.value = \ self.server_managers[index].insecure.value return dmg
def setUp(self): """Set up for test case.""" super(NvmeFault, self).setUp() self.no_of_pools = self.params.get("number_of_pools", '/run/pool/*', 1) self.capacity = self.params.get("percentage", '/run/faulttests/pool_capacity/*') self.no_of_servers = self.params.get( "count", '/run/faulttests/no_of_servers/*/') self.no_of_drives = self.params.get("count", '/run/faulttests/no_of_drives/*/') self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") #Set to True to generate the NVMe fault during IO self.set_faulty_device = True
def __init__(self, server_command, manager="Orterun", dmg_cfg=None): """Initialize a DaosServerManager object. Args: server_command (ServerCommand): server command object manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi". dmg_cfg (DmgYamlParameters, optional): The dmg configuration file parameters used to connect to this group of servers. """ super(DaosServerManager, self).__init__(server_command, manager) self.manager.job.sub_command_override = "start" # Dmg command to access this group of servers which will be configured # to access the daos_servers when they are started self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg)
def __init__(self, context, log=None, cb_handler=None, dmg_bin_path=None): # pylint: disable=unused-argument """Initialize a TestPool object. Note: 'log' is now a defunct argument and will be removed in the future Args: context (DaosContext): [description] log (logging): logging object used to report the pool status cb_handler (CallbackHandler, optional): callback object to use with the API methods. Defaults to None. """ super(TestPool, self).__init__("/run/pool/*", cb_handler) self.context = context self.uid = os.geteuid() self.gid = os.getegid() self.mode = BasicParameter(None) self.name = BasicParameter(None) # server group name self.svcn = BasicParameter(None) self.target_list = BasicParameter(None) self.scm_size = BasicParameter(None) self.nvme_size = BasicParameter(None) # Set USE_API to use API or USE_DMG to use dmg. If it's not set, API is # used. self.control_method = BasicParameter(self.USE_API, self.USE_API) uname = getpass.getuser() gname = grp.getgrnam(uname)[0] self.username = BasicParameter(uname, uname) self.groupname = BasicParameter(gname, gname) self.pool = None self.uuid = None self.info = None self.svc_ranks = None self.connected = False self.dmg = None # Required to use dmg. It defined the directory where dmg is installed. # Use self.basepath + '/install/bin' in the test self.dmg_bin_path = dmg_bin_path if dmg_bin_path is not None: # We make dmg as the member of this class because the test would # have more flexibility over the usage of the command. self.dmg = DmgCommand(self.dmg_bin_path) self.dmg.insecure.value = True self.dmg.request.value = "pool"
def test_dmg_nvme_scan_basic(self): """ JIRA ID: DAOS-2485 Test Description: Test basic dmg functionality to scan the nvme storage. on the system. :avocado: tags=all,tiny,pr,dmg,nvme_scan,basic """ # Create dmg command dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.get_params(self) # Update hostlist value for dmg command port = self.params.get("port", "/run/server_config/*") servers_with_ports = [ "{}:{}".format(host, port) for host in self.hostlist_servers] dmg.hostlist = servers_with_ports try: dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details))
def verify_pool_acl_prim_sec_groups(self, pool_acl_list, acl_file,\ uuid, svc): ''' Deascription: Verify daos pool acl access with primary and secondary groups access permission. Args: pool_acl_list: pool acl entry list. acl_file: acl file to be used. uuid: daos pool uuid. svc: daos pool svc. Return: None. ''' sec_group = self.params.get("secondary_group_name", "/run/pool_acl/*") sec_group_perm = self.params.get("sg_permission", "/run/pool_acl/*") sec_group_rw = self.params.get("sg_read_write", "/run/pool_acl/*") user_gid = os.getegid() current_group = grp.getgrgid(user_gid)[0] primary_grp_perm = self.params.get(\ "pg_permission", "/run/pool_acl/primary_secondary_group_test/*")[0] sec_group = self.params.get(\ "secondary_group_name", \ "/run/pool_acl/primary_secondary_group_test/*") sec_group_perm = self.params.get(\ "sg_permission", "/run/pool_acl/primary_secondary_group_test/*") sec_group_rw = self.params.get(\ "sg_read_write", "/run/pool_acl/primary_secondary_group_test/*") l_group = grp.getgrgid(os.getegid())[0] for group in sec_group: add_del_user(self.hostlist_clients, "groupadd", group) cmd = "usermod -G " + ",".join(sec_group) self.log.info(" (8-1)verify_pool_acl_prim_sec_groups, cmd= %s", cmd) add_del_user(self.hostlist_clients, cmd, l_group) self.log.info( " (8-2)Before update sec_group permission,\ pool_acl_list= %s", pool_acl_list) for group, permission in zip(sec_group, sec_group_perm): if permission == "none": permission = "" n_acl = acl_entry("group", group, permission) pool_acl_list.append(n_acl) self.log.info( " (8-3)After update sec_group permission,\ pool_acl_list= %s", pool_acl_list) self.log.info(" pool acl_file= %s", acl_file) create_acl_file(acl_file, pool_acl_list) #modify primary-group permission for secondary-group test grp_entry = acl_entry("group", current_group, primary_grp_perm) new_grp_entry = acl_entry("group", current_group, "") self.modify_acl_file_entry(acl_file, grp_entry, new_grp_entry) #dmg pool overwrite-acl --pool <uuid> --acl-file <file> dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.request.value = "pool" dmg.action.value = "overwrite-acl --pool={} --acl-file={}".\ format(uuid, acl_file) port = self.params.get("port", "/run/server_config/*", 10001) servers_with_ports = [ "{}:{}".format(host, port) for host in self.hostlist_servers ] dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist") self.log.info(" (8-4)dmg= %s", dmg) result = dmg.run() self.log.info(" (8-5)dmg.run() result=\n %s", result) #Verify pool read operation #daos pool query --pool <uuid> self.log.info(" (8-6)Verify pool read by: daos pool query --pool") exp_read = sec_group_rw[0] self.verify_pool_readwrite(svc, uuid, "read", expect=exp_read) #Verify pool write operation #daos continer create --pool <uuid> self.log.info(" (8-7)Verify pool write by: daos continer create pool") exp_write = sec_group_rw[1] self.verify_pool_readwrite(svc, uuid, "write", expect=exp_write) for group in sec_group: add_del_user(self.hostlist_clients, "groupdel", group)
def test_create(self): """Test dmg pool create and destroy with various parameters. Create a pool and verify that the pool was created by comparing the UUID returned from the dmg command against the directory name in /mnt/daos Destroy the pool and verify that the directory is deleted. :avocado: tags=all,pool,full_regression,small,multitarget """ # Create a dmg command object dmg = DmgCommand(self.bin) dmg.get_params(self) dmg.hostlist.update( self.server_managers[0].runner.job.yaml_params.access_points.value, "dmg.hostlist") # Disable raising an exception if the dmg command fails dmg.exit_status_exception = False # Accumulate a list of pass/fail indicators representing what is # expected for each parameter then "and" them to determine the # expected result of the test expected_for_param = [] userlist = self.params.get("user", '/run/tests/users/*') user = os.getlogin() if userlist[0] == 'valid' else userlist[0] expected_for_param.append(userlist[1]) grouplist = self.params.get("group", '/run/tests/groups/*') group = os.getlogin() if grouplist[0] == 'valid' else grouplist[0] expected_for_param.append(grouplist[1]) systemnamelist = self.params.get("systemname", '/run/tests/systemnames/*') system_name = systemnamelist[0] expected_for_param.append(systemnamelist[1]) tgtlistlist = self.params.get("tgt", '/run/tests/tgtlist/*') tgtlist = tgtlistlist[0] expected_for_param.append(tgtlistlist[1]) # if any parameter is FAIL then the test should FAIL expected_result = RESULT_PASS if RESULT_FAIL in expected_for_param: expected_result = RESULT_FAIL host1 = self.hostlist_servers[0] host2 = self.hostlist_servers[1] test_destroy = True create_result = dmg.pool_create("1GB", user, group, None, tgtlist, None, system_name) if create_result.exit_status == 0: if expected_result == RESULT_FAIL: self.fail( "Test was expected to fail but it passed at pool create.") uuid, _ = get_pool_uuid_service_replicas_from_stdout( create_result.stdout) if '0' in tgtlist: # check_for_pool checks if the uuid directory exists in host1 exists = check_for_pool.check_for_pool(host1, uuid) if exists != 0: self.fail("Pool {0} not found on host {1}.\n".format( uuid, host1)) if '1' in tgtlist: exists = check_for_pool.check_for_pool(host2, uuid) if exists != 0: self.fail("Pool {0} not found on host {1}.\n".format( uuid, host2)) else: test_destroy = False if expected_result == RESULT_PASS: self.fail("Test was expected to pass but it failed at pool " + "create.") if test_destroy: destroy_result = dmg.pool_destroy(uuid) if destroy_result.exit_status == 0: if expected_result == RESULT_FAIL: self.fail("Test was expected to fail but it passed at " + "pool create.") if '0' in tgtlist: exists = check_for_pool.check_for_pool(host1, uuid) if exists == 0: self.fail( "Pool {0} found on host {1} after destroy.\n". format(uuid, host1)) if '1' in tgtlist: exists = check_for_pool.check_for_pool(host2, uuid) if exists == 0: self.fail( "Pool {0} found on host {1} after destroy.\n". format(uuid, host2)) else: if expected_result == RESULT_PASS: self.fail("Test was expected to pass but it failed at " + "pool destroy.")
def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,full_regression :avocado: tags=hw,medium :avocado: tags=nvme :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') pool_capacity = self.params.get("pool_used_percentage", '/run/pool/*') pool_capacity = pool_capacity / 100 storage = self.get_max_storage_sizes() #Create the pool from available of storage space single_pool_nvme_size = int((storage[1] * pool_capacity) / no_of_pools) single_pool_scm_size = int((storage[0] * pool_capacity) / no_of_pools) self.pool = [] # Create the Large number of pools for _pool in range(no_of_pools): self.log.info("-- Creating pool number = %s", _pool) self.pool.append(self.get_pool(create=False)) self.pool[-1].scm_size.update(single_pool_scm_size, "scm_size") self.pool[-1].nvme_size.update(single_pool_nvme_size, "nvme_size") self.pool[-1].create() # initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") # List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.set_sub_command( "list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout_text: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout_text: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))
def test_net_class(self): """Test --net-class. 1. Iterate the interface set and count the number of elements that starts with "ib". This would be our ib_count threshold that we can set --num-engines with --net-class=infiniband. 2. Call dmg config generate --net-class=infiniband --num-engines=<1 to ib_count> and verify that it works. 3. In addition, verify provider using the dictionary. i.e., iterate "engines" fields and verify "provider" is in the list where key is "fabric_iface". 4. Similarly find eth_count and call dmg config generate --net-class=ethernet --num-engines=<1 to eth_count> and verify that it works. 5. As in ib, also verify provider using the dictionary. i.e., iterate "engines" fields and verify "provider" is in the list where key is "fabric_iface". :avocado: tags=all,full_regression :avocado: tags=hw,small :avocado: tags=control,config_generate_entries,net_class """ # Get necessary storage and network info. self.prepare_expected_data() # Get ib_count threshold. ib_count = 0 for interface in self.interface_set: if interface[:2] == "ib": ib_count += 1 self.log.info("ib_count = %d", ib_count) dmg = DmgCommand(self.bin) dmg.exit_status_exception = False errors = [] # Call dmg config generate --num-engines=<1 to ib_count> # --net-class=infiniband. Should pass. for num_engines in range(1, ib_count + 1): # dmg config generate should pass. result = dmg.config_generate(access_points="wolf-a", num_engines=num_engines, net_class="infiniband") if result.exit_status != 0: msg = "config generate failed with --net-class=infiniband "\ "--num-engines = {}!".format(num_engines) errors.append(msg) else: generated_config = yaml.safe_load(result.stdout) for engine in generated_config["engines"]: fabric_iface = engine["fabric_iface"] provider = engine["provider"] # Verify fabric_iface field, e.g., ib0 by checking the # dictionary keys. if not self.interface_to_providers[fabric_iface]: errors.append( "Unexpected fabric_iface! {}".format(fabric_iface)) elif provider not in \ self.interface_to_providers[fabric_iface]: # Now check the provider field, e.g., ofi+sockets by # checking the corresponding list in the dictionary. msg = "Unexpected provider in fabric_iface! provider ="\ " {}; fabric_iface = {}".format( provider, fabric_iface) errors.append(msg) # Call dmg config generate --num-engines=<ib_count + 1> # --net-class=infiniband. Too many engines. Should fail. result = dmg.config_generate(access_points="wolf-a", num_engines=ib_count + 1, net_class="infiniband") if result.exit_status == 0: msg = "config generate succeeded with --net-class=infiniband "\ "num_engines = {}!".format(ib_count + 1) errors.append(msg) # Get eth_count threshold. eth_count = 0 for interface in self.interface_set: if interface[:3] == "eth": eth_count += 1 self.log.info("eth_count = %d", eth_count) # Call dmg config generate --num-engines=<1 to eth_count> # --net-class=ethernet. Should pass. for num_engines in range(1, eth_count + 1): # dmg config generate should pass. result = dmg.config_generate(access_points="wolf-a", num_engines=num_engines, net_class="ethernet") if result.exit_status != 0: msg = "config generate failed with --net-class=ethernet "\ "--num-engines = {}!".format(num_engines) errors.append(msg) else: generated_config = yaml.safe_load(result.stdout) for engine in generated_config["engines"]: fabric_iface = engine["fabric_iface"] provider = engine["provider"] # Verify fabric_iface field, e.g., eth0 by checking the # dictionary keys. if not self.interface_to_providers[fabric_iface]: errors.append( "Unexpected fabric_iface! {}".format(fabric_iface)) elif provider not in \ self.interface_to_providers[fabric_iface]: # Now check the provider field, e.g., ofi+sockets by # checking the corresponding list in the dictionary. msg = "Unexpected provider in fabric_iface! provider ="\ " {}; fabric_iface = {}".format( provider, fabric_iface) errors.append(msg) # Call dmg config generate --num-engines=<eth_count + 1> # --net-class=ethernet. Too many engines. Should fail. result = dmg.config_generate(access_points="wolf-a", num_engines=eth_count + 1, net_class="ethernet") if result.exit_status == 0: msg = "config generate succeeded with --net-class=ethernet, "\ "num_engines = {}!".format(eth_count + 1) errors.append(msg) self.check_errors(errors)
def test_min_ssds(self): """Test --min-ssds. 1. Iterate the NVMe PCI dictionary and find the key that has the shortest list. This would be our min_ssd engine count threshold. 2. Call dmg config generate --min-ssds=<1 to min_ssd>. Should pass. 3. Call dmg config generate --min-ssds=<min_ssd + 1>. Should fail. 4. Call dmg config generate --min-ssds=0. Iterate the engines field and verify that there's no bdev_list field. :avocado: tags=all,full_regression :avocado: tags=hw,small :avocado: tags=control,config_generate_entries,min_ssds """ # Get necessary storage and network info. self.prepare_expected_data() # Iterate the NVMe PCI dictionary and find the key that has the shortest # list. This would be our min_ssd engine count threshold. socket_ids = list(self.nvme_socket_to_addrs.keys()) shortest_id = socket_ids[0] shortest = len(self.nvme_socket_to_addrs[shortest_id]) for socket_id in socket_ids: if len(self.nvme_socket_to_addrs[socket_id]) < shortest: shortest = len(self.nvme_socket_to_addrs[socket_id]) shortest_id = socket_id min_ssd = len(self.nvme_socket_to_addrs[shortest_id]) self.log.info("Maximum --min-ssds threshold = %d", min_ssd) dmg = DmgCommand(self.bin) dmg.exit_status_exception = False errors = [] # Call dmg config generate --min-ssds=<1 to min_ssd>. Should pass. for num_ssd in range(1, min_ssd + 1): result = dmg.config_generate(access_points="wolf-a", min_ssds=num_ssd) if result.exit_status != 0: errors.append( "config generate failed with min_ssd = {}!".format( num_ssd)) # Call dmg config generate --min_ssds=<min_ssd + 1>. Should fail. result = dmg.config_generate(access_points="wolf-a", min_ssds=min_ssd + 1) if result.exit_status == 0: errors.append( "config generate succeeded with min_ssd + 1 = {}!".format( min_ssd + 1)) # Call dmg config generate --min-ssds=0 result = dmg.config_generate(access_points="wolf-a", min_ssds=0) generated_yaml = yaml.safe_load(result.stdout) # Iterate the engines and verify that there's no bdev_list field. engines = generated_yaml["engines"] for engine in engines: if "bdev_list" in engine: errors.append("bdev_list field exists with --min-ssds=0!") self.check_errors(errors)
class CSumErrorLog(DaosCoreBase): """ Test Class Description: This test runs daos_test -z (Checksum tests) and verifies whether Checksum Error Counters are incremented in the NVME device due to checksum fault injection. :avocado: recursive """ # pylint: disable=too-many-instance-attributes def setUp(self): super(CSumErrorLog, self).setUp() self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.hostlist = self.hostlist_servers[0] self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") def get_nvme_device_id(self): self.dmg.json.value = True self.dmg.sub_command_class. \ sub_command_class.set_sub_command("list-devices") try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) data = json.loads(result.stdout) if len(data['host_errors']) > 0: self.fail("dmg command failed: {}".format(data['host_errors'])) for v in data['host_storage_map'].values(): if v['storage']['smd_info']['devices']: return v['storage']['smd_info']['devices'][0]['uuid'] def get_checksum_error_value(self, device_id=None): if device_id is None: self.fail("No device id provided") return self.dmg.json.value = True self.dmg.sub_command_class. \ sub_command_class.set_sub_command("device-health") self.dmg.sub_command_class. \ sub_command_class. \ sub_command_class.uuid.value = device_id try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) data = json.loads(result.stdout) if len(data['host_errors']) > 0: self.fail("dmg command failed: {}".format(data['host_errors'])) for v in data['host_storage_map'].values(): if v['storage']['smd_info']['devices']: dev = v['storage']['smd_info']['devices'][0] return dev['health']['checksum_errors'] def test_csum_error_logging(self): """ Test ID: DAOS-3927 Test Description: Write Avocado Test to verify single data after pool/container disconnect/reconnect. :avocado: tags=all,pr,hw,medium,ib2,csum_error_log """ dev_id = self.get_nvme_device_id() self.log.info("%s", dev_id) csum = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum) DaosCoreBase.run_subtest(self) csum_latest = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum_latest) self.assertTrue(csum_latest > csum, "Checksum Error Log not incremented") self.log.info("Checksum Error Logging Test Passed")
class CSumErrorLog(DaosCoreBase): """ Test Class Description: This test runs daos_test -z (Checksum tests) and verifies whether Checksum Error Counters are incremented in the NVME device due to checksum fault injection. :avocado: recursive """ # pylint: disable=too-many-instance-attributes def setUp(self): super(CSumErrorLog, self).setUp() self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.hostlist = self.hostlist_servers[0] self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") def get_nvme_device_id(self): self.dmg.sub_command_class.sub_command_class.set_sub_command("smd") self.dmg.sub_command_class. \ sub_command_class.sub_command_class.devices.value = True self.dmg.sub_command_class. \ sub_command_class.sub_command_class.pools.value = True try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) uid = None for line in result.stdout.splitlines(): line = line.strip() if re.search("^UUID:", line): temp = line.split() uid = temp[1] break return uid def get_checksum_error_value(self, device_id=None): if device_id is None: self.fail("No device id provided") return self.dmg.sub_command_class. \ sub_command_class.set_sub_command("blobstore-health") self.dmg.sub_command_class. \ sub_command_class. \ sub_command_class.devuuid.value = "{}".format(device_id) try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) csum_count = None for line in result.stdout.splitlines(): line = line.strip() if re.search("^Checksum", line): temp = line.split() csum_count = int(temp[2]) break return csum_count def test_csum_error_logging(self): """ Test ID: DAOS-3927 Test Description: Write Avocado Test to verify single data after pool/container disconnect/reconnect. :avocado: tags=all,pr,hw,medium,ib2,csum_error_log """ dev_id = self.get_nvme_device_id() self.log.info("%s", dev_id) csum = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum) DaosCoreBase.run_subtest(self) csum_latest = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum_latest) self.assertTrue(csum_latest > csum, "Checksum Error Log not incremented") self.log.info("Checksum Error Logging Test Passed")
class DaosServerManager(SubprocessManager): """Manages the daos_server execution on one or more hosts.""" # Mapping of environment variable names to daos_server config param names ENVIRONMENT_VARIABLE_MAPPING = { "CRT_PHY_ADDR_STR": "provider", "OFI_INTERFACE": "fabric_iface", "OFI_PORT": "fabric_iface_port", } def __init__(self, server_command, manager="Orterun", dmg_cfg=None): """Initialize a DaosServerManager object. Args: server_command (ServerCommand): server command object manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi". dmg_cfg (DmgYamlParameters, optional): The dmg configuration file parameters used to connect to this group of servers. """ super(DaosServerManager, self).__init__(server_command, manager) self.manager.job.sub_command_override = "start" # Dmg command to access this group of servers which will be configured # to access the daos_servers when they are started self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg) def get_params(self, test): """Get values for all of the command params from the yaml file. Use the yaml file parameter values to assign the server command and orterun command parameters. Args: test (Test): avocado Test object """ super(DaosServerManager, self).get_params(test) # Get the values for the dmg parameters self.dmg.get_params(test) def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info("<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts) local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates(get_log_file("daosCA/certs"), local_host.split()) # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update(self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update({"plm_rsh_args": "-l root"}, "orterun.mca", True) def clean_files(self, verbose=True): """Clean up the daos server files. Args: verbose (bool, optional): display clean commands. Defaults to True. """ clean_cmds = [] for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.get_value("scm_mount") self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "sudo rm -fr {}/*".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) if self.manager.job.using_dcpm: scm_list = server_params.get_value("scm_list") if isinstance(scm_list, list): self.log.info("Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_cmds: clean_cmds.append(cmd) pcmd(self._hosts, "; ".join(clean_cmds), verbose) def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True if using_nvme: cmd.sub_command_class.sub_command_class.hugepages.value = 4096 self.log.info("Preparing DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=40) if len(result) > 1 or 0 not in result: dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" raise ServerFailed("Error preparing {} storage".format(dev_type)) def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format.""" f_type = "format" if not reformat else "reformat" self.log.info("<SERVER> Waiting for servers to be ready for format") self.manager.job.update_pattern(f_type, len(self._hosts)) try: self.manager.run() except CommandFailure as error: self.kill() raise ServerFailed( "Failed to start servers before format: {}".format(error)) def detect_io_server_start(self, host_qty=None): """Detect when all the daos_io_servers have started. Args: host_qty (int): number of servers expected to have been started. Raises: ServerFailed: if there was an error starting the servers after formatting. """ if host_qty is None: hosts_qty = len(self._hosts) self.log.info("<SERVER> Waiting for the daos_io_servers to start") self.manager.job.update_pattern("normal", hosts_qty) if not self.manager.job.check_subprocess_status(self.manager.process): self.kill() raise ServerFailed("Failed to start servers after format") # Update the dmg command host list to work with pool create/destroy self.dmg.hostlist = self.get_config_value("access_points") def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage") def set_scm_mount_ownership(self, user=None, verbose=False): """Set the ownership to the specified user for each scm mount. Args: user (str, optional): user name. Defaults to None - current user. verbose (bool, optional): display commands. Defaults to False. """ user = getpass.getuser() if user is None else user cmd_list = set() for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.scm_mount.value # Support single or multiple scm_mount points if not isinstance(scm_mount, list): scm_mount = [scm_mount] self.log.info("Changing ownership to %s for: %s", user, scm_mount) cmd_list.add("sudo chown -R {0}:{0} {1}".format( user, " ".join(scm_mount))) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), verbose) def start(self): """Start the server through the job manager.""" # Prepare the servers self.prepare() # Start the servers and wait for them to be ready for storage format self.detect_format_ready() # Format storage and wait for server to change ownership self.log.info("<SERVER> Formatting hosts: <%s>", self.dmg.hostlist) # Temporarily increasing timeout to avoid CI errors until DAOS-5764 can # be further investigated. self.dmg.storage_format(timeout=40) # Wait for all the daos_io_servers to start self.detect_io_server_start() return True def stop(self): """Stop the server through the runner.""" self.log.info("<SERVER> Stopping server %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the job manager command try: super(DaosServerManager, self).stop() except CommandFailure as error: messages.append("Error stopping the {} subprocess: {}".format( self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.kill() if self.manager.job.using_nvme: # Reset the storage try: self.reset_storage() except ServerFailed as error: messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() # Report any errors after all stop actions have been attempted if messages: raise ServerFailed("Failed to stop servers:\n {}".format( "\n ".join(messages))) def get_environment_value(self, name): """Get the server config value associated with the env variable name. Args: name (str): environment variable name for which to get a daos_server configuration value Raises: ServerFailed: Unable to find a daos_server configuration value for the specified environment variable name Returns: str: the daos_server configuration value for the specified environment variable name """ try: setting = self.ENVIRONMENT_VARIABLE_MAPPING[name] except IndexError: raise ServerFailed( "Unknown server config setting mapping for the {} environment " "variable!".format(name)) return self.get_config_value(setting) def get_single_system_state(self): """Get the current homogeneous DAOS system state. Raises: ServerFailed: if a single state for all servers is not detected Returns: str: the current DAOS system state """ data = self.dmg.system_query() if not data: # The regex failed to get the rank and state raise ServerFailed("Error obtaining {} output: {}".format( self.dmg, data)) try: states = list(set([data[rank]["state"] for rank in data])) except KeyError: raise ServerFailed( "Unexpected result from {} - missing 'state' key: {}".format( self.dmg, data)) if len(states) > 1: # Multiple states for different ranks detected raise ServerFailed( "Multiple system states ({}) detected:\n {}".format( states, data)) return states[0] def check_system_state(self, valid_states, max_checks=1): """Check that the DAOS system state is one of the provided states. Fail the test if the current state does not match one of the specified valid states. Optionally the state check can loop multiple times, sleeping one second between checks, by increasing the number of maximum checks. Args: valid_states (list): expected DAOS system states as a list of lowercase strings max_checks (int, optional): number of times to check the state. Defaults to 1. Raises: ServerFailed: if there was an error detecting the server state or the detected state did not match one of the valid states Returns: str: the matching valid detected state """ checks = 0 daos_state = "????" while daos_state not in valid_states and checks < max_checks: if checks > 0: time.sleep(1) try: daos_state = self.get_single_system_state().lower() except ServerFailed as error: raise error checks += 1 self.log.info("System state check (%s): %s", checks, daos_state) if daos_state not in valid_states: raise ServerFailed( "Error checking DAOS state, currently neither {} after " "{} state check(s)!".format(valid_states, checks)) return daos_state def system_start(self): """Start the DAOS IO servers. Raises: ServerFailed: if there was an error starting the servers """ self.log.info("Starting DAOS IO servers") self.check_system_state(("stopped")) self.dmg.system_start() if self.dmg.result.exit_status != 0: raise ServerFailed("Error starting DAOS:\n{}".format( self.dmg.result)) def system_stop(self, extra_states=None): """Stop the DAOS IO servers. Args: extra_states (list, optional): a list of DAOS system states in addition to "started" and "joined" that are verified prior to issuing the stop. Defaults to None. Raises: ServerFailed: if there was an error stopping the servers """ valid_states = ["started", "joined"] if extra_states: valid_states.extend(extra_states) self.log.info("Stopping DAOS IO servers") self.check_system_state(valid_states) self.dmg.system_stop(force=True) if self.dmg.result.exit_status != 0: raise ServerFailed("Error stopping DAOS:\n{}".format( self.dmg.result)) def get_available_storage(self): """Get the available SCM and NVMe storage. Raises: ServerFailed: if there was an error stopping the servers Returns: list: a list of the maximum available SCM and NVMe sizes in bytes """ def get_host_capacity(key, device_names): """Get the total storage capacity per host rank. Args: key (str): the capacity type, e.g. "scm" or "nvme" device_names (list): the device names of this capacity type Returns: dict: a dictionary of total storage capacity per host rank """ host_capacity = {} for host in data: device_sizes = [] for device in data[host][key]: if device in device_names: device_sizes.append( human_to_bytes( data[host][key][device]["capacity"])) host_capacity[host] = sum(device_sizes) return host_capacity # Default maximum bytes for SCM and NVMe storage = [0, 0] using_dcpm = self.manager.job.using_dcpm using_nvme = self.manager.job.using_nvme if using_dcpm or using_nvme: # Stop the DAOS IO servers in order to be able to scan the storage self.system_stop() # Scan all of the hosts for their SCM and NVMe storage self.dmg.hostlist = self._hosts data = self.dmg.storage_scan(verbose=True) self.dmg.hostlist = self.get_config_value("access_points") if self.dmg.result.exit_status != 0: raise ServerFailed("Error obtaining DAOS storage:\n{}".format( self.dmg.result)) # Restart the DAOS IO servers self.system_start() if using_dcpm: # Find the sizes of the configured SCM storage scm_devices = [ os.path.basename(path) for path in self.get_config_value("scm_list") if path ] capacity = get_host_capacity("scm", scm_devices) for host in sorted(capacity): self.log.info("SCM capacity for %s: %s", host, capacity[host]) # Use the minimum SCM storage across all servers storage[0] = capacity[min(capacity, key=capacity.get)] else: # Use the assigned scm_size scm_size = self.get_config_value("scm_size") storage[0] = human_to_bytes("{}GB".format(scm_size)) if using_nvme: # Find the sizes of the configured NVMe storage capacity = get_host_capacity("nvme", self.get_config_value("bdev_list")) for host in sorted(capacity): self.log.info("NVMe capacity for %s: %s", host, capacity[host]) # Use the minimum SCM storage across all servers storage[1] = capacity[min(capacity, key=capacity.get)] self.log.info( "Total available storage:\n SCM: %s (%s)\n NVMe: %s (%s)", str(storage[0]), bytes_to_human(storage[0], binary=False), str(storage[1]), bytes_to_human(storage[1], binary=False)) return storage
def pool_acl_verification(self, current_user_acl, read, write): ''' Deascription: Daos pool security verification with acl file. Steps: (1)Setup dmg tool for creating a pool (2)Generate acl file with permissions (3)Create a pool with acl (4)Verify the pool create status (5)Get the pool's acl list (6)Verify pool read operation (7)Verify pool write operation (8)Cleanup user and destroy pool Args: current_user_acl: acl with read write access credential. read: expecting read permission. write: expecting write permission. Return: pass to continue. fail to report the testlog and stop. ''' # (1)Create daos_shell command dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.get_params(self) port = self.params.get("port", "/run/server_config/*", 10001) get_acl_file = self.params.get("acl_file", "/run/pool_acl/*", "acl_test.txt") acl_file = os.path.join(self.tmp, get_acl_file) num_user = self.params.get("num_user", "/run/pool_acl/*") num_group = self.params.get("num_group", "/run/pool_acl/*") servers_with_ports = [ "{}:{}".format(host, port) for host in self.hostlist_servers ] dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist") self.log.info(" (1)dmg= %s", dmg) # (2)Generate acl file with permissions self.log.info(" (2)Generate acl file with user/group permissions") permission_list = self.create_pool_acl(num_user, num_group, current_user_acl, acl_file) # (3)Create a pool with acl self.log.info(" (3)Create a pool with acl") dmg.action_command.acl_file.value = acl_file dmg.exit_status_exception = False result = dmg.run() # (4)Verify the pool create status self.log.info(" (4)dmg.run() result=\n%s", result) if result.stderr == "": uuid, svc = dmg_utils.get_pool_uuid_service_replicas_from_stdout( result.stdout) else: self.fail("##(4)Unable to parse pool uuid and svc.") # (5)Get the pool's acl list # dmg pool get-acl --pool <UUID> self.log.info(" (5)Get a pool's acl list by: " "dmg pool get-acl --pool --hostlist") pool_acl_list = self.get_pool_acl_list(uuid) self.log.info(" pool original permission_list: %s", permission_list) self.log.info(" pool get_acl permission_list: %s", pool_acl_list) # (6)Verify pool read operation # daos pool query --pool <uuid> self.log.info(" (6)Verify pool read by: daos pool query --pool") self.verify_pool_readwrite(svc, uuid, "read", expect=read) # (7)Verify pool write operation # daos continer create --pool <uuid> self.log.info(" (7)Verify pool write by: daos continer create --pool") self.verify_pool_readwrite(svc, uuid, "write", expect=write) # (8)Cleanup user and destroy pool self.log.info(" (8)Cleanup user and destroy pool") self.cleanup_user_group(num_user, num_group) dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.request.value = "pool" dmg.action.value = "destroy --pool={}".format(uuid) dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist") result = dmg.run() return
class DaosServerManager(SubprocessManager): """Manages the daos_server execution on one or more hosts.""" # Mapping of environment variable names to daos_server config param names ENVIRONMENT_VARIABLE_MAPPING = { "CRT_PHY_ADDR_STR": "provider", "OFI_INTERFACE": "fabric_iface", "OFI_PORT": "fabric_iface_port", } def __init__(self, server_command, manager="Orterun", dmg_cfg=None): """Initialize a DaosServerManager object. Args: server_command (ServerCommand): server command object manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi". dmg_cfg (DmgYamlParameters, optional): The dmg configuration file parameters used to connect to this group of servers. """ super(DaosServerManager, self).__init__(server_command, manager) self.manager.job.sub_command_override = "start" # Dmg command to access this group of servers which will be configured # to access the doas_servers when they are started self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg) def get_params(self, test): """Get values for all of the command params from the yaml file. Use the yaml file parameter values to assign the server command and orterun command parameters. Args: test (Test): avocado Test object """ super(DaosServerManager, self).get_params(test) # Get the values for the dmg parameters self.dmg.get_params(test) def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info( "<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates( get_log_file("daosCA/certs"), self._hosts) local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates( get_log_file("daosCA/certs"), local_host.split()) # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update( self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update( {"plm_rsh_args": "-l root"}, "orterun.mca", True) def clean_files(self, verbose=True): """Clean up the daos server files. Args: verbose (bool, optional): display clean commands. Defaults to True. """ clean_cmds = [] for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.get_value("scm_mount") self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "rm -fr {}/*".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) if self.manager.job.using_dcpm: scm_list = server_params.get_value("scm_list") if isinstance(scm_list, list): self.log.info( "Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_cmds: clean_cmds.append(cmd) pcmd(self._hosts, "; ".join(clean_cmds), verbose) def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True if using_nvme: cmd.sub_command_class.sub_command_class.hugepages.value = 4096 self.log.info("Preparing DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" raise ServerFailed("Error preparing {} storage".format(dev_type)) def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format.""" f_type = "format" if not reformat else "reformat" self.log.info("<SERVER> Waiting for servers to be ready for format") self.manager.job.update_pattern(f_type, len(self._hosts)) try: self.manager.run() except CommandFailure as error: self.kill() raise ServerFailed( "Failed to start servers before format: {}".format(error)) def detect_io_server_start(self): """Detect when all the daos_io_servers have started.""" self.log.info("<SERVER> Waiting for the daos_io_servers to start") self.manager.job.update_pattern("normal", len(self._hosts)) if not self.manager.job.check_subprocess_status(self.manager.process): self.kill() raise ServerFailed("Failed to start servers after format") # Update the dmg command host list to work with pool create/destroy self.dmg.hostlist = self.get_config_value("access_points") def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage") def set_scm_mount_ownership(self, user=None, verbose=False): """Set the ownership to the specified user for each scm mount. Args: user (str, optional): user name. Defaults to None - current user. verbose (bool, optional): display commands. Defaults to False. """ user = getpass.getuser() if user is None else user cmd_list = set() for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.scm_mount.value # Support single or multiple scm_mount points if not isinstance(scm_mount, list): scm_mount = [scm_mount] self.log.info("Changing ownership to %s for: %s", user, scm_mount) cmd_list.add( "sudo chown -R {0}:{0} {1}".format(user, " ".join(scm_mount))) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), verbose) def start(self): """Start the server through the job manager.""" # Prepare the servers self.prepare() # Start the servers and wait for them to be ready for storage format self.detect_format_ready() # Format storage and wait for server to change ownership self.log.info( "<SERVER> Formatting hosts: <%s>", self.dmg.hostlist) self.dmg.storage_format() # Wait for all the doas_io_servers to start self.detect_io_server_start() return True def stop(self): """Stop the server through the runner.""" self.log.info( "<SERVER> Stopping server %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the job manager command try: super(DaosServerManager, self).stop() except CommandFailure as error: messages.append( "Error stopping the {} subprocess: {}".format( self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.kill() if self.manager.job.using_nvme: # Reset the storage try: self.reset_storage() except ServerFailed as error: messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() # Report any errors after all stop actions have been attempted if messages: raise ServerFailed( "Failed to stop servers:\n {}".format("\n ".join(messages))) def get_environment_value(self, name): """Get the server config value associated with the env variable name. Args: name (str): environment variable name for which to get a daos_server configuration value Raises: ServerFailed: Unable to find a daos_server configuration value for the specified environment variable name Returns: str: the daos_server configuration value for the specified environment variable name """ try: setting = self.ENVIRONMENT_VARIABLE_MAPPING[name] except IndexError: raise ServerFailed( "Unknown server config setting mapping for the {} environment " "variable!".format(name)) return self.get_config_value(setting)
def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') #Stop the servers to run SPDK too to get the server capacity self.stop_servers() storage = self.get_nvme_max_capacity() self.start_servers() #Create the pool from 80% of available of storage space single_pool_nvme_size = int((storage * 0.80) / no_of_pools) self.pool = [] #Create the Large number of pools for _pool in range(no_of_pools): pool = TestPool(self.context, dmg_command=self.get_dmg_command()) pool.get_params(self) #SCM size is 10% of NVMe pool.scm_size.update('{}'.format(int(single_pool_nvme_size * 0.10))) pool.nvme_size.update('{}'.format(single_pool_nvme_size)) pool.create() self.pool.append(pool) #initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") #List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.\ set_sub_command("list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))
class TestPool(TestDaosApiBase): """A class for functional testing of DaosPools objects.""" # Constants to define whether to use API or dmg to create and destroy # pool. USE_API = "API" USE_DMG = "dmg" def __init__(self, context, log=None, cb_handler=None, dmg_bin_path=None): # pylint: disable=unused-argument """Initialize a TestPool object. Note: 'log' is now a defunct argument and will be removed in the future Args: context (DaosContext): [description] log (logging): logging object used to report the pool status cb_handler (CallbackHandler, optional): callback object to use with the API methods. Defaults to None. """ super(TestPool, self).__init__("/run/pool/*", cb_handler) self.context = context self.uid = os.geteuid() self.gid = os.getegid() self.mode = BasicParameter(None) self.name = BasicParameter(None) # server group name self.svcn = BasicParameter(None) self.target_list = BasicParameter(None) self.scm_size = BasicParameter(None) self.nvme_size = BasicParameter(None) # Set USE_API to use API or USE_DMG to use dmg. If it's not set, API is # used. self.control_method = BasicParameter(self.USE_API, self.USE_API) uname = getpass.getuser() gname = grp.getgrnam(uname)[0] self.username = BasicParameter(uname, uname) self.groupname = BasicParameter(gname, gname) self.pool = None self.uuid = None self.info = None self.svc_ranks = None self.connected = False self.dmg = None # Required to use dmg. It defined the directory where dmg is installed. # Use self.basepath + '/install/bin' in the test self.dmg_bin_path = dmg_bin_path if dmg_bin_path is not None: # We make dmg as the member of this class because the test would # have more flexibility over the usage of the command. self.dmg = DmgCommand(self.dmg_bin_path) self.dmg.insecure.value = True self.dmg.request.value = "pool" @fail_on(CommandFailure) @fail_on(DaosApiError) def create(self): """Create a pool with either API or dmg. To use dmg, the test needs to set control_method.value to USE_DMG prior to calling this method. The recommended way is to specify the pool block in yaml. For example, pool: control_method: dmg This tells this method to use dmg. The test also needs to set dmg_bin_path through the constructor if dmg is used. For example, self.pool = TestPool(self.context, dmg_bin_path=self.basepath + '/install/bin') If it wants to use --nsvc option, it needs to set the value to svcn.value. Otherwise, 1 is used. If it wants to use --group, it needs to set groupname.value. If it wants to use --user, it needs to set username.value. If it wants to add other options, directly set it to self.dmg.action_command. Refer dmg_utils.py pool_create method for more details. To test the negative case on create, the test needs to catch CommandFailure for dmg and DaosApiError for API. Thus, we need to make more than one line modification to the test only for this purpose. Currently, pool_svc is the only test that needs this change. """ self.destroy() if self.target_list.value is not None: self.log.info("Creating a pool on targets %s", self.target_list.value) else: self.log.info("Creating a pool") self.pool = DaosPool(self.context) if self.control_method.value == self.USE_API: kwargs = { "mode": self.mode.value, "uid": self.uid, "gid": self.gid, "scm_size": self.scm_size.value, "group": self.name.value } for key in ("target_list", "svcn", "nvme_size"): value = getattr(self, key).value if value is not None: kwargs[key] = value self._call_method(self.pool.create, kwargs) self.svc_ranks = [ int(self.pool.svc.rl_ranks[index]) for index in range(self.pool.svc.rl_nr) ] else: if self.dmg is None: raise DaosTestError( "self.dmg is None. dmg_bin_path needs to be set through " "the constructor of TestPool to create pool with dmg.") # Currently, there is one test that creates the pool over the # subset of the server hosts; pool/evict_test. To do so, the test # needs to set the rank(s) to target_list.value starting from 0. # e.g., if you're using 4 server hosts; wolf-1, wolf-2, wolf-3, and # wolf-4, and want to create a pool over the first two hosts; # wolf-1 and 2, then set the list [0, 1] to target_list.value. # We'll convert it to the comma separated string and set it to dmg. # For instance, [0, 1] will result in dmg pool create -r 0,1. If # you don't set target_list.value, -r won't be used, in which case # the pool is created over all the server hosts. if self.target_list.value is None: ranks_comma_separated = None else: ranks_comma_separated = "" for i in range(len(self.target_list.value)): ranks_comma_separated += str(self.target_list.value[i]) # If this element is not the last one, append comma if i < len(self.target_list.value) - 1: ranks_comma_separated += "," # Call the dmg pool create command self.dmg.action.value = "create" self.dmg.get_action_command() # uid/gid used in API correspond to --user and --group in dmg. # group, or self.name.value, used in API is called server group and # it's different from the group name passed in to --group. Server # group isn't used in dmg. We don't pass it into the command, but # we'll still use it to set self.pool.group self.dmg.action_command.group.value = self.groupname.value self.dmg.action_command.user.value = self.username.value self.dmg.action_command.scm_size.value = self.scm_size.value self.dmg.action_command.ranks.value = ranks_comma_separated self.dmg.action_command.nsvc.value = self.svcn.value create_result = self.dmg.run() self.log.info("Result stdout = %s", create_result.stdout) self.log.info("Result exit status = %s", create_result.exit_status) # Get UUID and service replica from the output uuid_svc = get_pool_uuid_service_replicas_from_stdout( create_result.stdout) new_uuid = uuid_svc[0] service_replica = uuid_svc[1] # 3. Create DaosPool object. The process is similar to the one in # DaosPool.create, but there are some modifications if self.name.value is None: self.pool.group = None else: self.pool.group = ctypes.create_string_buffer(self.name.value) # Modification 1: Use the length of service_replica returned by dmg # to calculate rank_t. Note that we assume we always get a single # number. I'm not sure if we ever get multiple numbers, but in that # case, we need to modify this implementation to create a list out # of the multiple numbers possibly separated by comma service_replicas = [int(service_replica)] rank_t = ctypes.c_uint * len(service_replicas) # Modification 2: Use the service_replicas list to generate rank. # In DaosPool, we first use some garbage 999999 values and let DAOS # set the correct values, but we can't do that here, so we need to # set the correct rank value by ourself rank = rank_t(*list([svc for svc in service_replicas])) rl_ranks = ctypes.POINTER(ctypes.c_uint)(rank) # Modification 3: Similar to 1. Use the length of service_replicas # list instead of self.svcn.value self.pool.svc = daos_cref.RankList(rl_ranks, len(service_replicas)) # 4. Set UUID and attached to the DaosPool object self.pool.set_uuid_str(new_uuid) self.pool.attached = 1 self.uuid = self.pool.get_uuid_str() @fail_on(DaosApiError) def connect(self, permission=1): """Connect to the pool. Args: permission (int, optional): connect permission. Defaults to 1. Returns: bool: True if the pool has been connected; False if the pool was already connected or the pool is not defined. """ if self.pool and not self.connected: kwargs = {"flags": 1 << permission} self.log.info( "Connecting to pool %s with permission %s (flag: %s)", self.uuid, permission, kwargs["flags"]) self._call_method(self.pool.connect, kwargs) self.connected = True return True return False @fail_on(DaosApiError) def disconnect(self): """Disconnect from connected pool. Returns: bool: True if the pool has been disconnected; False if the pool was already disconnected or the pool is not defined. """ if self.pool and self.connected: self.log.info("Disonnecting from pool %s", self.uuid) self._call_method(self.pool.disconnect, {}) self.connected = False return True return False @fail_on(CommandFailure) @fail_on(DaosApiError) def destroy(self, force=1): """Destroy the pool with either API or dmg. It uses control_method member previously set, so if you want to use the other way for some reason, update it before calling this method. Args: force (int, optional): force flag. Defaults to 1. Returns: bool: True if the pool has been destroyed; False if the pool is not defined. """ if self.pool: self.disconnect() self.log.info("Destroying pool %s", self.uuid) if self.control_method.value == self.USE_API: if self.pool.attached: self._call_method(self.pool.destroy, {"force": force}) elif self.control_method.value == self.USE_DMG: if self.pool.attached: self.dmg.action.value = "destroy" self.dmg.get_action_command() self.dmg.action_command.pool.value = self.uuid self.dmg.action_command.force.value = force self.dmg.run() else: self.log.error("Cannot destroy pool! Use USE_API or USE_DMG") return False self.pool = None self.uuid = None self.info = None self.svc_ranks = None return True return False @fail_on(DaosApiError) def get_info(self): """Query the pool for information. Sets the self.info attribute. """ if self.pool: self.connect() self._call_method(self.pool.pool_query, {}) self.info = self.pool.pool_info def check_pool_info(self, pi_uuid=None, pi_ntargets=None, pi_nnodes=None, pi_ndisabled=None, pi_map_ver=None, pi_leader=None, pi_bits=None): # pylint: disable=unused-argument """Check the pool info attributes. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Args: pi_uuid (str, optional): pool uuid. Defaults to None. pi_ntargets (int, optional): number of targets. Defaults to None. pi_nnodes (int, optional): number of nodes. Defaults to None. pi_ndisabled (int, optional): number of disabled. Defaults to None. pi_map_ver (int, optional): pool map version. Defaults to None. pi_leader (int, optional): pool leader. Defaults to None. pi_bits (int, optional): pool bits. Defaults to None. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [(key, c_uuid_to_str(getattr(self.info, key)) if key == "pi_uuid" else getattr(self.info, key), val) for key, val in locals().items() if key != "self" and val is not None] return self._check_info(checks) def check_pool_space(self, ps_free_min=None, ps_free_max=None, ps_free_mean=None, ps_ntargets=None, ps_padding=None): # pylint: disable=unused-argument """Check the pool info space attributes. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Args: ps_free_min (list, optional): minimum free space per device. Defaults to None. ps_free_max (list, optional): maximum free space per device. Defaults to None. ps_free_mean (list, optional): mean free space per device. Defaults to None. ps_ntargets (int, optional): number of targets. Defaults to None. ps_padding (int, optional): space padding. Defaults to None. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [] for key in ("ps_free_min", "ps_free_max", "ps_free_mean"): val = locals()[key] if isinstance(val, list): for index, item in val: checks.append(("{}[{}]".format(key, index), getattr(self.info.pi_space, key)[index], item)) for key in ("ps_ntargets", "ps_padding"): val = locals()[key] if val is not None: checks.append(key, getattr(self.info.pi_space, key), val) return self._check_info(checks) def check_pool_daos_space(self, s_total=None, s_free=None): # pylint: disable=unused-argument """Check the pool info daos space attributes. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Args: s_total (list, optional): total space per device. Defaults to None. s_free (list, optional): free space per device. Defaults to None. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [("{}_{}".format(key, index), getattr(self.info.pi_space.ps_space, key)[index], item) for key, val in locals().items() if key != "self" and val is not None for index, item in enumerate(val)] return self._check_info(checks) def check_rebuild_status(self, rs_version=None, rs_seconds=None, rs_errno=None, rs_done=None, rs_padding32=None, rs_fail_rank=None, rs_toberb_obj_nr=None, rs_obj_nr=None, rs_rec_nr=None, rs_size=None): # pylint: disable=unused-argument # pylint: disable=too-many-arguments """Check the pool info rebuild attributes. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Args: rs_version (int, optional): rebuild version. Defaults to None. rs_seconds (int, optional): rebuild seconds. Defaults to None. rs_errno (int, optional): rebuild error number. Defaults to None. rs_done (int, optional): rebuild done flag. Defaults to None. rs_padding32 (int, optional): padding. Defaults to None. rs_fail_rank (int, optional): rebuild fail target. Defaults to None. rs_toberb_obj_nr (int, optional): number of objects to be rebuilt. Defaults to None. rs_obj_nr (int, optional): number of rebuilt objects. Defaults to None. rs_rec_nr (int, optional): number of rebuilt records. Defaults to None. rs_size (int, optional): size of all rebuilt records. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [(key, getattr(self.info.pi_rebuild_st, key), val) for key, val in locals().items() if key != "self" and val is not None] return self._check_info(checks) def rebuild_complete(self): """Determine if the pool rebuild is complete. Returns: bool: True if pool rebuild is complete; False otherwise """ self.display_pool_rebuild_status() return self.info.pi_rebuild_st.rs_done == 1 def wait_for_rebuild(self, to_start, interval=1): """Wait for the rebuild to start or end. Args: to_start (bool): whether to wait for rebuild to start or end interval (int): number of seconds to wait in between rebuild completion checks """ self.log.info("Waiting for rebuild to %s ...", "start" if to_start else "complete") while self.rebuild_complete() == to_start: self.log.info(" Rebuild %s ...", "has not yet started" if to_start else "in progress") sleep(interval) self.log.info("Rebuild %s detected", "start" if to_start else "completion") @fail_on(DaosApiError) def start_rebuild(self, ranks, daos_log): """Kill the specific server ranks using this pool. Args: ranks (list): a list of daos server ranks (int) to kill daos_log (DaosLog): object for logging messages Returns: bool: True if the server ranks have been killed and the ranks have been excluded from the pool; False if the pool is undefined """ msg = "Killing DAOS ranks {} from server group {}".format( ranks, self.name.value) self.log.info(msg) daos_log.info(msg) for rank in ranks: server = DaosServer(self.context, self.name.value, rank) self._call_method(server.kill, {"force": 1}) return self.exclude(ranks, daos_log) @fail_on(DaosApiError) def exclude(self, ranks, daos_log): """Manually exclude a rank from this pool. Args: ranks (list): a list daos server ranks (int) to exclude daos_log (DaosLog): object for logging messages Returns: bool: True if the ranks were excluded from the pool; False if the pool is undefined """ if self.pool: msg = "Excluding server ranks {} from pool {}".format( ranks, self.uuid) self.log.info(msg) daos_log.info(msg) self._call_method(self.pool.exclude, {"rank_list": ranks}) return True return False def check_files(self, hosts): """Check if pool files exist on the specified list of hosts. Args: hosts (list): list of hosts Returns: bool: True if the files for this pool exist on each host; False otherwise """ return check_pool_files(self.log, hosts, self.uuid.lower()) def write_file(self, orterun, processes, hostfile, size, timeout=60): """Write a file to the pool. Args: orterun (str): full path to the orterun command processes (int): number of processes to launch hosts (list): list of clients from which to write the file size (int): size of the file to create in bytes timeout (int, optional): number of seconds before timing out the command. Defaults to 60 seconds. Returns: process.CmdResult: command execution result """ self.log.info("Writing %s bytes to pool %s", size, self.uuid) env = { "DAOS_POOL": self.uuid, "DAOS_SVCL": "1", "DAOS_SINGLETON_CLI": "1", "PYTHONPATH": os.getenv("PYTHONPATH", ""), } load_mpi("openmpi") current_path = os.path.dirname(os.path.abspath(__file__)) command = "{} --np {} --hostfile {} {} {} testfile".format( orterun, processes, hostfile, os.path.join(current_path, "write_some_data.py"), size) return process.run(command, timeout, True, False, "both", True, env) def get_pool_daos_space(self): """Get the pool info daos space attributes as a dictionary. Returns: dict: a dictionary of lists of the daos space attributes """ self.get_info() keys = ("s_total", "s_free") return {key: getattr(self.info.pi_space.ps_space, key) for key in keys} def display_pool_daos_space(self, msg=None): """Display the pool info daos space attributes. Args: msg (str, optional): optional text to include in the output. Defaults to None. """ daos_space = self.get_pool_daos_space() sizes = [ "{}[{}]={}".format(key, index, item) for key in sorted(daos_space.keys()) for index, item in enumerate(daos_space[key]) ] self.log.info("Pool %s space%s:\n %s", self.uuid, " " + msg if isinstance(msg, str) else "", "\n ".join(sizes)) def get_pool_rebuild_status(self): """Get the pool info rebuild status attributes as a dictionary. Returns: dict: a dictionary of lists of the rebuild status attributes """ self.get_info() keys = ("rs_version", "rs_pad_32", "rs_errno", "rs_done", "rs_toberb_obj_nr", "rs_obj_nr", "rs_rec_nr") return {key: getattr(self.info.pi_rebuild_st, key) for key in keys} def display_pool_rebuild_status(self): """Display the pool info rebuild status attributes.""" status = self.get_pool_rebuild_status() self.log.info( "Pool rebuild status: %s", ", ".join( ["{}={}".format(key, status[key]) for key in sorted(status)])) def read_data_during_rebuild(self, container): """Read data from the container while rebuild is active. Args: container (TestContainer): container from which to read data Returns: bool: True if all the data is read sucessfully befoire rebuild completes; False otherwise """ container.open() self.log.info("Reading objects in container %s during rebuild", self.uuid) # Attempt to read all of the data from the container during rebuild index = 0 status = read_incomplete = index < len(container.written_data) while not self.rebuild_complete() and read_incomplete: try: status &= container.written_data[index].read_object(container) except DaosTestError as error: self.log.error(str(error)) status = False index += 1 read_incomplete = index < len(container.written_data) # Verify that all of the container data was read successfully if read_incomplete: self.log.error( "Rebuild completed before all the written data could be read") status = False elif not status: self.log.error("Errors detected reading data during rebuild") return status