def get_single_system_state(self): """Get the current homogeneous DAOS system state. Raises: ServerFailed: if a single state for all servers is not detected Returns: str: the current DAOS system state """ data = self.get_current_state() if not data: # The regex failed to get the rank and state raise ServerFailed("Error obtaining {} output: {}".format( self.dmg, data)) try: states = list(set([data[rank]["state"] for rank in data])) except KeyError as error: raise ServerFailed( "Unexpected result from {} - missing 'state' key: {}".format( self.dmg, data)) from error if len(states) > 1: # Multiple states for different ranks detected raise ServerFailed( "Multiple system states ({}) detected:\n {}".format( states, data)) return states[0]
def stop_random_rank(self, daos_log, force=False, exclude_ranks=None): """Kill/Stop a random server rank that is expected to be running. Args: daos_log (DaosLog): object for logging messages force (bool, optional): whether to use --force option to dmg system stop. Defaults to False. exclude_ranks (list, optional): ranks to exclude from the random selection. Default is None. Raises: avocado.core.exceptions.TestFail: if there is an issue stopping the server ranks. ServerFailed: if there are no available ranks to stop. """ # Exclude non-running ranks rank_state = self.get_expected_states() candidate_ranks = [] for rank, state in rank_state.items(): for running_state in self._states["running"]: if running_state in state: candidate_ranks.append(rank) continue # Exclude specified ranks for rank in exclude_ranks or []: if rank in candidate_ranks: del candidate_ranks[candidate_ranks.index(rank)] if len(candidate_ranks) < 1: raise ServerFailed("No available candidate ranks to stop.") # Stop a random rank random_rank = random.choice(candidate_ranks) #nosec return self.stop_ranks([random_rank], daos_log=daos_log, force=force)
def check_rank_state(self, rank, valid_state, max_checks=1): """Check the state of single rank in DAOS system. Args: rankv(int): daos rank whose state need's to be checked valid_state (str): expected state for the rank max_checks (int, optional): number of times to check the state Defaults to 1. Raises: ServerFailed: if there was error obtaining the data for daos system query Returns: bool: returns True if there is a match for checked state, else False. """ checks = 0 while checks < max_checks: if checks > 0: time.sleep(1) data = self.get_current_state() if not data: # The regex failed to get the rank and state raise ServerFailed("Error obtaining {} output: {}".format(self.dmg, data)) checks += 1 if data[rank]["state"] == valid_state: return True return False
def get_environment_value(self, name): """Get the server config value associated with the env variable name. Args: name (str): environment variable name for which to get a daos_server configuration value Raises: ServerFailed: Unable to find a daos_server configuration value for the specified environment variable name Returns: str: the daos_server configuration value for the specified environment variable name """ try: setting = self.ENVIRONMENT_VARIABLE_MAPPING[name] except IndexError as error: raise ServerFailed( "Unknown server config setting mapping for the {} environment " "variable!".format(name)) from error return self.get_config_value(setting)
def stop(self): """Stop the server through the runner.""" self.log.info("<SERVER> Stopping server %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the job manager command try: super().stop() except CommandFailure as error: messages.append( "Error stopping the {} subprocess: {}".format(self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.manager.kill() if self.manager.job.using_nvme: # Reset the storage try: self.reset_storage() except ServerFailed as error: messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() # Report any errors after all stop actions have been attempted if messages: raise ServerFailed("Failed to stop servers:\n {}".format("\n ".join(messages)))
def detect_engine_start(self, hosts_qty=None): """Detect when all the engines have started. Args: hosts_qty (int): number of servers expected to have been started. Raises: ServerFailed: if there was an error starting the servers after formatting. """ if hosts_qty is None: hosts_qty = len(self._hosts) if self.detect_start_via_dmg: self.log.info("<SERVER> Waiting for the daos_engine to start via dmg system query") self.manager.job.update_pattern("dmg", hosts_qty) started = self.get_detected_engine_count(self.manager.process) else: self.log.info("<SERVER> Waiting for the daos_engine to start") self.manager.job.update_pattern("normal", hosts_qty) started = self.manager.check_subprocess_status(self.manager.process) if not started: self.manager.kill() raise ServerFailed("Failed to start servers after format") # Update the dmg command host list to work with pool create/destroy self._prepare_dmg_hostlist() # Define the expected states for each rank self._expected_states = self.get_current_state()
def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True self.log.info("Preparing DAOS server storage: %s", str(cmd)) results = run_pcmd(self._hosts, str(cmd), timeout=self.storage_prepare_timeout.value) # gratuitously lifted from pcmd() and get_current_state() result = {} stdouts = "" for res in results: stdouts += '\n'.join(res["stdout"] + ['']) if res["exit_status"] not in result: result[res["exit_status"]] = NodeSet() result[res["exit_status"]].add(res["hosts"]) if len(result) > 1 or 0 not in result or \ (using_dcpm and "No SCM modules detected; skipping operation" in stdouts): dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" pcmd(self._hosts, "sudo -n ipmctl show -v -dimm") pcmd(self._hosts, "ndctl list ") raise ServerFailed("Error preparing {} storage".format(dev_type))
def system_start(self): """Start the DAOS I/O Engines. Raises: ServerFailed: if there was an error starting the servers """ self.log.info("Starting DAOS I/O Engines") self.check_system_state(("stopped")) self.dmg.system_start() if self.dmg.result.exit_status != 0: raise ServerFailed("Error starting DAOS:\n{}".format(self.dmg.result))
def system_stop(self, extra_states=None): """Stop the DAOS I/O Engines. Args: extra_states (list, optional): a list of DAOS system states in addition to "started" and "joined" that are verified prior to issuing the stop. Defaults to None. Raises: ServerFailed: if there was an error stopping the servers """ valid_states = ["started", "joined"] if extra_states: valid_states.extend(extra_states) self.log.info("Stopping DAOS I/O Engines") self.check_system_state(valid_states) self.dmg.system_stop(force=True) if self.dmg.result.exit_status != 0: raise ServerFailed("Error stopping DAOS:\n{}".format(self.dmg.result))
def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage")
def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format. Args: reformat (bool, optional): whether or detect reformat (True) or format (False) messages. Defaults to False. Raises: ServerFailed: if there was an error starting the servers. """ f_type = "format" if not reformat else "reformat" self.log.info("<SERVER> Waiting for servers to be ready for %s", f_type) self.manager.job.update_pattern(f_type, len(self._hosts)) try: self.manager.run() except CommandFailure as error: self.manager.kill() raise ServerFailed( "Failed to start servers before format: {}".format(error)) from error
def check_system_state(self, valid_states, max_checks=1): """Check that the DAOS system state is one of the provided states. Fail the test if the current state does not match one of the specified valid states. Optionally the state check can loop multiple times, sleeping one second between checks, by increasing the number of maximum checks. Args: valid_states (list): expected DAOS system states as a list of lowercase strings max_checks (int, optional): number of times to check the state. Defaults to 1. Raises: ServerFailed: if there was an error detecting the server state or the detected state did not match one of the valid states Returns: str: the matching valid detected state """ checks = 0 daos_state = "????" while daos_state not in valid_states and checks < max_checks: if checks > 0: time.sleep(1) try: daos_state = self.get_single_system_state().lower() except ServerFailed as error: raise error checks += 1 self.log.info("System state check (%s): %s", checks, daos_state) if daos_state not in valid_states: raise ServerFailed( "Error checking DAOS state, currently neither {} after " "{} state check(s)!".format(valid_states, checks)) return daos_state
def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True # Use VMD option when resetting storage if it's prepared with VMD. if "True" in os.environ["DAOS_ENABLE_VMD"]: cmd.sub_command_class.sub_command_class.enable_vmd.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage")
def autosize_pool_params(self, size, tier_ratio, scm_size, nvme_size, min_targets=1, quantity=1): """Update any pool size parameter ending in a %. Use the current NVMe and SCM storage sizes to assign values to the size, scm_size, and or nvme_size dmg pool create arguments which end in "%". The numerical part of these arguments will be used to assign a value that is X% of the available storage capacity. The updated size and nvme_size arguments will be assigned values that are multiples of 1GiB times the number of targets assigned to each server engine. If needed the number of targets will be reduced (to not exceed min_targets) in order to support the requested size. An optional number of expected pools (quantity) can also be specified to divide the available storage capacity. Note: depending upon the inputs this method may return dmg pool create parameter combinations that are not supported, e.g. tier_ratio + nvme_size. This is intended to allow testing of these combinations. Args: size (object): the str, int, or None value for the dmg pool create size parameter. tier_ratio (object): the int or None value for the dmg pool create size parameter. scm_size (object): the str, int, or None value for the dmg pool create scm_size parameter. nvme_size (object): the str, int, or None value for the dmg pool create nvme_size parameter. min_targets (int, optional): the minimum number of targets per engine that can be configured. Defaults to 1. quantity (int, optional): Number of pools to account for in the size calculations. The pool size returned is only for a single pool. Defaults to 1. Raises: ServerFailed: if there was a error obtaining auto-sized TestPool parameters. AutosizeCancel: if a valid pool parameter size could not be obtained Returns: dict: the parameters for a TestPool object. """ # Adjust any pool size parameter by the requested percentage params = {"tier_ratio": tier_ratio} adjusted = {"size": size, "scm_size": scm_size, "nvme_size": nvme_size} keys = [ key for key in ("size", "scm_size", "nvme_size") if adjusted[key] is not None and str(adjusted[key]).endswith("%")] if keys: # Verify the minimum number of targets configured per engine targets = min(self.manager.job.get_engine_values("targets")) if targets < min_targets: raise ServerFailed( "Minimum target quantity ({}) exceeds current target " "quantity ({})".format(min_targets, targets)) self.log.info("-" * 100) pool_msg = "{} pool{}".format(quantity, "s" if quantity > 1 else "") self.log.info( "Autosizing TestPool parameters ending with a \"%%\" for %s:", pool_msg) for key in ("size", "scm_size", "nvme_size"): self.log.info(" - %-9s : %s (%s)", key, adjusted[key], key in keys) # Determine the largest SCM and NVMe pool sizes can be used with # this server configuration with an optionally applied ratio. try: available_storage = self.get_available_storage() except ServerFailed as error: raise ServerFailed("Error obtaining available storage") from error # Determine the SCM and NVMe size limits for the size and tier_ratio # arguments for the total number of engines if tier_ratio is None: # Use the default value if not provided tier_ratio = 6 engine_qty = len(self.manager.job.engine_params) * len(self._hosts) available_storage["size"] = min( engine_qty * available_storage["nvme"], (engine_qty * available_storage["scm"]) / float(tier_ratio / 100) ) available_storage["tier_ratio"] = available_storage["size"] * float(tier_ratio / 100) self.log.info( "Largest storage size available for %s engines with a %.2f%% " "tier_ratio:", engine_qty, tier_ratio) self.log.info( " - NVME : %s", get_display_size(available_storage["size"])) self.log.info( " - SCM : %s", get_display_size(available_storage["tier_ratio"])) self.log.info( " - COMBINED : %s", get_display_size(available_storage["size"] + available_storage["tier_ratio"])) # Apply any requested percentages to the pool parameters available = { "size": {"size": available_storage["size"], "type": "NVMe"}, "scm_size": {"size": available_storage["scm"], "type": "SCM"}, "nvme_size": {"size": available_storage["nvme"], "type": "NVMe"} } self.log.info("Adjusted pool sizes for %s:", pool_msg) for key in keys: try: ratio = int(str(adjusted[key]).replace("%", "")) except NameError as error: raise ServerFailed( "Invalid '{}' format: {}".format(key, adjusted[key])) from error adjusted[key] = (available[key]["size"] * float(ratio / 100)) / quantity self.log.info( " - %-9s : %-4s storage adjusted by %.2f%%: %s", key, available[key]["type"], ratio, get_display_size(adjusted[key])) # Display the pool size increment value for each size argument increment = { "size": human_to_bytes("1GiB"), "scm_size": human_to_bytes("16MiB"), "nvme_size": human_to_bytes("1GiB")} self.log.info("Increment sizes per target:") for key in keys: self.log.info(" - %-9s : %s", key, get_display_size(increment[key])) # Adjust the size to use a SCM/NVMe target multiplier self.log.info("Pool sizes adjusted to fit by increment sizes:") adjusted_targets = targets for key in keys: multiplier = math.floor(adjusted[key] / increment[key]) params[key] = multiplier * increment[key] self.log.info( " - %-9s : %s * %s = %s", key, multiplier, increment[key], get_display_size(params[key])) if multiplier < adjusted_targets: adjusted_targets = multiplier if adjusted_targets < min_targets: raise AutosizeCancel( "Unable to autosize the {} pool parameter due to " "exceeding the minimum of {} targets: {}".format( key, min_targets, adjusted_targets)) if key == "size": tier_ratio_size = params[key] * float(tier_ratio / 100) self.log.info( " - %-9s : %.2f%% tier_ratio = %s", key, tier_ratio, get_display_size(tier_ratio_size)) params[key] += tier_ratio_size self.log.info( " - %-9s : NVMe + SCM = %s", key, get_display_size(params[key])) params[key] = bytes_to_human(params[key], binary=True) # Reboot the servers if a reduced number of targets is required if adjusted_targets < targets: self.log.info( "Updating targets per server engine: %s -> %s", targets, adjusted_targets) self.set_config_value("targets", adjusted_targets) self.stop() self.start() self.log.info("-" * 100) return params