class ServerManager(ExecutableCommand): """Defines object to manage server functions and launch server command.""" def __init__(self, daosbinpath, runnerpath, timeout=300): """Create a ServerManager object. Args: daosbinpath (str): Path to daos bin runnerpath (str): Path to Orterun binary. timeout (int, optional): Time for the server to start. Defaults to 300. """ super(ServerManager, self).__init__("/run/server_manager/*", "", "") self.daosbinpath = daosbinpath self._hosts = None # Setup orterun command defaults self.runner = Orterun(DaosServer(self.daosbinpath), runnerpath, True) # Setup server command defaults self.runner.job.action.value = "start" self.runner.job.get_action_command() # Parameters that user can specify in the test yaml to modify behavior. self.debug = BasicParameter(None, True) # ServerCommand param self.insecure = BasicParameter(None, True) # ServerCommand param self.recreate = BasicParameter(None, True) # ServerCommand param self.sudo = BasicParameter(None, False) # ServerCommand param self.srv_timeout = BasicParameter(None, timeout) # ServerCommand param self.report_uri = BasicParameter(None) # Orterun param self.enable_recovery = BasicParameter(None, True) # Orterun param self.export = BasicParameter(None) # Orterun param @property def hosts(self): """Hosts attribute getter.""" return self._hosts @hosts.setter def hosts(self, value): """Hosts attribute setter. Args: value (tuple): (list of hosts, workdir, slots) """ self._hosts, workdir, slots = value self.runner.processes.value = len(self._hosts) self.runner.hostfile.value = write_host_file(self._hosts, workdir, slots) self.runner.job.server_cnt = len(self._hosts) self.runner.job.server_list = self._hosts def get_params(self, test): """Get values from the yaml file. Assign the ServerManager parameters to their respective ServerCommand and Orterun class parameters. Args: test (Test): avocado Test object """ server_params = ["debug", "sudo", "srv_timeout"] server_start_params = ["insecure", "recreate"] runner_params = ["enable_recovery", "export", "report_uri"] super(ServerManager, self).get_params(test) self.runner.job.yaml_params.get_params(test) self.runner.get_params(test) for name in self.get_param_names(): if name in server_params: if name == "sudo": setattr(self.runner.job, name, getattr(self, name).value) elif name == "srv_timeout": setattr(self.runner.job, name, getattr(self, name).value) else: getattr(self.runner.job, name).value = getattr(self, name).value if name in server_start_params: getattr(self.runner.job.action_command, name).value = \ getattr(self, name).value if name in runner_params: getattr(self.runner, name).value = getattr(self, name).value def run(self): """Execute the runner subprocess.""" self.log.info("Start CMD>>> %s", str(self.runner)) return self.runner.run() def start(self, yamlfile): """Start the server through the runner.""" storage_prep_flag = "" self.runner.job.set_config(yamlfile) self.server_clean() # Prepare SCM storage in servers if self.runner.job.yaml_params.is_scm(): storage_prep_flag = "dcpm" self.log.info("Performing SCM storage prepare in <format> mode") else: storage_prep_flag = "ram" # Prepare nvme storage in servers if self.runner.job.yaml_params.is_nvme(): if storage_prep_flag == "dcpm": storage_prep_flag = "dcpm_nvme" elif storage_prep_flag == "ram": storage_prep_flag = "ram_nvme" else: storage_prep_flag = "nvme" self.log.info("Performing NVMe storage prepare in <format> mode") # Make sure log file has been created for ownership change lfile = self.runner.job.yaml_params.server_params[ -1].log_file.value if lfile is not None: self.log.info("Creating log file") cmd_touch_log = "touch {}".format(lfile) pcmd(self._hosts, cmd_touch_log, False) if storage_prep_flag != "ram": storage_prepare(self._hosts, "root", storage_prep_flag) self.runner.mca.value = {"plm_rsh_args": "-l root"} try: self.run() except CommandFailure as details: self.log.info("<SERVER> Exception occurred: %s", str(details)) # Kill the subprocess, anything that might have started self.kill() raise ServerFailed("Failed to start server in {} mode.".format( self.runner.job.mode)) if self.runner.job.yaml_params.is_nvme() or \ self.runner.job.yaml_params.is_scm(): # Setup the hostlist to pass to dmg command servers_with_ports = [ "{}:{}".format(host, self.runner.job.yaml_params.port) for host in self._hosts ] # Format storage and wait for server to change ownership self.log.info("Formatting hosts: <%s>", self._hosts) storage_format(self.daosbinpath, ",".join(servers_with_ports)) self.runner.job.mode = "normal" try: self.runner.job.check_subprocess_status(self.runner.process) except CommandFailure as error: self.log.info("Failed to start after format: %s", str(error)) return True def stop(self): """Stop the server through the runner.""" self.log.info("Stopping servers") if self.runner.job.yaml_params.is_nvme(): self.kill() storage_reset(self._hosts) # Make sure the mount directory belongs to non-root user self.log.info("Changing ownership of mount to non-root user") cmd = "sudo chown -R {0}:{0} /mnt/daos*".format(getpass.getuser()) pcmd(self._hosts, cmd, False) else: try: self.runner.stop() except CommandFailure as error: raise ServerFailed("Failed to stop servers:{}".format(error)) def server_clean(self): """Prepare the hosts before starting daos server.""" # Kill any doas servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() def kill(self): """Forcably kill any daos server processes running on hosts. Sometimes stop doesn't get everything. Really whack everything with this. """ kill_cmds = [ "sudo pkill '(daos_server|daos_io_server)' --signal INT", "sleep 5", "pkill '(daos_server|daos_io_server)' --signal KILL", ] self.log.info("Killing any server processes") pcmd(self._hosts, "; ".join(kill_cmds), False, None, None) def clean_files(self): """Clean the tmpfs on the servers.""" scm_mount = self.runner.job.yaml_params.server_params[-1].scm_mount scm_list = self.runner.job.yaml_params.server_params[-1].scm_list.value clean_cmds = [ "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | xargs -0r rm -rf" ] if self.runner.job.yaml_params.is_nvme(): clean_cmds.append("sudo rm -rf {0}; \ sudo umount {0}".format(scm_mount)) # scm_mount can be /mnt/daos0 or /mnt/daos1 for two daos_server # instances. Presently, not supported in DAOS. The for loop needs # to be updated in future to handle it. Single instance pmem # device should work now. if self.runner.job.yaml_params.is_scm(): for value in scm_list: clean_cmds.append("sudo umount {}; \ sudo wipefs -a {}".format(scm_mount, value)) self.log.info("Cleanup of %s directory.", str(scm_mount)) pcmd(self._hosts, "; ".join(clean_cmds), False)
class ServerManager(ExecutableCommand): """Defines object to manage server functions and launch server command.""" def __init__(self, daosbinpath, runnerpath, timeout=300): """Create a ServerManager object. Args: daosbinpath (str): Path to daos bin runnerpath (str): Path to Orterun binary. timeout (int, optional): Time for the server to start. Defaults to 300. """ super(ServerManager, self).__init__("/run/server_manager/*", "", "") self.daosbinpath = daosbinpath self._hosts = None # Setup orterun command defaults self.runner = Orterun(DaosServer(self.daosbinpath), runnerpath, True) # Setup server command defaults self.runner.job.action.value = "start" self.runner.job.get_action_command() # Parameters that user can specify in the test yaml to modify behavior. self.debug = BasicParameter(None, True) # ServerCommand param self.insecure = BasicParameter(None, True) # ServerCommand param self.recreate = BasicParameter(None, False) # ServerCommand param self.sudo = BasicParameter(None, False) # ServerCommand param self.srv_timeout = BasicParameter(None, timeout) # ServerCommand param self.report_uri = BasicParameter(None) # Orterun param self.enable_recovery = BasicParameter(None, True) # Orterun param self.export = BasicParameter(None) # Orterun param @property def hosts(self): """Hosts attribute getter.""" return self._hosts @hosts.setter def hosts(self, value): """Hosts attribute setter. Args: value (tuple): (list of hosts, workdir, slots) """ self._hosts, workdir, slots = value self.runner.processes.value = len(self._hosts) self.runner.hostfile.value = write_host_file(self._hosts, workdir, slots) self.runner.job.server_list = self._hosts def get_params(self, test): """Get values from the yaml file. Assign the ServerManager parameters to their respective ServerCommand and Orterun class parameters. Args: test (Test): avocado Test object """ server_params = ["debug", "sudo", "srv_timeout"] server_start_params = ["insecure", "recreate"] runner_params = ["enable_recovery", "export", "report_uri"] super(ServerManager, self).get_params(test) self.runner.job.yaml_params.get_params(test) self.runner.get_params(test) for name in self.get_param_names(): if name in server_params: if name == "sudo": setattr(self.runner.job, name, getattr(self, name).value) elif name == "srv_timeout": setattr(self.runner.job, "timeout", getattr(self, name).value) else: getattr(self.runner.job, name).value = getattr(self, name).value if name in server_start_params: getattr(self.runner.job.action_command, name).value = \ getattr(self, name).value if name in runner_params: getattr(self.runner, name).value = getattr(self, name).value # Run daos_server with test variant specific log file names if specified self.runner.job.yaml_params.update_log_files( getattr(test, "control_log"), getattr(test, "helper_log"), getattr(test, "server_log")) def run(self): """Execute the runner subprocess.""" self.log.info("Start CMD>>> %s", str(self.runner)) # Temporary display debug mount information self.log.info("%s", "=" * 80) pcmd(self._hosts, "df -h -t tmpfs", True, None, None) self.log.info("%s", "=" * 80) return self.runner.run() def start(self, yamlfile): """Start the server through the runner.""" storage_prep_flag = "" self.runner.job.set_config(yamlfile) self.server_clean() # Prepare SCM storage in servers if self.runner.job.yaml_params.is_scm(): storage_prep_flag = "dcpm" self.log.info("Performing SCM storage prepare in <format> mode") else: storage_prep_flag = "ram" # Prepare nvme storage in servers if self.runner.job.yaml_params.is_nvme(): if storage_prep_flag == "dcpm": storage_prep_flag = "dcpm_nvme" elif storage_prep_flag == "ram": storage_prep_flag = "ram_nvme" else: storage_prep_flag = "nvme" self.log.info("Performing NVMe storage prepare in <format> mode") # Make sure log file has been created for ownership change lfile = self.runner.job.yaml_params.server_params[ -1].log_file.value if lfile is not None: self.log.info("Creating log file") cmd_touch_log = "touch {}".format(lfile) pcmd(self._hosts, cmd_touch_log, False) if storage_prep_flag != "ram": self.storage_prepare(getpass.getuser(), storage_prep_flag) self.runner.mca.update({"plm_rsh_args": "-l root"}, "orterun.mca", True) # Start the server and wait for each host to require a SCM format self.runner.job.mode = "format" try: self.run() except CommandFailure as error: raise ServerFailed( "Failed to start servers before format: {}".format(error)) # Format storage and wait for server to change ownership self.log.info("Formatting hosts: <%s>", self._hosts) servers_with_ports = [ "{}:{}".format(host, self.runner.job.yaml_params.port) for host in self._hosts ] storage_format(self.daosbinpath, ",".join(servers_with_ports)) # Wait for all the doas_io_servers to start self.runner.job.mode = "normal" if not self.runner.job.check_subprocess_status(self.runner.process): raise ServerFailed("Failed to start servers after format") return True def stop(self): """Stop the server through the runner.""" self.log.info("Stopping servers") if self.runner.job.yaml_params.is_nvme(): self.kill() self.storage_reset() # Make sure the mount directory belongs to non-root user self.log.info("Changing ownership of mount to non-root user") cmd = "sudo chown -R {0}:{0} /mnt/daos*".format(getpass.getuser()) pcmd(self._hosts, cmd, False) else: try: self.runner.stop() except CommandFailure as error: raise ServerFailed("Failed to stop servers:{}".format(error)) def server_clean(self): """Prepare the hosts before starting daos server.""" # Kill any doas servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() def kill(self): """Forcably kill any daos server processes running on hosts. Sometimes stop doesn't get everything. Really whack everything with this. """ kill_cmds = [ "sudo pkill '(daos_server|daos_io_server)' --signal INT", "sleep 5", "pkill '(daos_server|daos_io_server)' --signal KILL", ] self.log.info("Killing any server processes") pcmd(self._hosts, "; ".join(kill_cmds), False, None, None) def clean_files(self): """Clean the tmpfs on the servers.""" clean_cmds = [] for server_params in self.runner.job.yaml_params.server_params: scm_mount = server_params.scm_mount.value self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "rm -fr {}/*".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) if self.runner.job.yaml_params.is_scm(): scm_list = server_params.scm_list.value if isinstance(scm_list, list): self.log.info("Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_cmds: clean_cmds.append(cmd) pcmd(self._hosts, "; ".join(clean_cmds), True) def storage_prepare(self, user, device_type): """Prepare server's storage using the DAOS server's yaml settings file. Args: user (str): username for file permissions device_type (str): storage type - scm or nvme Raises: ServerFailed: if server failed to prepare storage """ # Get the daos_server from the install path. Useful for testing # with daos built binaries. dev_param = "" device_args = "" daos_srv_bin = os.path.join(self.daosbinpath, "daos_server") if device_type == "dcpm": dev_param = "-s" elif device_type == "dcpm_nvme": device_args = " --hugepages=4096" elif device_type in ("ram_nvme", "nvme"): dev_param = "-n" device_args = " --hugepages=4096" else: raise ServerFailed("Invalid device type") cmd = "{} storage prepare {} -u \"{}\" {} -f".format( daos_srv_bin, dev_param, user, device_args) result = pcmd(self._hosts, cmd, timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed( "Error preparing {} storage".format(device_type)) def storage_reset(self): """Reset the servers' storage. NOTE: Don't enhance this method to reset SCM. SCM will not be in a useful state for running next tests. Raises: ServerFailed: if server failed to reset storage """ daos_srv_bin = os.path.join(self.daosbinpath, "daos_server") cmd = "{} storage prepare -n --reset -f".format(daos_srv_bin) result = pcmd(self._hosts, cmd) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage")
class ServerManager(ExecutableCommand): """Defines object to manage server functions and launch server command.""" # pylint: disable=pylint-no-self-use def __init__(self, daosbinpath, runnerpath, attach="/tmp", timeout=300): """Create a ServerManager object. Args: daosbinpath (str): Path to daos bin runnerpath (str): Path to Orterun binary. attach (str, optional): Defaults to "/tmp". timeout (int, optional): Time for the server to start. Defaults to 300. """ super(ServerManager, self).__init__("/run/server_manager/*", "", "") self.daosbinpath = daosbinpath self._hosts = None # Setup orterun command defaults self.runner = Orterun( DaosServer(self.daosbinpath), runnerpath, True) # Setup server command defaults self.runner.job.action.value = "start" self.runner.job.get_action_command() # Set server environment os.environ["CRT_ATTACH_INFO_PATH"] = attach # Parameters that user can specify in the test yaml to modify behavior. self.debug = BasicParameter(None, True) # ServerCommand param self.attach = BasicParameter(None, attach) # ServerCommand param self.insecure = BasicParameter(None, True) # ServerCommand param self.recreate = BasicParameter(None, True) # ServerCommand param self.sudo = BasicParameter(None, False) # ServerCommand param self.srv_timeout = BasicParameter(None, timeout) # ServerCommand param self.report_uri = BasicParameter(None) # Orterun param self.enable_recovery = BasicParameter(None, True) # Orterun param self.export = BasicParameter(None) # Orterun param @property def hosts(self): """Hosts attribute getter.""" return self._hosts @hosts.setter def hosts(self, value): """Hosts attribute setter Args: value (tuple): (list of hosts, workdir, slots) """ self._hosts, workdir, slots = value self.runner.processes.value = len(self._hosts) self.runner.hostfile.value = write_host_file( self._hosts, workdir, slots) self.runner.job.server_cnt = len(self._hosts) def get_params(self, test): """Get values from the yaml file and assign them respectively to the server command and the orterun command. Args: test (Test): avocado Test object """ server_params = ["debug", "sudo", "srv_timeout"] server_start_params = ["attach", "insecure", "recreate"] runner_params = ["enable_recovery", "export", "report_uri"] super(ServerManager, self).get_params(test) self.runner.job.yaml_params.get_params(test) self.runner.get_params(test) for name in self.get_param_names(): if name in server_params: if name == "sudo": setattr(self.runner.job, name, getattr(self, name).value) elif name == "srv_timeout": setattr(self.runner.job, name, getattr(self, name).value) else: getattr( self.runner.job, name).value = getattr(self, name).value if name in server_start_params: getattr(self.runner.job.action_command, name).value = \ getattr(self, name).value if name in runner_params: getattr(self.runner, name).value = getattr(self, name).value def run(self): """Execute the runner subprocess.""" self.log.info("Start CMD>>> %s", str(self.runner)) return self.runner.run() def start(self, yamlfile): """Start the server through the runner.""" self.runner.job.set_config(yamlfile) self.server_clean() # Prepare nvme storage in servers if self.runner.job.yaml_params.is_nvme(): self.log.info("Performing nvme storage prepare in <format> mode") storage_prepare(self._hosts, "root") self.runner.mca.value = {"plm_rsh_args": "-l root"} # Make sure log file has been created for ownership change lfile = self.runner.job.yaml_params.server_params[-1].log_file.value if lfile is not None: self.log.info("Creating log file") cmd_touch_log = "touch {}".format(lfile) pcmd(self._hosts, cmd_touch_log, False) # Change ownership of attach info directory chmod_attach = "chmod 777 -R {}".format(self.attach.value) pcmd(self._hosts, chmod_attach, False) try: self.run() except CommandFailure as details: self.log.info("<SERVER> Exception occurred: %s", str(details)) # Kill the subprocess, anything that might have started self.kill() raise ServerFailed( "Failed to start server in {} mode.".format( self.runner.job.mode)) if self.runner.job.yaml_params.is_nvme() or \ self.runner.job.yaml_params.is_scm(): # Setup the hostlist to pass to dmg command servers_with_ports = [ "{}:{}".format(host, self.runner.job.yaml_params.port) for host in self._hosts] # Format storage and wait for server to change ownership self.log.info("Formatting hosts: <%s>", self._hosts) storage_format(self.daosbinpath, ",".join(servers_with_ports)) self.runner.job.mode = "normal" try: self.runner.job.check_subprocess_status(self.runner.process) except CommandFailure as error: self.log.info("Failed to start after format: %s", str(error)) # Change ownership shared attach info file chmod_cmds = "sudo chmod 777 {}/daos_server.attach_info_tmp".format( self.attach.value) pcmd(self._hosts, chmod_cmds, False) return True def stop(self): """Stop the server through the runner.""" self.log.info("Stopping servers") if self.runner.job.yaml_params.is_nvme(): self.kill() storage_reset(self._hosts) # Make sure the mount directory belongs to non-root user self.log.info("Changing ownership of mount to non-root user") cmd = "sudo chown -R {0}:{0} /mnt/daos*".format(getpass.getuser()) pcmd(self._hosts, cmd, False) else: try: self.runner.stop() except CommandFailure as error: raise ServerFailed("Failed to stop servers:{}".format(error)) def server_clean(self): """Prepare the hosts before starting daos server.""" # Kill any doas servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() def kill(self): """Forcably kill any daos server processes running on hosts. Sometimes stop doesn't get everything. Really whack everything with this. """ kill_cmds = [ "sudo pkill '(daos_server|daos_io_server)' --signal INT", "sleep 5", "pkill '(daos_server|daos_io_server)' --signal KILL", ] self.log.info("Killing any server processes") pcmd(self._hosts, "; ".join(kill_cmds), False, None, None) def clean_files(self): """Clean the tmpfs on the servers.""" clean_cmds = [ "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | xargs -0r rm -rf" ] if self.runner.job.yaml_params.is_nvme() or \ self.runner.job.yaml_params.is_scm(): clean_cmds.append("sudo rm -rf /mnt/daos; sudo umount /mnt/daos") self.log.info("Cleanup of /mnt/daos directory.") pcmd(self._hosts, "; ".join(clean_cmds), False)