コード例 #1
0
class ServerManager(ExecutableCommand):
    """Defines object to manage server functions and launch server command."""
    def __init__(self, daosbinpath, runnerpath, timeout=300):
        """Create a ServerManager object.

        Args:
            daosbinpath (str): Path to daos bin
            runnerpath (str): Path to Orterun binary.
            timeout (int, optional): Time for the server to start.
                Defaults to 300.
        """
        super(ServerManager, self).__init__("/run/server_manager/*", "", "")

        self.daosbinpath = daosbinpath
        self._hosts = None

        # Setup orterun command defaults
        self.runner = Orterun(DaosServer(self.daosbinpath), runnerpath, True)

        # Setup server command defaults
        self.runner.job.action.value = "start"
        self.runner.job.get_action_command()

        # Parameters that user can specify in the test yaml to modify behavior.
        self.debug = BasicParameter(None, True)  # ServerCommand param
        self.insecure = BasicParameter(None, True)  # ServerCommand param
        self.recreate = BasicParameter(None, True)  # ServerCommand param
        self.sudo = BasicParameter(None, False)  # ServerCommand param
        self.srv_timeout = BasicParameter(None, timeout)  # ServerCommand param
        self.report_uri = BasicParameter(None)  # Orterun param
        self.enable_recovery = BasicParameter(None, True)  # Orterun param
        self.export = BasicParameter(None)  # Orterun param

    @property
    def hosts(self):
        """Hosts attribute getter."""
        return self._hosts

    @hosts.setter
    def hosts(self, value):
        """Hosts attribute setter.

        Args:
            value (tuple): (list of hosts, workdir, slots)
        """
        self._hosts, workdir, slots = value
        self.runner.processes.value = len(self._hosts)
        self.runner.hostfile.value = write_host_file(self._hosts, workdir,
                                                     slots)
        self.runner.job.server_cnt = len(self._hosts)
        self.runner.job.server_list = self._hosts

    def get_params(self, test):
        """Get values from the yaml file.

        Assign the ServerManager parameters to their respective ServerCommand
        and Orterun class parameters.

        Args:
            test (Test): avocado Test object
        """
        server_params = ["debug", "sudo", "srv_timeout"]
        server_start_params = ["insecure", "recreate"]
        runner_params = ["enable_recovery", "export", "report_uri"]
        super(ServerManager, self).get_params(test)
        self.runner.job.yaml_params.get_params(test)
        self.runner.get_params(test)
        for name in self.get_param_names():
            if name in server_params:
                if name == "sudo":
                    setattr(self.runner.job, name, getattr(self, name).value)
                elif name == "srv_timeout":
                    setattr(self.runner.job, name, getattr(self, name).value)
                else:
                    getattr(self.runner.job, name).value = getattr(self,
                                                                   name).value
            if name in server_start_params:
                getattr(self.runner.job.action_command, name).value = \
                    getattr(self, name).value
            if name in runner_params:
                getattr(self.runner, name).value = getattr(self, name).value

    def run(self):
        """Execute the runner subprocess."""
        self.log.info("Start CMD>>> %s", str(self.runner))
        return self.runner.run()

    def start(self, yamlfile):
        """Start the server through the runner."""
        storage_prep_flag = ""
        self.runner.job.set_config(yamlfile)
        self.server_clean()
        # Prepare SCM storage in servers
        if self.runner.job.yaml_params.is_scm():
            storage_prep_flag = "dcpm"
            self.log.info("Performing SCM storage prepare in <format> mode")
        else:
            storage_prep_flag = "ram"

        # Prepare nvme storage in servers
        if self.runner.job.yaml_params.is_nvme():
            if storage_prep_flag == "dcpm":
                storage_prep_flag = "dcpm_nvme"
            elif storage_prep_flag == "ram":
                storage_prep_flag = "ram_nvme"
            else:
                storage_prep_flag = "nvme"
            self.log.info("Performing NVMe storage prepare in <format> mode")
            # Make sure log file has been created for ownership change
            lfile = self.runner.job.yaml_params.server_params[
                -1].log_file.value
            if lfile is not None:
                self.log.info("Creating log file")
                cmd_touch_log = "touch {}".format(lfile)
                pcmd(self._hosts, cmd_touch_log, False)
        if storage_prep_flag != "ram":
            storage_prepare(self._hosts, "root", storage_prep_flag)
            self.runner.mca.value = {"plm_rsh_args": "-l root"}

        try:
            self.run()
        except CommandFailure as details:
            self.log.info("<SERVER> Exception occurred: %s", str(details))
            # Kill the subprocess, anything that might have started
            self.kill()
            raise ServerFailed("Failed to start server in {} mode.".format(
                self.runner.job.mode))

        if self.runner.job.yaml_params.is_nvme() or \
           self.runner.job.yaml_params.is_scm():
            # Setup the hostlist to pass to dmg command
            servers_with_ports = [
                "{}:{}".format(host, self.runner.job.yaml_params.port)
                for host in self._hosts
            ]

            # Format storage and wait for server to change ownership
            self.log.info("Formatting hosts: <%s>", self._hosts)
            storage_format(self.daosbinpath, ",".join(servers_with_ports))
            self.runner.job.mode = "normal"
            try:
                self.runner.job.check_subprocess_status(self.runner.process)
            except CommandFailure as error:
                self.log.info("Failed to start after format: %s", str(error))

        return True

    def stop(self):
        """Stop the server through the runner."""
        self.log.info("Stopping servers")
        if self.runner.job.yaml_params.is_nvme():
            self.kill()
            storage_reset(self._hosts)
            # Make sure the mount directory belongs to non-root user
            self.log.info("Changing ownership of mount to non-root user")
            cmd = "sudo chown -R {0}:{0} /mnt/daos*".format(getpass.getuser())
            pcmd(self._hosts, cmd, False)
        else:
            try:
                self.runner.stop()
            except CommandFailure as error:
                raise ServerFailed("Failed to stop servers:{}".format(error))

    def server_clean(self):
        """Prepare the hosts before starting daos server."""
        # Kill any doas servers running on the hosts
        self.kill()
        # Clean up any files that exist on the hosts
        self.clean_files()

    def kill(self):
        """Forcably kill any daos server processes running on hosts.

        Sometimes stop doesn't get everything.  Really whack everything
        with this.

        """
        kill_cmds = [
            "sudo pkill '(daos_server|daos_io_server)' --signal INT",
            "sleep 5",
            "pkill '(daos_server|daos_io_server)' --signal KILL",
        ]
        self.log.info("Killing any server processes")
        pcmd(self._hosts, "; ".join(kill_cmds), False, None, None)

    def clean_files(self):
        """Clean the tmpfs on the servers."""
        scm_mount = self.runner.job.yaml_params.server_params[-1].scm_mount
        scm_list = self.runner.job.yaml_params.server_params[-1].scm_list.value
        clean_cmds = [
            "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | xargs -0r rm -rf"
        ]
        if self.runner.job.yaml_params.is_nvme():
            clean_cmds.append("sudo rm -rf {0};  \
                               sudo umount {0}".format(scm_mount))
        # scm_mount can be /mnt/daos0 or /mnt/daos1 for two daos_server
        # instances. Presently, not supported in DAOS. The for loop needs
        # to be updated in future to handle it. Single instance pmem
        # device should work now.
        if self.runner.job.yaml_params.is_scm():
            for value in scm_list:
                clean_cmds.append("sudo umount {}; \
                                   sudo wipefs -a {}".format(scm_mount, value))
        self.log.info("Cleanup of %s directory.", str(scm_mount))
        pcmd(self._hosts, "; ".join(clean_cmds), False)
コード例 #2
0
ファイル: server_utils.py プロジェクト: shingchuang/daos
class ServerManager(ExecutableCommand):
    """Defines object to manage server functions and launch server command."""
    def __init__(self, daosbinpath, runnerpath, timeout=300):
        """Create a ServerManager object.

        Args:
            daosbinpath (str): Path to daos bin
            runnerpath (str): Path to Orterun binary.
            timeout (int, optional): Time for the server to start.
                Defaults to 300.
        """
        super(ServerManager, self).__init__("/run/server_manager/*", "", "")

        self.daosbinpath = daosbinpath
        self._hosts = None

        # Setup orterun command defaults
        self.runner = Orterun(DaosServer(self.daosbinpath), runnerpath, True)

        # Setup server command defaults
        self.runner.job.action.value = "start"
        self.runner.job.get_action_command()

        # Parameters that user can specify in the test yaml to modify behavior.
        self.debug = BasicParameter(None, True)  # ServerCommand param
        self.insecure = BasicParameter(None, True)  # ServerCommand param
        self.recreate = BasicParameter(None, False)  # ServerCommand param
        self.sudo = BasicParameter(None, False)  # ServerCommand param
        self.srv_timeout = BasicParameter(None, timeout)  # ServerCommand param
        self.report_uri = BasicParameter(None)  # Orterun param
        self.enable_recovery = BasicParameter(None, True)  # Orterun param
        self.export = BasicParameter(None)  # Orterun param

    @property
    def hosts(self):
        """Hosts attribute getter."""
        return self._hosts

    @hosts.setter
    def hosts(self, value):
        """Hosts attribute setter.

        Args:
            value (tuple): (list of hosts, workdir, slots)
        """
        self._hosts, workdir, slots = value
        self.runner.processes.value = len(self._hosts)
        self.runner.hostfile.value = write_host_file(self._hosts, workdir,
                                                     slots)
        self.runner.job.server_list = self._hosts

    def get_params(self, test):
        """Get values from the yaml file.

        Assign the ServerManager parameters to their respective ServerCommand
        and Orterun class parameters.

        Args:
            test (Test): avocado Test object
        """
        server_params = ["debug", "sudo", "srv_timeout"]
        server_start_params = ["insecure", "recreate"]
        runner_params = ["enable_recovery", "export", "report_uri"]
        super(ServerManager, self).get_params(test)
        self.runner.job.yaml_params.get_params(test)
        self.runner.get_params(test)
        for name in self.get_param_names():
            if name in server_params:
                if name == "sudo":
                    setattr(self.runner.job, name, getattr(self, name).value)
                elif name == "srv_timeout":
                    setattr(self.runner.job, "timeout",
                            getattr(self, name).value)
                else:
                    getattr(self.runner.job, name).value = getattr(self,
                                                                   name).value
            if name in server_start_params:
                getattr(self.runner.job.action_command, name).value = \
                    getattr(self, name).value
            if name in runner_params:
                getattr(self.runner, name).value = getattr(self, name).value

        # Run daos_server with test variant specific log file names if specified
        self.runner.job.yaml_params.update_log_files(
            getattr(test, "control_log"), getattr(test, "helper_log"),
            getattr(test, "server_log"))

    def run(self):
        """Execute the runner subprocess."""
        self.log.info("Start CMD>>> %s", str(self.runner))

        # Temporary display debug mount information
        self.log.info("%s", "=" * 80)
        pcmd(self._hosts, "df -h -t tmpfs", True, None, None)
        self.log.info("%s", "=" * 80)

        return self.runner.run()

    def start(self, yamlfile):
        """Start the server through the runner."""
        storage_prep_flag = ""
        self.runner.job.set_config(yamlfile)
        self.server_clean()

        # Prepare SCM storage in servers
        if self.runner.job.yaml_params.is_scm():
            storage_prep_flag = "dcpm"
            self.log.info("Performing SCM storage prepare in <format> mode")
        else:
            storage_prep_flag = "ram"

        # Prepare nvme storage in servers
        if self.runner.job.yaml_params.is_nvme():
            if storage_prep_flag == "dcpm":
                storage_prep_flag = "dcpm_nvme"
            elif storage_prep_flag == "ram":
                storage_prep_flag = "ram_nvme"
            else:
                storage_prep_flag = "nvme"
            self.log.info("Performing NVMe storage prepare in <format> mode")
            # Make sure log file has been created for ownership change
            lfile = self.runner.job.yaml_params.server_params[
                -1].log_file.value
            if lfile is not None:
                self.log.info("Creating log file")
                cmd_touch_log = "touch {}".format(lfile)
                pcmd(self._hosts, cmd_touch_log, False)
        if storage_prep_flag != "ram":
            self.storage_prepare(getpass.getuser(), storage_prep_flag)
            self.runner.mca.update({"plm_rsh_args": "-l root"}, "orterun.mca",
                                   True)

        # Start the server and wait for each host to require a SCM format
        self.runner.job.mode = "format"
        try:
            self.run()
        except CommandFailure as error:
            raise ServerFailed(
                "Failed to start servers before format: {}".format(error))

        # Format storage and wait for server to change ownership
        self.log.info("Formatting hosts: <%s>", self._hosts)
        servers_with_ports = [
            "{}:{}".format(host, self.runner.job.yaml_params.port)
            for host in self._hosts
        ]
        storage_format(self.daosbinpath, ",".join(servers_with_ports))

        # Wait for all the doas_io_servers to start
        self.runner.job.mode = "normal"
        if not self.runner.job.check_subprocess_status(self.runner.process):
            raise ServerFailed("Failed to start servers after format")

        return True

    def stop(self):
        """Stop the server through the runner."""
        self.log.info("Stopping servers")
        if self.runner.job.yaml_params.is_nvme():
            self.kill()
            self.storage_reset()
            # Make sure the mount directory belongs to non-root user
            self.log.info("Changing ownership of mount to non-root user")
            cmd = "sudo chown -R {0}:{0} /mnt/daos*".format(getpass.getuser())
            pcmd(self._hosts, cmd, False)
        else:
            try:
                self.runner.stop()
            except CommandFailure as error:
                raise ServerFailed("Failed to stop servers:{}".format(error))

    def server_clean(self):
        """Prepare the hosts before starting daos server."""
        # Kill any doas servers running on the hosts
        self.kill()
        # Clean up any files that exist on the hosts
        self.clean_files()

    def kill(self):
        """Forcably kill any daos server processes running on hosts.

        Sometimes stop doesn't get everything.  Really whack everything
        with this.

        """
        kill_cmds = [
            "sudo pkill '(daos_server|daos_io_server)' --signal INT",
            "sleep 5",
            "pkill '(daos_server|daos_io_server)' --signal KILL",
        ]
        self.log.info("Killing any server processes")
        pcmd(self._hosts, "; ".join(kill_cmds), False, None, None)

    def clean_files(self):
        """Clean the tmpfs on the servers."""
        clean_cmds = []
        for server_params in self.runner.job.yaml_params.server_params:
            scm_mount = server_params.scm_mount.value
            self.log.info("Cleaning up the %s directory.", str(scm_mount))

            # Remove the superblocks
            cmd = "rm -fr {}/*".format(scm_mount)
            if cmd not in clean_cmds:
                clean_cmds.append(cmd)

            # Dismount the scm mount point
            cmd = "while sudo umount {}; do continue; done".format(scm_mount)
            if cmd not in clean_cmds:
                clean_cmds.append(cmd)

            if self.runner.job.yaml_params.is_scm():
                scm_list = server_params.scm_list.value
                if isinstance(scm_list, list):
                    self.log.info("Cleaning up the following device(s): %s.",
                                  ", ".join(scm_list))
                    # Umount and wipefs the dcpm device
                    cmd_list = [
                        "for dev in {}".format(" ".join(scm_list)),
                        "do mount=$(lsblk $dev -n -o MOUNTPOINT)",
                        "if [ ! -z $mount ]", "then while sudo umount $mount",
                        "do continue", "done", "fi", "sudo wipefs -a $dev",
                        "done"
                    ]
                    cmd = "; ".join(cmd_list)
                    if cmd not in clean_cmds:
                        clean_cmds.append(cmd)

        pcmd(self._hosts, "; ".join(clean_cmds), True)

    def storage_prepare(self, user, device_type):
        """Prepare server's storage using the DAOS server's yaml settings file.

        Args:
            user (str): username for file permissions
            device_type (str): storage type - scm or nvme

        Raises:
            ServerFailed: if server failed to prepare storage

        """
        # Get the daos_server from the install path. Useful for testing
        # with daos built binaries.
        dev_param = ""
        device_args = ""
        daos_srv_bin = os.path.join(self.daosbinpath, "daos_server")
        if device_type == "dcpm":
            dev_param = "-s"
        elif device_type == "dcpm_nvme":
            device_args = " --hugepages=4096"
        elif device_type in ("ram_nvme", "nvme"):
            dev_param = "-n"
            device_args = " --hugepages=4096"
        else:
            raise ServerFailed("Invalid device type")
        cmd = "{} storage prepare {} -u \"{}\" {} -f".format(
            daos_srv_bin, dev_param, user, device_args)
        result = pcmd(self._hosts, cmd, timeout=120)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed(
                "Error preparing {} storage".format(device_type))

    def storage_reset(self):
        """Reset the servers' storage.

        NOTE: Don't enhance this method to reset SCM. SCM will not be in a
        useful state for running next tests.

        Raises:
            ServerFailed: if server failed to reset storage

        """
        daos_srv_bin = os.path.join(self.daosbinpath, "daos_server")
        cmd = "{} storage prepare -n --reset -f".format(daos_srv_bin)
        result = pcmd(self._hosts, cmd)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed("Error resetting NVMe storage")
コード例 #3
0
class ServerManager(ExecutableCommand):
    """Defines object to manage server functions and launch server command."""
    # pylint: disable=pylint-no-self-use

    def __init__(self, daosbinpath, runnerpath, attach="/tmp", timeout=300):
        """Create a ServerManager object.

        Args:
            daosbinpath (str): Path to daos bin
            runnerpath (str): Path to Orterun binary.
            attach (str, optional): Defaults to "/tmp".
            timeout (int, optional): Time for the server to start.
                Defaults to 300.
        """
        super(ServerManager, self).__init__("/run/server_manager/*", "", "")

        self.daosbinpath = daosbinpath
        self._hosts = None

        # Setup orterun command defaults
        self.runner = Orterun(
            DaosServer(self.daosbinpath), runnerpath, True)

        # Setup server command defaults
        self.runner.job.action.value = "start"
        self.runner.job.get_action_command()

        # Set server environment
        os.environ["CRT_ATTACH_INFO_PATH"] = attach

        # Parameters that user can specify in the test yaml to modify behavior.
        self.debug = BasicParameter(None, True)       # ServerCommand param
        self.attach = BasicParameter(None, attach)    # ServerCommand param
        self.insecure = BasicParameter(None, True)    # ServerCommand param
        self.recreate = BasicParameter(None, True)    # ServerCommand param
        self.sudo = BasicParameter(None, False)       # ServerCommand param
        self.srv_timeout = BasicParameter(None, timeout)   # ServerCommand param
        self.report_uri = BasicParameter(None)             # Orterun param
        self.enable_recovery = BasicParameter(None, True)  # Orterun param
        self.export = BasicParameter(None)                 # Orterun param

    @property
    def hosts(self):
        """Hosts attribute getter."""
        return self._hosts

    @hosts.setter
    def hosts(self, value):
        """Hosts attribute setter

        Args:
            value (tuple): (list of hosts, workdir, slots)
        """
        self._hosts, workdir, slots = value
        self.runner.processes.value = len(self._hosts)
        self.runner.hostfile.value = write_host_file(
            self._hosts, workdir, slots)
        self.runner.job.server_cnt = len(self._hosts)

    def get_params(self, test):
        """Get values from the yaml file and assign them respectively
            to the server command and the orterun command.

        Args:
            test (Test): avocado Test object
        """
        server_params = ["debug", "sudo", "srv_timeout"]
        server_start_params = ["attach", "insecure", "recreate"]
        runner_params = ["enable_recovery", "export", "report_uri"]
        super(ServerManager, self).get_params(test)
        self.runner.job.yaml_params.get_params(test)
        self.runner.get_params(test)
        for name in self.get_param_names():
            if name in server_params:
                if name == "sudo":
                    setattr(self.runner.job, name, getattr(self, name).value)
                elif name == "srv_timeout":
                    setattr(self.runner.job, name, getattr(self, name).value)
                else:
                    getattr(
                        self.runner.job, name).value = getattr(self, name).value
            if name in server_start_params:
                getattr(self.runner.job.action_command, name).value = \
                    getattr(self, name).value
            if name in runner_params:
                getattr(self.runner, name).value = getattr(self, name).value

    def run(self):
        """Execute the runner subprocess."""
        self.log.info("Start CMD>>> %s", str(self.runner))
        return self.runner.run()

    def start(self, yamlfile):
        """Start the server through the runner."""
        self.runner.job.set_config(yamlfile)
        self.server_clean()

        # Prepare nvme storage in servers
        if self.runner.job.yaml_params.is_nvme():
            self.log.info("Performing nvme storage prepare in <format> mode")
            storage_prepare(self._hosts, "root")
            self.runner.mca.value = {"plm_rsh_args": "-l root"}

            # Make sure log file has been created for ownership change
            lfile = self.runner.job.yaml_params.server_params[-1].log_file.value
            if lfile is not None:
                self.log.info("Creating log file")
                cmd_touch_log = "touch {}".format(lfile)
                pcmd(self._hosts, cmd_touch_log, False)

            # Change ownership of attach info directory
            chmod_attach = "chmod 777 -R {}".format(self.attach.value)
            pcmd(self._hosts, chmod_attach, False)

        try:
            self.run()
        except CommandFailure as details:
            self.log.info("<SERVER> Exception occurred: %s", str(details))
            # Kill the subprocess, anything that might have started
            self.kill()
            raise ServerFailed(
                "Failed to start server in {} mode.".format(
                    self.runner.job.mode))

        if self.runner.job.yaml_params.is_nvme() or \
           self.runner.job.yaml_params.is_scm():
            # Setup the hostlist to pass to dmg command
            servers_with_ports = [
                "{}:{}".format(host, self.runner.job.yaml_params.port)
                for host in self._hosts]

            # Format storage and wait for server to change ownership
            self.log.info("Formatting hosts: <%s>", self._hosts)
            storage_format(self.daosbinpath, ",".join(servers_with_ports))
            self.runner.job.mode = "normal"
            try:
                self.runner.job.check_subprocess_status(self.runner.process)
            except CommandFailure as error:
                self.log.info("Failed to start after format: %s", str(error))

            # Change ownership shared attach info file
            chmod_cmds = "sudo chmod 777 {}/daos_server.attach_info_tmp".format(
                self.attach.value)
            pcmd(self._hosts, chmod_cmds, False)

        return True

    def stop(self):
        """Stop the server through the runner."""
        self.log.info("Stopping servers")
        if self.runner.job.yaml_params.is_nvme():
            self.kill()
            storage_reset(self._hosts)
            # Make sure the mount directory belongs to non-root user
            self.log.info("Changing ownership of mount to non-root user")
            cmd = "sudo chown -R {0}:{0} /mnt/daos*".format(getpass.getuser())
            pcmd(self._hosts, cmd, False)
        else:
            try:
                self.runner.stop()
            except CommandFailure as error:
                raise ServerFailed("Failed to stop servers:{}".format(error))

    def server_clean(self):
        """Prepare the hosts before starting daos server."""
        # Kill any doas servers running on the hosts
        self.kill()
        # Clean up any files that exist on the hosts
        self.clean_files()

    def kill(self):
        """Forcably kill any daos server processes running on hosts.

        Sometimes stop doesn't get everything.  Really whack everything
        with this.

        """
        kill_cmds = [
            "sudo pkill '(daos_server|daos_io_server)' --signal INT",
            "sleep 5",
            "pkill '(daos_server|daos_io_server)' --signal KILL",
        ]
        self.log.info("Killing any server processes")
        pcmd(self._hosts, "; ".join(kill_cmds), False, None, None)

    def clean_files(self):
        """Clean the tmpfs on the servers."""
        clean_cmds = [
            "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | xargs -0r rm -rf"
        ]

        if self.runner.job.yaml_params.is_nvme() or \
           self.runner.job.yaml_params.is_scm():
            clean_cmds.append("sudo rm -rf /mnt/daos; sudo umount /mnt/daos")

        self.log.info("Cleanup of /mnt/daos directory.")
        pcmd(self._hosts, "; ".join(clean_cmds), False)