Beispiel #1
0
    def _run_process(self):
        """Run the command as a foreground process.

        Raises:
            CommandFailure: if there is an error running the command

        """
        command = self.__str__()
        try:
            # Block until the command is complete or times out
            return run_command(command,
                               self.timeout,
                               self.verbose,
                               self.exit_status_exception,
                               "combined",
                               env=self.env)

        except DaosTestError as error:
            # Command failed or possibly timed out
            raise CommandFailure(error)
Beispiel #2
0
    def parse_output(self, stdout, regex_method):
        """Parse output using findall() with supplied 'regex_method' as pattern.

        Args:
            stdout (str): output to parse
            regex_method (str): name of the method regex to use

        Raises:
            CommandFailure: if there is an error finding the method's regex
                pattern.

        Returns:
            list: a list of strings obtained from the method's output parsed
                through its regex

        """
        if regex_method not in self.METHOD_REGEX:
            raise CommandFailure(
                "No pattern regex defined for '{}()'".format(regex_method))
        return re.findall(self.METHOD_REGEX[regex_method], stdout)
Beispiel #3
0
    def _report_unit_command(self, command):
        """Run the systemctl command and report the log data on an error.

        Args:
            command (str): systemctl unit command

        Raises:
            CommandFailure: if there is an issue running the command

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        try:
            return self._run_unit_command(command)
        except CommandFailure as error:
            self.log.info(error)
            self.display_log_data(
                self.get_log_data(self._hosts, self.timestamps[command]))
            raise CommandFailure(error)
Beispiel #4
0
    def __init__(self, command, manager="Orterun"):
        """Create a SubprocessManager object.

        Args:
            command (YamlCommand): command to manage as a subprocess
            manager (str, optional): the name of the JobManager class used to
                manage the YamlCommand defined through the "job" attribute.
                Defaults to "OpenMpi"
        """
        self.log = getLogger(__name__)

        # Define the JobManager class used to manage the command as a subprocess
        try:
            manager_module = import_module("job_manager_utils")
            manager_class = getattr(manager_module, manager)
        except (ImportError, AttributeError) as error:
            raise CommandFailure("Invalid '{}' job manager class: {}".format(
                manager, error))
        self.manager = manager_class(command, subprocess=True)

        # Define the list of hosts that will execute the daos command
        self._hosts = []
Beispiel #5
0
    def start(self):
        """Start the daos command.

        Raises:
            CommandFailure: if the daos command fails to start

        """
        # Create the yaml file for the daos command
        self.manager.job.temporary_file_hosts = self._hosts
        self.manager.job.create_yaml_file()

        # Start the daos command
        try:
            self.manager.run()
        except CommandFailure as error:
            # Kill the subprocess, anything that might have started
            self.manager.kill()
            raise CommandFailure("Failed to start {}.".format(
                str(self.manager.job))) from error
        finally:
            # Define the expected states for each rank
            self._expected_states = self.get_current_state()
Beispiel #6
0
    def __init__(self, job, subprocess=False):
        """Create a Orterun object.

        Args:
            job (ExecutableCommand): command object to manage.
            subprocess (bool, optional): whether the command is run as a
                subprocess. Defaults to False.
        """
        if not load_mpi("openmpi"):
            raise CommandFailure("Failed to load openmpi")

        path = os.path.dirname(find_executable("orterun"))
        super(Orterun, self).__init__("/run/orterun/*", "orterun", job, path,
                                      subprocess)

        # Default mca values to avoid queue pair errors
        mca_default = {
            "btl_openib_warn_default_gid_prefix": "0",
            "btl": "tcp,self",
            "oob": "tcp",
            "pml": "ob1",
            "btl_tcp_if_include": "eth0",
        }

        self.hostfile = FormattedParameter("--hostfile {}", None)
        self.processes = FormattedParameter("--np {}", 1)
        self.display_map = FormattedParameter("--display-map", False)
        self.map_by = FormattedParameter("--map-by {}", "node")
        self.export = FormattedParameter("-x {}", None)
        self.enable_recovery = FormattedParameter("--enable-recovery", True)
        self.report_uri = FormattedParameter("--report-uri {}", None)
        self.allow_run_as_root = FormattedParameter("--allow-run-as-root",
                                                    None)
        self.mca = FormattedParameter("--mca {}", mca_default)
        self.pprnode = FormattedParameter("--map-by ppr:{}:node", None)
        self.tag_output = FormattedParameter("--tag-output", True)
        self.ompi_server = FormattedParameter("--ompi-server {}", None)
        self.working_dir = FormattedParameter("-wdir {}", None)
Beispiel #7
0
    def stop(self):
        """Stop the subprocess command.

        Raises:
            CommandFailure: if unable to stop

        """
        if self._process is not None:
            # Send a SIGTERM to the stop the subprocess and if it is still
            # running after 5 seconds send a SIGKILL and report an error
            signal_list = [signal.SIGTERM, signal.SIGKILL]

            # Turn off verbosity to keep the logs clean as the server stops
            self._process.verbose = False

            # Send signals while the process is still running
            state = None
            while self._process.poll() is None and signal_list:
                signal_to_send = signal_list.pop(0)
                msg = "before sending signal {}".format(signal_to_send)
                state = self.get_subprocess_state(msg)
                self.log.info(
                    "Sending signal %s to %s (state=%s)", str(signal_to_send),
                    self._command, str(state))
                self._process.send_signal(signal_to_send)
                if signal_list:
                    time.sleep(5)

            if not signal_list:
                if state and (len(state) > 1 or state[0] not in ("D", "Z")):
                    # Indicate an error if the process required a SIGKILL and
                    # either multiple processes were still found running or the
                    # parent process was in any state except uninterruptible
                    # sleep (D) or zombie (Z).
                    raise CommandFailure("Error stopping '{}'".format(self))

            self.log.info("%s stopped successfully", self.command)
            self._process = None
Beispiel #8
0
    def _execute_command(self,
                         command,
                         fail_on_err=True,
                         display_output=True,
                         hosts=None):
        """Execute the command on all client hosts.

        Optionally verify if the command returns a non zero return code.

        Args:
            command (str): the command to execute on the client hosts
            fail_on_err (bool, optional): whether or not to fail the test if
                command returns a non zero return code. Defaults to True.
            display_output (bool, optional): whether or not to display output.
                Defaults to True.

        Raises:
            CommandFailure: if 'fail_on_err' is set and the command fails on at
                least one of the client hosts

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        if hosts is None:
            hosts = self.hostlist_clients
        result = pcmd(hosts, command, verbose=display_output, timeout=300)
        if 0 not in result and fail_on_err:
            hosts = [
                str(nodes) for code, nodes in list(result.items()) if code != 0
            ]
            raise CommandFailure(
                "Error running '{}' on the following hosts: {}".format(
                    command, NodeSet(",".join(hosts))))
        return result
Beispiel #9
0
    def run(self):
        """Run the daos_racer command remotely.

        Raises:
            CommandFailure: if there is an error running the command

        """
        # Run daos_racer on the specified host
        self.log.info(
            "Running %s on %s with %s timeout", self.__str__(), self.host,
            "no" if self.clush_timeout.value is None else "a {}s".format(
                self.clush_timeout.value))
        return_codes = pcmd([self.host], self.__str__(), True,
                            self.clush_timeout.value)
        if 0 not in return_codes or len(return_codes) > 1:
            # Kill the daos_racer process if the remote command timed out
            if 255 in return_codes:
                self.log.info("Stopping timed out daos_racer process on %s",
                              self.host)
                pcmd([self.host], "pkill daos_racer", True)

            raise CommandFailure("Error running '{}'".format(self._command))

        self.log.info("Test passed!")
Beispiel #10
0
def run_ior_loop(manager, uuids, tmpdir_base):
    """IOR run for each UUID provided.

    Args:
        manager (str): mpi job manager command
        uuids (list): list of container UUIDs
        tmpdir_base (str): base directory for the mpi orte_tmpdir_base mca parameter

    Returns:
        list: a list of CmdResults from each ior command run

    """
    results = []
    errors = []
    for index, cont_uuid in enumerate(uuids):
        manager.job.dfs_cont.update(cont_uuid, "ior.cont_uuid")

        # Create a unique temporary directory for the the manager command
        tmp_dir = mkdtemp(dir=tmpdir_base)
        manager.tmpdir_base.update(tmp_dir, "tmpdir_base")

        try:
            results.append(manager.run())
        except CommandFailure as error:
            ior_mode = "read" if "-r" in manager.job.flags.value else "write"
            errors.append(
                "IOR {} Loop {}/{} failed for container {}: {}".format(
                    ior_mode, index, len(uuids), cont_uuid, error))
        finally:
            # Remove the unique temporary directory and its contents to avoid conflicts
            shutil.rmtree(tmp_dir, ignore_errors=True)

    if errors:
        raise CommandFailure("IOR failed in {}/{} loops: {}".format(
            len(errors), len(uuids), "\n".join(errors)))
    return results
Beispiel #11
0
    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        # Include all hosts when stopping to ensure all mount points in any
        # state are properly removed
        self.running_hosts.add(NodeSet.fromlist(self.hosts))

        self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value and self.running_hosts:
            error_list = []

            # Loop until all fuseblk mounted devices are unmounted
            counter = 0
            while self.running_hosts and counter < 3:
                # Attempt to kill dfuse on after first unmount fails
                if self.running_hosts and counter > 1:
                    kill_command = "pkill dfuse --signal KILL"
                    pcmd(self.running_hosts, kill_command, timeout=30)

                # Attempt to unmount any fuseblk mounted devices after detection
                if self.running_hosts and counter > 0:
                    pcmd(self.running_hosts,
                         self.get_umount_command(counter > 1),
                         expect_rc=None)
                    time.sleep(2)

                # Detect which hosts have fuseblk mounted devices and remove any
                # hosts which no longer have the dfuse mount point mounted
                state = self.check_mount_state(self.running_hosts)
                for host in state["unmounted"].union(state["nodirectory"]):
                    self.running_hosts.remove(host)

                # Increment the loop counter
                counter += 1

            if self.running_hosts:
                error_list.append("Error stopping dfuse on {}".format(
                    self.running_hosts))

            # Remove mount points
            try:
                self.remove_mount_point()
            except CommandFailure as error:
                error_list.append(error)

            # Report any errors
            if error_list:
                raise CommandFailure("\n".join(error_list))

        elif self.mount_dir.value is None:
            self.log.info("No dfuse mount directory defined - nothing to stop")

        else:
            self.log.info("No hosts running dfuse - nothing to stop")
Beispiel #12
0
    def update_config_file_from_file(self, dst_hosts, test_dir,
                                     generated_yaml):
        """Update config file and object.

        Create and place the new config file in /etc/daos/daos_server.yml
        Then update SCM-related data in engine_params so that those disks will
        be wiped.

        Args:
            dst_hosts (list): Destination server hostnames to place the new
                config file.
            test_dir (str): Directory where the server config data from
                generated_yaml will be written.
            generated_yaml (YAMLObject): New server config data.

        """
        # Create a temporary file in test_dir and write the generated config.
        temp_file_path = os.path.join(test_dir, "temp_server.yml")
        try:
            with open(temp_file_path, 'w') as write_file:
                yaml.dump(generated_yaml, write_file, default_flow_style=False)
        except Exception as error:
            raise CommandFailure("Error writing the yaml file! {}: {}".format(
                temp_file_path, error)) from error

        # Copy the config from temp dir to /etc/daos of the server node.
        default_server_config = get_default_config_file("server")
        try:
            distribute_files(dst_hosts,
                             temp_file_path,
                             default_server_config,
                             verbose=False,
                             sudo=True)
        except DaosTestError as error:
            raise CommandFailure(
                "ERROR: Copying yaml configuration file to {}: "
                "{}".format(dst_hosts, error)) from error

        # Before restarting daos_server, we need to clear SCM. Unmount the mount
        # point, wipefs the disks, etc. This clearing step is built into the
        # server start steps. It'll look at the engine_params of the
        # server_manager and clear the SCM set there, so we need to overwrite it
        # before starting to the values from the generated config.
        self.log.info("Resetting engine_params")
        self.manager.job.yaml.engine_params = []
        engines = generated_yaml["engines"]
        for i, engine in enumerate(engines):
            self.log.info("engine %d", i)
            for storage_tier in engine["storage"]:
                if storage_tier["class"] != "dcpm":
                    continue

                self.log.info("scm_mount = %s", storage_tier["scm_mount"])
                self.log.info("class = %s", storage_tier["class"])
                self.log.info("scm_list = %s", storage_tier["scm_list"])

                per_engine_yaml_parameters =\
                    DaosServerYamlParameters.PerEngineYamlParameters(i)
                per_engine_yaml_parameters.scm_mount.update(
                    storage_tier["scm_mount"])
                per_engine_yaml_parameters.scm_class.update(
                    storage_tier["class"])
                per_engine_yaml_parameters.scm_size.update(None)
                per_engine_yaml_parameters.scm_list.update(
                    storage_tier["scm_list"])
                per_engine_yaml_parameters.reset_yaml_data_updated()

                self.manager.job.yaml.engine_params.append(
                    per_engine_yaml_parameters)
Beispiel #13
0
    def create(self, uuid=None, con_in=None, acl_file=None):
        """Create a container.

        Args:
            uuid (str, optional): container uuid. Defaults to None.
            con_in (optional): to be defined. Defaults to None.
            acl_file (str, optional): path of the ACL file. Defaults to None.
        """
        self.destroy()
        if not self.silent.value:
            self.log.info("Creating a container with pool handle %s",
                          self.pool.pool.handle.value)
        self.container = DaosContainer(self.pool.context)

        if self.control_method.value == self.USE_API:
            # Create a container with the API method
            kwargs = {"poh": self.pool.pool.handle}
            if uuid is not None:
                kwargs["con_uuid"] = uuid

            # Refer daos_api for setting input params for DaosContainer.
            if con_in is not None:
                cop = self.input_params.get_con_create_params()
                cop.type = con_in[0]
                cop.enable_chksum = con_in[1]
                cop.srv_verify = con_in[2]
                cop.chksum_type = con_in[3]
                cop.chunk_size = con_in[4]
                kwargs["con_prop"] = cop

            self._call_method(self.container.create, kwargs)

        elif self.control_method.value == self.USE_DAOS and self.daos:
            # Disconnect the pool if connected
            self.pool.disconnect()

            # Create a container with the daos command
            kwargs = {
                "pool": self.pool.uuid,
                "sys_name": self.pool.name.value,
                "cont": uuid,
                "path": self.path.value,
                "cont_type": self.type.value,
                "oclass": self.oclass.value,
                "chunk_size": self.chunk_size.value,
                "properties": self.properties.value,
                "acl_file": acl_file,
            }

            self._log_method("daos.container_create", kwargs)
            try:
                uuid = self.daos.container_create(
                    **kwargs)["response"]["container_uuid"]
            except KeyError as error:
                raise CommandFailure(
                    "Error: Unexpected daos container create output"
                ) from error
            # Populate the empty DaosContainer object with the properties of the
            # container created with daos container create.
            self.container.uuid = str_to_c_uuid(uuid)
            self.container.attached = 1
            self.container.poh = self.pool.pool.handle

        elif self.control_method.value == self.USE_DAOS:
            self.log.error("Error: Undefined daos command")

        else:
            self.log.error("Error: Undefined control_method: %s",
                           self.control_method.value)

        self.uuid = self.container.get_uuid_str()
        if not self.silent.value:
            self.log.info("  Container created with uuid %s", self.uuid)
Beispiel #14
0
    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            dict: log output per host

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = [
            "sudo",
            "journalctl",
            "--unit={}".format(self._systemctl.service.value),
            "--since=\"{}\"".format(since),
        ]
        if until:
            command.append("--until=\"{}\"".format(until))
        self.log.info("Gathering log data on %s: %s", str(hosts),
                      " ".join(command))

        # Gather the log information per host
        task = run_task(hosts, " ".join(command), timeout)

        # Create a dictionary of hosts for each unique return code
        results = {code: hosts for code, hosts in task.iter_retcodes()}

        # Determine if the command completed successfully across all the hosts
        status = len(results) == 1 and 0 in results

        # Determine if any commands timed out
        timed_out = [str(hosts) for hosts in task.iter_keys_timeout()]
        if timed_out:
            status = False
        if not status:
            self.log.info("  Errors detected running \"%s\":", command)

        # List any hosts that timed out
        if timed_out:
            self.log.info("    %s: timeout detected after %s seconds",
                          str(NodeSet.fromlist(timed_out)), timeout)

        # Display/return the command output
        log_data = {}
        for code in sorted(results):
            # Get the command output from the hosts with this return code
            output_data = list(task.iter_buffers(results[code]))
            if not output_data:
                output_data = [["<NONE>", results[code]]]

            for output_buffer, output_hosts in output_data:
                node_set = NodeSet.fromlist(output_hosts)
                lines = str(output_buffer).splitlines()

                if status:
                    # Add the successful output from each node to the dictionary
                    log_data[node_set] = lines
                else:
                    # Display all of the results in the case of an error
                    if len(lines) > 1:
                        self.log.info("    %s: rc=%s, output:", node_set, code)
                        for line in lines:
                            self.log.info("      %s", line)
                    else:
                        self.log.info("    %s: rc=%s, output: %s", node_set,
                                      code, output_buffer)

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data
Beispiel #15
0
    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            list: a list of dictionaries including:
                "hosts": <NodeSet() of hosts with this data>
                "data": <journalctl output>

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = [
            "sudo",
            "journalctl",
            "--unit={}".format(self._systemctl.service.value),
            "--since=\"{}\"".format(since),
        ]
        if until:
            command.append("--until=\"{}\"".format(until))
        self.log.info(
            "Gathering log data on %s: %s", str(hosts), " ".join(command))

        # Gather the log information per host
        results = run_pcmd(hosts, " ".join(command), False, timeout, None)

        # Determine if the command completed successfully without a timeout
        status = True
        for result in results:
            if result["interrupted"]:
                self.log.info("  Errors detected running \"%s\":", command)
                self.log.info(
                    "    %s: timeout detected after %s seconds",
                    str(result["hosts"]), timeout)
                status = False
            elif result["exit_status"] != 0:
                self.log.info("  Errors detected running \"%s\":", command)
                status = False
            if not status:
                break

        # Display/return the command output
        log_data = []
        for result in results:
            if result["exit_status"] == 0 and not result["interrupted"]:
                # Add the successful output from each node to the dictionary
                log_data.append(
                    {"hosts": result["hosts"], "data": result["stdout"]})
            else:
                # Display all of the results in the case of an error
                if len(result["stdout"]) > 1:
                    self.log.info(
                        "    %s: rc=%s, output:",
                        str(result["hosts"]), result["exit_status"])
                    for line in result["stdout"]:
                        self.log.info("      %s", line)
                else:
                    self.log.info(
                        "    %s: rc=%s, output: %s",
                        str(result["hosts"]), result["exit_status"],
                        result["stdout"][0])

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data