Ejemplo n.º 1
0
    def __init__(self, job):
        """Create a Orterun object.

        Args:
            job (SubProcessCommand): command object to manage.
        """
        # path = os.path.dirname(find_executable("systemctl"))
        super(Systemctl, self).__init__("/run/systemctl/*", "", job)
        self.job = job
        self._systemctl = SystemctlCommand()
        self._systemctl.service.value = self.job.service_name

        self.timestamps = {
            "enable": None,
            "disable": None,
            "start": None,
            "running": None,
            "verified": None,
            "stop": None,
            "restart": None,
        }
Ejemplo n.º 2
0
class Systemctl(JobManager):
    """A class for the systemctl job manager command."""
    def __init__(self, job):
        """Create a Orterun object.

        Args:
            job (SubProcessCommand): command object to manage.
        """
        # path = os.path.dirname(find_executable("systemctl"))
        super(Systemctl, self).__init__("/run/systemctl/*", "", job)
        self.job = job
        self._systemctl = SystemctlCommand()
        self._systemctl.service.value = self.job.service_name

        self.timestamps = {
            "enable": None,
            "disable": None,
            "start": None,
            "running": None,
            "verified": None,
            "stop": None,
            "restart": None,
        }

    @property
    def hosts(self):
        """Get the list of hosts associated with this command."""
        return list(self._hosts) if self._hosts else None

    def __str__(self):
        """Return the command with all of its defined parameters as a string.

        Returns:
            str: the command with all the defined parameters

        """
        return self._systemctl.__str__()

    def run(self):
        """Start the job's service via the systemctl command.

        Enable the service, start the service, and report the status of the
        service.  If an error occurs with any of these commands also display
        the journalctl output for the service.

        Raises:
            CommandFailure: if unable to enable or start the service

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        # Start the daos_server.service
        self.service_enable()
        result = self.service_start()
        # result = self.service_status()

        # Determine if the command has launched correctly using its
        # check_subprocess_status() method.
        if not self.check_subprocess_status(None):
            msg = "Command '{}' did not launch correctly".format(self)
            self.log.error(msg)
            raise CommandFailure(msg)

        return result

    def stop(self):
        """Stop the job's service via the systemctl command.

        Stop the service, disable the service, and report the status of the
        service.  If an error occurs with any of these commands also display
        the journalctl output for the service.

        Raises:
            CommandFailure: if unable to stop or disable the service

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        self.service_stop()
        return self.service_disable()

    def wait(self):
        """Wait for the sub process to complete."""
        raise NotImplementedError()

    def check_subprocess_status(self, sub_process):
        """Verify command status when called in a subprocess.

        Args:
            sub_process (process.SubProcess): subprocess used to run the command

        Returns:
            bool: whether or not the command progress has been detected

        """
        return self.check_logs(self.job.pattern, self.timestamps["start"],
                               None, self.job.pattern_count,
                               self.job.pattern_timeout.value)

    def assign_hosts(self, hosts, path=None, slots=None):
        """Assign the hosts to use with the command.

        Set the appropriate command line parameter with the specified value.

        Args:
            hosts (list): list of hosts to specify on the command line
            path (str, optional): path to use when specifying the hosts through
                a hostfile. Defaults to None. Not used.
            slots (int, optional): number of slots per host to specify in the
                optional hostfile. Defaults to None. Not used.
        """
        self._hosts = NodeSet.fromlist(hosts)

    def assign_environment(self, env_vars, append=False):
        """Assign or add environment variables to the command.

        Args:
            env_vars (EnvironmentVariables): the environment variables to use
                assign or add to the command
            append (bool): whether to assign (False) or append (True) the
                specified environment variables
        """
        pass

    def assign_environment_default(self, env_vars):
        """Assign the default environment variables for the command.

        Args:
            env_vars (EnvironmentVariables): the environment variables to
                assign as the default
        """
        pass

    def get_subprocess_state(self, message=None):
        """Display the state of the subprocess.

        Args:
            message (str, optional): additional text to include in output.
                Defaults to None.

        Returns:
            list: a list of states for the process found. Any active remote
                processes will be indicated by a 'R' state at the end of the
                list.

        """
        state = None
        remote_state = self._get_remote_process_state(message)
        if remote_state:
            state = [remote_state]
        return state

    def _run_unit_command(self, command):
        """Run the systemctl command.

        Args:
            command (str): systemctl unit command

        Raises:
            CommandFailure: if there is an issue running the command

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        self._systemctl.unit_command.value = command
        self.timestamps[command] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        result = pcmd(self._hosts, self.__str__(), self.verbose, self.timeout)
        if 255 in result:
            raise CommandFailure(
                "Timeout detected running '{}' with a {}s timeout on {}".
                format(self.__str__(), self.timeout,
                       NodeSet.fromlist(result[255])))
        if 0 not in result or len(result) > 1:
            failed = []
            for item, value in result.items():
                if item != 0:
                    failed.extend(value)
            raise CommandFailure("Error occurred running '{}' on {}".format(
                self.__str__(), NodeSet.fromlist(failed)))
        return result

    def _report_unit_command(self, command):
        """Run the systemctl command and report the log data on an error.

        Args:
            command (str): systemctl unit command

        Raises:
            CommandFailure: if there is an issue running the command

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        try:
            return self._run_unit_command(command)
        except CommandFailure as error:
            self.log.info(error)
            self.display_log_data(
                self.get_log_data(self._hosts, self.timestamps[command]))
            raise CommandFailure(error)

    def service_enable(self):
        """Enable the job's service via the systemctl command.

        Raises:
            CommandFailure: if unable to enable

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        return self._report_unit_command("enable")

    def service_disable(self):
        """Disable the job's service via the systemctl command.

        Raises:
            CommandFailure: if unable to disable

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        return self._report_unit_command("disable")

    def service_start(self):
        """Start the job's service via the systemctl command.

        Raises:
            CommandFailure: if unable to start

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        return self._report_unit_command("start")

    def service_stop(self):
        """Stop the job's service via the systemctl command.

        Raises:
            CommandFailure: if unable to stop

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        return self._report_unit_command("stop")

    def service_status(self):
        """Get the status of the job's service via the systemctl command.

        Raises:
            CommandFailure: if unable to get the status

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        return self._report_unit_command("status")

    def service_running(self):
        """Determine if the job's service is active via the systemctl command.

        The 'systemctl is-active <service>' command will return a string
        indicating one of the following states:
            active, inactive, activating, deactivating, failed, unknown
        If the <service> is "active" or "activating" return True.

        Returns:
            bool: True id the service is running, False otherwise

        """
        status = True
        states = {}
        valid_states = ["active", "activating"]
        self._systemctl.unit_command.value = "is-active"
        task = run_task(self._hosts, self.__str__(), self.timeout)
        for output, nodelist in task.iter_buffers():
            output = str(output)
            nodeset = NodeSet.fromlist(nodelist)
            status &= output in valid_states
            if output not in states:
                states[output] = NodeSet()
            states[output].add(nodeset)
        if self.timeout and task.num_timeout() > 0:
            nodeset = NodeSet.fromlist(task.iter_keys_timeout())
            states["timeout"] = nodeset
        data = ["=".join([key, str(states[key])]) for key in sorted(states)]
        self.log.info("  Detected %s states: %s",
                      self._systemctl.service.value, ", ".join(data))
        return status

    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            dict: log output per host

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = [
            "sudo",
            "journalctl",
            "--unit={}".format(self._systemctl.service.value),
            "--since=\"{}\"".format(since),
        ]
        if until:
            command.append("--until=\"{}\"".format(until))
        self.log.info("Gathering log data on %s: %s", str(hosts),
                      " ".join(command))

        # Gather the log information per host
        task = run_task(hosts, " ".join(command), timeout)

        # Create a dictionary of hosts for each unique return code
        results = {code: hosts for code, hosts in task.iter_retcodes()}

        # Determine if the command completed successfully across all the hosts
        status = len(results) == 1 and 0 in results

        # Determine if any commands timed out
        timed_out = [str(hosts) for hosts in task.iter_keys_timeout()]
        if timed_out:
            status = False
        if not status:
            self.log.info("  Errors detected running \"%s\":", command)

        # List any hosts that timed out
        if timed_out:
            self.log.info("    %s: timeout detected after %s seconds",
                          str(NodeSet.fromlist(timed_out)), timeout)

        # Display/return the command output
        log_data = {}
        for code in sorted(results):
            # Get the command output from the hosts with this return code
            output_data = list(task.iter_buffers(results[code]))
            if not output_data:
                output_data = [["<NONE>", results[code]]]

            for output_buffer, output_hosts in output_data:
                node_set = NodeSet.fromlist(output_hosts)
                lines = str(output_buffer).splitlines()

                if status:
                    # Add the successful output from each node to the dictionary
                    log_data[node_set] = lines
                else:
                    # Display all of the results in the case of an error
                    if len(lines) > 1:
                        self.log.info("    %s: rc=%s, output:", node_set, code)
                        for line in lines:
                            self.log.info("      %s", line)
                    else:
                        self.log.info("    %s: rc=%s, output: %s", node_set,
                                      code, output_buffer)

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data

    def display_log_data(self, log_data):
        """Display the journalctl log data.

        Args:
            log_data (dict): dictionary of journalctl log output.
        """
        self.log.info("Journalctl output:")
        for line in self.str_log_data(log_data).split("\n"):
            self.log.info(line)

    @staticmethod
    def str_log_data(log_data):
        """Get the journalctl log data as a string.

        Args:
            log_data (dict): dictionary of journalctl log output.

        Returns:
            str: the journalctl log data

        """
        data = []
        for node_set in sorted(log_data):
            data.append("  {}:".format(node_set))
            for line in log_data[node_set]:
                data.append("    {}".format(line))
        return "\n".join(data)

    def check_logs(self, pattern, since, until, quantity=1, timeout=60):
        """Check the command logs on each host for a specified string.

        Args:
            pattern (str): regular expression to search for in the logs
            since (str): search log entries from this date.
            until (str, optional): search log entries up to this date. Defaults
                to None, in which case it is not utilized.
            quantity (int, optional): number of times to expect the search
                pattern per host. Defaults to 1.
            timeout (int, optional): maximum number of seconds to wait to detect
                the specified pattern. Defaults to 60.

        Returns:
            bool: whether or not the search string was found in the logs on each
                host

        """
        self.log.info("Searching for '%s' in '%s' output on %s", pattern,
                      self._systemctl, self._hosts)

        log_data = None
        detected = 0
        complete = False
        timed_out = False
        start = time.time()

        # Search for patterns in the subprocess output until:
        #   - the expected number of pattern matches are detected (success)
        #   - the time out is reached (failure)
        #   - the service is no longer running (failure)
        while not complete and not timed_out and self.service_running():
            detected = 0
            log_data = self.get_log_data(self._hosts, since, until, timeout)
            for node_set in sorted(log_data):
                match = re.findall(pattern, "\n".join(log_data[node_set]))
                detected += len(match) if match else 0

            complete = detected == quantity
            timed_out = time.time() - start > timeout

            if complete:
                self.timestamps["running"] = datetime.now().strftime(
                    "%Y-%m-%d %H:%M:%S")

        # Summarize results
        msg = "{}/{} '{}' messages detected in".format(detected, quantity,
                                                       pattern)
        runtime = "{}/{} seconds".format(time.time() - start, timeout)

        if not complete:
            # Report the error / timeout
            reason = "ERROR detected"
            details = ""
            if timed_out:
                reason = "TIMEOUT detected, exceeded {} seconds".format(
                    timeout)
                runtime = "{} seconds".format(time.time() - start)
            if log_data:
                details = ":\n{}".format(self.str_log_data(log_data))
            self.log.info("%s - %s %s%s", reason, msg, runtime, details)
            if timed_out:
                self.log.debug(
                    "If needed the %s second timeout can be adjusted via "
                    "the 'pattern_timeout' test yaml parameter under %s",
                    timeout, self.namespace)
        else:
            # Report the successful start
            # self.display_log_data(log_data)
            self.log.info("%s subprocess startup detected - %s %s",
                          self._command, msg, runtime)

        return complete

    def dump_logs(self, hosts=None):
        """Display the journalctl log data since detecting server start.

        Args:
            hosts (list, optional): list of hosts from which to display the
                journalctl log data. Defaults to None which will log the
                journalctl log data from all of the hosts.
        """
        timestamp = None
        if self.timestamps["running"]:
            timestamp = self.timestamps["running"]
        elif self.timestamps["verified"]:
            timestamp = self.timestamps["verified"]
        if timestamp:
            if hosts is None:
                hosts = self._hosts
            self.display_log_data(self.get_log_data(hosts, timestamp))