def __init__(self, job): """Create a Orterun object. Args: job (SubProcessCommand): command object to manage. """ # path = os.path.dirname(find_executable("systemctl")) super(Systemctl, self).__init__("/run/systemctl/*", "", job) self.job = job self._systemctl = SystemctlCommand() self._systemctl.service.value = self.job.service_name self.timestamps = { "enable": None, "disable": None, "start": None, "running": None, "verified": None, "stop": None, "restart": None, }
class Systemctl(JobManager): """A class for the systemctl job manager command.""" def __init__(self, job): """Create a Orterun object. Args: job (SubProcessCommand): command object to manage. """ # path = os.path.dirname(find_executable("systemctl")) super(Systemctl, self).__init__("/run/systemctl/*", "", job) self.job = job self._systemctl = SystemctlCommand() self._systemctl.service.value = self.job.service_name self.timestamps = { "enable": None, "disable": None, "start": None, "running": None, "verified": None, "stop": None, "restart": None, } @property def hosts(self): """Get the list of hosts associated with this command.""" return list(self._hosts) if self._hosts else None def __str__(self): """Return the command with all of its defined parameters as a string. Returns: str: the command with all the defined parameters """ return self._systemctl.__str__() def run(self): """Start the job's service via the systemctl command. Enable the service, start the service, and report the status of the service. If an error occurs with any of these commands also display the journalctl output for the service. Raises: CommandFailure: if unable to enable or start the service Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ # Start the daos_server.service self.service_enable() result = self.service_start() # result = self.service_status() # Determine if the command has launched correctly using its # check_subprocess_status() method. if not self.check_subprocess_status(None): msg = "Command '{}' did not launch correctly".format(self) self.log.error(msg) raise CommandFailure(msg) return result def stop(self): """Stop the job's service via the systemctl command. Stop the service, disable the service, and report the status of the service. If an error occurs with any of these commands also display the journalctl output for the service. Raises: CommandFailure: if unable to stop or disable the service Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ self.service_stop() return self.service_disable() def wait(self): """Wait for the sub process to complete.""" raise NotImplementedError() def check_subprocess_status(self, sub_process): """Verify command status when called in a subprocess. Args: sub_process (process.SubProcess): subprocess used to run the command Returns: bool: whether or not the command progress has been detected """ return self.check_logs(self.job.pattern, self.timestamps["start"], None, self.job.pattern_count, self.job.pattern_timeout.value) def assign_hosts(self, hosts, path=None, slots=None): """Assign the hosts to use with the command. Set the appropriate command line parameter with the specified value. Args: hosts (list): list of hosts to specify on the command line path (str, optional): path to use when specifying the hosts through a hostfile. Defaults to None. Not used. slots (int, optional): number of slots per host to specify in the optional hostfile. Defaults to None. Not used. """ self._hosts = NodeSet.fromlist(hosts) def assign_environment(self, env_vars, append=False): """Assign or add environment variables to the command. Args: env_vars (EnvironmentVariables): the environment variables to use assign or add to the command append (bool): whether to assign (False) or append (True) the specified environment variables """ pass def assign_environment_default(self, env_vars): """Assign the default environment variables for the command. Args: env_vars (EnvironmentVariables): the environment variables to assign as the default """ pass def get_subprocess_state(self, message=None): """Display the state of the subprocess. Args: message (str, optional): additional text to include in output. Defaults to None. Returns: list: a list of states for the process found. Any active remote processes will be indicated by a 'R' state at the end of the list. """ state = None remote_state = self._get_remote_process_state(message) if remote_state: state = [remote_state] return state def _run_unit_command(self, command): """Run the systemctl command. Args: command (str): systemctl unit command Raises: CommandFailure: if there is an issue running the command Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ self._systemctl.unit_command.value = command self.timestamps[command] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") result = pcmd(self._hosts, self.__str__(), self.verbose, self.timeout) if 255 in result: raise CommandFailure( "Timeout detected running '{}' with a {}s timeout on {}". format(self.__str__(), self.timeout, NodeSet.fromlist(result[255]))) if 0 not in result or len(result) > 1: failed = [] for item, value in result.items(): if item != 0: failed.extend(value) raise CommandFailure("Error occurred running '{}' on {}".format( self.__str__(), NodeSet.fromlist(failed))) return result def _report_unit_command(self, command): """Run the systemctl command and report the log data on an error. Args: command (str): systemctl unit command Raises: CommandFailure: if there is an issue running the command Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: return self._run_unit_command(command) except CommandFailure as error: self.log.info(error) self.display_log_data( self.get_log_data(self._hosts, self.timestamps[command])) raise CommandFailure(error) def service_enable(self): """Enable the job's service via the systemctl command. Raises: CommandFailure: if unable to enable Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ return self._report_unit_command("enable") def service_disable(self): """Disable the job's service via the systemctl command. Raises: CommandFailure: if unable to disable Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ return self._report_unit_command("disable") def service_start(self): """Start the job's service via the systemctl command. Raises: CommandFailure: if unable to start Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ return self._report_unit_command("start") def service_stop(self): """Stop the job's service via the systemctl command. Raises: CommandFailure: if unable to stop Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ return self._report_unit_command("stop") def service_status(self): """Get the status of the job's service via the systemctl command. Raises: CommandFailure: if unable to get the status Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ return self._report_unit_command("status") def service_running(self): """Determine if the job's service is active via the systemctl command. The 'systemctl is-active <service>' command will return a string indicating one of the following states: active, inactive, activating, deactivating, failed, unknown If the <service> is "active" or "activating" return True. Returns: bool: True id the service is running, False otherwise """ status = True states = {} valid_states = ["active", "activating"] self._systemctl.unit_command.value = "is-active" task = run_task(self._hosts, self.__str__(), self.timeout) for output, nodelist in task.iter_buffers(): output = str(output) nodeset = NodeSet.fromlist(nodelist) status &= output in valid_states if output not in states: states[output] = NodeSet() states[output].add(nodeset) if self.timeout and task.num_timeout() > 0: nodeset = NodeSet.fromlist(task.iter_keys_timeout()) states["timeout"] = nodeset data = ["=".join([key, str(states[key])]) for key in sorted(states)] self.log.info(" Detected %s states: %s", self._systemctl.service.value, ", ".join(data)) return status def get_log_data(self, hosts, since, until=None, timeout=60): """Gather log output for the command running on each host. Note (from journalctl man page): Date specifications should be of the format "2012-10-30 18:17:16". If the time part is omitted, "00:00:00" is assumed. If only the seconds component is omitted, ":00" is assumed. If the date component is omitted, the current day is assumed. Alternatively the strings "yesterday", "today", "tomorrow" are understood, which refer to 00:00:00 of the day before the current day, the current day, or the day after the current day, respectively. "now" refers to the current time. Finally, relative times may be specified, prefixed with "-" or "+", referring to times before or after the current time, respectively. Args: hosts (list): list of hosts from which to gather log data. since (str): show log entries from this date. until (str, optional): show log entries up to this date. Defaults to None, in which case it is not utilized. timeout (int, optional): timeout for issuing the command. Defaults to 60 seconds. Returns: dict: log output per host """ # Setup the journalctl command to capture all unit activity from the # specified start date to now or a specified end date # --output=json? command = [ "sudo", "journalctl", "--unit={}".format(self._systemctl.service.value), "--since=\"{}\"".format(since), ] if until: command.append("--until=\"{}\"".format(until)) self.log.info("Gathering log data on %s: %s", str(hosts), " ".join(command)) # Gather the log information per host task = run_task(hosts, " ".join(command), timeout) # Create a dictionary of hosts for each unique return code results = {code: hosts for code, hosts in task.iter_retcodes()} # Determine if the command completed successfully across all the hosts status = len(results) == 1 and 0 in results # Determine if any commands timed out timed_out = [str(hosts) for hosts in task.iter_keys_timeout()] if timed_out: status = False if not status: self.log.info(" Errors detected running \"%s\":", command) # List any hosts that timed out if timed_out: self.log.info(" %s: timeout detected after %s seconds", str(NodeSet.fromlist(timed_out)), timeout) # Display/return the command output log_data = {} for code in sorted(results): # Get the command output from the hosts with this return code output_data = list(task.iter_buffers(results[code])) if not output_data: output_data = [["<NONE>", results[code]]] for output_buffer, output_hosts in output_data: node_set = NodeSet.fromlist(output_hosts) lines = str(output_buffer).splitlines() if status: # Add the successful output from each node to the dictionary log_data[node_set] = lines else: # Display all of the results in the case of an error if len(lines) > 1: self.log.info(" %s: rc=%s, output:", node_set, code) for line in lines: self.log.info(" %s", line) else: self.log.info(" %s: rc=%s, output: %s", node_set, code, output_buffer) # Report any errors through an exception if not status: raise CommandFailure( "Error(s) detected gathering {} log data on {}".format( self._systemctl.service.value, NodeSet.fromlist(hosts))) # Return the successful command output per set of hosts return log_data def display_log_data(self, log_data): """Display the journalctl log data. Args: log_data (dict): dictionary of journalctl log output. """ self.log.info("Journalctl output:") for line in self.str_log_data(log_data).split("\n"): self.log.info(line) @staticmethod def str_log_data(log_data): """Get the journalctl log data as a string. Args: log_data (dict): dictionary of journalctl log output. Returns: str: the journalctl log data """ data = [] for node_set in sorted(log_data): data.append(" {}:".format(node_set)) for line in log_data[node_set]: data.append(" {}".format(line)) return "\n".join(data) def check_logs(self, pattern, since, until, quantity=1, timeout=60): """Check the command logs on each host for a specified string. Args: pattern (str): regular expression to search for in the logs since (str): search log entries from this date. until (str, optional): search log entries up to this date. Defaults to None, in which case it is not utilized. quantity (int, optional): number of times to expect the search pattern per host. Defaults to 1. timeout (int, optional): maximum number of seconds to wait to detect the specified pattern. Defaults to 60. Returns: bool: whether or not the search string was found in the logs on each host """ self.log.info("Searching for '%s' in '%s' output on %s", pattern, self._systemctl, self._hosts) log_data = None detected = 0 complete = False timed_out = False start = time.time() # Search for patterns in the subprocess output until: # - the expected number of pattern matches are detected (success) # - the time out is reached (failure) # - the service is no longer running (failure) while not complete and not timed_out and self.service_running(): detected = 0 log_data = self.get_log_data(self._hosts, since, until, timeout) for node_set in sorted(log_data): match = re.findall(pattern, "\n".join(log_data[node_set])) detected += len(match) if match else 0 complete = detected == quantity timed_out = time.time() - start > timeout if complete: self.timestamps["running"] = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # Summarize results msg = "{}/{} '{}' messages detected in".format(detected, quantity, pattern) runtime = "{}/{} seconds".format(time.time() - start, timeout) if not complete: # Report the error / timeout reason = "ERROR detected" details = "" if timed_out: reason = "TIMEOUT detected, exceeded {} seconds".format( timeout) runtime = "{} seconds".format(time.time() - start) if log_data: details = ":\n{}".format(self.str_log_data(log_data)) self.log.info("%s - %s %s%s", reason, msg, runtime, details) if timed_out: self.log.debug( "If needed the %s second timeout can be adjusted via " "the 'pattern_timeout' test yaml parameter under %s", timeout, self.namespace) else: # Report the successful start # self.display_log_data(log_data) self.log.info("%s subprocess startup detected - %s %s", self._command, msg, runtime) return complete def dump_logs(self, hosts=None): """Display the journalctl log data since detecting server start. Args: hosts (list, optional): list of hosts from which to display the journalctl log data. Defaults to None which will log the journalctl log data from all of the hosts. """ timestamp = None if self.timestamps["running"]: timestamp = self.timestamps["running"] elif self.timestamps["verified"]: timestamp = self.timestamps["verified"] if timestamp: if hosts is None: hosts = self._hosts self.display_log_data(self.get_log_data(hosts, timestamp))