def _run_process(self): """Run the command as a foreground process. Raises: CommandFailure: if there is an error running the command """ command = self.__str__() try: # Block until the command is complete or times out return run_command(command, self.timeout, self.verbose, self.exit_status_exception, "combined", env=self.env) except DaosTestError as error: # Command failed or possibly timed out raise CommandFailure(error)
def parse_output(self, stdout, regex_method): """Parse output using findall() with supplied 'regex_method' as pattern. Args: stdout (str): output to parse regex_method (str): name of the method regex to use Raises: CommandFailure: if there is an error finding the method's regex pattern. Returns: list: a list of strings obtained from the method's output parsed through its regex """ if regex_method not in self.METHOD_REGEX: raise CommandFailure( "No pattern regex defined for '{}()'".format(regex_method)) return re.findall(self.METHOD_REGEX[regex_method], stdout)
def _report_unit_command(self, command): """Run the systemctl command and report the log data on an error. Args: command (str): systemctl unit command Raises: CommandFailure: if there is an issue running the command Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: return self._run_unit_command(command) except CommandFailure as error: self.log.info(error) self.display_log_data( self.get_log_data(self._hosts, self.timestamps[command])) raise CommandFailure(error)
def __init__(self, command, manager="Orterun"): """Create a SubprocessManager object. Args: command (YamlCommand): command to manage as a subprocess manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi" """ self.log = getLogger(__name__) # Define the JobManager class used to manage the command as a subprocess try: manager_module = import_module("job_manager_utils") manager_class = getattr(manager_module, manager) except (ImportError, AttributeError) as error: raise CommandFailure("Invalid '{}' job manager class: {}".format( manager, error)) self.manager = manager_class(command, subprocess=True) # Define the list of hosts that will execute the daos command self._hosts = []
def start(self): """Start the daos command. Raises: CommandFailure: if the daos command fails to start """ # Create the yaml file for the daos command self.manager.job.temporary_file_hosts = self._hosts self.manager.job.create_yaml_file() # Start the daos command try: self.manager.run() except CommandFailure as error: # Kill the subprocess, anything that might have started self.manager.kill() raise CommandFailure("Failed to start {}.".format( str(self.manager.job))) from error finally: # Define the expected states for each rank self._expected_states = self.get_current_state()
def __init__(self, job, subprocess=False): """Create a Orterun object. Args: job (ExecutableCommand): command object to manage. subprocess (bool, optional): whether the command is run as a subprocess. Defaults to False. """ if not load_mpi("openmpi"): raise CommandFailure("Failed to load openmpi") path = os.path.dirname(find_executable("orterun")) super(Orterun, self).__init__("/run/orterun/*", "orterun", job, path, subprocess) # Default mca values to avoid queue pair errors mca_default = { "btl_openib_warn_default_gid_prefix": "0", "btl": "tcp,self", "oob": "tcp", "pml": "ob1", "btl_tcp_if_include": "eth0", } self.hostfile = FormattedParameter("--hostfile {}", None) self.processes = FormattedParameter("--np {}", 1) self.display_map = FormattedParameter("--display-map", False) self.map_by = FormattedParameter("--map-by {}", "node") self.export = FormattedParameter("-x {}", None) self.enable_recovery = FormattedParameter("--enable-recovery", True) self.report_uri = FormattedParameter("--report-uri {}", None) self.allow_run_as_root = FormattedParameter("--allow-run-as-root", None) self.mca = FormattedParameter("--mca {}", mca_default) self.pprnode = FormattedParameter("--map-by ppr:{}:node", None) self.tag_output = FormattedParameter("--tag-output", True) self.ompi_server = FormattedParameter("--ompi-server {}", None) self.working_dir = FormattedParameter("-wdir {}", None)
def stop(self): """Stop the subprocess command. Raises: CommandFailure: if unable to stop """ if self._process is not None: # Send a SIGTERM to the stop the subprocess and if it is still # running after 5 seconds send a SIGKILL and report an error signal_list = [signal.SIGTERM, signal.SIGKILL] # Turn off verbosity to keep the logs clean as the server stops self._process.verbose = False # Send signals while the process is still running state = None while self._process.poll() is None and signal_list: signal_to_send = signal_list.pop(0) msg = "before sending signal {}".format(signal_to_send) state = self.get_subprocess_state(msg) self.log.info( "Sending signal %s to %s (state=%s)", str(signal_to_send), self._command, str(state)) self._process.send_signal(signal_to_send) if signal_list: time.sleep(5) if not signal_list: if state and (len(state) > 1 or state[0] not in ("D", "Z")): # Indicate an error if the process required a SIGKILL and # either multiple processes were still found running or the # parent process was in any state except uninterruptible # sleep (D) or zombie (Z). raise CommandFailure("Error stopping '{}'".format(self)) self.log.info("%s stopped successfully", self.command) self._process = None
def _execute_command(self, command, fail_on_err=True, display_output=True, hosts=None): """Execute the command on all client hosts. Optionally verify if the command returns a non zero return code. Args: command (str): the command to execute on the client hosts fail_on_err (bool, optional): whether or not to fail the test if command returns a non zero return code. Defaults to True. display_output (bool, optional): whether or not to display output. Defaults to True. Raises: CommandFailure: if 'fail_on_err' is set and the command fails on at least one of the client hosts Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ if hosts is None: hosts = self.hostlist_clients result = pcmd(hosts, command, verbose=display_output, timeout=300) if 0 not in result and fail_on_err: hosts = [ str(nodes) for code, nodes in list(result.items()) if code != 0 ] raise CommandFailure( "Error running '{}' on the following hosts: {}".format( command, NodeSet(",".join(hosts)))) return result
def run(self): """Run the daos_racer command remotely. Raises: CommandFailure: if there is an error running the command """ # Run daos_racer on the specified host self.log.info( "Running %s on %s with %s timeout", self.__str__(), self.host, "no" if self.clush_timeout.value is None else "a {}s".format( self.clush_timeout.value)) return_codes = pcmd([self.host], self.__str__(), True, self.clush_timeout.value) if 0 not in return_codes or len(return_codes) > 1: # Kill the daos_racer process if the remote command timed out if 255 in return_codes: self.log.info("Stopping timed out daos_racer process on %s", self.host) pcmd([self.host], "pkill daos_racer", True) raise CommandFailure("Error running '{}'".format(self._command)) self.log.info("Test passed!")
def run_ior_loop(manager, uuids, tmpdir_base): """IOR run for each UUID provided. Args: manager (str): mpi job manager command uuids (list): list of container UUIDs tmpdir_base (str): base directory for the mpi orte_tmpdir_base mca parameter Returns: list: a list of CmdResults from each ior command run """ results = [] errors = [] for index, cont_uuid in enumerate(uuids): manager.job.dfs_cont.update(cont_uuid, "ior.cont_uuid") # Create a unique temporary directory for the the manager command tmp_dir = mkdtemp(dir=tmpdir_base) manager.tmpdir_base.update(tmp_dir, "tmpdir_base") try: results.append(manager.run()) except CommandFailure as error: ior_mode = "read" if "-r" in manager.job.flags.value else "write" errors.append( "IOR {} Loop {}/{} failed for container {}: {}".format( ior_mode, index, len(uuids), cont_uuid, error)) finally: # Remove the unique temporary directory and its contents to avoid conflicts shutil.rmtree(tmp_dir, ignore_errors=True) if errors: raise CommandFailure("IOR failed in {}/{} loops: {}".format( len(errors), len(uuids), "\n".join(errors))) return results
def stop(self): """Stop dfuse. Try to stop dfuse. Try once nicely by using fusermount, then if that fails try to pkill it to see if that works. Abort based on the result of the fusermount, as if pkill is necessary then dfuse itself has not worked correctly. Finally, try and remove the mount point, and that itself should work. Raises: CommandFailure: In case dfuse stop fails """ # Include all hosts when stopping to ensure all mount points in any # state are properly removed self.running_hosts.add(NodeSet.fromlist(self.hosts)) self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value, self.running_hosts) if self.mount_dir.value and self.running_hosts: error_list = [] # Loop until all fuseblk mounted devices are unmounted counter = 0 while self.running_hosts and counter < 3: # Attempt to kill dfuse on after first unmount fails if self.running_hosts and counter > 1: kill_command = "pkill dfuse --signal KILL" pcmd(self.running_hosts, kill_command, timeout=30) # Attempt to unmount any fuseblk mounted devices after detection if self.running_hosts and counter > 0: pcmd(self.running_hosts, self.get_umount_command(counter > 1), expect_rc=None) time.sleep(2) # Detect which hosts have fuseblk mounted devices and remove any # hosts which no longer have the dfuse mount point mounted state = self.check_mount_state(self.running_hosts) for host in state["unmounted"].union(state["nodirectory"]): self.running_hosts.remove(host) # Increment the loop counter counter += 1 if self.running_hosts: error_list.append("Error stopping dfuse on {}".format( self.running_hosts)) # Remove mount points try: self.remove_mount_point() except CommandFailure as error: error_list.append(error) # Report any errors if error_list: raise CommandFailure("\n".join(error_list)) elif self.mount_dir.value is None: self.log.info("No dfuse mount directory defined - nothing to stop") else: self.log.info("No hosts running dfuse - nothing to stop")
def update_config_file_from_file(self, dst_hosts, test_dir, generated_yaml): """Update config file and object. Create and place the new config file in /etc/daos/daos_server.yml Then update SCM-related data in engine_params so that those disks will be wiped. Args: dst_hosts (list): Destination server hostnames to place the new config file. test_dir (str): Directory where the server config data from generated_yaml will be written. generated_yaml (YAMLObject): New server config data. """ # Create a temporary file in test_dir and write the generated config. temp_file_path = os.path.join(test_dir, "temp_server.yml") try: with open(temp_file_path, 'w') as write_file: yaml.dump(generated_yaml, write_file, default_flow_style=False) except Exception as error: raise CommandFailure("Error writing the yaml file! {}: {}".format( temp_file_path, error)) from error # Copy the config from temp dir to /etc/daos of the server node. default_server_config = get_default_config_file("server") try: distribute_files(dst_hosts, temp_file_path, default_server_config, verbose=False, sudo=True) except DaosTestError as error: raise CommandFailure( "ERROR: Copying yaml configuration file to {}: " "{}".format(dst_hosts, error)) from error # Before restarting daos_server, we need to clear SCM. Unmount the mount # point, wipefs the disks, etc. This clearing step is built into the # server start steps. It'll look at the engine_params of the # server_manager and clear the SCM set there, so we need to overwrite it # before starting to the values from the generated config. self.log.info("Resetting engine_params") self.manager.job.yaml.engine_params = [] engines = generated_yaml["engines"] for i, engine in enumerate(engines): self.log.info("engine %d", i) for storage_tier in engine["storage"]: if storage_tier["class"] != "dcpm": continue self.log.info("scm_mount = %s", storage_tier["scm_mount"]) self.log.info("class = %s", storage_tier["class"]) self.log.info("scm_list = %s", storage_tier["scm_list"]) per_engine_yaml_parameters =\ DaosServerYamlParameters.PerEngineYamlParameters(i) per_engine_yaml_parameters.scm_mount.update( storage_tier["scm_mount"]) per_engine_yaml_parameters.scm_class.update( storage_tier["class"]) per_engine_yaml_parameters.scm_size.update(None) per_engine_yaml_parameters.scm_list.update( storage_tier["scm_list"]) per_engine_yaml_parameters.reset_yaml_data_updated() self.manager.job.yaml.engine_params.append( per_engine_yaml_parameters)
def create(self, uuid=None, con_in=None, acl_file=None): """Create a container. Args: uuid (str, optional): container uuid. Defaults to None. con_in (optional): to be defined. Defaults to None. acl_file (str, optional): path of the ACL file. Defaults to None. """ self.destroy() if not self.silent.value: self.log.info("Creating a container with pool handle %s", self.pool.pool.handle.value) self.container = DaosContainer(self.pool.context) if self.control_method.value == self.USE_API: # Create a container with the API method kwargs = {"poh": self.pool.pool.handle} if uuid is not None: kwargs["con_uuid"] = uuid # Refer daos_api for setting input params for DaosContainer. if con_in is not None: cop = self.input_params.get_con_create_params() cop.type = con_in[0] cop.enable_chksum = con_in[1] cop.srv_verify = con_in[2] cop.chksum_type = con_in[3] cop.chunk_size = con_in[4] kwargs["con_prop"] = cop self._call_method(self.container.create, kwargs) elif self.control_method.value == self.USE_DAOS and self.daos: # Disconnect the pool if connected self.pool.disconnect() # Create a container with the daos command kwargs = { "pool": self.pool.uuid, "sys_name": self.pool.name.value, "cont": uuid, "path": self.path.value, "cont_type": self.type.value, "oclass": self.oclass.value, "chunk_size": self.chunk_size.value, "properties": self.properties.value, "acl_file": acl_file, } self._log_method("daos.container_create", kwargs) try: uuid = self.daos.container_create( **kwargs)["response"]["container_uuid"] except KeyError as error: raise CommandFailure( "Error: Unexpected daos container create output" ) from error # Populate the empty DaosContainer object with the properties of the # container created with daos container create. self.container.uuid = str_to_c_uuid(uuid) self.container.attached = 1 self.container.poh = self.pool.pool.handle elif self.control_method.value == self.USE_DAOS: self.log.error("Error: Undefined daos command") else: self.log.error("Error: Undefined control_method: %s", self.control_method.value) self.uuid = self.container.get_uuid_str() if not self.silent.value: self.log.info(" Container created with uuid %s", self.uuid)
def get_log_data(self, hosts, since, until=None, timeout=60): """Gather log output for the command running on each host. Note (from journalctl man page): Date specifications should be of the format "2012-10-30 18:17:16". If the time part is omitted, "00:00:00" is assumed. If only the seconds component is omitted, ":00" is assumed. If the date component is omitted, the current day is assumed. Alternatively the strings "yesterday", "today", "tomorrow" are understood, which refer to 00:00:00 of the day before the current day, the current day, or the day after the current day, respectively. "now" refers to the current time. Finally, relative times may be specified, prefixed with "-" or "+", referring to times before or after the current time, respectively. Args: hosts (list): list of hosts from which to gather log data. since (str): show log entries from this date. until (str, optional): show log entries up to this date. Defaults to None, in which case it is not utilized. timeout (int, optional): timeout for issuing the command. Defaults to 60 seconds. Returns: dict: log output per host """ # Setup the journalctl command to capture all unit activity from the # specified start date to now or a specified end date # --output=json? command = [ "sudo", "journalctl", "--unit={}".format(self._systemctl.service.value), "--since=\"{}\"".format(since), ] if until: command.append("--until=\"{}\"".format(until)) self.log.info("Gathering log data on %s: %s", str(hosts), " ".join(command)) # Gather the log information per host task = run_task(hosts, " ".join(command), timeout) # Create a dictionary of hosts for each unique return code results = {code: hosts for code, hosts in task.iter_retcodes()} # Determine if the command completed successfully across all the hosts status = len(results) == 1 and 0 in results # Determine if any commands timed out timed_out = [str(hosts) for hosts in task.iter_keys_timeout()] if timed_out: status = False if not status: self.log.info(" Errors detected running \"%s\":", command) # List any hosts that timed out if timed_out: self.log.info(" %s: timeout detected after %s seconds", str(NodeSet.fromlist(timed_out)), timeout) # Display/return the command output log_data = {} for code in sorted(results): # Get the command output from the hosts with this return code output_data = list(task.iter_buffers(results[code])) if not output_data: output_data = [["<NONE>", results[code]]] for output_buffer, output_hosts in output_data: node_set = NodeSet.fromlist(output_hosts) lines = str(output_buffer).splitlines() if status: # Add the successful output from each node to the dictionary log_data[node_set] = lines else: # Display all of the results in the case of an error if len(lines) > 1: self.log.info(" %s: rc=%s, output:", node_set, code) for line in lines: self.log.info(" %s", line) else: self.log.info(" %s: rc=%s, output: %s", node_set, code, output_buffer) # Report any errors through an exception if not status: raise CommandFailure( "Error(s) detected gathering {} log data on {}".format( self._systemctl.service.value, NodeSet.fromlist(hosts))) # Return the successful command output per set of hosts return log_data
def get_log_data(self, hosts, since, until=None, timeout=60): """Gather log output for the command running on each host. Note (from journalctl man page): Date specifications should be of the format "2012-10-30 18:17:16". If the time part is omitted, "00:00:00" is assumed. If only the seconds component is omitted, ":00" is assumed. If the date component is omitted, the current day is assumed. Alternatively the strings "yesterday", "today", "tomorrow" are understood, which refer to 00:00:00 of the day before the current day, the current day, or the day after the current day, respectively. "now" refers to the current time. Finally, relative times may be specified, prefixed with "-" or "+", referring to times before or after the current time, respectively. Args: hosts (list): list of hosts from which to gather log data. since (str): show log entries from this date. until (str, optional): show log entries up to this date. Defaults to None, in which case it is not utilized. timeout (int, optional): timeout for issuing the command. Defaults to 60 seconds. Returns: list: a list of dictionaries including: "hosts": <NodeSet() of hosts with this data> "data": <journalctl output> """ # Setup the journalctl command to capture all unit activity from the # specified start date to now or a specified end date # --output=json? command = [ "sudo", "journalctl", "--unit={}".format(self._systemctl.service.value), "--since=\"{}\"".format(since), ] if until: command.append("--until=\"{}\"".format(until)) self.log.info( "Gathering log data on %s: %s", str(hosts), " ".join(command)) # Gather the log information per host results = run_pcmd(hosts, " ".join(command), False, timeout, None) # Determine if the command completed successfully without a timeout status = True for result in results: if result["interrupted"]: self.log.info(" Errors detected running \"%s\":", command) self.log.info( " %s: timeout detected after %s seconds", str(result["hosts"]), timeout) status = False elif result["exit_status"] != 0: self.log.info(" Errors detected running \"%s\":", command) status = False if not status: break # Display/return the command output log_data = [] for result in results: if result["exit_status"] == 0 and not result["interrupted"]: # Add the successful output from each node to the dictionary log_data.append( {"hosts": result["hosts"], "data": result["stdout"]}) else: # Display all of the results in the case of an error if len(result["stdout"]) > 1: self.log.info( " %s: rc=%s, output:", str(result["hosts"]), result["exit_status"]) for line in result["stdout"]: self.log.info(" %s", line) else: self.log.info( " %s: rc=%s, output: %s", str(result["hosts"]), result["exit_status"], result["stdout"][0]) # Report any errors through an exception if not status: raise CommandFailure( "Error(s) detected gathering {} log data on {}".format( self._systemctl.service.value, NodeSet.fromlist(hosts))) # Return the successful command output per set of hosts return log_data