コード例 #1
0
ファイル: job_manager_utils.py プロジェクト: liw/daos
    def _run_unit_command(self, command):
        """Run the systemctl command.

        Args:
            command (str): systemctl unit command

        Raises:
            CommandFailure: if there is an issue running the command

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        self._systemctl.unit_command.value = command
        self.timestamps[command] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        result = pcmd(self._hosts, self.__str__(), self.verbose, self.timeout)
        if 255 in result:
            raise CommandFailure(
                "Timeout detected running '{}' with a {}s timeout on {}".
                format(self.__str__(), self.timeout,
                       NodeSet.fromlist(result[255])))

        if 0 not in result or len(result) > 1:
            failed = []
            for item, value in list(result.items()):
                if item != 0:
                    failed.extend(value)
            raise CommandFailure("Error occurred running '{}' on {}".format(
                self.__str__(), NodeSet.fromlist(failed)))
        return result
コード例 #2
0
ファイル: dfuse_utils.py プロジェクト: liw/daos
    def create_mount_point(self):
        """Create dfuse directory.

        Raises:
            CommandFailure: In case of error creating directory

        """
        # Raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        # Create the mount point on any host without dfuse already mounted
        state = self.check_mount_state()
        if state["nodirectory"]:
            command = "mkdir -p {}".format(self.mount_dir.value)
            ret_code = pcmd(state["nodirectory"], command, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                failed_nodes = [
                    str(node_set) for code, node_set in list(ret_code.items())
                    if code != 0
                ]
                error_hosts = NodeSet(",".join(failed_nodes))
                raise CommandFailure(
                    "Error creating the {} dfuse mount point on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 error_hosts))
コード例 #3
0
ファイル: server_utils.py プロジェクト: liw/daos
    def update_config_file_from_file(self, dst_hosts, test_dir, generated_yaml):
        """Update config file and object.

        Create and place the new config file in /etc/daos/daos_server.yml
        Then update SCM-related data in engine_params so that those disks will
        be wiped.

        Args:
            dst_hosts (list): Destination server hostnames to place the new config file.
            test_dir (str): Directory where the server config data from
                generated_yaml will be written.
            generated_yaml (YAMLObject): New server config data.

        """
        # Create a temporary file in test_dir and write the generated config.
        temp_file_path = os.path.join(test_dir, "temp_server.yml")
        try:
            with open(temp_file_path, 'w') as write_file:
                yaml.dump(generated_yaml, write_file, default_flow_style=False)
        except Exception as error:
            raise CommandFailure(
                "Error writing the yaml file! {}: {}".format(temp_file_path, error)) from error

        # Copy the config from temp dir to /etc/daos of the server node.
        default_server_config = get_default_config_file("server")
        try:
            distribute_files(
                dst_hosts, temp_file_path, default_server_config, verbose=False, sudo=True)
        except DaosTestError as error:
            raise CommandFailure(
                "ERROR: Copying yaml configuration file to {}: "
                "{}".format(dst_hosts, error)) from error

        # Before restarting daos_server, we need to clear SCM. Unmount the mount
        # point, wipefs the disks, etc. This clearing step is built into the
        # server start steps. It'll look at the engine_params of the
        # server_manager and clear the SCM set there, so we need to overwrite it
        # before starting to the values from the generated config.
        self.log.info("Resetting engine_params")
        self.manager.job.yaml.engine_params = []
        engines = generated_yaml["engines"]
        for i, engine in enumerate(engines):
            self.log.info("engine %d", i)
            for storage_tier in engine["storage"]:
                if storage_tier["class"] != "dcpm":
                    continue

                self.log.info("scm_mount = %s", storage_tier["scm_mount"])
                self.log.info("class = %s", storage_tier["class"])
                self.log.info("scm_list = %s", storage_tier["scm_list"])

                per_engine_yaml_parameters = DaosServerYamlParameters.PerEngineYamlParameters(i)
                per_engine_yaml_parameters.scm_mount.update(storage_tier["scm_mount"])
                per_engine_yaml_parameters.scm_class.update(storage_tier["class"])
                per_engine_yaml_parameters.scm_size.update(None)
                per_engine_yaml_parameters.scm_list.update(storage_tier["scm_list"])
                per_engine_yaml_parameters.reset_yaml_data_updated()

                self.manager.job.yaml.engine_params.append(
                    per_engine_yaml_parameters)
コード例 #4
0
ファイル: dfuse_utils.py プロジェクト: liw/daos
    def run(self, check=True, bind_cores=None):
        # pylint: disable=arguments-differ
        """Run the dfuse command.

        Args:
            check (bool): Check if dfuse mounted properly after
                mount is executed.
            bind_cores (str): List of CPU cores to pass to taskset
        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self.env:
            raise CommandFailure(
                "Dfuse missing environment variables for D_LOG_FILE")

        if 'D_LOG_MASK' not in self.env:
            self.env['D_LOG_MASK'] = 'INFO'

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        cmd = self.env.get_export_str()
        if bind_cores:
            cmd += 'taskset -c {} '.format(bind_cores)
        cmd += str(self)
        self.log.info("Command is '%s'", cmd)
        ret_code = pcmd(self.hosts, cmd, timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if ret_code:
            error_hosts = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if check:
            # Dfuse will block in the command for the mount to complete, even
            # if run in background mode so it should be possible to start using
            # it immediately after the command returns.
            if not self.check_running(fail_on_error=False):
                self.log.info('Waiting two seconds for dfuse to start')
                time.sleep(2)
                if not self.check_running(fail_on_error=False):
                    self.log.info('Waiting five seconds for dfuse to start')
                    time.sleep(5)
                    self.check_running()
コード例 #5
0
ファイル: dfuse_utils.py プロジェクト: liw/daos
    def remove_mount_point(self, fail=True):
        """Remove dfuse directory.

        Try once with a simple rmdir which should succeed, if this does not then
        try again with rm -rf, but still raise an error.

        Raises:
            CommandFailure: In case of error deleting directory

        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        dir_exists, clean_nodes = check_file_exists(self.hosts,
                                                    self.mount_dir.value,
                                                    directory=True)
        if dir_exists:
            target_nodes = list(self.hosts)
            if clean_nodes:
                target_nodes.remove(clean_nodes)

            self.log.info("Removing the %s dfuse mount point on %s",
                          self.mount_dir.value, target_nodes)

            cmd = "rmdir {}".format(self.mount_dir.value)
            ret_code = pcmd(target_nodes, cmd, timeout=30)
            if len(ret_code) == 1 and 0 in ret_code:
                return

            failed_nodes = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))

            cmd = "rm -rf {}".format(self.mount_dir.value)
            ret_code = pcmd(failed_nodes, cmd, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in list(ret_code.items())
                    if code != 0
                ]))
                if fail:
                    raise CommandFailure(
                        "Error removing the {} dfuse mount point with rm on "
                        "the following hosts: {}".format(
                            self.mount_dir.value, error_hosts))
            if fail:
                raise CommandFailure(
                    "Error removing the {} dfuse mount point with rmdir on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 failed_nodes))
        else:
            self.log.info("No %s dfuse mount point directory found on %s",
                          self.mount_dir.value, self.hosts)
コード例 #6
0
    def get_aggregate_total(self, processes):
        """Get the total bytes expected to be written by ior.

        Args:
            processes (int): number of processes running the ior command

        Returns:
            int: total number of bytes written

        Raises:
            CommandFailure: if there is an error obtaining the aggregate total

        """
        power = {"k": 1, "m": 2, "g": 3, "t": 4}
        total = processes
        for name in ("block_size", "segment_count"):
            item = getattr(self, name).value
            if item:
                sub_item = re.split(r"([^\d])", str(item))
                if int(sub_item[0]) > 0:
                    total *= int(sub_item[0])
                    if len(sub_item) > 1:
                        key = sub_item[1].lower()
                        if key in power:
                            total *= 1024**power[key]
                        else:
                            raise CommandFailure(
                                "Error obtaining the IOR aggregate total from "
                                "the {} - bad key: value: {}, split: {}, "
                                "key: {}".format(name, item, sub_item, key))
                else:
                    raise CommandFailure(
                        "Error obtaining the IOR aggregate total from the {}: "
                        "value: {}, split: {}".format(name, item, sub_item))

        # Account for any replicas, except for the ones with no replication
        # i.e all object classes starting with "S". Eg: S1,S2,...,SX.
        if not self.dfs_oclass.value.startswith("S"):
            replica_qty = 1
            try:
                # Extract the replica quantity from the object class string
                replica_qty = int(re.findall(r"\d+", self.dfs_oclass.value)[0])
            except (TypeError, IndexError):
                # If the daos object class is undefined (TypeError) or it does
                # not contain any numbers (IndexError) then there is only one
                # replica.
                pass
            finally:
                total *= replica_qty

        return total
コード例 #7
0
ファイル: job_manager_utils.py プロジェクト: liw/daos
    def run(self):
        """Start the job's service via the systemctl command.

        Enable the service, start the service, and report the status of the
        service.  If an error occurs with any of these commands also display
        the journalctl output for the service.

        Raises:
            CommandFailure: if unable to enable or start the service

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        # Start the daos_server.service
        self.service_enable()
        result = self.service_start()
        # result = self.service_status()

        # Determine if the command has launched correctly using its
        # check_subprocess_status() method.
        if not self.check_subprocess_status(None):
            msg = "Command '{}' did not launch correctly".format(self)
            self.log.error(msg)
            raise CommandFailure(msg)

        return result
コード例 #8
0
ファイル: dfuse_utils.py プロジェクト: liw/daos
    def check_running(self, fail_on_error=True):
        """Check dfuse is running.

        Run a command to verify dfuse is running on hosts where it is supposed
        to be.  Use grep -v and rc=1 here so that if it isn't, then we can
        see what is being used instead.

        Args:
            fail_on_error (bool, optional): should an exception be raised if an
                error is detected. Defaults to True.

        Raises:
            CommandFailure: raised if dfuse is found not running on any expected
                nodes and fail_on_error is set.

        Returns:
            bool: whether or not dfuse is running

        """
        status = True
        state = self.check_mount_state(self.running_hosts)
        if state["unmounted"] or state["nodirectory"]:
            self.log.error("Error: dfuse not running on %s",
                           str(state["unmounted"].union(state["nodirectory"])))
            status = False
            if fail_on_error:
                raise CommandFailure("dfuse not running")
        return status
コード例 #9
0
    def _execute_cmd(self, cmd):
        """Execute command on the host clients.

        Args:
            cmd (str): Command to run

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        try:
            # execute bash cmds
            ret = pcmd(self.dfuse_hosts, cmd, verbose=True, timeout=30)
            if 0 not in ret:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in list(ret.items())
                    if code != 0
                ]))
                raise CommandFailure("Error running '{}' on the following "
                                     "hosts: {}".format(cmd, error_hosts))

        # report error if any command fails
        except CommandFailure as error:
            self.log.error("DfuseSparseFile Test Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        return ret
コード例 #10
0
def get_dmg_network_information(dmg_network_scan):
    """Get the network device information from the dmg network scan output.

    Args:
        dmg_network_scan (dict): the dmg network scan json command output

    Raises:
        CommandFailure: if there was an error processing the dmg network scan output

    Returns:
        list: a list of NetworkDevice objects identifying the network devices on each host

    """
    network_devices = []

    try:
        for host_fabric in dmg_network_scan["response"]["HostFabrics"].values(
        ):
            for host in NodeSet(host_fabric["HostSet"].split(":")[0]):
                for interface in host_fabric["HostFabric"]["Interfaces"]:
                    network_devices.append(
                        NetworkDevice(host, interface["Device"], None, 1,
                                      interface["Provider"],
                                      interface["NumaNode"]))
    except KeyError as error:
        raise CommandFailure(
            f"Error processing dmg network scan json output: {dmg_network_scan}"
        ) from error

    return network_devices
コード例 #11
0
    def _run_process(self):
        """Run the command as a foreground process.

        Raises:
            CommandFailure: if there is an error running the command

        """
        if self._hosts is None:
            # Run fio locally
            self.log.debug("Running: %s", self.__str__())
            super()._run_process()
        else:
            # Run fio remotely
            self.log.debug("Running: %s", self.__str__())
            ret_codes = pcmd(self._hosts, self.__str__())

            # Report any failures
            if len(ret_codes) > 1 or 0 not in ret_codes:
                failed = [
                    "{}: rc={}".format(val, key)
                    for key, val in list(ret_codes.items()) if key != 0
                ]
                raise CommandFailure(
                    "Error running fio on the following hosts: {}".format(
                        ", ".join(failed)))
コード例 #12
0
ファイル: command_utils_base.py プロジェクト: liw/daos
    def create_yaml(self, filename=None):
        """Create a yaml file from the parameter values.

        A yaml file will only be created if at least one of its parameter values
        have be updated (BasicParameter.updated = True).

        Args:
            filename (str, optional): the yaml file to generate with the
                parameters. Defaults to None, which uses self.filename.

        Raises:
            CommandFailure: if there is an error creating the yaml file

        Returns:
            bool: whether or not an updated yaml file was created

        """
        create_yaml = self.is_yaml_data_updated()
        if create_yaml:
            # Write a new yaml file if any of the parameters have been updated
            if filename is None:
                filename = self.filename
            yaml_data = self.get_yaml_data()
            self.log.info("Writing yaml configuration file %s", filename)
            try:
                with open(filename, 'w') as write_file:
                    yaml.dump(yaml_data, write_file, default_flow_style=False)
            except Exception as error:
                raise CommandFailure(
                    "Error writing the yaml file {}: {}".format(
                        filename, error)) from error
            self.reset_yaml_data_updated()
        return create_yaml
コード例 #13
0
def get_device_ids(dmg, servers):
    """Get the NVMe Device ID from servers.

    Args:
        dmg: DmgCommand class instance.
        servers (list): list of server hosts.

    Returns:
        devices (dictionary): Device UUID for servers.

    """
    devices = {}
    dmg.set_sub_command("storage")
    dmg.sub_command_class.set_sub_command("query")
    dmg.sub_command_class.sub_command_class.set_sub_command("list-devices")
    for host in servers:
        dmg.hostlist = host
        try:
            result = dmg.run()
        except CommandFailure as _error:
            raise CommandFailure("dmg list-devices failed with error {}".format(_error)) from _error
        drive_list = []
        for line in result.stdout_text.split('\n'):
            if 'UUID' in line:
                drive_list.append(line.split('UUID:')[1].split(' ')[0])
        devices[host] = drive_list
    return devices
コード例 #14
0
ファイル: agent_utils.py プロジェクト: liw/daos
    def stop(self):
        """Stop the agent through the job manager.

        Raises:
            CommandFailure: if there was an error stopping the agents.

        """
        self.log.info("<AGENT> Stopping agent %s command",
                      self.manager.command)

        # Maintain a running list of errors detected trying to stop
        messages = []

        # Stop the subprocess running the manager command
        try:
            super().stop()
        except CommandFailure as error:
            messages.append("Error stopping the {} subprocess: {}".format(
                self.manager.command, error))

        # Kill any leftover processes that may not have been stopped correctly
        self.manager.kill()

        # Report any errors after all stop actions have been attempted
        if messages:
            raise CommandFailure("Failed to stop agents:\n  {}".format(
                "\n  ".join(messages)))
コード例 #15
0
ファイル: find.py プロジェクト: liw/daos
 def _run_cmd(self, cmd):
     ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=180)
     if 0 not in ret_code:
         error_hosts = NodeSet(",".join(
             [str(v) for k, v in list(ret_code.items()) if k != 0]))
         raise CommandFailure(
             "Error running '{}' on the following hosts: {}".format(
                 cmd, error_hosts))
コード例 #16
0
ファイル: command_utils_base.py プロジェクト: liw/daos
 def update_params(self, **params):
     """Update each of provided parameter name and value pairs."""
     for name, value in params.items():
         try:
             getattr(self, name).update(value, name)
         except AttributeError as error:
             raise CommandFailure(
                 "Unknown parameter: {}".format(name)) from error
コード例 #17
0
ファイル: test_utils_pool.py プロジェクト: liw/daos
    def set_query_data(self, show_enabled=False, show_disabled=False):
        """Execute dmg pool query and store the results.

        Args:
            show_enabled (bool, optional): Display enabled ranks.
            show_disabled (bool, optional): Display disabled ranks.

        Only supported with the dmg control method.
        """
        self.query_data = {}
        if self.pool:
            if self.dmg:
                end_time = None
                if self.pool_query_timeout.value is not None:
                    self.log.info(
                        "Waiting for pool %s query to be responsive with a %s "
                        "second timeout", self.identifier,
                        self.pool_query_timeout.value)
                    end_time = time() + self.pool_query_timeout.value
                while True:
                    try:
                        self.query_data = self.dmg.pool_query(
                            self.identifier, show_enabled, show_disabled)
                        break
                    except CommandFailure as error:
                        if end_time is not None:
                            self.log.info(
                                "Pool %s query still non-responsive: %s",
                                self.identifier, str(error))
                            if time() > end_time:
                                raise CommandFailure(
                                    "TIMEOUT detected after {} seconds while "
                                    "waiting for pool {} query response. This "
                                    "timeout can be adjusted via the "
                                    "'pool/pool_query_timeout' test yaml "
                                    "parameter.".format(
                                        self.pool_query_timeout.value,
                                        self.identifier)) \
                                            from error
                        else:
                            raise CommandFailure(error) from error
            else:
                self.log.error("Error: Undefined dmg command")
コード例 #18
0
ファイル: test_utils_pool.py プロジェクト: liw/daos
    def get_params(self, test):
        """Get values for all of the command params from the yaml file.

        Autosize any size/scm_size/nvme_size parameter whose value ends in "%".
        Also create a unique label by adding the incremented number prefix.

        Args:
            test (Test): avocado Test object
        """
        super().get_params(test)

        # Autosize any size/scm_size/nvme_size parameters
        # pylint: disable=too-many-boolean-expressions
        if ((self.scm_size.value is not None
             and str(self.scm_size.value).endswith("%"))
                or (self.nvme_size.value is not None
                    and str(self.nvme_size.value).endswith("%"))):
            index = self.server_index.value
            try:
                params = test.server_managers[index].autosize_pool_params(
                    size=None,
                    tier_ratio=None,
                    scm_size=self.scm_size.value,
                    nvme_size=self.nvme_size.value,
                    min_targets=self.min_targets.value,
                    quantity=self.quantity.value)
            except ServerFailed as error:
                test.fail(
                    "Failure autosizing pool parameters: {}".format(error))
            except AutosizeCancel as error:
                test.cancel(error)

            # Update the pool parameters with any autosized values
            for name in params:
                test_pool_param = getattr(self, name)
                test_pool_param.update(params[name], name)

                # Cache the autosized value so we do not calculate it again
                # pylint: disable=protected-access
                cache_id = (name, self.namespace, test_pool_param._default)
                test.params._cache[cache_id] = params[name]

        # Use a unique pool label if using pool labels
        if self.label.value is not None:
            if not isinstance(self.label_generator, LabelGenerator):
                raise CommandFailure(
                    "Unable to create a unique pool label; Undefined label_generator"
                )
            self.label.update(self.label_generator.get_label(self.label.value))
コード例 #19
0
    def assert_on_exception(self, out_queue=None):
        """Assert on exception while executing an application.

        Args:
            out_queue (queue): Check whether the queue is
            empty. If empty, app (ior, mdtest) didn't encounter error.
        """
        if out_queue is None:
            out_queue = self.out_queue
        if out_queue.empty():
            pass
        else:
            exc = out_queue.get(block=False)
            out_queue.put(exc)
            raise CommandFailure(exc)
コード例 #20
0
ファイル: job_manager_utils.py プロジェクト: liw/daos
    def _report_unit_command(self, command):
        """Run the systemctl command and report the log data on an error.

        Args:
            command (str): systemctl unit command

        Raises:
            CommandFailure: if there is an issue running the command

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        try:
            return self._run_unit_command(command)
        except CommandFailure as error:
            self.log.info(error)
            self.display_log_data(
                self.get_log_data(self._hosts, self.timestamps[command]))
            raise CommandFailure(error) from error
コード例 #21
0
    def stop(self, pool=None):
        """Stop the ior command when the job manager was run as a subprocess .

        Args:
            pool (TestPool, optional): if provided the pool space will be displayed after attempting
                to stop the ior command . Defaults to None.

        Raises:
            CommandFailure: if there is an error stopping the ior subprocess

        """
        if self.manager.run_as_subprocess:
            error_message = None
            try:
                self.manager.stop()
            except CommandFailure as error:
                error_message = "IOR Failed: {}".format(error)
            finally:
                if pool:
                    self.display_pool_space(pool)
            if error_message:
                raise CommandFailure(error_message)
コード例 #22
0
ファイル: daos_racer_utils.py プロジェクト: liw/daos
    def run(self):
        """Run the daos_racer command remotely.

        Raises:
            CommandFailure: if there is an error running the command

        """
        # Run daos_racer on the specified host
        self.log.info(
            "Running %s on %s with %s timeout", self.__str__(), self.host,
            "no" if self.clush_timeout.value is None else "a {}s".format(
                self.clush_timeout.value))
        return_codes = pcmd([self.host], self.__str__(), True,
                            self.clush_timeout.value)
        if 0 not in return_codes or len(return_codes) > 1:
            # Kill the daos_racer process if the remote command timed out
            if 255 in return_codes:
                self.log.info("Stopping timed out daos_racer process on %s",
                              self.host)
                pcmd([self.host], "pkill daos_racer", True)

            raise CommandFailure("Error running '{}'".format(self._command))

        self.log.info("Test passed!")
コード例 #23
0
    def _execute_command(self,
                         command,
                         fail_on_err=True,
                         display_output=True,
                         hosts=None):
        """Execute the command on all client hosts.

        Optionally verify if the command returns a non zero return code.

        Args:
            command (str): the command to execute on the client hosts
            fail_on_err (bool, optional): whether or not to fail the test if
                command returns a non zero return code. Defaults to True.
            display_output (bool, optional): whether or not to display output.
                Defaults to True.

        Raises:
            CommandFailure: if 'fail_on_err' is set and the command fails on at
                least one of the client hosts

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        if hosts is None:
            hosts = self.hostlist_clients
        result = pcmd(hosts, command, verbose=display_output, timeout=300)
        if 0 not in result and fail_on_err:
            hosts = [
                str(nodes) for code, nodes in list(result.items()) if code != 0
            ]
            raise CommandFailure(
                "Error running '{}' on the following hosts: {}".format(
                    command, NodeSet(",".join(hosts))))
        return result
コード例 #24
0
def run_ior_loop(manager, uuids, tmpdir_base):
    """IOR run for each UUID provided.

    Args:
        manager (str): mpi job manager command
        uuids (list): list of container UUIDs
        tmpdir_base (str): base directory for the mpi orte_tmpdir_base mca parameter

    Returns:
        list: a list of CmdResults from each ior command run

    """
    results = []
    errors = []
    for index, cont_uuid in enumerate(uuids):
        manager.job.dfs_cont.update(cont_uuid, "ior.cont_uuid")

        # Create a unique temporary directory for the the manager command
        tmp_dir = mkdtemp(dir=tmpdir_base)
        manager.tmpdir_base.update(tmp_dir, "tmpdir_base")

        try:
            results.append(manager.run())
        except CommandFailure as error:
            ior_mode = "read" if "-r" in manager.job.flags.value else "write"
            errors.append(
                "IOR {} Loop {}/{} failed for container {}: {}".format(
                    ior_mode, index, len(uuids), cont_uuid, error))
        finally:
            # Remove the unique temporary directory and its contents to avoid conflicts
            shutil.rmtree(tmp_dir, ignore_errors=True)

    if errors:
        raise CommandFailure(
            "IOR failed in {}/{} loops: {}".format(len(errors), len(uuids), "\n".join(errors)))
    return results
コード例 #25
0
ファイル: parallel_io.py プロジェクト: liw/daos
    def test_parallelio(self):
        """Jira ID: DAOS-3775.

        Test Description:
            Purpose of this test is to mount dfuse and verify multiple
            containers using fio.
        Use cases:
            Mount dfuse using pool uuid.
            Create multiple containers under that dfuse mount point.
            Check those containers are accessible from that mount point.
            Perform io to those containers using FIO
            Delete one of the containers
            Check if dfuse is still running. If not, fail the test and exit.
            Otherwise, try accessing the deleted container.
            This should fail.
            Check dfuse again.
        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=daosio,tx,dfuse
        :avocado: tags=parallelio
        """
        # get test params for cont and pool count
        self.cont_count = self.params.get("cont_count", '/run/container/*')

        threads = []

        # Create a pool and start dfuse.
        self.create_pool()
        self.start_dfuse(self.hostlist_clients, self.pool[0], None)
        # create multiple containers
        self.add_container_qty(self.cont_count, self.pool[0])

        # check if all the created containers can be accessed and perform
        # io on each container using fio in parallel
        for _, cont in enumerate(self.container):
            dfuse_cont_dir = self.dfuse.mount_dir.value + "/" + cont.uuid
            cmd = "ls -a {}".format(dfuse_cont_dir)
            try:
                # execute bash cmds
                ret_code = general_utils.pcmd(self.hostlist_clients,
                                              cmd,
                                              timeout=30)
                if 0 not in ret_code:
                    error_hosts = NodeSet(",".join([
                        str(node_set)
                        for code, node_set in list(ret_code.items())
                        if code != 0
                    ]))
                    raise CommandFailure("Error running '{}' on the following "
                                         "hosts: {}".format(cmd, error_hosts))
            # report error if any command fails
            except CommandFailure as error:
                self.log.error("ParallelIo Test Failed: %s", str(error))
                self.fail("Test was expected to pass but " "it failed.\n")
            # run fio on all containers
            thread = threading.Thread(target=self.execute_fio,
                                      args=(self.dfuse.mount_dir.value + "/" +
                                            cont.uuid, False))
            threads.append(thread)
            thread.start()

        # wait for all fio jobs to be finished
        for job in threads:
            job.join()

        # destroy first container
        container_to_destroy = self.container[0].uuid
        self.container[0].destroy(1)

        # check dfuse if it is running fine
        self.dfuse.check_running()

        # try accessing destroyed container, it should fail
        try:
            self.execute_fio(
                self.dfuse.mount_dir.value + "/" + container_to_destroy, False)
            self.fail("Fio was able to access destroyed container: {}".format(
                self.container[0].uuid))
        except CommandFailure as error:
            self.log.info("This run is expected to fail")

            # check dfuse is still running after attempting to access deleted
            # container.
            self.dfuse.check_running()
コード例 #26
0
ファイル: config_generate_run.py プロジェクト: liw/daos
    def test_config_generate_run(self):
        """Run daos_server with generated server config file.

        1. Start daos_server.
        2. Call dmg config generate with different parameters.
        3. Store the generated output to a temporary directory - self.test_dir
        4. Copy the generated output from the temp dir to /etc/daos of the
        server node.
        5. Stop daos_server.
        6. Restart daos_server.

        See yaml for the test cases.

        Note: When running locally, use 50 sec timeout in
        DaosServerCommand.__init__()

        :avocado: tags=all,full_regression
        :avocado: tags=hw,small
        :avocado: tags=control,config_generate_entries,config_generate_run
        """
        num_engines = self.params.get(
            "num_engines", "/run/config_generate_params/*/")
        min_ssds = self.params.get(
            "min_ssds", "/run/config_generate_params/*/")
        net_class = self.params.get(
            "net_class", "/run/config_generate_params/*/")

        # Call dmg config generate. AP is always the first server host.
        server_host = self.hostlist_servers[0]
        result = self.get_dmg_command().config_generate(
            access_points=server_host, num_engines=num_engines,
            min_ssds=min_ssds, net_class=net_class)

        try:
            generated_yaml = yaml.safe_load(result.stdout)
        except yaml.YAMLError:
            raise CommandFailure("Error loading dmg generated config!")

        # Stop and restart daos_server. self.start_server_managers() has the
        # server startup check built into it, so if there's something wrong,
        # it'll throw an error.
        self.log.info("Stopping servers")
        self.stop_servers()

        # Create a new server config from generated_yaml and update SCM-related
        # data in engine_params so that the cleanup before the server start
        # works.
        self.log.info("Copy config to /etc/daos and update engine_params")
        self.server_managers[0].update_config_file_from_file(
            self.hostlist_servers, self.test_dir, generated_yaml)

        # Start server with the generated config.
        self.log.info("Restarting server with the generated config")
        try:
            agent_force = self.start_server_managers(force=True)
        except ServerFailed as error:
            self.fail("Restarting server failed! {}".format(error))

        # We don't need agent for this test. However, when we stop the server,
        # agent is also stopped. Then the harness checks that the agent is
        # running during the teardown. If agent isn't running at that point, it
        # would cause an error, so start it here.
        self.log.info("Restarting agents")
        self.start_agent_managers(force=agent_force)
コード例 #27
0
    def test_bashcmd(self):
        """Jira ID: DAOS-3508.

        Test Description:
            Purpose of this test is to mount different mount points of dfuse
            for different container and pool sizes and perform basic bash
            commands.

        Use cases:
            Following list of bash commands have been incorporated
            as part of this test: mkdir, touch, ls, chmod, rm, dd, stat,
            cp, cmp, mv, rmdir.
              Create a directory.
              Create a file under that directory.
              List the created file.
              Remove the file.
              Write a file to the dfuse mounted location using dd.
              List the written file to verify if it's create.
              Verify the file created is of right size as desired.
              Copy the file
              Compare the copied file with original to verify the
              content is same.
              Remove copied file.
              Rename file
              Verify renamed file exist using list.
              Verify dfuse support for '.'
              Verify dfuse support for '..'
              Remove renamed file
              Remove a directory

        :avocado: tags=all,daily_regression,pr
        :avocado: tags=hw,small
        :avocado: tags=dfuse
        :avocado: tags=bashcmd
        """
        dir_name = self.params.get("dirname", '/run/bashcmd/*')
        file_name1 = self.params.get("filename1", '/run/bashcmd/*')
        file_name2 = self.params.get("filename2", '/run/bashcmd/*')
        dd_count = self.params.get("dd_count", '/run/bashcmd/*')
        dd_blocksize = self.params.get("dd_blocksize", '/run/bashcmd/*')
        pool_count = self.params.get("pool_count", '/run/pool/*')
        cont_count = self.params.get("cont_count", '/run/container/*')

        # Create a pool if one does not already exist.
        for _ in range(pool_count):
            self.add_pool(connect=False)
            # perform test for multiple containers.
            for count in range(cont_count):
                self.add_container(self.pool)
                mount_dir = "/tmp/{}_daos_dfuse{}".format(
                    self.pool.uuid, count)
                self.start_dfuse(self.hostlist_clients, self.pool,
                                 self.container, mount_dir)
                abs_dir_path = os.path.join(self.dfuse.mount_dir.value,
                                            dir_name)
                abs_file_path1 = os.path.join(abs_dir_path, file_name1)
                abs_file_path2 = os.path.join(abs_dir_path, file_name2)
                # list of commands to be executed.
                commands = [
                    "mkdir -p {}".format(abs_dir_path),
                    "touch {}".format(abs_file_path1),
                    "ls -a {}".format(abs_file_path1),
                    "rm {}".format(abs_file_path1),
                    "dd if=/dev/zero of={} count={} bs={}".format(
                        abs_file_path1, dd_count,
                        dd_blocksize), "ls -al {}".format(abs_file_path1),
                    "filesize=$(stat -c%s '{}');\
                            if (( filesize != {}*{} )); then exit 1;\
                            fi".format(abs_file_path1, dd_count,
                                       dd_blocksize), "cp -r {} {}".format(
                                           abs_file_path1, abs_file_path2),
                    "cmp --silent {} {}".format(abs_file_path1,
                                                abs_file_path2),
                    "rm {}".format(abs_file_path2), "mv {} {}".format(
                        abs_file_path1,
                        abs_file_path2), "ls -al {}".format(abs_file_path2),
                    "ls -al {}/.".format(abs_dir_path),
                    "ls -al {}/..".format(abs_dir_path),
                    "rm {}".format(abs_file_path2),
                    "rmdir {}".format(abs_dir_path)
                ]
                for cmd in commands:
                    try:
                        # execute bash cmds
                        ret_code = general_utils.pcmd(self.hostlist_clients,
                                                      cmd,
                                                      timeout=30)
                        if 0 not in ret_code:
                            error_hosts = NodeSet(",".join([
                                str(node_set)
                                for code, node_set in list(ret_code.items())
                                if code != 0
                            ]))
                            raise CommandFailure(
                                "Error running '{}' on the following "
                                "hosts: {}".format(cmd, error_hosts))
                    # report error if any command fails
                    except CommandFailure as error:
                        self.log.error("BashCmd Test Failed: %s", str(error))
                        self.fail("Test was expected to pass but "
                                  "it failed.\n")

                # stop dfuse
                self.stop_dfuse()
                # destroy container
                self.container.destroy()
            # destroy pool
            self.pool.destroy()
コード例 #28
0
ファイル: dfuse_utils.py プロジェクト: liw/daos
    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        # Include all hosts when stopping to ensure all mount points in any
        # state are properly removed
        self.running_hosts.add(NodeSet.fromlist(self.hosts))

        self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value and self.running_hosts:
            error_list = []

            # Loop until all fuseblk mounted devices are unmounted
            counter = 0
            while self.running_hosts and counter < 3:
                # Attempt to kill dfuse on after first unmount fails
                if self.running_hosts and counter > 1:
                    kill_command = "pkill dfuse --signal KILL"
                    pcmd(self.running_hosts, kill_command, timeout=30)

                # Attempt to unmount any fuseblk mounted devices after detection
                if self.running_hosts and counter > 0:
                    pcmd(self.running_hosts,
                         self.get_umount_command(counter > 1),
                         expect_rc=None)
                    time.sleep(2)

                # Detect which hosts have fuseblk mounted devices and remove any
                # hosts which no longer have the dfuse mount point mounted
                state = self.check_mount_state(self.running_hosts)
                for host in state["unmounted"].union(state["nodirectory"]):
                    self.running_hosts.remove(host)

                # Increment the loop counter
                counter += 1

            if self.running_hosts:
                error_list.append("Error stopping dfuse on {}".format(
                    self.running_hosts))

            # Remove mount points
            try:
                self.remove_mount_point()
            except CommandFailure as error:
                error_list.append(error)

            # Report any errors
            if error_list:
                raise CommandFailure("\n".join(error_list))

        elif self.mount_dir.value is None:
            self.log.info("No dfuse mount directory defined - nothing to stop")

        else:
            self.log.info("No hosts running dfuse - nothing to stop")
コード例 #29
0
ファイル: daos_build.py プロジェクト: liw/daos
    def test_daos_build(self):
        """Jira ID: DAOS-8937.

        Test Description:
            This test builds DAOS on a dfuse filesystem.
        Use cases:
            Create Pool
            Create Posix container
            Mount dfuse
            Checkout and build DAOS sources.
        :avocado: tags=all,full_regression
        :avocado: tags=vm
        :avocado: tags=daosio,dfuse
        :avocado: tags=dfusedaosbuild
        """

        # Create a pool, container and start dfuse.
        self.add_pool(connect=False)
        self.add_container(self.pool)

        daos_cmd = self.get_daos_command()

        cont_attrs = OrderedDict()

        cache_mode = self.params.get('name', '/run/dfuse/*')
        intercept = self.params.get('use_intercept',
                                    '/run/intercept/*',
                                    default=False)

        # How long to cache things for, if caching is enabled.
        cache_time = '30m'
        build_time = 15

        if cache_mode == 'writeback':
            cont_attrs['dfuse-data-cache'] = 'on'
            cont_attrs['dfuse-attr-time'] = cache_time
            cont_attrs['dfuse-dentry-time'] = cache_time
            cont_attrs['dfuse-ndentry-time'] = cache_time
        elif cache_mode == 'writethrough':
            cont_attrs['dfuse-data-cache'] = 'on'
            cont_attrs['dfuse-attr-time'] = cache_time
            cont_attrs['dfuse-dentry-time'] = cache_time
            cont_attrs['dfuse-ndentry-time'] = cache_time
            if intercept:
                build_time = 120
        elif cache_mode == 'metadata':
            cont_attrs['dfuse-data-cache'] = 'off'
            cont_attrs['dfuse-attr-time'] = cache_time
            cont_attrs['dfuse-dentry-time'] = cache_time
            cont_attrs['dfuse-ndentry-time'] = cache_time
        elif cache_mode == 'nocache':
            build_time = 210
            cont_attrs['dfuse-data-cache'] = 'off'
            cont_attrs['dfuse-attr-time'] = '0'
            cont_attrs['dfuse-dentry-time'] = '0'
            cont_attrs['dfuse-ndentry-time'] = '0'
        else:
            self.fail('Invalid cache_mode: {}'.format(cache_mode))

        for key, value in cont_attrs.items():
            daos_cmd.container_set_attr(pool=self.pool.uuid,
                                        cont=self.container.uuid,
                                        attr=key,
                                        val=value)

        self.start_dfuse(self.hostlist_clients, self.pool, self.container)

        mount_dir = self.dfuse.mount_dir.value
        build_dir = os.path.join(mount_dir, 'daos')

        remote_env = OrderedDict()
        remote_env['PATH'] = '{}:$PATH'.format(
            os.path.join(mount_dir, 'venv', 'bin'))
        remote_env['VIRTUAL_ENV'] = os.path.join(mount_dir, 'venv')

        if intercept:
            remote_env['LD_PRELOAD'] = os.path.join(self.prefix, 'lib64',
                                                    'libioil.so')
            remote_env['D_LOG_FILE'] = '/var/tmp/daos_testing/daos-il.log'
            remote_env['DD_MASK'] = 'all'
            remote_env['DD_SUBSYS'] = 'all'
            remote_env['D_LOG_MASK'] = 'INFO'

        envs = [
            'export {}={}'.format(env, value)
            for env, value in remote_env.items()
        ]

        preload_cmd = ';'.join(envs)

        # Run the deps build in parallel for speed/coverage however the daos build itself does
        # not yet work, so run this part in serial.
        cmds = [
            'python3 -m venv {}/venv'.format(mount_dir),
            'git clone https://github.com/daos-stack/daos.git {}'.format(
                build_dir), 'git -C {} submodule init'.format(build_dir),
            'git -C {} submodule update'.format(build_dir),
            'python3 -m pip install pip --upgrade',
            'python3 -m pip install -r {}/requirements.txt'.format(build_dir),
            'scons -C {} --jobs 50 build --build-deps=yes --deps-only'.format(
                build_dir), 'scons -C {} build'.format(build_dir)
        ]
        for cmd in cmds:
            try:
                command = '{};{}'.format(preload_cmd, cmd)
                # Use a 10 minute timeout for most commands, but vary the build timeout based on
                # the dfuse mode.
                timeout = 10 * 60
                if cmd.startswith('scons'):
                    timeout = build_time * 60
                ret_code = general_utils.pcmd(self.hostlist_clients,
                                              command,
                                              timeout=timeout)
                if 0 in ret_code:
                    continue
                self.log.info(ret_code)
                raise CommandFailure("Error running '{}'".format(cmd))
            except CommandFailure as error:
                self.log.error('BuildDaos Test Failed: %s', str(error))
                self.fail(
                    'Unable to build daos over dfuse in mode {}.\n'.format(
                        cache_mode))
コード例 #30
0
ファイル: job_manager_utils.py プロジェクト: liw/daos
    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            list: a list of dictionaries including:
                "hosts": <NodeSet() of hosts with this data>
                "data": <journalctl output>

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = self.get_journalctl_command(since, until)
        self.log.info("Gathering log data on %s: %s", str(hosts), command)

        # Gather the log information per host
        results = run_pcmd(hosts, command, False, timeout, None)

        # Determine if the command completed successfully without a timeout
        status = True
        for result in results:
            if result["interrupted"]:
                self.log.info("  Errors detected running \"%s\":", command)
                self.log.info("    %s: timeout detected after %s seconds",
                              str(result["hosts"]), timeout)
                status = False
            elif result["exit_status"] != 0:
                self.log.info("  Errors detected running \"%s\":", command)
                status = False
            if not status:
                break

        # Display/return the command output
        log_data = []
        for result in results:
            if result["exit_status"] == 0 and not result["interrupted"]:
                # Add the successful output from each node to the dictionary
                log_data.append({
                    "hosts": result["hosts"],
                    "data": result["stdout"]
                })
            else:
                # Display all of the results in the case of an error
                if len(result["stdout"]) > 1:
                    self.log.info("    %s: rc=%s, output:",
                                  str(result["hosts"]), result["exit_status"])
                    for line in result["stdout"]:
                        self.log.info("      %s", line)
                else:
                    self.log.info("    %s: rc=%s, output: %s",
                                  str(result["hosts"]), result["exit_status"],
                                  result["stdout"][0])

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data