Example #1
0
    def debug_numa_node(self, pci_addr_heads):
        """Debug numa_node file by searching it in /sys and call hwloc-ls.

        Args:
            pci_addr_heads (list): List of PCI address head.
        """
        for pci_addr_head in pci_addr_heads:
            self.log.debug("----- Search PCI Addr Head %s in /sys -----",
                           pci_addr_head)
            run_pcmd(hosts=self.hostlist_servers,
                     command="find /sys -name \"{}\"".format(pci_addr_head),
                     verbose=True)

        # Another way to obtain the Socket ID is to use hwloc-ls --whole-io
        # --verbose. It contains something like:

        # Bridge Host->PCI L#9 (P#2 buses=0000:[80-81])
        #     Bridge PCI->PCI (P#524320 busid=0000:80:02.0 id=8086:2f04
        #     class=0604(PCI_B) buses=0000:[81-81])
        #         PCI 8086:2701 (P#528384 busid=0000:81:00.0 class=0108(NVMExp)
        #         PCISlot=801)

        # In this case, the PCI address was 0000:81:00.0. We can figure out
        # which NUMA node section these lines are in. This approach is clearly
        # much more cumbersome than reading the numa_node, so it's called here
        # for mainly debugging purpose.
        self.log.debug("----- Show PCI Address in hwloc-ls -----")
        pcmd(hosts=self.hostlist_servers,
             command="hwloc-ls --whole-io --verbose")
Example #2
0
    def get_device_info(self):
        """Get the available device names, their numa nodes, and their domains."""
        self.interfaces = {}
        command = "ls -1 {}".format(os.path.join(os.path.sep, "sys", "class", "net"))
        results = run_pcmd(self.hostlist_servers, command)
        if len(results) != 1:
            self.fail("Error obtaining interfaces - non-homogeneous config")
        try:
            # Find any ib* device in the listing and initially use default numa and domain values
            for index, interface in enumerate(re.findall(r"ib\d", "\n".join(results[0]["stdout"]))):
                self.interfaces[interface] = {"numa": index, "domain": "hfi1_{}".format(index)}
        except (IndexError, KeyError) as error:
            self.log.error("Error obtaining interfaces: %s", str(error))
            self.fail("Error obtaining interfaces - unexpected error")

        # Update interface domain and NUMA node settings for Mellanox devices through mst output:
        #   DEVICE_TYPE        MST   PCI       RDMA     NET       NUMA
        #   ConnectX6(rev:0)   NA    86:00.0   mlx5_1   net-ib1   1
        #   ConnectX6(rev:0)   NA    37:00.0   mlx5_0   net-ib0   0
        command = "sudo mst status -v"
        results = run_pcmd(self.hostlist_servers, command)
        try:
            if results[0]["exit_status"] == 0:
                regex = r"(mlx\d_\d)\s+net-(ib\d)\s+(\d)"
                for match in re.findall(regex, "\n".join(results[0]["stdout"])):
                    self.interfaces[match[1]]["numa"] = int(match[2])
                    self.interfaces[match[1]]["domain"] = match[0]
        except (IndexError, KeyError, ValueError) as error:
            self.log.error("Error obtaining interfaces: %s", str(error))
            self.fail("Error obtaining interfaces - unexpected error")

        if not self.interfaces:
            self.fail("No ib* interfaces found!")
Example #3
0
    def test_cpu_usage(self):
        """
        JIRA ID: DAOS-4826

        Test Description: Test CPU usage of formatted and idle engine.

        :avocado: tags=all,full_regression
        :avocado: tags=server
        :avocado: tags=cpu_usage
        """
        # Get PID of daos_engine with ps.
        ps_engine = r"ps -C daos_engine -o %\p"
        pid_found = False
        # At this point, daos_engine should be started, but do the repetitive
        # calls just in case.
        for _ in range(5):
            results = run_pcmd(hosts=self.hostlist_servers, command=ps_engine)
            for result in results:
                self.log.info("ps output = %s", "\n".join(result["stdout"]))
                pid = result["stdout"][-1]
                self.log.info("PID = %s", pid)
                if "PID" not in pid:
                    pid_found = True
            if pid_found:
                break
            time.sleep(5)
        if not pid_found:
            self.fail("daos_engine PID couldn't be obtained!")

        for _ in range(10):
            # Get (instantaneous) CPU usage of the PID with top.
            top_pid = "top -p {} -b -n 1".format(pid)
            usage = -1
            results = run_pcmd(hosts=self.hostlist_servers, command=top_pid)
            for result in results:
                process_row = result["stdout"][-1]
                self.log.info("Process row = %s", process_row)
                values = process_row.split()
                self.log.info("Values = %s", values)
                if len(values) < 9:
                    self.fail("{} returned invalid output!".format(top_pid))
                usage = values[8]
                self.log.info("CPU Usage = %s", usage)
            if usage != -1 and float(usage) < 100:
                break
            time.sleep(2)

        self.assertTrue(usage != -1,
                        "daos_engine CPU usage couldn't be obtained!")
        self.assertTrue(
            float(usage) < 100, "CPU usage is above 100%: {}%".format(usage))
Example #4
0
 def dump_attachinfo(self):
     """Run dump-attachinfo on the daos_agent."""
     self.manager.job.set_sub_command("dump-attachinfo")
     self.manager.job.sudo = True
     self.attachinfo = run_pcmd(self.hosts,
                                str(self.manager.job))[0]["stdout"]
     self.log.info("Agent attachinfo: %s", self.attachinfo)
Example #5
0
    def get_current_state(self):
        """Get the current state of the daos_server ranks.

        Returns:
            dict: dictionary of server rank keys, each referencing a dictionary
                of information containing at least the following information:
                    {"host": <>, "uuid": <>, "state": <>}
                This will be empty if there was error obtaining the dmg system
                query output.

        """
        data = {}
        ranks = {host: rank for rank, host in enumerate(self._hosts)}
        if not self._verify_socket_dir:
            command = "systemctl is-active {}".format(
                self.manager.job.service_name)
        else:
            command = "prep {}".format(self.manager.job.command)
        results = run_pcmd(self._hosts, command, 30)
        for result in results:
            for node in result["hosts"]:
                # expecting single line output from run_pcmd
                data[ranks[node]] = {
                    "host": node,
                    "uuid": "-",
                    "state": result["stdout"][-1]
                }
        return data
Example #6
0
    def get_cpu_usage(self, pid, usage_limit):
        """Monitor CPU usage and return if it gets below usage_limit.

        Args:
            pid (str): daos_engine PID.
            usage_limit (int): Limit that we want daos_engine to use.

        Returns:
            str: daos_engine CPU usage.
        """
        usage = -1
        for _ in range(10):
            # Get (instantaneous) CPU usage of the PID with top.
            top_pid = "top -p {} -b -n 1".format(pid)
            usage = -1
            results = run_pcmd(hosts=self.hostlist_servers, command=top_pid)
            for result in results:
                process_row = result["stdout"][-1]
                self.log.info("Process row = %s", process_row)
                values = process_row.split()
                self.log.info("Values = %s", values)
                if len(values) < 9:
                    self.fail("{} returned invalid output!".format(top_pid))
                usage = values[8]
                self.log.info("CPU Usage = %s", usage)
            if usage != -1 and float(usage) < usage_limit:
                break
            time.sleep(2)
        return usage
Example #7
0
def run_metrics_check(self, logging=True, prefix=None):
    """Monitor telemetry data.

    Args:
        self (obj): soak obj
        logging (bool): If True; output is logged to file
        prefix (str): add prefix to name; ie initial or final
    """
    enable_telemetry = self.params.get("enable_telemetry", "/run/*")
    if enable_telemetry:
        engine_count = self.server_managers[0].get_config_value(
            "engines_per_host")
        for engine in range(engine_count):
            name = "pass" + str(self.loop) + "_metrics_{}.csv".format(engine)
            if prefix:
                name = prefix + "_metrics_{}.csv".format(engine)
            destination = self.outputsoakdir
            results = run_pcmd(
                hosts=self.hostlist_servers,
                command="sudo daos_metrics -S {} --csv".format(engine),
                verbose=(not logging),
                timeout=60)
            if logging:
                for result in results:
                    hosts = result["hosts"]
                    log_name = name + "-" + str(hosts)
                    write_logfile(result["stdout"], log_name, destination)
Example #8
0
    def service_running(self):
        """Determine if the job's service is active via the systemctl command.

        The 'systemctl is-active <service>' command will return a string
        indicating one of the following states:
            active, inactive, activating, deactivating, failed, unknown
        If the <service> is "active" or "activating" return True.

        Returns:
            bool: True id the service is running, False otherwise

        """
        status = True
        states = {}
        valid_states = ["active", "activating"]
        self._systemctl.unit_command.value = "is-active"
        results = run_pcmd(self._hosts, self.__str__(), False, self.timeout,
                           None)
        for result in results:
            if result["interrupted"]:
                states["timeout"] = result["hosts"]
                status = False
            else:
                output = result["stdout"][-1]
                if output not in states:
                    states[output] = NodeSet()
                states[output].add(result["hosts"])
                status &= output in valid_states
        data = ["=".join([key, str(states[key])]) for key in sorted(states)]
        self.log.info("  Detected %s states: %s",
                      self._systemctl.service.value, ", ".join(data))
        return status
Example #9
0
    def prepare_storage(self, user, using_dcpm=None, using_nvme=None):
        """Prepare the server storage.

        Args:
            user (str): username
            using_dcpm (bool, optional): override option to prepare scm storage.
                Defaults to None, which uses the configuration file to determine
                if scm storage should be formatted.
            using_nvme (bool, optional): override option to prepare nvme
                storage. Defaults to None, which uses the configuration file to
                determine if nvme storage should be formatted.

        Raises:
            ServerFailed: if there was an error preparing the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.target_user.value = user
        cmd.sub_command_class.sub_command_class.force.value = True

        # Use the configuration file settings if no overrides specified
        if using_dcpm is None:
            using_dcpm = self.manager.job.using_dcpm
        if using_nvme is None:
            using_nvme = self.manager.job.using_nvme

        if using_dcpm and not using_nvme:
            cmd.sub_command_class.sub_command_class.scm_only.value = True
        elif not using_dcpm and using_nvme:
            cmd.sub_command_class.sub_command_class.nvme_only.value = True

        self.log.info("Preparing DAOS server storage: %s", str(cmd))
        results = run_pcmd(self._hosts, str(cmd), timeout=self.storage_prepare_timeout.value)

        # gratuitously lifted from pcmd() and get_current_state()
        result = {}
        stdouts = ""
        for res in results:
            stdouts += '\n'.join(res["stdout"] + [''])
            if res["exit_status"] not in result:
                result[res["exit_status"]] = NodeSet()
            result[res["exit_status"]].add(res["hosts"])

        if len(result) > 1 or 0 not in result or \
           (using_dcpm and "No SCM modules detected; skipping operation" in stdouts):
            dev_type = "nvme"
            if using_dcpm and using_nvme:
                dev_type = "dcpm & nvme"
            elif using_dcpm:
                dev_type = "dcpm"
            pcmd(self._hosts, "sudo -n ipmctl show -v -dimm")
            pcmd(self._hosts, "ndctl list ")
            raise ServerFailed("Error preparing {} storage".format(dev_type))
Example #10
0
    def test_core_files(self):
        """Test to verify core file creation.

        This test will send a signal 6 to a random daos_engine process so
        that it will create a core file, allowing the core file collection code
        in launch.py to be tested.

        This test can be run in any CI stage: vm, small, medium, large

        :avocado: tags=all
        :avocado: tags=harness,harness_advanced_test,core_files
        :avocado: tags=test_core_files
        """
        # Choose a server find the pid of its daos_engine process
        host = choice(self.server_managers[0].hosts) #nosec
        self.log.info("Obtaining pid of the daos_engine process on %s", host)
        pid = None
        result = run_pcmd([host], "pgrep --list-full daos_engine", 20)
        index = 0
        while not pid and index < len(result):
            output = "\n".join(result[index]["stdout"])
            match = findall(r"(\d+)\s+[A-Za-z0-9/]+", output)
            if match:
                pid = match[0]
            index += 1
        if pid is None:
            self.fail(
                "Error obtaining pid of the daos_engine process on "
                "{}".format(host))
        self.log.info("Found pid %s", pid)

        # Send a signal 6 to its daos_engine process
        self.log.info("Sending a signal 6 to %s", pid)
        result = run_pcmd([host], "sudo kill -6 {}".format(pid))
        if len(result) > 1 or result[0]["exit_status"] != 0:
            self.fail("Error sending a signal 6 to {} on {}".format(pid, host))

        # Display the journalctl log for the process that was sent the signal
        self.server_managers[0].manager.dump_logs([host])

        # Simplify resolving the host name to rank by marking all ranks as
        # expected to be either running or errored (sent a signal 6)
        self.server_managers[0].update_expected_states(
            None, ["Joined", "Errored"])
Example #11
0
    def verify_ssd_sockets(self, storage_dict):
        """Main test component.

        Args:
            storage_dict (dict): Dictionary under "storage"

        Returns:
            list: List of errors.
        """
        nvme_devices = storage_dict["nvme_devices"]

        pci_addr_heads = []
        errors = []

        # For every PCI address, verify its Socket ID against its NUMA socket
        # ID.
        for nvme_device in nvme_devices:
            cmd_socket_id = nvme_device["socket_id"]

            # Get the PCI Address Head and construct the path to numa_node.
            pci_addr = nvme_device["pci_addr"]
            pci_addr_values = pci_addr.split(":")
            pci_addr_head = "{}:{}".format(pci_addr_values[0],
                                           pci_addr_values[1])
            pci_addr_heads.append(pci_addr_head)
            numa_node_path = "/sys/class/pci_bus/{}/device/numa_node".format(
                pci_addr_head)

            # Call cat on the server host, not necessarily the local test host.
            results = run_pcmd(hosts=[self.hostlist_servers[0]],
                               command="cat {}".format(numa_node_path))

            # Obtain the numa_node content.
            fs_socket_id = ""
            for result in results:
                # Test that the content is expected.
                fs_socket_id = result["stdout"][-1]
                if fs_socket_id != str(cmd_socket_id):
                    errors.append(
                        "Unexpected socket ID! Cmd: {}; FS: {}".format(
                            cmd_socket_id, fs_socket_id))

        if errors:
            # Since we're dealing with system files and we don't have access to
            # them in CI, we need some debugging info when the test fails to
            # better understand the result.
            self.debug_numa_node(pci_addr_heads)

        return errors
Example #12
0
    def test_nvme_io_stats(self):
        """Jira ID: DAOS-4722.

        Test Description:
            Purpose of this test is to run IO test and check when NVME_IO_STATS
            enabled in config, it generates the different statistics.

        Use case:
            Run ior and it will print the NVMe IO stats to control plane log
            file.

        :avocado: tags=all,hw,medium,nvme,ib2,nvme_io_stats,full_regression
        """
        # run ior
        self.run_ior_with_pool()

        # Get the NVMe IO statistics from server control_log file.
        cmd = 'cat {}'.format(get_log_file(self.control_log))
        results = run_pcmd(self.hostlist_servers, cmd)
        for result in results:
            if result["exit_status"] == 1:
                self.fail("Failed to run cmd {} on {}".format(
                    cmd, result["hosts"]))

            # Verify statistics are increasing for IO
            target_stats = []
            for _tmp in range(8):
                target_stats.append([
                    line for line in result["stdout"]
                    if "tgt[{}]".format(_tmp) in line
                ])
            for stats in NVME_STATS:
                for _tgt in range(len(target_stats)):
                    first_stats = re.findall(r'\d+', [
                        x for x in target_stats[_tgt][0].split()
                        if re.search(stats, x)
                    ][0])[0]
                    last_stats = re.findall(r'\d+', [
                        x for x in target_stats[_tgt][-1].split()
                        if re.search(stats, x)
                    ][0])[0]
                    # Last statistic should be higher than initial statistics
                    if int(first_stats) >= int(last_stats):
                        self.fail(
                            "Failed: Stats {} for target {} did not increased "
                            "First_stat={} < Last_stat={}".format(
                                stats, _tgt, first_stats, last_stats))
Example #13
0
File: ec_utils.py Project: liw/daos
    def stop_job_managers(self):
        """Cleanup dfuse in case of test failure."""
        error_list = []
        dfuse_cleanup_cmd = [
            "pkill dfuse --signal KILL",
            "fusermount3 -uz {}".format(self.dfuse.mount_dir.value)
        ]

        for cmd in dfuse_cleanup_cmd:
            results = run_pcmd(self.hostlist_clients, cmd)
            for result in results:
                if result["exit_status"] != 0:
                    error_list.append(
                        "Errors detected during cleanup cmd %s on node %s",
                        cmd, str(result["hosts"]))
                    error_list.extend(super().stop_job_managers())
        return error_list
Example #14
0
    def test_cpu_usage(self):
        """
        JIRA ID: DAOS-4826

        Test Description: Test CPU usage of formatted and idle engine.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,small
        :avocado: tags=server,cpu_usage
        """
        # Get PID of daos_engine with ps.
        ps_engine = r"ps -C daos_engine -o %\p"
        pid_found = False
        # At this point, daos_engine should be started, but do the repetitive
        # calls just in case.
        for _ in range(5):
            results = run_pcmd(hosts=self.hostlist_servers, command=ps_engine)
            for result in results:
                self.log.info("ps output = %s", "\n".join(result["stdout"]))
                pid = result["stdout"][-1]
                self.log.info("PID = %s", pid)
                if "PID" not in pid:
                    pid_found = True
            if pid_found:
                break
            time.sleep(5)
        if not pid_found:
            self.fail("daos_engine PID couldn't be obtained!")

        # Get and verify CPU usage.
        usage_limit = self.params.get("usage_limit", '/run/*')
        usage = self.get_cpu_usage(pid=pid, usage_limit=usage_limit)
        self.verify_usage(usage=usage, usage_limit=usage_limit)

        # Create a pool, container, and run IOR. IO will invoke CPU usage by
        # daos_engine.
        self.run_ior_with_pool()

        # Verify that the CPU usage goes down after IO.
        usage = self.get_cpu_usage(pid=pid, usage_limit=usage_limit)
        self.verify_usage(usage=usage, usage_limit=usage_limit)
Example #15
0
    def verify_storage_scam_scm(self, storage_dict):
        """Main test component.

        Args:
            storage_dict (dict): Dictionary under "storage"

        Returns:
            list: List of errors.
        """
        errors = []
        RC_SUCCESS = 0

        for scm_namespace in storage_dict["scm_namespaces"]:
            # Verify that all namespaces exist under /dev.
            pmem_name = scm_namespace["blockdev"]
            lscmd = "{} {}".format("ls", os.path.join("/dev", pmem_name))
            # rc is a dictionary where return code is the key.
            rc = pcmd(hosts=self.hostlist_servers, command=lscmd)

            if RC_SUCCESS not in rc:
                errors.append("{} didn't exist under /dev!".format(pmem_name))

            # Verify the Socket ID.
            numa_node_path = "/sys/class/block/{}/device/numa_node".format(
                pmem_name)
            command = "cat {}".format(numa_node_path)
            out_list = run_pcmd(hosts=self.hostlist_servers, command=command)

            # This one is in str.
            expected_numa_node = out_list[0]["stdout"][0]
            actual_numa_node = str(scm_namespace["numa_node"])

            if expected_numa_node != actual_numa_node:
                msg = "Unexpected Socket ID! Expected: {}, Actual: {}".format(
                    expected_numa_node, actual_numa_node)
                errors.append(msg)

        return errors
Example #16
0
    def get_daos_metrics(self, verbose=False, timeout=60):
        """Get daos_metrics for the server.

        Args:
            verbose (bool, optional): pass verbose to run_pcmd. Defaults to False.
            timeout (int, optional): pass timeout to each execution ofrun_pcmd. Defaults to 60.

        Returns:
            list: list of pcmd results for each host. See general_utils.run_pcmd for details.
                [
                    general_utils.run_pcmd(), # engine 0
                    general_utils.run_pcmd()  # engine 1
                ]

        """
        engines_per_host = self.get_config_value("engines_per_host") or 1
        engines = []
        daos_metrics_exe = os.path.join(self.manager.job.command_path, "daos_metrics")
        for engine in range(engines_per_host):
            results = run_pcmd(
                hosts=self._hosts, verbose=verbose, timeout=timeout,
                command="sudo {} -S {} --csv".format(daos_metrics_exe, engine))
            engines.append(results)
        return engines
Example #17
0
    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            list: a list of dictionaries including:
                "hosts": <NodeSet() of hosts with this data>
                "data": <journalctl output>

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = self.get_journalctl_command(since, until)
        self.log.info("Gathering log data on %s: %s", str(hosts), command)

        # Gather the log information per host
        results = run_pcmd(hosts, command, False, timeout, None)

        # Determine if the command completed successfully without a timeout
        status = True
        for result in results:
            if result["interrupted"]:
                self.log.info("  Errors detected running \"%s\":", command)
                self.log.info("    %s: timeout detected after %s seconds",
                              str(result["hosts"]), timeout)
                status = False
            elif result["exit_status"] != 0:
                self.log.info("  Errors detected running \"%s\":", command)
                status = False
            if not status:
                break

        # Display/return the command output
        log_data = []
        for result in results:
            if result["exit_status"] == 0 and not result["interrupted"]:
                # Add the successful output from each node to the dictionary
                log_data.append({
                    "hosts": result["hosts"],
                    "data": result["stdout"]
                })
            else:
                # Display all of the results in the case of an error
                if len(result["stdout"]) > 1:
                    self.log.info("    %s: rc=%s, output:",
                                  str(result["hosts"]), result["exit_status"])
                    for line in result["stdout"]:
                        self.log.info("      %s", line)
                else:
                    self.log.info("    %s: rc=%s, output: %s",
                                  str(result["hosts"]), result["exit_status"],
                                  result["stdout"][0])

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data
Example #18
0
File: version.py Project: liw/daos
    def test_version(self):
        """Verify version number for dmg, daos, daos_server, and daos_agent against RPM.

        :avocado: tags=all,full_regression
        :avocado: tags=vm
        :avocado: tags=control
        :avocado: tags=version_number
        """
        errors = []

        # Get RPM version.
        rpm_command = "rpm -qa|grep daos-server"
        output = run_pcmd(hosts=self.hostlist_servers, command=rpm_command)
        self.log.info("RPM output = %s", output)
        stdout = output[0]["stdout"][0]
        self.log.info("RPM stdout = %s", stdout)
        result = re.findall(r"daos-server-[tests-|tests_openmpi-]*([\d.]+)", stdout)
        if not result:
            errors.append("RPM version is not in the output! {}".format(output))
        else:
            rpm_version = result[0]
            self.log.info("RPM version = %s", rpm_version)

        # Get dmg version.
        dmg_cmd = self.get_dmg_command()
        output = dmg_cmd.version().stdout.decode("utf-8")

        # Verify that "dmg version" is in the output.
        if "dmg version" not in output:
            errors.append("dmg version is not in the output! {}".format(output))

        result = re.findall(r"dmg version ([\d.]+)", output)
        if not result:
            errors.append("Failed to obtain dmg version! {}".format(output))
        else:
            dmg_version = result[0]
            self.log.info("dmg version = %s", dmg_version)

        # Get daos version.
        daos_cmd = self.get_daos_command()
        output = daos_cmd.version().stdout.decode("utf-8")

        # Verify that "daos version" is in the output.
        if "daos version" not in output:
            errors.append("daos version is not in the output! {}".format(output))

        result = re.findall(r"daos version ([\d.]+)", output)
        if not result:
            errors.append("Failed to obtain daos version! {}".format(output))
        else:
            daos_version = result[0]
            self.log.info("daos version = %s", daos_version)

        # Get daos_agent version.
        daos_agent_cmd = "daos_agent version"
        output = run_pcmd(hosts=self.hostlist_servers, command=daos_agent_cmd)
        stdout = output[0]["stdout"][0]

        # Verify that "DAOS Agent" is in the output.
        if "DAOS Agent" not in stdout:
            errors.append("DAOS Agent is not in the output! {}".format(stdout))

        result = re.findall(r"DAOS Agent v([\d.]+)", stdout)
        if not result:
            errors.append("Failed to obtain daos_agent version! {}".format(output))
        else:
            daos_agent_version = result[0]
            self.log.info("daos_agent version = %s", daos_agent_version)

        # Get daos_server version
        daos_server_cmd = "daos_server version"
        output = run_pcmd(hosts=self.hostlist_servers, command=daos_server_cmd)
        stdout = output[0]["stdout"][0]

        # Verify that "DAOS Control Server" is in the output.
        if "DAOS Control Server" not in stdout:
            errors.append("DAOS Control Server is not in the output! {}".format(stdout))

        result = re.findall(r"DAOS Control Server v([\d.]+)", stdout)
        if not result:
            errors.append("Failed to obtain daos_server version! {}".format(output))
        else:
            daos_server_version = result[0]
            self.log.info("daos_server version = %s", daos_server_version)

        # Verify the tool versions against the RPM.
        tool_versions = [
            ("dmg", dmg_version),
            ("daos", daos_version),
            ("daos_agent", daos_agent_version),
            ("daos_server", daos_server_version)
        ]

        for tool_version in tool_versions:
            tool = tool_version[0]
            version = tool_version[1]
            if version != rpm_version:
                msg = "Unexpected version! {} = {}, RPM = {}".format(
                    tool, version, rpm_version)
                errors.append(msg)

        if errors:
            self.fail("\n---- Errors detected! ----\n{}".format("\n".join(errors)))
Example #19
0
    def test_ec_truncate(self):
        """Jira ID: DAOS-7328.

        Test Description:
            Verify the truncate on EC object class works fine over fuse.

        Use Cases:
            Create the container with EC class
            Create the data file with verify pattern over Fuse
            Truncate the file and increase the size
            Verify the data content and file size
            Truncate the file and reduce the size to original
            Verify the data content and file size

        :avocado: tags=all,full_regression
        :avocado: tags=hw,large,ib2
        :avocado: tags=ec,ec_fio,ec_posix
        :avocado: tags=ec_truncate
        """
        truncate_size = int(self.params.get("truncate_size", '/run/fio/*'))
        fname = self.params.get("names", '/run/fio/*')

        # Write the file using Fio
        self.execute_fio(stop_dfuse=False)

        # Get the fuse file name.
        testfile = "{}.0.0".format(
            os.path.join(self.dfuse.mount_dir.value, fname[0]))
        original_fs = int(self.fio_cmd._jobs['test'].size.value)

        # Read and verify the original data.
        self.fio_cmd._jobs['test'].rw = 'read'
        self.fio_cmd.run()

        # Get the file stats and confirm size
        file_size = get_remote_file_size(self.hostlist_clients[0], testfile)
        self.assertEqual(original_fs, file_size)

        # Truncate the original file which will extend the size of file.
        result = run_pcmd(self.hostlist_clients,
                          "truncate -s {} {}".format(truncate_size, testfile))
        if result[0]["exit_status"] == 1:
            self.fail("Failed to truncate file {}".format(testfile))

        # Verify the file size is extended.
        file_size = get_remote_file_size(self.hostlist_clients[0], testfile)
        self.assertEqual(truncate_size, file_size)

        # Read and verify the data after truncate.
        self.fio_cmd.run()

        # Truncate the original file and shrink to original size.
        result = run_pcmd(self.hostlist_clients,
                          "truncate -s {} {}".format(original_fs, testfile))
        if result[0]["exit_status"] == 1:
            self.fail("Failed to truncate file {}".format(testfile))

        # Verify the file size is shrink to original.
        file_size = get_remote_file_size(self.hostlist_clients[0], testfile)
        self.assertEqual(original_fs, file_size)

        # Read and verify the data after truncate.
        self.fio_cmd.run()
Example #20
0
    def test_stat_parameters(self):
        """JIRA ID: DAOS-3769

        Create files of 1M, 10M, 100M, 500M, and verify the size and creation
        time.

        :avocado: tags=all,full_regression
        :avocado: tags=small
        :avocado: tags=dfuse
        :avocado: tags=stat_parameters
        """
        block_sizes = self.params.get("block_sizes", "/run/*")
        error_list = []

        self.add_pool(connect=False)
        self.add_container(pool=self.pool)

        i = 1
        for block_size in block_sizes:
            self.log.info("Block Size = %s", block_size)
            self.ior_cmd.block_size.update(block_size)

            # 1. Verify creation time.
            test_file_suffix = "_{}".format(i)
            i += 1

            # Run ior command.
            try:
                self.run_ior_with_pool(timeout=200,
                                       stop_dfuse=False,
                                       create_pool=False,
                                       create_cont=False,
                                       test_file_suffix=test_file_suffix)
            except TestFail:
                self.log.info("ior command failed!")

            # Get current epoch.
            current_epoch = -1
            output = run_pcmd(hosts=self.hostlist_clients, command="date +%s")
            stdout = output[0]["stdout"]
            self.log.info("date stdout = %s", stdout)
            current_epoch = stdout[-1]

            # Get epoch of the created file. (technically %Z is for last status
            # change. %W is file birth, but it returns 0.)
            creation_epoch = -1
            # As in date command, run stat command in the client node.
            stat_command = "stat -c%Z {}".format(self.ior_cmd.test_file.value)
            output = run_pcmd(hosts=self.hostlist_clients,
                              command=stat_command)
            stdout = output[0]["stdout"]
            self.log.info("stat stdout = %s", stdout)
            creation_epoch = stdout[-1]

            # Calculate the epoch difference between the creation time and the
            # value in the file metadata. They're usually 2 sec apart.
            creation_epoch_int = int(creation_epoch)
            current_epoch_int = int(current_epoch)
            diff_epoch = creation_epoch_int - current_epoch_int
            if diff_epoch > 10:
                msg = "Unexpected creation time! Expected = {}; Actual = {}"
                error_list.append(
                    msg.format(current_epoch_int, creation_epoch_int))

            # 2. Verify file size.
            # Get file size.
            file_size = get_remote_file_size(self.hostlist_clients[0],
                                             self.ior_cmd.test_file.value)

            # Adjust the file size and verify that it matches the expected size.
            expected_size = block_size[:-1]
            # Obtained size is in byte, so convert it to MB.
            file_size_adjusted = file_size / 1024 / 1024
            if int(expected_size) != file_size_adjusted:
                msg = "Unexpected file size! Expected = {}; Actual = {}"
                error_list.append(
                    msg.format(int(expected_size), file_size_adjusted))

        if error_list:
            self.fail("\n----- Errors detected! -----\n{}".format(
                "\n".join(error_list)))