Example #1
0
    def get_port_cnt(self, hosts, port_counter):
        """Get the port count info for device names specified.

        Args:
            hosts (list): list of hosts
            port_counter (str): port counter information to collect

        Returns:
            dict: a dictionary of the requested port data for each interface on each host

        """
        port_info = {}
        for interface in self.interfaces:
            # Check the port counter for each interface on all of the hosts
            counter_file = os.path.join(
                os.sep, "sys", "class", "infiniband", self.interfaces[interface]["domain"], "ports",
                "1", "counters", port_counter)
            check_result = check_file_exists(hosts, counter_file)
            if not check_result[0]:
                self.fail("{}: {} not found".format(check_result[1], counter_file))
            all_host_data = get_host_data(
                hosts, "cat {}".format(counter_file), "{} port_counter".format(interface),
                "Error obtaining {} info".format(port_counter), 20)
            port_info[interface] = {}
            for host_data in all_host_data:
                for host in list(host_data["hosts"]):
                    port_info[interface][host] = {1: {port_counter: host_data["data"]}}
        return port_info
Example #2
0
def run_event_check(self, since, until):
    """Run a check on specific events in journalctl.

    Args:
        self (obj): soak obj

    Returns list of any matched events found in system log

    """
    events_found = []
    detected = 0
    # to do: currently all events are from - t kernel;
    # when systemctl is enabled add daos events
    events = self.params.get("events", "/run/*")
    # check events on all nodes
    hosts = list(set(self.hostlist_servers))
    if events:
        command = ("sudo /usr/bin/journalctl --system -t kernel -t "
                   "daos_server --since=\"{}\" --until=\"{}\"".format(
                       since, until))
        err = "Error gathering system log events"
        for event in events:
            for output in get_host_data(hosts, command, "journalctl", err):
                lines = output["data"].splitlines()
                for line in lines:
                    match = re.search(r"{}".format(event), str(line))
                    if match:
                        events_found.append(line)
                        detected += 1
                self.log.info(
                    "Found %s instances of %s in system log from %s through %s",
                    detected, event, since, until)
    return events_found
Example #3
0
    def get_log_info(self, hosts, dev, env_state, log_file):
        """Get information from daos.log file to verify device used.

        Args:
            hosts (list): list of hosts
            dev (str): device to get counter information for
            env_state (bool): set state for OFI_INTERFACE env variable
            log_file (str): log file to verify

        Returns:
            bool: status of whether correct device was used.

        """
        # anticipate log switch
        cmd = "if [ -f {0}.old ]; then head -50 {0}.old; else head -50 {0};" \
              "fi".format(log_file)
        err = "Error getting log data."
        pattern = r"Using\s+client\s+provided\s+OFI_INTERFACE:\s+{}".format(dev)

        detected = 0
        for host_data in get_host_data(hosts, cmd, log_file, err):
            detected = len(re.findall(pattern, host_data["data"]))
        self.log.info(
            "Found %s instances of client setting up OFI_INTERFACE=%s",
            detected, dev)

        # Verify
        status = True
        if env_state and detected != 1:
            status = False
        elif not env_state and detected == 1:
            status = False
        return status
Example #4
0
    def get_port_cnt(self, hosts, dev, port_counter):
        """Get the port count info for device names specified.

        Args:
            hosts (list): list of hosts
            dev (str): device to get counter information for
            port_counter (str): port counter to get information from

        Returns:
            list: a list of the data common to each unique NodeSet of hosts

        """
        b_path = "/sys/class/infiniband/{}".format(dev)
        file = os.path.join(b_path, "ports/1/counters", port_counter)

        # Check if if exists for the host
        check_result = check_file_exists(hosts, file)
        if not check_result[0]:
            self.fail("{}: {} not found".format(check_result[1], file))

        cmd = "cat {}".format(file)
        text = "port_counter"
        error = "Error obtaining {} info".format(port_counter)
        all_host_data = get_host_data(hosts, cmd, text, error, 20)
        return [host_data["data"] for host_data in all_host_data]
Example #5
0
    def get_host_nvme_data(self):
        """Get the largest NVMe capacity in bytes for each host.

        Returns:
            dict: a dictionary of data values for each NodeSet key

        """
        cmd = "lsblk -b -o SIZE,NAME | grep nvme"
        text = "NVMe"
        error = "No NVMe drives bound to the kernel driver detected"
        return get_host_data(self.hosts, cmd, text, error, self.timeout)
Example #6
0
    def get_host_mem_data(self):
        """Get the total non-swap memory in bytes for each host.

        Returns:
            dict: a dictionary of data values for each NodeSet key

        """
        cmd = r"free -b | sed -En 's/Mem:\s+([0-9]+).*/\1/p'"
        text = "memory"
        error = "Error obtaining total memory size"
        return get_host_data(self.hosts, cmd, text, error, self.timeout)
Example #7
0
    def get_host_scm_data(self):
        """Get the total SCM capacity in bytes for each host.

        Returns:
            dict: a dictionary of data values for each NodeSet key

        """
        cmd_list = [
            "sudo -n ipmctl show -units B -memoryresources",
            r"sed -En 's/^Capacity\=([0-9+]).*/\1/p'",
        ]
        cmd = " | ".join(cmd_list)
        text = "SCM"
        error = "No SCM devices detected"
        return get_host_data(self.hosts, cmd, text, error, self.timeout)
Example #8
0
    def get_superblock_info(self, sp_file, sp_value):
        """Get the superblock information for each host.

        Args:
            sp_file (str): scm mount path.
            sp_value (str): superblock file value to extract.
                i.e. version, uuid, system, rank, validrank, ms

        Returns:
            dict: a dictionary of data values for each NodeSet key

        """
        pattern = r"^{}:\s+([_a-z0-9-]+).*".format(sp_value)
        cmd = r"cat {} | sed -En 's/{}/\1 /gp'".format(sp_file, pattern)
        text = "superblock"
        error = "Error obtaining superblock info: {}".format(sp_value)

        return get_host_data(self.dmg.hostlist, cmd, text, error, 20)