Esempio n. 1
0
    def test_cpu_usage(self):
        """
        JIRA ID: DAOS-4826
        Test Description: Test CPU usage of formatted and idle engine.
        :avocado: tags=all,full_regression
        :avocado: tags=server
        :avocado: tags=cpu_usage
        """
        # Get PID of daos_engine with ps.
        ps_engine = r"ps -C daos_engine -o %\p"
        pid_found = False
        # At this point, daos_engine should be started, but do the repetetive
        # calls just in case.
        for _ in range(5):
            task = run_task(hosts=self.hostlist_servers, command=ps_engine)
            for output, _ in task.iter_buffers():
                self.log.info("ps output = %s", output)
                pid = str(output).splitlines()[-1]
                self.log.info("PID = %s", pid)
                if "PID" not in pid:
                    pid_found = True
            if pid_found:
                break
            time.sleep(5)
        if not pid_found:
            self.fail("daos_engine PID couldn't be obtained!")

        for _ in range(10):
            # Get (instantaneous) CPU usage of the PID with top.
            top_pid = "top -p {} -b -n 1".format(pid)
            usage = -1
            task = run_task(hosts=self.hostlist_servers, command=top_pid)
            for output, _ in task.iter_buffers():
                process_row = str(output).splitlines()[-1]
                self.log.info("Process row = %s", process_row)
                values = process_row.split()
                self.log.info("Values = %s", values)
                if len(values) < 9:
                    self.fail("{} returned invalid output!".format(top_pid))
                usage = values[8]
                self.log.info("CPU Usage = %s", usage)
            if usage != -1 and float(usage) < 100:
                break
            time.sleep(2)

        self.assertTrue(usage != -1,
                        "daos_engine CPU usage couldn't be obtained!")
        self.assertTrue(
            float(usage) < 100, "CPU usage is above 100%: {}%".format(usage))
Esempio n. 2
0
def get_interface_ib_name(hosts, interface, verbose=True):
    """Get the InfiniBand name of this network interface on each host.

    Args:
        hosts (NodeSet): hosts on which to detect the InfiniBand name
        interface (str): interface for which to obtain the InfiniBand name
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of InfiniBand name keys and NodeSet values on which they were detected

    """
    net_path = os.path.join(os.path.sep, "sys", "class", "net")
    command = f"ls -1 {os.path.join(net_path, interface, 'device', 'infiniband')}"
    task = run_task(hosts, command, verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of IB names with a NodSet of hosts on which it was detected
    ib_names = {}
    results = dict(task.iter_retcodes())
    if 0 in results:
        for output, nodelist in task.iter_buffers(results[0]):
            ib_name_list = []
            for line in output:
                match = re.findall(r"([A-Za-z0-9;_+]+)", line.decode("utf-8"))
                if len(match) == 1:
                    ib_name_list.append(match[0])
            if ib_name_list:
                ib_names[",".join(ib_name_list)] = NodeSet.fromlist(nodelist)

    return ib_names
Esempio n. 3
0
    def service_running(self):
        """Determine if the job's service is active via the systemctl command.

        The 'systemctl is-active <service>' command will return a string
        indicating one of the following states:
            active, inactive, activating, deactivating, failed, unknown
        If the <service> is "active" or "activating" return True.

        Returns:
            bool: True id the service is running, False otherwise

        """
        status = True
        states = {}
        valid_states = ["active", "activating"]
        self._systemctl.unit_command.value = "is-active"
        task = run_task(self._hosts, self.__str__(), self.timeout)
        for output, nodelist in task.iter_buffers():
            output = str(output)
            nodeset = NodeSet.fromlist(nodelist)
            status &= output in valid_states
            if output not in states:
                states[output] = NodeSet()
            states[output].add(nodeset)
        if self.timeout and task.num_timeout() > 0:
            nodeset = NodeSet.fromlist(task.iter_keys_timeout())
            states["timeout"] = nodeset
        data = ["=".join([key, str(states[key])]) for key in sorted(states)]
        self.log.info("  Detected %s states: %s",
                      self._systemctl.service.value, ", ".join(data))
        return status
Esempio n. 4
0
    def get_nvme_readlink(self):
        """Get NVMe readlink from servers.

        Returns:
            dict: Dictionary of server readlink pci mapping with disk ID
                  'wolf-A': {'0000:da:00.0': 'nvme9n1'}.
                  Dictionary of server mapping with disk ID and size
                  'wolf-A': {'nvme2n1': '1600321314816'}.
        """
        nvme_lsblk = self.get_nvme_lsblk()
        nvme_readlink = {}

        #Create the dictionary for NVMe readlink.
        for server, items in nvme_lsblk.items():
            tmp_dict = {}
            for drive in items:
                cmd = ('readlink /sys/block/{}/device/device'
                       .format(drive.split()[0]))
                task = run_task([server], cmd)
                for _rc_code, _node in task.iter_retcodes():
                    if _rc_code == 1:
                        print("Failed to readlink on {}".format(_node))
                        raise ValueError
                #Get the drive size from each daos_io_servers
                for buf, _node in task.iter_buffers():
                    output = str(buf).split('\n')
                tmp_dict[output[0].split('/')[-1]] = drive.split()[0]
            nvme_readlink[server] = tmp_dict

        return nvme_lsblk, nvme_readlink
Esempio n. 5
0
    def get_nvme_lsblk(self):
        """Get NVMe size using lsblk from servers.

        Returns:
            dict: Dictionary of server mapping with disk ID and size
                  'wolf-A': {'nvme2n1': '1600321314816'}.
        """
        nvme_data = {}

        task = run_task(self.hostlist_servers, "lsblk -b /dev/nvme*n*")
        for _rc_code, _node in task.iter_retcodes():
            if _rc_code == 1:
                print("Failed to lsblk on {}".format(_node))
                raise ValueError
        #Get the drive size from each daos_io_servers
        for buf, nodelist in task.iter_buffers():
            for node in nodelist:
                disk_data = {}
                output = str(buf).split('\n')
                for _tmp in output[1:]:
                    if 'nvme' in _tmp:
                        disk_data[_tmp.split()[0]] = _tmp.split()[3]
                    nvme_data['{}'.format(node)] = disk_data

        return nvme_data
Esempio n. 6
0
    def debug_numa_node(self, pci_addr_heads):
        """Debug numa_node file by searching it in /sys and call hwloc-ls.

        Args:
            pci_addr_heads (list): List of PCI address head.
        """
        for pci_addr_head in pci_addr_heads:
            self.log.debug("----- Search PCI Addr Head %s in /sys -----",
                           pci_addr_head)
            task = run_task(
                hosts=self.hostlist_servers,
                command="find /sys -name \"{}\"".format(pci_addr_head))
            for output, _ in task.iter_buffers():
                self.log.debug(output)

        # Another way to obtain the Socket ID is to use hwloc-ls --whole-io
        # --verbose. It contains something like:

        # Bridge Host->PCI L#9 (P#2 buses=0000:[80-81])
        #     Bridge PCI->PCI (P#524320 busid=0000:80:02.0 id=8086:2f04
        #     class=0604(PCI_B) buses=0000:[81-81])
        #         PCI 8086:2701 (P#528384 busid=0000:81:00.0 class=0108(NVMExp)
        #         PCISlot=801)

        # In this case, the PCI address was 0000:81:00.0. We can figure out
        # which NUMA node section these lines are in. This approach is clearly
        # much more cumbersome than reading the numa_node, so it's called here
        # for mainly debugging purpose.
        self.log.debug("----- Show PCI Address in hwloc-ls -----")
        pcmd(hosts=self.hostlist_servers,
             command="hwloc-ls --whole-io --verbose")
Esempio n. 7
0
def get_interface_numa_node(hosts, interface, verbose=True):
    """Get the NUMA node ID of this network interface on each host.

    Args:
        hosts (NodeSet): hosts on which to detect the NUMA node
        interface (str): interface for which to obtain the NUMA node
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of NUMA node ID keys and NodeSet values on which they were detected

    """
    net_path = os.path.join(os.path.sep, "sys", "class", "net")
    command = f"cat {os.path.join(net_path, interface, 'device', 'numa_node')}"
    task = run_task(hosts, command, verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of numa node IDs with a NodSet of hosts on which it was detected
    numa_nodes = {}
    results = dict(task.iter_retcodes())
    if 0 in results:
        for output, nodelist in task.iter_buffers(results[0]):
            output_lines = [line.decode("utf-8") for line in output]
            nodeset = NodeSet.fromlist(nodelist)
            for line in output_lines:
                try:
                    numa_node = int(line.strip())
                except ValueError:
                    continue
                if numa_node not in numa_nodes:
                    numa_nodes[numa_node] = NodeSet()
                numa_nodes[numa_node].update(nodeset)

    return numa_nodes
Esempio n. 8
0
def update_config_cmdlist(args):
    """Create the command lines to update slurmd.conf file.

    Args:
        args (Namespace): Commandline arguments

    Returns:
        cmd_list: list of cmdlines to update config file

    """
    all_nodes = NodeSet("{},{}".format(str(args.control), str(args.nodes)))
    if not args.sudo:
        sudo = ""
    else:
        sudo = "sudo"
    # Copy the slurm*example.conf files to /etc/slurm/
    if execute_cluster_cmds(all_nodes, COPY_LIST, args.sudo) > 0:
        exit(1)

    cmd_list = [
        "sed -i -e 's/ControlMachine=linux0/ControlMachine={}/g' {}".format(
            args.control, SLURM_CONF),
        "sed -i -e 's/ClusterName=linux/ClusterName=ci_cluster/g' {}".format(
            SLURM_CONF),
        "sed -i -e 's/SlurmUser=slurm/SlurmUser={}/g' {}".format(
            args.user, SLURM_CONF),
        "sed -i -e 's/NodeName/#NodeName/g' {}".format(SLURM_CONF),
    ]

    # This info needs to be gathered from every node that can run a slurm job
    command = r"lscpu | grep -E '(Socket|Core|Thread)\(s\)'"
    task = run_task(all_nodes, command)
    for output, nodes in task.iter_buffers():
        info = {
            data[0]: data[1]
            for data in re.findall(r"(Socket|Core|Thread).*:\s+(\d+)",
                                   str(output)) if len(data) > 1
        }

        if "Socket" not in info or "Core" not in info or "Thread" not in info:
            # Did not find value for socket|core|thread so do not
            # include in config file
            pass
        cmd_list.append("echo \"NodeName={0} Sockets={1} CoresPerSocket={2} "
                        "ThreadsPerCore={3}\" |{4} tee -a {5}".format(
                            NodeSet.fromlist(nodes), info["Socket"],
                            info["Core"], info["Thread"], sudo, SLURM_CONF))

    #
    cmd_list.append("echo \"PartitionName= {} Nodes={} Default=YES "
                    "MaxTime=INFINITE State=UP\" |{} tee -a {}".format(
                        args.partition, args.nodes, sudo, SLURM_CONF))

    return execute_cluster_cmds(all_nodes, cmd_list, args.sudo)
Esempio n. 9
0
    def test_scan_ssd(self):
        """
        JIRA ID: DAOS-3584

        Test Description: Verify NVMe NUMA socket values.

        :avocado: tags=all,small,full_regression,hw,control,ssd_socket
        """
        # Call dmg storage scan --verbose and get the PCI addresses.
        data = self.get_dmg_command().storage_scan(verbose=True)
        pci_addrs = data[self.hostlist_servers[0]]["nvme"].keys()
        self.log.info("Testing PCI addresses: %s", pci_addrs)

        pci_addr_heads = []
        errors = []

        # For every PCI address, verify its Socket ID against its NUMA socket
        # ID.
        for pci_addr in pci_addrs:
            # Get the PCI Address Head and construct the path to numa_node.
            cmd_socket_id = data[self.hostlist_servers[0]]["nvme"][pci_addr]\
                ["socket"]
            pci_addr_values = pci_addr.split(":")
            pci_addr_head = "{}:{}".format(pci_addr_values[0],
                                           pci_addr_values[1])
            pci_addr_heads.append(pci_addr_head)
            numa_node_path = "/sys/class/pci_bus/{}/device/numa_node".format(
                pci_addr_head)

            # Call cat on the server host, not necessarily the local test host.
            task = run_task(hosts=[self.hostlist_servers[0]],
                            command="cat {}".format(numa_node_path))

            # Obtain the numa_node content.
            fs_socket_id = ""
            for output, _ in task.iter_buffers():
                fs_socket_id = str(output).splitlines()[-1]

            # Test that the content is expected.
            if fs_socket_id != cmd_socket_id:
                errors.append("Unexpected socket ID! Cmd: {}; FS: {}".format(
                    cmd_socket_id, fs_socket_id))

        if errors:
            # Since we're dealing with system files and we don't have access to
            # them in CI, we need some debugging info when the test fails to
            # better understand the result.
            self.debug_numa_node(pci_addr_heads)
            self.fail("Error found!\n{}".format("\n".join(errors)))
Esempio n. 10
0
    def test_nvme_io_stats(self):
        """Jira ID: DAOS-4722.

        Test Description:
            Purpose of this test is to run IO test and check when NVME_IO_STATS
            enabled in config, it generates the different statistics.

        Use case:
            Run ior and it will print the NVMe IO stats to control plane log
            file.

        :avocado: tags=all,hw,medium,nvme,ib2,nvme_io_stats,full_regression
        """
        # run ior
        self.run_ior_with_pool()

        #Get the NVMe IO statistics from server control_log file.
        cmd = 'cat {}'.format(get_log_file(self.control_log))
        task = run_task(self.hostlist_servers, cmd)
        for _rc_code, _node in task.iter_retcodes():
            if _rc_code == 1:
                self.fail("Failed to run cmd {} on {}".format(cmd, _node))
        for buf, _nodes in task.iter_buffers():
            output_list = str(buf).split('\n')

        #Verify statistics are increasing for IO
        target_stats = []
        for _tmp in range(8):
            target_stats.append(
                [s for s in output_list if "tgt[{}]".format(_tmp) in s])
        for stats in NVME_STATS:
            for _tgt in range(len(target_stats)):
                first_stats = re.findall(r'\d+', [
                    x for x in target_stats[_tgt][0].split()
                    if re.search(stats, x)
                ][0])[0]
                last_stats = re.findall(r'\d+', [
                    x for x in target_stats[_tgt][-1].split()
                    if re.search(stats, x)
                ][0])[0]
                #Last statistic should be higher from the initial statistics
                if int(first_stats) >= int(last_stats):
                    self.fail(
                        'Failed: Stats {} for target {} did not increased'
                        ' First_stat={} < Last_stat={}'.format(
                            stats, _tgt, first_stats, last_stats))
Esempio n. 11
0
def get_interface_speeds(hosts, interface, verbose=True):
    """Get the speeds of this network interface on each host.

    Args:
        hosts (NodeSet): hosts on which to detect the interface speed
        interface (str): interface for which to obtain the speed
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of interface speed keys and NodeSet values on which they were detected

    """
    net_path = os.path.join(os.path.sep, "sys", "class", "net")
    command = f"cat {os.path.join(net_path, interface, 'speed')}"
    task = run_task(hosts, command, verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of interface speeds with a NodSet of hosts on which it was detected
    interface_speeds = {}
    results = dict(task.iter_retcodes())
    if 0 in results:
        for output, nodelist in task.iter_buffers(results[0]):
            output_lines = [line.decode("utf-8") for line in output]
            nodeset = NodeSet.fromlist(nodelist)
            for line in output_lines:
                try:
                    speed = int(line.strip())
                except IOError as io_error:
                    # KVM/Qemu/libvirt returns an EINVAL
                    if io_error.errno == errno.EINVAL:
                        speed = 1000
                except ValueError:
                    # Any line not containing a speed (integer)
                    continue
                if speed not in interface_speeds:
                    interface_speeds[speed] = NodeSet()
                interface_speeds[speed].update(nodeset)

    return interface_speeds
Esempio n. 12
0
    def test_cpu_usage(self):
        # pylint: disable=pylint-bad-continuation
        """
        JIRA ID: DAOS-4826
        Test Description: Test CPU usage of formatted and idle daos_io_server.
        :avocado: tags=all,hw,server,small,full_regression,cpu_usage
        """
        ps_get_cpu = r"ps -C daos_io_server -o %\cpu"

        prev_usage = 1
        usage = 1
        time.sleep(5)
        for _ in range(10):
            time.sleep(5)
            task = run_task(hosts=self.hostlist_servers, command=ps_get_cpu)
            # Sample output.
            # %CPU
            # 1798
            for output, _ in task.iter_buffers():
                usage = str(output).splitlines()[-1]
                self.log.info("CPU usage = %s", usage)
            # Check if daos_io_server has started.
            if usage == "%CPU":
                continue

            usage = int(usage)
            if usage == 0:
                break
            diff = usage - prev_usage
            diff_p = (float(abs(diff)) / prev_usage) * 100

            # Check if the CPU usage is stable; the change was less than 10%.
            if diff_p <= float(10):
                break
            prev_usage = usage

        self.assertTrue(
            usage != "%CPU", "daos_io_server CPU usage couldn't be obtained!")
        self.assertTrue(
            usage < 100, "CPU usage is above 100%: {}%".format(usage))
Esempio n. 13
0
def get_active_network_interfaces(hosts, verbose=True):
    """Get all active network interfaces on the hosts.

    Args:
        hosts (NodeSet): hosts on which to find active interfaces
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of interface keys and NodeSet values on which they were found

    """
    net_path = os.path.join(os.path.sep, "sys", "class", "net")
    operstate = os.path.join(net_path, "*", "operstate")
    command = " | ".join([
        f"grep -l 'up' {operstate}", "grep -Ev '/(lo|bonding_masters)/'",
        "sort"
    ])
    task = run_task(hosts, command, verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of active interfaces with a NodSet of hosts on which it was found
    active_interfaces = {}
    for output, nodelist in task.iter_buffers():
        output_lines = [line.decode("utf-8") for line in output]
        nodeset = NodeSet.fromlist(nodelist)
        for line in output_lines:
            try:
                interface = line.split("/")[-2]
                if interface not in active_interfaces:
                    active_interfaces[interface] = NodeSet()
                active_interfaces[interface].update(nodeset)
            except IndexError:
                pass

    return active_interfaces
Esempio n. 14
0
def get_ofi_info(hosts, supported=None, verbose=True):
    """Get the OFI provider information from the specified hosts.

    Args:
        hosts (NodeSet): hosts from which to gather the information
        supported (list, optional): list of supported providers when if provided will limit the
            inclusion to only those providers specified. Defaults to None.
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of interface keys with a dictionary value of a comma-separated string of
            providers key with a NodeSet value where the providers where detected.

    """
    task = run_task(hosts, "fi_info", verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of interfaces with a list of provider lists and NodSet of hosts on which
    # the providers were detected.
    providers = {}
    results = dict(task.iter_retcodes())
    if 0 in results:
        for output, nodelist in task.iter_buffers(results[0]):
            output_lines = [
                line.decode("utf-8").rstrip(os.linesep) for line in output
            ]
            nodeset = NodeSet.fromlist(nodelist)

            # Find all the provider and domain pairings. The fi_info output reports these on
            # separate lines when processing the re matches ensure each domain is preceded by a
            # provider.
            interface_providers = {}
            data = re.findall(r"(provider|domain):\s+([A-Za-z0-9;_+]+)",
                              "\n".join(output_lines))
            while data:
                provider = list(data.pop(0))
                if provider[0] == "provider" and data[0][0] == "domain":
                    provider.pop(0)
                    domain = list(data.pop(0))
                    domain.pop(0)

                    # A provider and domain must be specified
                    if not provider or not domain:
                        continue

                    # Add 'ofi+' to the provider
                    provider = ["+".join(["ofi", item]) for item in provider]

                    # Only include supported providers if a supported list is provided
                    if supported and provider[0] not in supported:
                        continue

                    if domain[0] not in interface_providers:
                        interface_providers[domain[0]] = set()
                    interface_providers[domain[0]].update(provider)

            for interface, provider_set in interface_providers.items():
                if interface not in providers:
                    providers[interface] = {}
                provider_key = ",".join(list(provider_set))
                if provider_key not in providers[interface]:
                    providers[interface][provider_key] = NodeSet()
                providers[interface][provider_key].update(nodeset)

    return providers
Esempio n. 15
0
    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            dict: log output per host

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = [
            "sudo",
            "journalctl",
            "--unit={}".format(self._systemctl.service.value),
            "--since=\"{}\"".format(since),
        ]
        if until:
            command.append("--until=\"{}\"".format(until))
        self.log.info("Gathering log data on %s: %s", str(hosts),
                      " ".join(command))

        # Gather the log information per host
        task = run_task(hosts, " ".join(command), timeout)

        # Create a dictionary of hosts for each unique return code
        results = {code: hosts for code, hosts in task.iter_retcodes()}

        # Determine if the command completed successfully across all the hosts
        status = len(results) == 1 and 0 in results

        # Determine if any commands timed out
        timed_out = [str(hosts) for hosts in task.iter_keys_timeout()]
        if timed_out:
            status = False
        if not status:
            self.log.info("  Errors detected running \"%s\":", command)

        # List any hosts that timed out
        if timed_out:
            self.log.info("    %s: timeout detected after %s seconds",
                          str(NodeSet.fromlist(timed_out)), timeout)

        # Display/return the command output
        log_data = {}
        for code in sorted(results):
            # Get the command output from the hosts with this return code
            output_data = list(task.iter_buffers(results[code]))
            if not output_data:
                output_data = [["<NONE>", results[code]]]

            for output_buffer, output_hosts in output_data:
                node_set = NodeSet.fromlist(output_hosts)
                lines = str(output_buffer).splitlines()

                if status:
                    # Add the successful output from each node to the dictionary
                    log_data[node_set] = lines
                else:
                    # Display all of the results in the case of an error
                    if len(lines) > 1:
                        self.log.info("    %s: rc=%s, output:", node_set, code)
                        for line in lines:
                            self.log.info("      %s", line)
                    else:
                        self.log.info("    %s: rc=%s, output: %s", node_set,
                                      code, output_buffer)

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data
Esempio n. 16
0
    def get_host_data(self, command, text, error):
        """Get the data requested for each host using the specified command.

        Args:
            command (str): command used to obtain the data on each server
            text (str): data identification string
            error (str): data error string

        Returns:
            dict: a dictionary of data values for each NodeSet key

        """
        # Find the data for each specified servers
        self.log.info("  Obtaining %s data on %s", text, self.hosts)
        task = run_task(self.hosts, command, self.timeout)
        host_data = {}

        # Create a list of NodeSets with the same return code
        data = {code: hosts for code, hosts in task.iter_retcodes()}

        # Multiple return codes or a single non-zero return code
        # indicate at least one error obtaining the data
        if len(data) > 1 or 0 not in data:
            # Report the errors
            messages = []
            for code, hosts in data.items():
                if code != 0:
                    output_data = list(task.iter_buffers(hosts))
                    if len(output_data) == 0:
                        messages.append("{}: rc={}, command=\"{}\"".format(
                            NodeSet.fromlist(hosts), code, command))
                    else:
                        for output, o_hosts in output_data:
                            lines = str(output).splitlines()
                            info = "rc={}{}".format(
                                code, ", {}".format(output) if len(lines) < 2
                                else "\n  {}".format("\n  ".join(lines)))
                            messages.append("{}: {}".format(
                                NodeSet.fromlist(o_hosts), info))
            self.log.error("    %s on the following hosts:\n      %s", error,
                           "\n      ".join(messages))

            # Return an error data set for all of the hosts
            host_data = {NodeSet.fromlist(self.hosts): DATA_ERROR}

        else:
            # The command completed successfully on all servers.
            for output, hosts in task.iter_buffers(data[0]):
                # Find the maximum size of the all the devices reported by
                # this group of hosts as only one needs to meet the minimum
                nodes = NodeSet.fromlist(hosts)
                try:
                    # The assumption here is that each line of command output
                    # will begin with a number and that for the purposes of
                    # checking this requirement the maximum of these numbers is
                    # needed
                    int_host_values = [
                        int(line.split()[0])
                        for line in str(output).splitlines()
                    ]
                    host_data[nodes] = max(int_host_values)

                except (IndexError, ValueError):
                    # Log the error
                    self.log.error(
                        "    %s: Unable to obtain the maximum %s size due to "
                        "unexpected output:\n      %s", nodes, text,
                        "\n      ".join(str(output).splitlines()))

                    # Return an error data set for all of the hosts
                    host_data = {NodeSet.fromlist(hosts): DATA_ERROR}
                    break

        return host_data
Esempio n. 17
0
def get_ucx_info(hosts, supported=None, verbose=True):
    """Get the UCX provider information from the specified hosts.

    Args:
        hosts (NodeSet): hosts from which to gather the information
        supported (list, optional): list of supported providers when if provided will limit the
            inclusion to only those providers specified. Defaults to None.
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of interface keys with a dictionary value of a comma-separated string of
            providers key with a NodeSet value where the providers where detected.

    """
    task = run_task(hosts, "ucx_info -d", verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of interfaces with a list of provider lists and NodSet of hosts on which
    # the providers were detected.
    providers = {}
    results = dict(task.iter_retcodes())
    if 0 in results:
        for output, nodelist in task.iter_buffers(results[0]):
            output_lines = [
                line.decode("utf-8").rstrip(os.linesep) for line in output
            ]
            nodeset = NodeSet.fromlist(nodelist)

            # Find all the transport, device, and type pairings. The ucx_info output reports these
            # on separate lines so when processing the re matches ensure each device is preceded by
            # a provider.
            interface_providers = {}
            data = re.findall(r"(Transport|Device):\s+([A-Za-z0-9;_+]+)",
                              "\n".join(output_lines))
            while data:
                transport = list(data.pop(0))
                if transport[0] == "Transport" and data[0][0] == "Device":
                    transport.pop(0)
                    device = list(data.pop(0))
                    device.pop(0)

                    # A transport and device must be specified
                    if not transport or not device:
                        continue

                    # Add 'ucx+' to the provider and replace 'mlx[0-9]' with 'x'
                    transport = [
                        "+".join(["ucx",
                                  re.sub(r"mlx[0-9]+", "x", item)])
                        for item in transport
                    ]

                    # Only include supported providers if a supported list is provided
                    if supported and transport[0] not in supported:
                        continue

                    if device[0] not in interface_providers:
                        interface_providers[device[0]] = set()
                    interface_providers[device[0]].update(transport)

            for interface, provider_set in interface_providers.items():
                if interface not in providers:
                    providers[interface] = {}
                provider_key = ",".join(list(provider_set))
                if provider_key not in providers[interface]:
                    providers[interface][provider_key] = NodeSet()
                providers[interface][provider_key].update(nodeset)

    return providers