def get_port_cnt(self, hosts, port_counter): """Get the port count info for device names specified. Args: hosts (list): list of hosts port_counter (str): port counter information to collect Returns: dict: a dictionary of the requested port data for each interface on each host """ port_info = {} for interface in self.interfaces: # Check the port counter for each interface on all of the hosts counter_file = os.path.join( os.sep, "sys", "class", "infiniband", self.interfaces[interface]["domain"], "ports", "1", "counters", port_counter) check_result = check_file_exists(hosts, counter_file) if not check_result[0]: self.fail("{}: {} not found".format(check_result[1], counter_file)) all_host_data = get_host_data( hosts, "cat {}".format(counter_file), "{} port_counter".format(interface), "Error obtaining {} info".format(port_counter), 20) port_info[interface] = {} for host_data in all_host_data: for host in list(host_data["hosts"]): port_info[interface][host] = {1: {port_counter: host_data["data"]}} return port_info
def run_event_check(self, since, until): """Run a check on specific events in journalctl. Args: self (obj): soak obj Returns list of any matched events found in system log """ events_found = [] detected = 0 # to do: currently all events are from - t kernel; # when systemctl is enabled add daos events events = self.params.get("events", "/run/*") # check events on all nodes hosts = list(set(self.hostlist_servers)) if events: command = ("sudo /usr/bin/journalctl --system -t kernel -t " "daos_server --since=\"{}\" --until=\"{}\"".format( since, until)) err = "Error gathering system log events" for event in events: for output in get_host_data(hosts, command, "journalctl", err): lines = output["data"].splitlines() for line in lines: match = re.search(r"{}".format(event), str(line)) if match: events_found.append(line) detected += 1 self.log.info( "Found %s instances of %s in system log from %s through %s", detected, event, since, until) return events_found
def get_log_info(self, hosts, dev, env_state, log_file): """Get information from daos.log file to verify device used. Args: hosts (list): list of hosts dev (str): device to get counter information for env_state (bool): set state for OFI_INTERFACE env variable log_file (str): log file to verify Returns: bool: status of whether correct device was used. """ # anticipate log switch cmd = "if [ -f {0}.old ]; then head -50 {0}.old; else head -50 {0};" \ "fi".format(log_file) err = "Error getting log data." pattern = r"Using\s+client\s+provided\s+OFI_INTERFACE:\s+{}".format(dev) detected = 0 for host_data in get_host_data(hosts, cmd, log_file, err): detected = len(re.findall(pattern, host_data["data"])) self.log.info( "Found %s instances of client setting up OFI_INTERFACE=%s", detected, dev) # Verify status = True if env_state and detected != 1: status = False elif not env_state and detected == 1: status = False return status
def get_port_cnt(self, hosts, dev, port_counter): """Get the port count info for device names specified. Args: hosts (list): list of hosts dev (str): device to get counter information for port_counter (str): port counter to get information from Returns: list: a list of the data common to each unique NodeSet of hosts """ b_path = "/sys/class/infiniband/{}".format(dev) file = os.path.join(b_path, "ports/1/counters", port_counter) # Check if if exists for the host check_result = check_file_exists(hosts, file) if not check_result[0]: self.fail("{}: {} not found".format(check_result[1], file)) cmd = "cat {}".format(file) text = "port_counter" error = "Error obtaining {} info".format(port_counter) all_host_data = get_host_data(hosts, cmd, text, error, 20) return [host_data["data"] for host_data in all_host_data]
def get_host_nvme_data(self): """Get the largest NVMe capacity in bytes for each host. Returns: dict: a dictionary of data values for each NodeSet key """ cmd = "lsblk -b -o SIZE,NAME | grep nvme" text = "NVMe" error = "No NVMe drives bound to the kernel driver detected" return get_host_data(self.hosts, cmd, text, error, self.timeout)
def get_host_mem_data(self): """Get the total non-swap memory in bytes for each host. Returns: dict: a dictionary of data values for each NodeSet key """ cmd = r"free -b | sed -En 's/Mem:\s+([0-9]+).*/\1/p'" text = "memory" error = "Error obtaining total memory size" return get_host_data(self.hosts, cmd, text, error, self.timeout)
def get_host_scm_data(self): """Get the total SCM capacity in bytes for each host. Returns: dict: a dictionary of data values for each NodeSet key """ cmd_list = [ "sudo -n ipmctl show -units B -memoryresources", r"sed -En 's/^Capacity\=([0-9+]).*/\1/p'", ] cmd = " | ".join(cmd_list) text = "SCM" error = "No SCM devices detected" return get_host_data(self.hosts, cmd, text, error, self.timeout)
def get_superblock_info(self, sp_file, sp_value): """Get the superblock information for each host. Args: sp_file (str): scm mount path. sp_value (str): superblock file value to extract. i.e. version, uuid, system, rank, validrank, ms Returns: dict: a dictionary of data values for each NodeSet key """ pattern = r"^{}:\s+([_a-z0-9-]+).*".format(sp_value) cmd = r"cat {} | sed -En 's/{}/\1 /gp'".format(sp_file, pattern) text = "superblock" error = "Error obtaining superblock info: {}".format(sp_value) return get_host_data(self.dmg.hostlist, cmd, text, error, 20)