def verify_socket_directory(self, user, hosts): """Verify the domain socket directory is present and owned by this user. Args: user (str): user to verify has ownership of the directory hosts (list): list of hosts on which to verify the directory exists Raises: CommandFailure: if the socket directory does not exist or is not owned by the user and could not be created """ if isinstance(self.yaml, YamlParameters): directory = self.get_user_file() self.log.info("Verifying %s socket directory: %s", self.command, directory) status, nodes = check_file_exists(hosts, directory, user) if not status: self.log.info( "%s: creating socket directory %s for user %s on %s", self.command, directory, user, nodes) try: create_directory(nodes, directory, sudo=True) change_file_owner(nodes, directory, user, user, sudo=True) except DaosTestError as error: raise CommandFailure( "{}: error setting up missing socket directory {} for " "user {} on {}:\n{}".format(self.command, directory, user, nodes, error))
def create_mount_point(self): """Create dfuse directory. Raises: CommandFailure: In case of error creating directory """ # raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") _, missing_nodes = check_file_exists(self.hosts, self.mount_dir.value, directory=True) if len(missing_nodes): cmd = "mkdir -p {}".format(self.mount_dir.value) ret_code = pcmd(missing_nodes, cmd, timeout=30) if len(ret_code) > 1 or 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in ret_code.items() if code != 0 ])) raise CommandFailure( "Error creating the {} dfuse mount point on the following " "hosts: {}".format(self.mount_dir.value, error_hosts))
def get_port_cnt(self, hosts, port_counter): """Get the port count info for device names specified. Args: hosts (list): list of hosts port_counter (str): port counter information to collect Returns: dict: a dictionary of the requested port data for each interface on each host """ port_info = {} for interface in self.interfaces: # Check the port counter for each interface on all of the hosts counter_file = os.path.join( os.sep, "sys", "class", "infiniband", self.interfaces[interface]["domain"], "ports", "1", "counters", port_counter) check_result = check_file_exists(hosts, counter_file) if not check_result[0]: self.fail("{}: {} not found".format(check_result[1], counter_file)) all_host_data = get_host_data( hosts, "cat {}".format(counter_file), "{} port_counter".format(interface), "Error obtaining {} info".format(port_counter), 20) port_info[interface] = {} for host_data in all_host_data: for host in list(host_data["hosts"]): port_info[interface][host] = {1: {port_counter: host_data["data"]}} return port_info
def get_port_cnt(self, hosts, dev, port_counter): """Get the port count info for device names specified. Args: hosts (list): list of hosts dev (str): device to get counter information for port_counter (str): port counter to get information from Returns: list: a list of the data common to each unique NodeSet of hosts """ b_path = "/sys/class/infiniband/{}".format(dev) file = os.path.join(b_path, "ports/1/counters", port_counter) # Check if if exists for the host check_result = check_file_exists(hosts, file) if not check_result[0]: self.fail("{}: {} not found".format(check_result[1], file)) cmd = "cat {}".format(file) text = "port_counter" error = "Error obtaining {} info".format(port_counter) all_host_data = get_host_data(hosts, cmd, text, error, 20) return [host_data["data"] for host_data in all_host_data]
def remove_mount_point(self, fail=True): """Remove dfuse directory. Try once with a simple rmdir which should succeed, if this does not then try again with rm -rf, but still raise an error. Raises: CommandFailure: In case of error deleting directory """ # raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") dir_exists, clean_nodes = check_file_exists(self.hosts, self.mount_dir.value, directory=True) if dir_exists: target_nodes = list(self.hosts) if clean_nodes: target_nodes.remove(clean_nodes) self.log.info("Removing the %s dfuse mount point on %s", self.mount_dir.value, target_nodes) cmd = "rmdir {}".format(self.mount_dir.value) ret_code = pcmd(target_nodes, cmd, timeout=30) if len(ret_code) == 1 and 0 in ret_code: return failed_nodes = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) cmd = "rm -rf {}".format(self.mount_dir.value) ret_code = pcmd(failed_nodes, cmd, timeout=30) if len(ret_code) > 1 or 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) if fail: raise CommandFailure( "Error removing the {} dfuse mount point with rm on " "the following hosts: {}".format( self.mount_dir.value, error_hosts)) if fail: raise CommandFailure( "Error removing the {} dfuse mount point with rmdir on the " "following hosts: {}".format(self.mount_dir.value, failed_nodes)) else: self.log.info("No %s dfuse mount point directory found on %s", self.mount_dir.value, self.hosts)
def verify_socket_directory(self, user): """Verify the domain socket directory is present and owned by this user. Args: user (str): user to verify has ownership of the directory Raises: CommandFailure: if the socket directory does not exist or is not owned by the user """ if self._hosts and hasattr(self.manager.job, "yaml"): directory = self.get_user_file() status, nodes = check_file_exists(self._hosts, directory, user) if not status: raise CommandFailure( "{}: Server missing socket directory {} for user {}". format(nodes, directory, user))
def test_super_block_version_basic(self): """JIRA ID: DAOS-3648. Test Description: Basic test to verify that superblock file is versioned. :avocado: tags=all,tiny,pr,ds_versioning,basic """ # Check that the superblock file exists under the scm_mount dir. scm_mount = self.server_managers[0].get_config_value("scm_mount") fname = os.path.join(scm_mount, "superblock") check_result = check_file_exists(self.hostlist_servers, fname) if not check_result[0]: self.fail("{}: {} not found".format(check_result[1], fname)) # Make sure that 'version' is in the file, run task to check cmd = "cat {} | grep -F \"version\"".format(fname) result = pcmd(self.hostlist_servers, cmd, timeout=20) # Determine if the command completed successfully across all the hosts if len(result) > 1 or 0 not in result: self.fail("Was not able to find version in {} file".format(fname))
def remove_mount_point(self): """Remove dfuse directory Raises: CommandFailure: In case of error deleting directory """ # raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") dir_exists, _ = general_utils.check_file_exists( self.hosts, self.mount_dir.value, directory=True) if dir_exists: cmd = "rm -rf {}".format(self.mount_dir.value) ret_code = general_utils.pcmd(self.hosts, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error removing the {} dfuse mount point on the following " "hosts: {}".format(self.mount_dir.value, error_hosts))
def run_agent(test, server_list, client_list=None): """Start daos agents on the specified hosts. Make sure the environment is setup for the security agent and then launches it on the compute nodes. This is temporary; presuming the agent will deamonize at somepoint and can be started killed more appropriately. Args: test (Test): provides tmp directory for DAOS repo or installation server_list (list): nodes acting as server nodes in the test client_list (list, optional): nodes acting as client nodes in the test. Defaults to None. Raises: AgentFailed: if there is an error starting the daos agents Returns: dict: set of subprocess sessions """ sessions = {} user = getpass.getuser() # if empty client list, 'test' is effectively client client_list = include_local_host(client_list) # Create the DAOS Agent configuration yaml file to pass # with daos_agent -o <FILE_NAME> agent_yaml = os.path.join(test.tmp, "daos_agent.yml") agent_config = DaosAgentConfig() agent_config.get_params(test) agent_config.hostlist.value = client_list access_point = ":".join((server_list[0], str(agent_config.port))) agent_config.access_points.value = access_point.split() agent_config.create_yaml(agent_yaml) # Verify the domain socket directory is present and owned by this user file_checks = ( ("Server", server_list, "/var/run/daos_server"), ("Client", client_list, "/var/run/daos_agent"), ) for host_type, host_list, directory in file_checks: status, nodeset = check_file_exists(host_list, directory, user) if not status: raise AgentFailed( "{}: {} missing directory {} for user {}.".format( nodeset, host_type, directory, user)) # launch the agent daos_agent_bin = os.path.join(test.prefix, "bin", "daos_agent") daos_agent_cmd = " ".join((daos_agent_bin, "-o", agent_yaml)) print("<AGENT> Agent command: ", daos_agent_cmd) for client in client_list: sessions[client] = subprocess.Popen([ "ssh", client, "-o ConnectTimeout=10", "{} -i".format(daos_agent_bin) ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # double check agent launched successfully timeout = 15 started_clients = [] for client in client_list: print("<AGENT> Starting agent on {}".format(client)) file_desc = sessions[client].stdout.fileno() flags = fcntl.fcntl(file_desc, fcntl.F_GETFL) fcntl.fcntl(file_desc, fcntl.F_SETFL, flags | os.O_NONBLOCK) start_time = time.time() pattern = "Using logfile" expected_data = "" while not sessions[client].poll(): if time.time() - start_time > timeout: print("<AGENT>: {}".format(expected_data)) raise AgentFailed("DAOS Agent didn't start! Agent reported:\n" "{}before we gave up waiting for it to " "start".format(expected_data)) output = "" try: output = sessions[client].stdout.read() except IOError as excpn: if excpn.errno != errno.EAGAIN: raise AgentFailed( "Error in starting daos_agent: {0}".format(str(excpn))) time.sleep(1) continue expected_data += output match = re.findall(pattern, output) if match: print("<AGENT> agent started on node {} in {} seconds".format( client, time.time() - start_time)) break if sessions[client].returncode is not None: print("<AGENT> uh-oh, in agent startup, the ssh that started the " "agent on {} has exited with {}.\nStopping agents on " "{}".format(client, sessions[client].returncode, started_clients)) # kill the ones we started stop_agent(sessions, started_clients) raise AgentFailed("Failed to start agent on {}".format(client)) return sessions
def run_agent(basepath, server_list, client_list=None): """Start daos agents on the specified hosts. Make sure the environment is setup for the security agent and then launches it on the compute nodes. This is temporary; presuming the agent will deamonize at somepoint and can be started killed more appropriately. Args: basepath (str): root directory for DAOS repo or installation server_list (list): nodes acting as server nodes in the test client_list (list, optional): nodes acting as client nodes in the test. Defaults to None. Raises: AgentFailed: if there is an error starting the daos agents Returns: dict: set of subprocess sessions """ sessions = {} user = getpass.getuser() # if empty client list, 'self' is effectively client if client_list is None: client_list = [socket.gethostname().split('.', 1)[0]] # Verify the domain socket directory is present and owned by this user file_checks = ( ("Server", server_list, "/var/run/daos_server"), ("Client", client_list, "/var/run/daos_agent"), ) for host_type, host_list, directory in file_checks: status, nodeset = check_file_exists(host_list, directory, user) if not status: raise AgentFailed( "{}: {} missing directory {} for user {}.".format( nodeset, host_type, directory, user)) # launch the agent with open(os.path.join(basepath, ".build_vars.json")) as json_vars: build_vars = json.load(json_vars) daos_agent_bin = os.path.join(build_vars["PREFIX"], "bin", "daos_agent") for client in client_list: sessions[client] = subprocess.Popen([ "ssh", client, "-o ConnectTimeout=10", "{} -i".format(daos_agent_bin) ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # double check agent launched successfully timeout = 15 started_clients = [] for client in client_list: file_desc = sessions[client].stdout.fileno() flags = fcntl.fcntl(file_desc, fcntl.F_GETFL) fcntl.fcntl(file_desc, fcntl.F_SETFL, flags | os.O_NONBLOCK) start_time = time.time() pattern = "Using logfile" expected_data = "" while not sessions[client].poll(): if time.time() - start_time > timeout: print("<AGENT>: {}".format(expected_data)) raise AgentFailed("DAOS Agent didn't start! Agent reported:\n" "{}before we gave up waiting for it to " "start".format(expected_data)) output = "" try: output = sessions[client].stdout.read() except IOError as excpn: if excpn.errno != errno.EAGAIN: raise AgentFailed( "Error in starting daos_agent: {0}".format(str(excpn))) time.sleep(1) continue expected_data += output match = re.findall(pattern, output) if match: print("<AGENT> agent started on node {} in {} seconds".format( client, time.time() - start_time)) break if sessions[client].returncode is not None: print("<AGENT> uh-oh, in agent startup, the ssh that started the " "agent on {} has exited with {}.\nStopping agents on " "{}".format(client, sessions[client].returncode, started_clients)) # kill the ones we started stop_agent(sessions, started_clients) raise AgentFailed("Failed to start agent on {}".format(client)) return sessions