Exemple #1
0
    def verify_socket_directory(self, user, hosts):
        """Verify the domain socket directory is present and owned by this user.

        Args:
            user (str): user to verify has ownership of the directory
            hosts (list): list of hosts on which to verify the directory exists

        Raises:
            CommandFailure: if the socket directory does not exist or is not
                owned by the user and could not be created

        """
        if isinstance(self.yaml, YamlParameters):
            directory = self.get_user_file()
            self.log.info("Verifying %s socket directory: %s", self.command,
                          directory)
            status, nodes = check_file_exists(hosts, directory, user)
            if not status:
                self.log.info(
                    "%s: creating socket directory %s for user %s on %s",
                    self.command, directory, user, nodes)
                try:
                    create_directory(nodes, directory, sudo=True)
                    change_file_owner(nodes, directory, user, user, sudo=True)
                except DaosTestError as error:
                    raise CommandFailure(
                        "{}: error setting up missing socket directory {} for "
                        "user {} on {}:\n{}".format(self.command, directory,
                                                    user, nodes, error))
Exemple #2
0
    def create_mount_point(self):
        """Create dfuse directory.

        Raises:
            CommandFailure: In case of error creating directory

        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        _, missing_nodes = check_file_exists(self.hosts,
                                             self.mount_dir.value,
                                             directory=True)
        if len(missing_nodes):

            cmd = "mkdir -p {}".format(self.mount_dir.value)
            ret_code = pcmd(missing_nodes, cmd, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in ret_code.items()
                    if code != 0
                ]))
                raise CommandFailure(
                    "Error creating the {} dfuse mount point on the following "
                    "hosts: {}".format(self.mount_dir.value, error_hosts))
Exemple #3
0
    def get_port_cnt(self, hosts, port_counter):
        """Get the port count info for device names specified.

        Args:
            hosts (list): list of hosts
            port_counter (str): port counter information to collect

        Returns:
            dict: a dictionary of the requested port data for each interface on each host

        """
        port_info = {}
        for interface in self.interfaces:
            # Check the port counter for each interface on all of the hosts
            counter_file = os.path.join(
                os.sep, "sys", "class", "infiniband", self.interfaces[interface]["domain"], "ports",
                "1", "counters", port_counter)
            check_result = check_file_exists(hosts, counter_file)
            if not check_result[0]:
                self.fail("{}: {} not found".format(check_result[1], counter_file))
            all_host_data = get_host_data(
                hosts, "cat {}".format(counter_file), "{} port_counter".format(interface),
                "Error obtaining {} info".format(port_counter), 20)
            port_info[interface] = {}
            for host_data in all_host_data:
                for host in list(host_data["hosts"]):
                    port_info[interface][host] = {1: {port_counter: host_data["data"]}}
        return port_info
Exemple #4
0
    def get_port_cnt(self, hosts, dev, port_counter):
        """Get the port count info for device names specified.

        Args:
            hosts (list): list of hosts
            dev (str): device to get counter information for
            port_counter (str): port counter to get information from

        Returns:
            list: a list of the data common to each unique NodeSet of hosts

        """
        b_path = "/sys/class/infiniband/{}".format(dev)
        file = os.path.join(b_path, "ports/1/counters", port_counter)

        # Check if if exists for the host
        check_result = check_file_exists(hosts, file)
        if not check_result[0]:
            self.fail("{}: {} not found".format(check_result[1], file))

        cmd = "cat {}".format(file)
        text = "port_counter"
        error = "Error obtaining {} info".format(port_counter)
        all_host_data = get_host_data(hosts, cmd, text, error, 20)
        return [host_data["data"] for host_data in all_host_data]
Exemple #5
0
    def remove_mount_point(self, fail=True):
        """Remove dfuse directory.

        Try once with a simple rmdir which should succeed, if this does not then
        try again with rm -rf, but still raise an error.

        Raises:
            CommandFailure: In case of error deleting directory

        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        dir_exists, clean_nodes = check_file_exists(self.hosts,
                                                    self.mount_dir.value,
                                                    directory=True)
        if dir_exists:
            target_nodes = list(self.hosts)
            if clean_nodes:
                target_nodes.remove(clean_nodes)

            self.log.info("Removing the %s dfuse mount point on %s",
                          self.mount_dir.value, target_nodes)

            cmd = "rmdir {}".format(self.mount_dir.value)
            ret_code = pcmd(target_nodes, cmd, timeout=30)
            if len(ret_code) == 1 and 0 in ret_code:
                return

            failed_nodes = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))

            cmd = "rm -rf {}".format(self.mount_dir.value)
            ret_code = pcmd(failed_nodes, cmd, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in list(ret_code.items())
                    if code != 0
                ]))
                if fail:
                    raise CommandFailure(
                        "Error removing the {} dfuse mount point with rm on "
                        "the following hosts: {}".format(
                            self.mount_dir.value, error_hosts))
            if fail:
                raise CommandFailure(
                    "Error removing the {} dfuse mount point with rmdir on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 failed_nodes))
        else:
            self.log.info("No %s dfuse mount point directory found on %s",
                          self.mount_dir.value, self.hosts)
Exemple #6
0
    def verify_socket_directory(self, user):
        """Verify the domain socket directory is present and owned by this user.

        Args:
            user (str): user to verify has ownership of the directory

        Raises:
            CommandFailure: if the socket directory does not exist or is not
                owned by the user

        """
        if self._hosts and hasattr(self.manager.job, "yaml"):
            directory = self.get_user_file()
            status, nodes = check_file_exists(self._hosts, directory, user)
            if not status:
                raise CommandFailure(
                    "{}: Server missing socket directory {} for user {}".
                    format(nodes, directory, user))
    def test_super_block_version_basic(self):
        """JIRA ID: DAOS-3648.

        Test Description:
            Basic test to verify that superblock file is versioned.

        :avocado: tags=all,tiny,pr,ds_versioning,basic
        """
        # Check that the superblock file exists under the scm_mount dir.
        scm_mount = self.server_managers[0].get_config_value("scm_mount")
        fname = os.path.join(scm_mount, "superblock")
        check_result = check_file_exists(self.hostlist_servers, fname)
        if not check_result[0]:
            self.fail("{}: {} not found".format(check_result[1], fname))

        # Make sure that 'version' is in the file, run task to check
        cmd = "cat {} | grep -F \"version\"".format(fname)
        result = pcmd(self.hostlist_servers, cmd, timeout=20)

        # Determine if the command completed successfully across all the hosts
        if len(result) > 1 or 0 not in result:
            self.fail("Was not able to find version in {} file".format(fname))
Exemple #8
0
    def remove_mount_point(self):
        """Remove dfuse directory
        Raises:
            CommandFailure: In case of error deleting directory
        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        dir_exists, _ = general_utils.check_file_exists(
            self.hosts, self.mount_dir.value, directory=True)
        if dir_exists:
            cmd = "rm -rf {}".format(self.mount_dir.value)
            ret_code = general_utils.pcmd(self.hosts, cmd, timeout=30)
            if 0 not in ret_code:
                error_hosts = NodeSet(
                    ",".join(
                        [str(node_set) for code, node_set in ret_code.items()
                         if code != 0]))
                raise CommandFailure(
                    "Error removing the {} dfuse mount point on the following "
                    "hosts: {}".format(self.mount_dir.value, error_hosts))
Exemple #9
0
def run_agent(test, server_list, client_list=None):
    """Start daos agents on the specified hosts.

    Make sure the environment is setup for the security agent and then
    launches it on the compute nodes.

    This is temporary; presuming the agent will deamonize at somepoint and
    can be started killed more appropriately.

    Args:
        test (Test): provides tmp directory for DAOS repo or installation
        server_list (list): nodes acting as server nodes in the test
        client_list (list, optional): nodes acting as client nodes in the
                    test.
            Defaults to None.

    Raises:
        AgentFailed: if there is an error starting the daos agents

    Returns:
        dict: set of subprocess sessions

    """
    sessions = {}
    user = getpass.getuser()

    # if empty client list, 'test' is effectively client
    client_list = include_local_host(client_list)

    # Create the DAOS Agent configuration yaml file to pass
    # with daos_agent -o <FILE_NAME>
    agent_yaml = os.path.join(test.tmp, "daos_agent.yml")
    agent_config = DaosAgentConfig()
    agent_config.get_params(test)
    agent_config.hostlist.value = client_list

    access_point = ":".join((server_list[0], str(agent_config.port)))
    agent_config.access_points.value = access_point.split()

    agent_config.create_yaml(agent_yaml)

    # Verify the domain socket directory is present and owned by this user
    file_checks = (
        ("Server", server_list, "/var/run/daos_server"),
        ("Client", client_list, "/var/run/daos_agent"),
    )
    for host_type, host_list, directory in file_checks:
        status, nodeset = check_file_exists(host_list, directory, user)
        if not status:
            raise AgentFailed(
                "{}: {} missing directory {} for user {}.".format(
                    nodeset, host_type, directory, user))

    # launch the agent
    daos_agent_bin = os.path.join(test.prefix, "bin", "daos_agent")
    daos_agent_cmd = " ".join((daos_agent_bin, "-o", agent_yaml))
    print("<AGENT> Agent command: ", daos_agent_cmd)

    for client in client_list:
        sessions[client] = subprocess.Popen([
            "ssh", client, "-o ConnectTimeout=10",
            "{} -i".format(daos_agent_bin)
        ],
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.STDOUT)

    # double check agent launched successfully
    timeout = 15
    started_clients = []
    for client in client_list:
        print("<AGENT> Starting agent on {}".format(client))
        file_desc = sessions[client].stdout.fileno()
        flags = fcntl.fcntl(file_desc, fcntl.F_GETFL)
        fcntl.fcntl(file_desc, fcntl.F_SETFL, flags | os.O_NONBLOCK)
        start_time = time.time()
        pattern = "Using logfile"
        expected_data = ""
        while not sessions[client].poll():
            if time.time() - start_time > timeout:
                print("<AGENT>: {}".format(expected_data))
                raise AgentFailed("DAOS Agent didn't start! Agent reported:\n"
                                  "{}before we gave up waiting for it to "
                                  "start".format(expected_data))
            output = ""
            try:
                output = sessions[client].stdout.read()
            except IOError as excpn:
                if excpn.errno != errno.EAGAIN:
                    raise AgentFailed(
                        "Error in starting daos_agent: {0}".format(str(excpn)))
                time.sleep(1)
                continue
            expected_data += output

            match = re.findall(pattern, output)
            if match:
                print("<AGENT> agent started on node {} in {} seconds".format(
                    client,
                    time.time() - start_time))
                break

        if sessions[client].returncode is not None:
            print("<AGENT> uh-oh, in agent startup, the ssh that started the "
                  "agent on {} has exited with {}.\nStopping agents on "
                  "{}".format(client, sessions[client].returncode,
                              started_clients))
            # kill the ones we started
            stop_agent(sessions, started_clients)
            raise AgentFailed("Failed to start agent on {}".format(client))

    return sessions
Exemple #10
0
def run_agent(basepath, server_list, client_list=None):
    """Start daos agents on the specified hosts.

    Make sure the environment is setup for the security agent and then launches
    it on the compute nodes.

    This is temporary; presuming the agent will deamonize at somepoint and
    can be started killed more appropriately.

    Args:
        basepath (str): root directory for DAOS repo or installation
        server_list (list): nodes acting as server nodes in the test
        client_list (list, optional): nodes acting as client nodes in the test.
            Defaults to None.

    Raises:
        AgentFailed: if there is an error starting the daos agents

    Returns:
        dict: set of subprocess sessions

    """
    sessions = {}
    user = getpass.getuser()

    # if empty client list, 'self' is effectively client
    if client_list is None:
        client_list = [socket.gethostname().split('.', 1)[0]]

    # Verify the domain socket directory is present and owned by this user
    file_checks = (
        ("Server", server_list, "/var/run/daos_server"),
        ("Client", client_list, "/var/run/daos_agent"),
    )
    for host_type, host_list, directory in file_checks:
        status, nodeset = check_file_exists(host_list, directory, user)
        if not status:
            raise AgentFailed(
                "{}: {} missing directory {} for user {}.".format(
                    nodeset, host_type, directory, user))

    # launch the agent
    with open(os.path.join(basepath, ".build_vars.json")) as json_vars:
        build_vars = json.load(json_vars)
    daos_agent_bin = os.path.join(build_vars["PREFIX"], "bin", "daos_agent")

    for client in client_list:
        sessions[client] = subprocess.Popen([
            "ssh", client, "-o ConnectTimeout=10",
            "{} -i".format(daos_agent_bin)
        ],
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.STDOUT)

    # double check agent launched successfully
    timeout = 15
    started_clients = []
    for client in client_list:
        file_desc = sessions[client].stdout.fileno()
        flags = fcntl.fcntl(file_desc, fcntl.F_GETFL)
        fcntl.fcntl(file_desc, fcntl.F_SETFL, flags | os.O_NONBLOCK)
        start_time = time.time()
        pattern = "Using logfile"
        expected_data = ""
        while not sessions[client].poll():
            if time.time() - start_time > timeout:
                print("<AGENT>: {}".format(expected_data))
                raise AgentFailed("DAOS Agent didn't start!  Agent reported:\n"
                                  "{}before we gave up waiting for it to "
                                  "start".format(expected_data))
            output = ""
            try:
                output = sessions[client].stdout.read()
            except IOError as excpn:
                if excpn.errno != errno.EAGAIN:
                    raise AgentFailed(
                        "Error in starting daos_agent: {0}".format(str(excpn)))
                time.sleep(1)
                continue
            expected_data += output

            match = re.findall(pattern, output)
            if match:
                print("<AGENT> agent started on node {} in {} seconds".format(
                    client,
                    time.time() - start_time))
                break

        if sessions[client].returncode is not None:
            print("<AGENT> uh-oh, in agent startup, the ssh that started the "
                  "agent on {} has exited with {}.\nStopping agents on "
                  "{}".format(client, sessions[client].returncode,
                              started_clients))
            # kill the ones we started
            stop_agent(sessions, started_clients)
            raise AgentFailed("Failed to start agent on {}".format(client))

    return sessions