Beispiel #1
0
    def start_dfuse(self, hosts, pool, container):
        """Create a DfuseCommand object and use it to start Dfuse.

        Args:
            hosts (list): list of hosts on which to start Dfuse
            pool (TestPool): pool to use with Dfuse
            container (TestContainer): container to use with Dfuse
        """
        self.dfuse = Dfuse(hosts, self.tmp)
        self.dfuse.get_params(self)

        # Update dfuse params
        self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_cont_param(container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # Start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.")
Beispiel #2
0
    def start_dfuse(self, hosts, pool=None, container=None, mount_dir=None):
        """Create a DfuseCommand object and use it to start Dfuse.

        Args:
            hosts (list): list of hosts on which to start Dfuse
            pool (TestPool, optional): pool to use with Dfuse
            container (TestContainer, optional): container to use with Dfuse
            mount_dir (str, optional): updated mount dir name. Defaults to None.
        """
        self.dfuse = Dfuse(hosts, self.tmp)
        self.dfuse.get_params(self)

        # Update dfuse params
        if mount_dir:
            self.dfuse.mount_dir.update(mount_dir)
        if pool:
            self.dfuse.set_dfuse_params(pool)
        if container:
            self.dfuse.set_dfuse_cont_param(container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # Start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.")
Beispiel #3
0
    def start_dfuse(self, pool):
        """Create a DfuseCommand object to start dfuse.

        Args:

            pool (obj):   TestPool obj
        """
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)
        # update dfuse params
        self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool))
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        # create dfuse mount point
        cmd = "mkdir -p {}".format(self.dfuse.mount_dir.value)
        params = self.srun_params
        params["export"] = "all"
        params["ntasks-per-node"] = 1
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd,
                                  params)
        if result.exit_status > 0:
            raise SoakTestError(
                "<<FAILED: Dfuse mountpoint {} not created>>".format(
                    self.dfuse.mount_dir.value))
        cmd = self.dfuse.__str__()
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd,
                                  params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Dfuse failed to start>>")
Beispiel #4
0
    def _start_dfuse(self, pool, container):
        """Create a DfuseCommand object to start dfuse.

           Args:
               container: Container to mount dfuse
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.dfuse_hosts, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_cont_param(container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")
Beispiel #5
0
    def start_dfuse(self, pool=None):
        """Create a DfuseCommand object to start dfuse.

          Args:
            pool (TestPool): Test pool object if dfuse is intended to be
                             started using pool uuid option.
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        if pool:
            self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")
Beispiel #6
0
    def start_dfuse(self, count):
        """Create a DfuseCommand object to start dfuse.

           Args:
             count(int): container index
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.mount_dir.update("/tmp/" + self.pool.uuid + "_daos_dfuse" +
                                    str(count))
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")
Beispiel #7
0
    def start_dfuse(self, pool):
        """Create dfuse start command line for slurm.

        Args:
            pool (obj):             TestPool obj

        Returns dfuse(obj):         Dfuse obj
                cmd(list):          list of dfuse commands to add to jobscript
        """
        # Get Dfuse params
        dfuse = Dfuse(self.hostlist_clients, self.tmp)
        dfuse.get_params(self)
        # update dfuse params; mountpoint for each container
        unique = get_random_string(5, self.used)
        self.used.append(unique)
        mount_dir = dfuse.mount_dir.value + unique
        dfuse.mount_dir.update(mount_dir)
        dfuse.set_dfuse_params(pool)
        dfuse.set_dfuse_cont_param(self.get_container(pool))

        dfuse_start_cmds = [
            "mkdir -p {}".format(dfuse.mount_dir.value),
            "{}".format(dfuse.__str__()),
            "df -h {}".format(dfuse.mount_dir.value)
        ]
        return dfuse, dfuse_start_cmds
Beispiel #8
0
    def start_dfuse(self, pool):
        """Create dfuse start command line for slurm.

        Args:
            pool (obj):             TestPool obj

        Returns dfuse(obj):         Dfuse obj
                cmd(list):          list of dfuse commands to add to jobscript
        """
        commands = []
        # Get Dfuse params
        dfuse = Dfuse(self.hostlist_clients, self.tmp)
        dfuse.get_params(self)
        # update dfuse params; mountpoint for each container
        unique = get_random_string(5, self.used)
        self.used.append(unique)
        mount_dir = dfuse.mount_dir.value + unique
        dfuse.mount_dir.update(mount_dir)
        dfuse.set_dfuse_params(pool)
        dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool))
        # create dfuse mount point
        commands.append(slurm_utils.srun_str(
            hosts=None,
            cmd="mkdir -p {}".format(dfuse.mount_dir.value),
            srun_params=None))
        commands.append(slurm_utils.srun_str(
            hosts=None,
            cmd="{}".format(dfuse.__str__()),
            srun_params=None))
        commands.append("sleep 10")
        commands.append(slurm_utils.srun_str(
            hosts=None,
            cmd="df -h {}".format(dfuse.mount_dir.value),
            srun_params=None))
        return dfuse, commands
Beispiel #9
0
def start_dfuse(self,
                pool,
                container,
                nodesperjob,
                resource_mgr=None,
                name=None,
                job_spec=None):
    """Create dfuse start command line for slurm.

    Args:
        self (obj): soak obj
        pool (obj):             TestPool obj

    Returns dfuse(obj):         Dfuse obj
            cmd(list):          list of dfuse commands to add to jobscript
    """
    # Get Dfuse params
    dfuse = Dfuse(self.hostlist_clients, self.tmp)
    dfuse.namespace = os.path.join(os.sep, "run", job_spec, "dfuse", "*")
    dfuse.get_params(self)
    # update dfuse params; mountpoint for each container
    unique = get_random_string(5, self.used)
    self.used.append(unique)
    mount_dir = dfuse.mount_dir.value + unique
    dfuse.mount_dir.update(mount_dir)
    dfuse.set_dfuse_params(pool)
    dfuse.set_dfuse_cont_param(container)
    dfuse_log = os.path.join(
        self.test_log_dir,
        self.test_name + "_" + name + "_${SLURM_JOB_NODELIST}_"
        "" + "${SLURM_JOB_ID}_" + "daos_dfuse_" + unique)
    dfuse_env = "export D_LOG_MASK=ERR;export D_LOG_FILE={}".format(dfuse_log)
    dfuse_start_cmds = [
        "mkdir -p {}".format(dfuse.mount_dir.value),
        "clush -S -w $SLURM_JOB_NODELIST \"cd {};{};{}\"".format(
            dfuse.mount_dir.value, dfuse_env, dfuse.__str__()),
        "sleep 10",
        "df -h {}".format(dfuse.mount_dir.value),
    ]
    if resource_mgr == "SLURM":
        cmds = []
        for cmd in dfuse_start_cmds:
            if cmd.startswith("clush") or cmd.startswith("sleep"):
                cmds.append(cmd)
            else:
                cmds.append(get_srun_cmd(cmd, nodesperjob))
        dfuse_start_cmds = cmds
    return dfuse, dfuse_start_cmds
Beispiel #10
0
    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # TO-DO: Enable container using TestContainer object,
        # once DAOS-3355 is resolved.
        # Get Container params
        #self.container = TestContainer(self.pool)
        #self.container.get_params(self)

        # create container
        # self.container.create()
        env = Dfuse(self.hostlist_clients, self.tmp).get_default_env()
        # command to create container of posix type
        cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format(
            self.ior_cmd.daos_pool.value, self.ior_cmd.daos_svcl.value)
        try:
            container = subprocess.Popen(cmd,
                                         stdout=subprocess.PIPE,
                                         shell=True)
            (output, err) = container.communicate()
            self.log.info("Container created with UUID %s", output.split()[3])

        except subprocess.CalledProcessError as err:
            self.fail("Container create failed:{}".format(err))

        return output.split()[3]
Beispiel #11
0
    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self._create_cont())

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse), str(NodeSet(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Unable to launch Dfuse.\n")
Beispiel #12
0
    def start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.create_cont())

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")
Beispiel #13
0
def start_dfuse(self, pool, nodesperjob, resource_mgr=None):
    """Create dfuse start command line for slurm.

    Args:
        self (obj): soak obj
        pool (obj):             TestPool obj

    Returns dfuse(obj):         Dfuse obj
            cmd(list):          list of dfuse commands to add to jobscript
    """
    # Get Dfuse params
    dfuse = Dfuse(self.hostlist_clients, self.tmp)
    dfuse.get_params(self)
    # update dfuse params; mountpoint for each container
    unique = get_random_string(5, self.used)
    self.used.append(unique)
    add_containers(self, pool)
    mount_dir = dfuse.mount_dir.value + unique
    dfuse.mount_dir.update(mount_dir)
    dfuse.set_dfuse_params(pool)
    dfuse.set_dfuse_cont_param(self.container[-1])

    dfuse_start_cmds = [
        "mkdir -p {}".format(dfuse.mount_dir.value),
        "clush -w $SLURM_JOB_NODELIST \"cd {};{}\"".format(
            dfuse.mount_dir.value, dfuse.__str__()),
        "sleep 10",
        "df -h {}".format(dfuse.mount_dir.value),
    ]
    if resource_mgr == "SLURM":
        cmds = []
        for cmd in dfuse_start_cmds:
            if cmd.startswith("clush") or cmd.startswith("sleep"):
                cmds.append(cmd)
            else:
                cmds.append(get_srun_cmd(cmd, nodesperjob))
        dfuse_start_cmds = cmds
    return dfuse, dfuse_start_cmds
Beispiel #14
0
class RootContainerTest(TestWithServers):
    """Base Dfuse Container check test class.

    :avocado: recursive
    """

    def __init__(self, *args, **kwargs):
        """Initialize a RootContainerTest object."""
        super(RootContainerTest, self).__init__(*args, **kwargs)
        self.pool = []
        self.container = []
        self.tmp_file_count = self.params.get(
            "tmp_file_count", '/run/container/*')
        self.cont_count = self.params.get(
            "cont_count", '/run/container/*')
        self.tmp_file_size = self.params.get(
            "tmp_file_size", '/run/container/*')
        self.tmp_file_name = self.params.get(
            "tmp_file_name", '/run/container/*')
        # device where the pools and containers are created
        self.device = "scm"

    def setUp(self):
        """Set up each test case."""
        # Start the servers and agents
        super(RootContainerTest, self).setUp()
        self.dfuse = None
        self.dfuse_hosts = None

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(RootContainerTest, self).tearDown()

    def _create_pool(self):
        """Create a TestPool object to use with ior.
        """
        # Get the pool params
        pool = TestPool(
            self.context, dmg_command=self.get_dmg_command())
        pool.get_params(self)
        # Create a pool
        pool.create()
        self.pool.append(pool)
        return pool

    def _create_cont(self, pool, path=None):
        """Create a TestContainer object to be used to create container.

           Args:
               pool (TestPool): pool object
               path (str): Unified namespace path for container
        """
        # Get container params
        container = TestContainer(pool, daos_command=DaosCommand(self.bin))
        container.get_params(self)
        if path is not None:
            container.path.update(path)
        # create container
        container.create()
        self.container.append(container)
        return container

    def _start_dfuse(self, pool, container):
        """Create a DfuseCommand object to start dfuse.

           Args:
               container: Container to mount dfuse
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.dfuse_hosts, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_cont_param(container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def test_rootcontainer(self):
        """Jira ID: DAOS-3782.

        Test Description:
            Purpose of this test is to try and create a container and
            mount it over dfuse and use it as a root container and create
            subcontainers underneath it and insert several files and see
            if they can be accessed using ls and cd. Verify the pool size
            reflects the space occupied by container. Try to remove the
            files and containers and see the space is reclaimed.
            Test the above procedure with 100 sub containers.
            Test the above procedure with 5 pools and 50 containers
            spread across the pools.
        :avocado: tags=all,hw,small,full_regression,container
        :avocado: tags=rootcontainer
        """

        # Create a pool and start dfuse.
        pool = self._create_pool()
        container = self._create_cont(pool)
        self.dfuse_hosts = self.agent_managers[0].hosts
        # mount fuse
        self._start_dfuse(pool, container)
        # Create another container and add it as sub container under
        # root container
        sub_container = str(self.dfuse.mount_dir.value + "/cont0")
        container = self._create_cont(pool, path=sub_container)
        #Insert files into root container
        self.insert_files_and_verify("")
        #Insert files into sub container
        self.insert_files_and_verify("cont0")
        #Create 100 subcontainer and verify the temp files
        self.verify_create_delete_containers(pool, 100)
        self.verify_multi_pool_containers()

    def verify_multi_pool_containers(self):
        """Create several pools and containers and mount it
           under the root container and verify they're
           accessible.
        """
        pool_count = self.params.get("pool_count", "/run/pool/*")
        for i in range(pool_count):
            pool = self._create_pool()
            for j in range(self.cont_count):
                cont_name = "/cont_{}{}".format(i, j)
                sub_cont = str(self.dfuse.mount_dir.value + cont_name)
                self._create_cont(pool=pool, path=sub_cont)
                self.insert_files_and_verify(cont_name)

    def verify_create_delete_containers(self, pool, cont_count):
        """Create multiple containers and multiple multi-mb files
           in each of them and verify the space usage.
           Destroy half of the containers and verify the space
           usage is reclaimed.

           Args:
               cont_count (int): Number of containers to be created.
        """
        self.log.info("Verifying multiple container create delete")
        pool_space_before = pool.get_pool_free_space(self.device)
        self.log.info("Pool space before = %s", pool_space_before)
        for i in range(cont_count):
            sub_cont = str(self.dfuse.mount_dir.value + "/cont{}".format(i+1))
            self._create_cont(pool, path=sub_cont)
            self.insert_files_and_verify("cont{}".format(i+1))
        expected = pool_space_before - \
                   cont_count * self.tmp_file_count * self.tmp_file_size
        pool_space_after = pool.get_pool_free_space(self.device)
        self.log.info("Pool space <= Expected")
        self.log.info("%s <= %s", pool_space_after, expected)
        self.assertTrue(pool_space_after <= expected)
        self.log.info("Destroying half of the containers = %s",
                      cont_count//2)
        for i in range(cont_count // 2):
            self.container[-1].destroy(1)
            self.container.pop()
        expected = pool_space_after + \
                   ((cont_count // 2) * self.tmp_file_count *\
                    self.tmp_file_size)
        pool_space_after_cont_destroy = \
                   pool.get_pool_free_space(self.device)
        self.log.info("After container destroy")
        self.log.info("Free Pool space >= Expected")
        self.log.info("%s >= %s", pool_space_after_cont_destroy, expected)
        self.assertTrue(pool_space_after_cont_destroy >= expected)

    def insert_files_and_verify(self, container_name):
        """ Insert files into the specific container and verify
            they're navigable and accessible.

        Args:
            container_name: Name of the POSIX Container
            file_name_prefix: Prefix of the file name that will be created
            no_of_files: Number of files to be created iteratively

        Return:
            None
        """
        cont_dir = self.dfuse.mount_dir.value
        if container_name:
            cont_dir = "{}/{}".format(cont_dir, container_name)

        cmds = []
        ls_cmds = []

        for i in range(self.tmp_file_count):
            # Create 40 MB files
            file_name = "{}{}".format(self.tmp_file_name, i+1)
            cmd = "head -c {} /dev/urandom > {}/{}".format(
                self.tmp_file_size, cont_dir, file_name)
            ls_cmds.append("ls {}".format(file_name))
            cmds.append(cmd)
        self._execute_cmd(";".join(cmds))

        cmds = []
        # Run ls to verify the temp files are actually created
        cmds = ["cd {}".format(cont_dir)]
        cmds.extend(ls_cmds)
        self._execute_cmd(";".join(cmds))

    def _execute_cmd(self, cmd):
        """Execute command on the host clients

           Args:
               cmd (str): Command to run
        """

        try:
            # execute bash cmds
            ret = pcmd(
                self.dfuse_hosts, cmd, verbose=True, timeout=30)
            if 0 not in ret:
                error_hosts = NodeSet(
                    ",".join(
                        [str(node_set) for code, node_set in
                         ret.items() if code != 0]))
                raise CommandFailure(
                    "Error running '{}' on the following "
                    "hosts: {}".format(cmd, error_hosts))

         # report error if any command fails
        except CommandFailure as error:
            self.log.error("DfuseSparseFile Test Failed: %s",
                           str(error))
            self.fail("Test was expected to pass but "
                      "it failed.\n")
        return ret
Beispiel #15
0
class DfuseTestBase(TestWithServers):
    """Runs HDF5 vol test suites.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a TestWithServers object."""
        super(DfuseTestBase, self).__init__(*args, **kwargs)
        self.dfuse = None

    def stop_job_managers(self):
        """Stop the test job manager followed by dfuse.

        Returns:
            list: a list of exceptions raised stopping the agents

        """
        error_list = super(DfuseTestBase, self).stop_job_managers()
        try:
            self.stop_dfuse()
        except CommandFailure as error:
            error_list.append("Error stopping dfuse: {}".format(error))
        return error_list

    def start_dfuse(self, hosts, pool=None, container=None, mount_dir=None):
        """Create a DfuseCommand object and use it to start Dfuse.

        Args:
            hosts (list): list of hosts on which to start Dfuse
            pool (TestPool, optional): pool to use with Dfuse
            container (TestContainer, optional): container to use with Dfuse
            mount_dir (str, optional): updated mount dir name. Defaults to None.
        """
        self.dfuse = Dfuse(hosts, self.tmp)
        self.dfuse.get_params(self)

        # Update dfuse params
        if mount_dir:
            self.dfuse.mount_dir.update(mount_dir)
        if pool:
            self.dfuse.set_dfuse_params(pool)
        if container:
            self.dfuse.set_dfuse_cont_param(container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # Start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.")

    def stop_dfuse(self):
        """Stop Dfuse and unset the DfuseCommand object."""
        if self.dfuse:
            self.dfuse.stop()
            self.dfuse = None
Beispiel #16
0
class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')
        # Until DAOS-3320 is resolved run IOR for POSIX
        # with single client node
        if self.ior_cmd.api.value == "POSIX":
            self.hostlist_clients = [self.hostlist_clients[0]]
            self.hostfile_clients = write_host_file.write_host_file(
                self.hostlist_clients, self.workdir,
                self.hostfile_clients_slots)

    def tearDown(self):
        """Tear down each test case."""
        try:
            self.dfuse = None
        finally:
            # Stop the servers and agents
            super(IorTestBase, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(self.context, self.log)
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # TO-DO: Enable container using TestContainer object,
        # once DAOS-3355 is resolved.
        # Get Container params
        #self.container = TestContainer(self.pool)
        #self.container.get_params(self)

        # create container
        # self.container.create()
        env = Dfuse(self.hostlist_clients, self.tmp).get_default_env()
        # command to create container of posix type
        cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format(
            self.ior_cmd.daos_pool.value, self.ior_cmd.daos_svcl.value)
        try:
            container = subprocess.Popen(cmd,
                                         stdout=subprocess.PIPE,
                                         shell=True)
            (output, err) = container.communicate()
            self.log.info("Container created with UUID %s", output.split()[3])

        except subprocess.CalledProcessError as err:
            self.fail("Container create failed:{}".format(err))

        return output.split()[3]

    def start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.create_cont())

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def run_ior_with_pool(self, intercept=None):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str): path to the interception library. Shall be used
                             only for POSIX through DFUSE.
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Update IOR params with the pool
        self.ior_cmd.set_daos_params(self.server_group, self.pool)

        # start dfuse if api is POSIX
        if self.ior_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            # self.pool.connect()
            # self.create_cont()
            if self.ior_cmd.transfer_size.value == "256B":
                self.cancelForTicket("DAOS-3449")
            self.start_dfuse()
            self.ior_cmd.test_file.update(self.dfuse.mount_dir.value +
                                          "/testfile")

        out = self.run_ior(self.get_job_manager_command(), self.processes,
                           intercept)

        return out

    def get_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode
        if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        mpirun_path = os.path.join(mpio_util.mpichinstall, "bin")
        return Mpirun(self.ior_cmd, mpirun_path)

    def run_ior(self, manager, processes, intercept=None):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        env = self.ior_cmd.get_default_env(str(manager), self.tmp,
                                           self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.setup_command(env, self.hostfile_clients, processes)
        try:
            out = manager.run()
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))
Beispiel #17
0
class ParallelIo(FioBase):
    """Base Parallel IO test class.

    :avocado: recursive
    """

    def __init__(self, *args, **kwargs):
        """Initialize a ParallelIo object."""
        super(ParallelIo, self).__init__(*args, **kwargs)
        self.dfuse = None
        self.cont_count = None
        self.container = []

    def setUp(self):
        """Set up each test case."""
        # Start the servers and agents
        super(ParallelIo, self).setUp()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(ParallelIo, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(
            self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # Get container params
        container = TestContainer(
            self.pool, daos_command=DaosCommand(self.bin))
        container.get_params(self)

        # create container
        container.create()
        self.container.append(container)

    def start_dfuse(self):
        """Create a DfuseCommand object to start dfuse.
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def test_parallelio(self):
        """Jira ID: DAOS-3775.

        Test Description:
            Purpose of this test is to mount dfuse and verify multiple
            containers using fio.
        Use cases:
            Mount dfuse using pool uuid.
            Create multiple containers under that dfuse mount point.
            Check those containers are accessible from that mount point.
            Perform io to those containers using FIO
            Delete one of the containers
            Check if dfuse is still running. If not, fail the test and exit.
            Otherwise, try accessing the deleted container.
            This should fail.
            Check dfuse again.
        :avocado: tags=all,hw,daosio,medium,ib2,full_regression,parallelio
        """
        # get test params for cont and pool count
        self.cont_count = self.params.get("cont_count", '/run/container/*')

        threads = []

        # Create a pool and start dfuse.
        self.create_pool()
        self.start_dfuse()
        # create multiple containers in parallel
        cont_threads = []
        for _ in range(self.cont_count):
            cont_thread = threading.Thread(target=self.create_cont())
            cont_threads.append(cont_thread)
        # start container create job
        for cont_job in cont_threads:
            cont_job.start()
        # wait for container create to finish
        for cont_job in cont_threads:
            cont_job.join()

        # check if all the created containers can be accessed and perform
        # io on each container using fio in parallel
        for _, cont in enumerate(self.container):
            dfuse_cont_dir = self.dfuse.mount_dir.value + "/" + cont.uuid
            cmd = u"ls -a {}".format(dfuse_cont_dir)
            try:
                # execute bash cmds
                ret_code = general_utils.pcmd(
                    self.hostlist_clients, cmd, timeout=30)
                if 0 not in ret_code:
                    error_hosts = NodeSet(
                        ",".join(
                            [str(node_set) for code, node_set in
                             ret_code.items() if code != 0]))
                    raise CommandFailure(
                        "Error running '{}' on the following "
                        "hosts: {}".format(cmd, error_hosts))
            # report error if any command fails
            except CommandFailure as error:
                self.log.error("ParallelIo Test Failed: %s",
                               str(error))
                self.fail("Test was expected to pass but "
                          "it failed.\n")
            # run fio on all containers
            thread = threading.Thread(target=self.execute_fio, args=(
                self.dfuse.mount_dir.value + "/" + cont.uuid, False))
            threads.append(thread)
            thread.start()

        # wait for all fio jobs to be finished
        for job in threads:
            job.join()

        # destroy first container
        container_to_destroy = self.container[0].uuid
        self.container[0].destroy(1)

        # check dfuse if it is running fine
        self.dfuse.check_running()

        # try accessing destroyed container, it should fail
        try:
            self.execute_fio(self.dfuse.mount_dir.value + "/" + \
                container_to_destroy, False)
            self.fail("Fio was able to access destroyed container: {}".\
                format(self.container[0].uuid))
        except CommandFailure as error:
            self.log.info("This run is expected to fail")

        # check dfuse is still running after attempting to access deleted
        # container.
            self.dfuse.check_running()
Beispiel #18
0
class SoakTestBase(TestWithServers):
    # pylint: disable=too-many-public-methods
    """Execute DAOS Soak test cases.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a SoakBase object."""
        super(SoakTestBase, self).__init__(*args, **kwargs)
        self.failed_job_id_list = None
        self.test_log_dir = None
        self.exclude_slurm_nodes = None
        self.loop = None
        self.log_dir = None
        self.outputsoakdir = None
        self.test_name = None
        self.local_pass_dir = None
        self.dfuse = None
        self.test_timeout = None
        self.end_time = None
        self.job_timeout = None
        self.nodesperjob = None
        self.task_list = None
        self.soak_results = None
        self.srun_params = None
        self.pool = None
        self.container = None
        self.test_iteration = None
        self.h_list = None
        self.harasser_joblist = None
        self.harasser_results = None
        self.harasser_timeout = None
        self.all_failed_jobs = None
        self.username = None

    def setUp(self):
        """Define test setup to be done."""
        self.log.info("<<setUp Started>> at %s", time.ctime())
        # Start the daos_agents in the job scripts
        self.setup_start_servers = True
        self.setup_start_agents = False
        super(SoakTestBase, self).setUp()
        self.username = getuser()
        # Initialize loop param for all tests
        self.loop = 1
        self.exclude_slurm_nodes = []
        # Setup logging directories for soak logfiles
        # self.output dir is an avocado directory .../data/
        self.log_dir = self.params.get("logdir", "/run/*")
        self.outputsoakdir = self.outputdir + "/soak"
        # Create the remote log directories on all client nodes
        self.test_log_dir = self.log_dir + "/pass" + str(self.loop)
        self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop)
        # Fail if slurm partition daos_client is not defined
        if not self.client_partition:
            raise SoakTestError(
                "<<FAILED: Partition is not correctly setup for daos "
                "slurm partition>>")
        # Check if the server nodes are in the client list;
        # this will happen when only one partition is specified
        for host_server in self.hostlist_servers:
            if host_server in self.hostlist_clients:
                self.hostlist_clients.remove(host_server)
                self.exclude_slurm_nodes.append(host_server)
        self.log.info("<<Updated hostlist_clients %s >>",
                      self.hostlist_clients)
        if not self.hostlist_clients:
            self.fail("There are no nodes that are client only;"
                      "check if the partition also contains server nodes")

        # Include test node for log cleanup; remove from client list
        local_host_list = include_local_host(None)
        self.exclude_slurm_nodes.extend(local_host_list)

        # Start an agent on the test control host to enable API calls for
        # reserved pool and containers.  The test control host should be the
        # last host in the hostlist_clients list.
        agent_groups = {self.server_group: local_host_list}
        self.start_agents(agent_groups)

    def pre_tear_down(self):
        """Tear down any test-specific steps prior to running tearDown().

        Returns:
            list: a list of error strings to report after all tear down
            steps have been attempted

        """
        errors = []
        # clear out any jobs in squeue;
        if self.failed_job_id_list:
            self.log.info("<<Cancel jobs in queue with ids %s >>",
                          self.failed_job_id_list)
            status = process.system("scancel --partition {} -u {}".format(
                self.client_partition, self.username))
            if status > 0:
                errors.append("Failed to cancel jobs {}".format(
                    self.failed_job_id_list))
        if self.all_failed_jobs:
            errors.append("SOAK FAILED: The following jobs failed {} ".format(
                " ,".join(str(j_id) for j_id in self.all_failed_jobs)))

        # One last attempt to copy any logfiles from client nodes
        try:
            self.get_remote_logs()
        except SoakTestError as error:
            self.log.info("Remote copy failed with %s", error)
        # daos_agent is always started on this node when start agent is false
        if not self.setup_start_agents:
            self.hostlist_clients = [socket.gethostname().split('.', 1)[0]]
        return errors

    def tearDown(self):
        """Define tearDown and clear any left over jobs in squeue."""
        # Perform any test-specific tear down steps and collect any
        # reported errors
        self.log.info("<<tearDown Started>> at %s", time.ctime())
        super(SoakTestBase, self).tearDown()

    def job_done(self, args):
        """Call this function when a job is done.

        Args:
            args (list):handle --which job, i.e. the job ID,
                        state  --string indicating job completion status
        """
        self.soak_results[args["handle"]] = args["state"]

    def add_pools(self, pool_names):
        """Create a list of pools that the various tests use for storage.

        Args:
            pool_names: list of pool namespaces from yaml file
                        /run/<test_params>/poollist/*
        """
        for pool_name in pool_names:
            path = "".join(["/run/", pool_name, "/*"])
            # Create a pool and add it to the overall list of pools
            self.pool.append(
                TestPool(self.context,
                         self.log,
                         dmg_command=self.get_dmg_command()))
            self.pool[-1].namespace = path
            self.pool[-1].get_params(self)
            self.pool[-1].create()
            self.log.info("Valid Pool UUID is %s", self.pool[-1].uuid)

    def get_remote_logs(self):
        """Copy files from remote dir to local dir.

        Raises:
            SoakTestError: if there is an error with the remote copy

        """
        # copy the files from the remote
        # TO-DO: change scp
        this_host = socket.gethostname()
        rsync_str = "rsync -avtr --min-size=1B"
        result = slurm_utils.srun(
            NodeSet.fromlist(self.hostlist_clients),
            "bash -c \"{0} {1} {2}:{1}/.. && rm -rf {1}/*\"".format(
                rsync_str, self.test_log_dir, this_host), self.srun_params)
        if result.exit_status == 0:
            cmd = "cp -R -p {0}/ \'{1}\'; rm -rf {0}/*".format(
                self.test_log_dir, self.outputsoakdir)
            try:
                result = process.run(cmd, shell=True, timeout=30)
            except process.CmdError as error:
                raise SoakTestError("<<FAILED: Soak remote logfiles not copied"
                                    "to avocado data dir {} - check /tmp/soak "
                                    "on nodes {}>>".format(
                                        error, self.hostlist_clients))
        else:
            raise SoakTestError("<<FAILED: Soak remote logfiles not copied "
                                "from clients>>: {}".format(
                                    self.hostlist_clients))

    def is_harasser(self, harasser):
        """Check if harasser is defined in yaml.

        Args:
            harasser (list): list of harassers to launch

        Returns: bool

        """
        return self.h_list and harasser in self.h_list

    def launch_harassers(self, harassers, pools):
        """Launch any harasser tests if defined in yaml.

        Args:
            harasser (list): list of harassers to launch
            pools (TestPool): pool obj

        """
        job = None
        # Launch harasser after one complete pass
        for harasser in harassers:
            if harasser == "rebuild":
                method = self.launch_rebuild
                ranks = self.params.get("ranks_to_kill",
                                        "/run/" + harasser + "/*")
                param_list = (ranks, pools)
                name = "REBUILD"
            if harasser in "snapshot":
                method = self.launch_snapshot
                param_list = ()
                name = "SNAPSHOT"
            else:
                raise SoakTestError(
                    "<<FAILED: Harasser {} is not supported. ".format(
                        harasser))
            job = threading.Thread(target=method, args=param_list, name=name)
            self.harasser_joblist.append(job)

        # start all harassers
        for job in self.harasser_joblist:
            job.start()

    def harasser_completion(self, timeout):
        """Complete harasser jobs.

        Args:
            timeout (int): timeout in secs

        Returns:
            bool: status

        """
        status = True
        for job in self.harasser_joblist:
            job.join(timeout)
        for job in self.harasser_joblist:
            if job.is_alive():
                self.log.error("<< HARASSER is alive %s FAILED to join>> ",
                               job.name)
                status &= False
        # Check if the completed job passed
        for harasser, status in self.harasser_results.items():
            if not status:
                self.log.error("<< HARASSER %s FAILED>> ", harasser)
                status &= False
        self.harasser_joblist = []
        return status

    def launch_rebuild(self, ranks, pools):
        """Launch the rebuild process.

        Args:
            ranks (list): Server ranks to kill
            pools (list): list of TestPool obj

        """
        self.log.info("<<Launch Rebuild>> at %s", time.ctime())
        status = True
        for pool in pools:
            # Kill the server
            try:
                pool.start_rebuild(ranks, self.d_log)
            except (RuntimeError, TestFail, DaosApiError) as error:
                self.log.error("Rebuild failed to start", exc_info=error)
                status &= False
                break
            # Wait for rebuild to start
            try:
                pool.wait_for_rebuild(True)
            except (RuntimeError, TestFail, DaosApiError) as error:
                self.log.error("Rebuild failed waiting to start",
                               exc_info=error)
                status &= False
                break
            # Wait for rebuild to complete
            try:
                pool.wait_for_rebuild(False)
            except (RuntimeError, TestFail, DaosApiError) as error:
                self.log.error("Rebuild failed waiting to finish",
                               exc_info=error)
                status &= False
                break
        with H_LOCK:
            self.harasser_results["REBUILD"] = status

    def launch_snapshot(self):
        """Create a basic snapshot of the reserved pool."""
        self.log.info("<<Launch Snapshot>> at %s", time.ctime())
        status = True
        # Create container
        container = TestContainer(self.pool[0])
        container.namespace = "/run/container_reserved/*"
        container.get_params(self)
        container.create()
        container.open()
        obj_cls = self.params.get("object_class", '/run/container_reserved/*')

        # write data to object
        data_pattern = get_random_string(500)
        datasize = len(data_pattern) + 1
        dkey = "dkey"
        akey = "akey"
        tx_handle = container.container.get_new_tx()
        obj = container.container.write_an_obj(data_pattern,
                                               datasize,
                                               dkey,
                                               akey,
                                               obj_cls=obj_cls,
                                               txn=tx_handle)
        container.container.commit_tx(tx_handle)
        obj.close()
        # Take a snapshot of the container
        snapshot = DaosSnapshot(self.context)
        try:
            snapshot.create(container.container.coh, tx_handle)
        except (RuntimeError, TestFail, DaosApiError) as error:
            self.log.error("Snapshot failed", exc_info=error)
            status &= False
        if status:
            self.log.info("Snapshot Created")
            # write more data to object
            data_pattern2 = get_random_string(500)
            datasize2 = len(data_pattern2) + 1
            dkey = "dkey"
            akey = "akey"
            obj2 = container.container.write_an_obj(data_pattern2,
                                                    datasize2,
                                                    dkey,
                                                    akey,
                                                    obj_cls=obj_cls)
            obj2.close()
            self.log.info("Wrote additional data to container")
            # open the snapshot and read the data
            obj.open()
            snap_handle = snapshot.open(container.container.coh)
            try:
                data_pattern3 = container.container.read_an_obj(
                    datasize, dkey, akey, obj, txn=snap_handle.value)
            except (RuntimeError, TestFail, DaosApiError) as error:
                self.log.error("Error when retrieving the snapshot data %s",
                               error)
                status &= False
            if status:
                # Compare the snapshot to the original written data.
                if data_pattern3.value != data_pattern:
                    self.log.error("Snapshot data miscompere")
                    status &= False
        # Destroy the snapshot
        try:
            snapshot.destroy(container.container.coh)
        except (RuntimeError, TestFail, DaosApiError) as error:
            self.log.error("Failed to destroy snapshot %s", error)
            status &= False
        # cleanup
        container.close()
        container.destroy()
        with H_LOCK:
            self.harasser_results["SNAPSHOT"] = status

    def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob):
        """Create an IOR cmdline to run in slurm batch.

        Args:

            job_spec (str):   ior job in yaml to run
            pool (obj):       TestPool obj
            ppn(int):         number of tasks to run on each node
            nodesperjob(int): number of nodes per job

        Returns:
            cmd: cmdline string

        """
        commands = []

        iteration = self.test_iteration
        ior_params = "/run/" + job_spec + "/*"
        # IOR job specs with a list of parameters; update each value
        api_list = self.params.get("api", ior_params + "*")
        tsize_list = self.params.get("transfer_size", ior_params + "*")
        bsize_list = self.params.get("block_size", ior_params + "*")
        oclass_list = self.params.get("daos_oclass", ior_params + "*")
        # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX
        if self.is_harasser("rebuild"):
            oclass_list = self.params.get("daos_oclass", "/run/rebuild/*")
        # update IOR cmdline for each additional IOR obj
        for api in api_list:
            for b_size in bsize_list:
                for t_size in tsize_list:
                    for o_type in oclass_list:
                        ior_cmd = IorCommand()
                        ior_cmd.namespace = ior_params
                        ior_cmd.get_params(self)
                        if iteration is not None and iteration < 0:
                            ior_cmd.repetitions.update(1000000)
                        if self.job_timeout is not None:
                            ior_cmd.max_duration.update(self.job_timeout)
                        else:
                            ior_cmd.max_duration.update(10)
                        ior_cmd.api.update(api)
                        ior_cmd.block_size.update(b_size)
                        ior_cmd.transfer_size.update(t_size)
                        ior_cmd.daos_oclass.update(o_type)
                        ior_cmd.set_daos_params(self.server_group, pool)
                        # srun cmdline
                        nprocs = nodesperjob * ppn
                        env = ior_cmd.get_default_env("srun")
                        if ior_cmd.api.value == "MPIIO":
                            env["DAOS_CONT"] = ior_cmd.daos_cont.value
                        cmd = Srun(ior_cmd)
                        cmd.setup_command(env, None, nprocs)
                        cmd.ntasks_per_node.update(ppn)
                        log_name = "{}_{}_{}_{}".format(
                            api, b_size, t_size, o_type)
                        commands.append([cmd.__str__(), log_name])
                        self.log.info("<<IOR cmdline>>: %s \n",
                                      commands[-1].__str__())
        return commands

    def create_dfuse_cont(self, pool):
        """Create a TestContainer object to be used to create container.

        Args:

            pool (obj):   TestPool obj

        Returns:
            cuuid: container uuid

        """
        # TO-DO: use daos tool when available
        # This method assumes that doas agent is running on test node
        cmd = "daos cont create --pool={} --svc={} --type=POSIX".format(
            pool.uuid, ":".join([str(item) for item in pool.svc_ranks]))
        try:
            result = process.run(cmd, shell=True, timeout=30)
        except process.CmdError as error:
            raise SoakTestError(
                "<<FAILED: Dfuse container failed {}>>".format(error))
        self.log.info("Dfuse Container UUID = %s", result.stdout.split()[3])
        return result.stdout.split()[3]

    def start_dfuse(self, pool):
        """Create a DfuseCommand object to start dfuse.

        Args:

            pool (obj):   TestPool obj
        """
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)
        # update dfuse params
        self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool))
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        # create dfuse mount point
        cmd = "mkdir -p {}".format(self.dfuse.mount_dir.value)
        params = self.srun_params
        params["export"] = "all"
        params["ntasks-per-node"] = 1
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd,
                                  params)
        if result.exit_status > 0:
            raise SoakTestError(
                "<<FAILED: Dfuse mountpoint {} not created>>".format(
                    self.dfuse.mount_dir.value))
        cmd = self.dfuse.__str__()
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd,
                                  params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Dfuse failed to start>>")

    def create_fio_cmdline(self, job_spec, pool):
        """Create the FOI commandline.

        Args:

            job_spec (str): fio job in yaml to run
            pool (obj):   TestPool obj
            ppn(int): number of tasks to run on each node

        Returns:
            cmd(list): list of cmdlines

        """
        commands = []

        fio_namespace = "/run/{}".format(job_spec)
        # test params
        bs_list = self.params.get("blocksize", fio_namespace + "/soak/*")
        size_list = self.params.get("size", fio_namespace + "/soak/*")
        rw_list = self.params.get("rw", fio_namespace + "/soak/*")
        # Get the parameters for Fio
        fio_cmd = FioCommand()
        fio_cmd.namespace = "{}/*".format(fio_namespace)
        fio_cmd.get_params(self)
        for blocksize in bs_list:
            for size in size_list:
                for rw in rw_list:
                    # update fio params
                    fio_cmd.update("global", "blocksize", blocksize,
                                   "fio --name=global --blocksize")
                    fio_cmd.update("global", "size", size,
                                   "fio --name=global --size")
                    fio_cmd.update("global", "rw", rw,
                                   "fio --name=global --rw")
                    # start dfuse if api is POSIX
                    if fio_cmd.api.value == "POSIX":
                        # Connect to the pool, create container
                        # and then start dfuse
                        self.start_dfuse(pool)
                        fio_cmd.update("global", "directory",
                                       self.dfuse.mount_dir.value,
                                       "fio --name=global --directory")
                    # fio command
                    log_name = "{}_{}_{}".format(blocksize, size, rw)
                    commands.append([fio_cmd.__str__(), log_name])
                    self.log.info("<<FIO cmdline>>: %s \n", commands[-1])
        return commands

    def build_job_script(self, commands, job, ppn, nodesperjob):
        """Create a slurm batch script that will execute a list of cmdlines.

        Args:
            commands(list): commandlines and cmd specific log_name
            job(str): the job name that will be defined in the slurm script
            ppn(int): number of tasks to run on each node

        Returns:
            script_list: list of slurm batch scripts

        """
        self.log.info("<<Build Script>> at %s", time.ctime())
        script_list = []

        # Start the daos_agent in the batch script for now
        # TO-DO:  daos_agents start with systemd
        agent_launch_cmds = [
            "mkdir -p {}".format(os.environ.get("DAOS_TEST_LOG_DIR"))
        ]
        agent_launch_cmds.append(" ".join(
            [str(self.agent_managers[0].manager.job), "&"]))

        # Create the sbatch script for each cmdline
        used = []
        for cmd, log_name in commands:
            output = os.path.join(
                self.test_log_dir, "%N_" + self.test_name + "_" + job +
                "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_")
            error = os.path.join(
                self.test_log_dir,
                "%N_" + self.test_name + "_" + job + "_%j_%t_" +
                str(ppn * nodesperjob) + "_" + log_name + "_ERROR_")
            sbatch = {
                "time": str(self.job_timeout) + ":00",
                "exclude": NodeSet.fromlist(self.exclude_slurm_nodes),
                "error": str(error)
            }
            # include the cluster specific params
            sbatch.update(self.srun_params)
            unique = get_random_string(5, used)
            script = slurm_utils.write_slurm_script(self.test_log_dir, job,
                                                    output, nodesperjob,
                                                    agent_launch_cmds + [cmd],
                                                    unique, sbatch)
            script_list.append(script)
            used.append(unique)
        return script_list

    def job_setup(self, job, pool):
        """Create the cmdline needed to launch job.

        Args:
            job(str): single job from test params list of jobs to run
            pool (obj): TestPool obj

        Returns:
            job_cmdlist: list cmdline that can be launched
                         by specifed job manager

        """
        job_cmdlist = []
        commands = []
        scripts = []
        nodesperjob = []
        self.log.info("<<Job_Setup %s >> at %s", self.test_name, time.ctime())
        for npj in self.nodesperjob:
            # nodesperjob = -1 indicates to use all nodes in client hostlist
            if npj < 0:
                npj = len(self.hostlist_clients)
            if len(self.hostlist_clients) / npj < 1:
                raise SoakTestError(
                    "<<FAILED: There are only {} client nodes for this job. "
                    "Job requires {}".format(len(self.hostlist_clients), npj))
            nodesperjob.append(npj)
        if "ior" in job:
            for npj in nodesperjob:
                for ppn in self.task_list:
                    commands = self.create_ior_cmdline(job, pool, ppn, npj)
                    # scripts are single cmdline
                    scripts = self.build_job_script(commands, job, ppn, npj)
                    job_cmdlist.extend(scripts)
        elif "fio" in job:
            commands = self.create_fio_cmdline(job, pool)
            # scripts are single cmdline
            scripts = self.build_job_script(commands, job, 1, 1)
            job_cmdlist.extend(scripts)
        else:
            raise SoakTestError("<<FAILED: Job {} is not supported. ".format(
                self.job))
        return job_cmdlist

    def job_startup(self, job_cmdlist):
        """Submit job batch script.

        Args:
            job_cmdlist (list): list of jobs to execute
        Returns:
            job_id_list: IDs of each job submitted to slurm.

        """
        self.log.info("<<Job Startup - %s >> at %s", self.test_name,
                      time.ctime())
        job_id_list = []
        # before submitting the jobs to the queue, check the job timeout;
        if time.time() > self.end_time:
            self.log.info("<< SOAK test timeout in Job Startup>>")
            return job_id_list
        # job_cmdlist is a list of batch script files
        for script in job_cmdlist:
            try:
                job_id = slurm_utils.run_slurm_script(str(script))
            except slurm_utils.SlurmFailed as error:
                self.log.error(error)
                # Force the test to exit with failure
                job_id = None
            if job_id:
                self.log.info("<<Job %s started with %s >> at %s", job_id,
                              script, time.ctime())
                slurm_utils.register_for_job_results(job_id,
                                                     self,
                                                     maxwait=self.test_timeout)
                # keep a list of the job_id's
                job_id_list.append(int(job_id))
            else:
                # one of the jobs failed to queue; exit on first fail for now.
                err_msg = "Slurm failed to submit job for {}".format(script)
                job_id_list = []
                raise SoakTestError("<<FAILED:  Soak {}: {}>>".format(
                    self.test_name, err_msg))
        return job_id_list

    def job_completion(self, job_id_list):
        """Wait for job completion and cleanup.

        Args:
            job_id_list: IDs of each job submitted to slurm
        Returns:
            failed_job_id_list: IDs of each job that failed in slurm

        """
        self.log.info("<<Job Completion - %s >> at %s", self.test_name,
                      time.ctime())
        # If there is nothing to do; exit
        if job_id_list:
            # wait for all the jobs to finish
            while len(self.soak_results) < len(job_id_list):
                # wait for the jobs to complete.
                # enter tearDown before hitting the avocado timeout
                if time.time() > self.end_time:
                    self.log.info("<< SOAK test timeout in Job Completion>>")
                    break
                time.sleep(5)
            # check for job COMPLETED and remove it from the job queue
            for job, result in self.soak_results.items():
                # The queue include status of "COMPLETING"
                if result == "COMPLETED":
                    job_id_list.remove(int(job))
                else:
                    self.log.info("<< Job %s failed with status %s>>", job,
                                  result)
            if job_id_list:
                self.log.info("<<Cancel jobs in queue with id's %s >>",
                              job_id_list)
                for job in job_id_list:
                    status = slurm_utils.cancel_jobs(int(job))
                    if status == 0:
                        self.log.info("<<Job %s successfully cancelled>>", job)
                    else:
                        self.log.info("<<Job %s could not be killed>>", job)
            # gather all the logfiles for this pass and cleanup test nodes
            try:
                self.get_remote_logs()
            except SoakTestError as error:
                self.log.info("Remote copy failed with %s", error)
            self.soak_results = {}
        return job_id_list

    def execute_jobs(self, jobs, pools):
        """Execute the overall soak test.

        Args:
            pools (list): list of TestPool obj - self.pool[1:]

        Raise:
            SoakTestError

        """
        cmdlist = []
        # Create the remote log directories from new loop/pass
        self.test_log_dir = self.log_dir + "/pass" + str(self.loop)
        self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop)
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "mkdir -p {}".format(self.test_log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: logfile directory not"
                                "created on clients>>: {}".format(
                                    self.hostlist_clients))
        # Create local log directory
        os.makedirs(self.local_pass_dir)
        # Setup cmdlines for job with specified pool
        if len(pools) < len(jobs):
            raise SoakTestError(
                "<<FAILED: There are not enough pools to run this test>>")
        for index, job in enumerate(jobs):
            cmdlist.extend(self.job_setup(job, pools[index]))
        # Gather the job_ids
        job_id_list = self.job_startup(cmdlist)
        # Initialize the failed_job_list to job_list so that any
        # unexpected failures will clear the squeue in tearDown
        self.failed_job_id_list = job_id_list
        # launch harassers if defined and enabled
        if self.h_list and self.loop > 1:
            self.log.info("<<Harassers are enabled>>")
            self.launch_harassers(self.h_list, pools)
            if not self.harasser_completion(self.harasser_timeout):
                raise SoakTestError("<<FAILED: Harassers failed ")
            # rebuild can only run once for now
            if self.is_harasser("rebuild"):
                self.h_list.remove("rebuild")
        # Wait for jobs to finish and cancel/kill jobs if necessary
        self.failed_job_id_list = self.job_completion(job_id_list)
        # Log the failing job ID
        if self.failed_job_id_list:
            self.log.info(
                "<<FAILED: The following jobs failed %s >>",
                (" ,".join(str(j_id) for j_id in self.failed_job_id_list)))
            # accumulate failing job IDs
            self.all_failed_jobs.extend(self.failed_job_id_list)

    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.harasser_joblist = []
        self.harasser_results = {}
        test_to = self.params.get("test_timeout", test_param)
        self.job_timeout = self.params.get("job_timeout", test_param)
        self.harasser_timeout = self.params.get("harasser_timeout", test_param)
        self.test_name = self.params.get("name", test_param)
        self.nodesperjob = self.params.get("nodesperjob", test_param)
        self.test_iteration = self.params.get("iteration", test_param)
        self.task_list = self.params.get("taskspernode", test_param + "*")
        self.h_list = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        pool_list = self.params.get("poollist", test_param + "*")
        rank = self.params.get("rank", "/run/container_reserved/*")
        if self.is_harasser("rebuild"):
            obj_class = "_".join([
                "OC",
                str(self.params.get("daos_oclass", "/run/rebuild/*")[0])
            ])
        else:
            obj_class = self.params.get("object_class",
                                        "/run/container_reserved/*")
        slurm_reservation = self.params.get("reservation",
                                            "/run/srun_params/*")
        # Srun params
        if self.client_partition is not None:
            self.srun_params = {"partition": self.client_partition}
        if slurm_reservation is not None:
            self.srun_params["reservation"] = slurm_reservation
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        self.add_pools(["pool_reserved"])
        self.pool[0].connect()
        # Create the container and populate with a known data
        # TO-DO: use IOR to write and later read verify the data
        self.container = TestContainer(self.pool[0])
        self.container.namespace = "/run/container_reserved/*"
        self.container.get_params(self)
        self.container.create()
        self.container.write_objects(rank, obj_class)
        self.all_failed_jobs = []
        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node /tmp/soak
        cmd = "rm -rf {}".format(self.log_dir)
        try:
            result = process.run(cmd, shell=True, timeout=30)
        except process.CmdError as error:
            raise SoakTestError(
                "<<FAILED: Soak directory on testnode not removed {}>>".format(
                    error))
        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<Soak1 PASS %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            # Create all specified pools
            self.add_pools(pool_list)
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            try:
                self.execute_jobs(job_list, self.pool[1:])
            except SoakTestError as error:
                self.fail(error)
            errors = self.destroy_pools(self.pool[1:])
            # remove the test pools from self.pool; preserving reserved pool
            self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            self.assertEqual(len(errors), 0, "\n".join(errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<PASS %s completed in %s >>", self.loop,
                          DDHHMMSS_format(loop_time))
            # # if the time left if less than a loop exit now
            # if end_time - time.time() < loop_time:
            #     break
            self.loop += 1
        # TO-DO: use IOR
        self.assertTrue(
            self.container.read_objects(),
            "Data verification error on reserved pool"
            "after SOAK completed")
        # gather the doas logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>",
                      DDHHMMSS_format(time.time() - start_time))
Beispiel #19
0
class DfuseContainerCheck(TestWithServers):
    """Base Dfuse Container check test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a DfuseContainerCheck object."""
        super(DfuseContainerCheck, self).__init__(*args, **kwargs)
        self.dfuse = None
        self.pool = None
        self.container = None

    def setUp(self):
        """Set up each test case."""
        # Start the servers and agents
        super(DfuseContainerCheck, self).setUp()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(DfuseContainerCheck, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def start_dfuse(self):
        """Create a DfuseCommand object to start dfuse.
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run(False)
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def test_dfusecontainercheck(self):
        """Jira ID: DAOS-3635.

        Test Description:
            Purpose of this test is to try and mount different container types
            to dfuse and check the behavior.
        Use cases:
            Create pool
            Create container of type default
            Try to mount to dfuse and check the behaviour.
            Create container of type POSIX.
            Try to mount to dfuse and check the behaviour.
        :avocado: tags=all,small,full_regression,dfusecontainercheck
        """
        # get test params for cont and pool count
        cont_types = self.params.get("cont_types", '/run/container/*')

        # Create a pool and start dfuse.
        self.create_pool()

        for cont_type in cont_types:
            # Get container params
            self.container = TestContainer(self.pool,
                                           daos_command=DaosCommand(self.bin))
            self.container.get_params(self)
            # create container
            if cont_type == "POSIX":
                self.container.type.update(cont_type)
            self.container.create()
            try:
                # mount fuse
                self.start_dfuse()
                # check if fuse got mounted
                self.dfuse.check_running()
                # fail the test if fuse mounts with non-posix type container
                if cont_type == "":
                    self.fail(
                        "Non-Posix type container got mounted over dfuse")
            except CommandFailure as error:
                # expected to throw CommandFailure exception for non-posix type
                # container
                if cont_type == "":
                    self.log.info(
                        "Expected behaviour: Default container type \
                        is expected to fail on dfuse mount: %s", str(error))
                # fail the test if exception is caught for POSIX type container
                elif cont_type == "POSIX":
                    self.log.error(
                        "Posix Container dfuse mount \
                        failed: %s", str(error))
                    self.fail("Posix container type was expected to mount \
                        over dfuse")
            # stop fuse and container for next iteration
            if not cont_type == "":
                self.dfuse.stop()
            self.container.destroy(1)
Beispiel #20
0
class FioBase(TestWithServers):
    """Base fio class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a FioBase object."""
        super(FioBase, self).__init__(*args, **kwargs)
        self.fio_cmd = None
        self.processes = None
        self.manager = None
        self.dfuse = None
        self.daos_cmd = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()

        # Start the servers and agents
        super(FioBase, self).setUp()

        # initialise daos_cmd
        self.daos_cmd = DaosCommand(self.bin)

        # Get the parameters for Fio
        self.fio_cmd = FioCommand()
        self.fio_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/fio/client_processes/*')
        self.manager = self.params.get("manager", '/run/fio/*', "MPICH")

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(FioBase, self).tearDown()

    def _create_pool(self):
        """Create a pool and execute Fio."""
        # Get the pool params
        # pylint: disable=attribute-defined-outside-init
        self.pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def _create_cont(self):
        """Create a container.

        Returns:
            str: UUID of the created container

        """
        cont_type = self.params.get("type", "/run/container/*")
        result = self.daos_cmd.container_create(pool=self.pool.uuid,
                                                svc=self.pool.svc_ranks,
                                                cont_type=cont_type)

        # Extract the container UUID from the daos container create output
        cont_uuid = re.findall(r"created\s+container\s+([0-9a-f-]+)",
                               result.stdout)
        if not cont_uuid:
            self.fail("Error obtaining the container uuid from: {}".format(
                result.stdout))
        return cont_uuid[0]

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self._create_cont())
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Unable to launch Dfuse.\n")

    def execute_fio(self):
        """Runner method for Fio."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self._create_pool()

        # start dfuse if api is POSIX
        if self.fio_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            # self.pool.connect()
            # self.create_cont()
            self._start_dfuse()
            self.fio_cmd.update("global", "directory",
                                self.dfuse.mount_dir.value,
                                "fio --name=global --directory")

        # Run Fio
        self.fio_cmd.hosts = self.hostlist_clients
        self.fio_cmd.run()

        if self.dfuse:
            self.dfuse.stop()
            self.dfuse = None
Beispiel #21
0
class MdtestBase(TestWithServers):
    """Base mdtest class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a MdtestBase object."""
        super(MdtestBase, self).__init__(*args, **kwargs)
        self.mdtest_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(MdtestBase, self).setUp()

        # Get the parameters for Mdtest
        self.mdtest_cmd = MdtestCommand()
        self.mdtest_cmd.get_params(self)
        self.processes = self.params.get("np",
                                         '/run/mdtest/client_processes/*')
        self.manager = self.params.get("manager", '/run/mdtest/*', "MPICH")

        # Until DAOS-3320 is resolved run IOR for POSIX
        # with single client node
        if self.mdtest_cmd.api.value == "POSIX":
            self.hostlist_clients = [self.hostlist_clients[0]]
            self.hostfile_clients = write_host_file.write_host_file(
                self.hostlist_clients, self.workdir,
                self.hostfile_clients_slots)

    def tearDown(self):
        """Tear down each test case."""
        try:
            self.dfuse = None
        finally:
            # Stop the servers and agents
            super(MdtestBase, self).tearDown()

    def _create_pool(self):
        """Create a pool and execute Mdtest."""
        # Get the pool params
        self.pool = TestPool(self.context, self.log)
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def _create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # TO-DO: Enable container using TestContainer object,
        # once DAOS-3355 is resolved.
        # Get Container params
        #self.container = TestContainer(self.pool)
        #self.container.get_params(self)

        # create container
        # self.container.create()
        env = Dfuse(self.hostlist_clients, self.tmp).get_default_env()
        # command to create container of posix type
        cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format(
            self.mdtest_cmd.dfs_pool_uuid.value,
            self.mdtest_cmd.dfs_svcl.value)
        try:
            container = subprocess.Popen(cmd,
                                         stdout=subprocess.PIPE,
                                         shell=True)
            (output, err) = container.communicate()
            self.log.info("Container created with UUID %s", output.split()[3])

        except subprocess.CalledProcessError as err:
            self.fail("Container create failed:{}".format(err))

        return output.split()[3]

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp, self.basepath)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self._create_cont())

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Unable to launch Dfuse.\n")

    def execute_mdtest(self):
        """Runner method for Mdtest."""

        # Create a pool if one does not already exist
        if self.pool is None:
            self._create_pool()
        # set Mdtest params
        self.mdtest_cmd.set_daos_params(self.server_group, self.pool)

        # start dfuse if api is POSIX
        if self.mdtest_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            # self.pool.connect()
            # self.create_cont()
            self._start_dfuse()
            self.mdtest_cmd.test_dir.update(self.dfuse.mount_dir.value)

    # Run Mdtest
        self.run_mdtest(self.get_job_manager_command(self.manager),
                        self.processes)

    def get_job_manager_command(self, manager):
        """Get the MPI job manager command for Mdtest.

        Returns:
            JobManager: the object for the mpi job manager command

        """
        # Initialize MpioUtils if mdtest needs to be run using mpich
        if manager == "MPICH":
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
            path = os.path.join(mpio_util.mpichinstall, "bin")
            return Mpirun(self.mdtest_cmd, path)

        path = os.path.join(self.ompi_prefix, "bin")
        return Orterun(self.mdtest_cmd, path)

    def run_mdtest(self, manager, processes):
        """Run the Mdtest command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
        """
        env = self.mdtest_cmd.get_default_env(str(manager), self.tmp,
                                              self.client_log)
        manager.setup_command(env, self.hostfile_clients, processes)
        try:
            manager.run()
        except CommandFailure as error:
            self.log.error("Mdtest Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
Beispiel #22
0
class ParallelIo(FioBase, IorTestBase):
    """Base Parallel IO test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a ParallelIo object."""
        super(ParallelIo, self).__init__(*args, **kwargs)
        self.dfuse = None
        self.cont_count = None
        self.pool_count = None
        self.statvfs_info_initial = None
        self.statvfs_before_cont_destroy = None
        self.statvfs_after_cont_destroy = None
        self.pool = []
        self.container = []

    def setUp(self):
        """Set up each test case."""
        # Start the servers and agents
        super(ParallelIo, self).setUp()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(ParallelIo, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        pool.get_params(self)

        # Create a pool
        pool.create()
        self.pool.append(pool)

    # pylint: disable=arguments-differ
    def create_cont(self, pool):
        """Create a TestContainer object to be used to create container.

          Args:
            pool (TestPool): TestPool object type for which container
                             needs to be created
        """
        # Get container params
        container = TestContainer(pool, daos_command=DaosCommand(self.bin))
        container.get_params(self)

        # create container
        container.create()
        self.container.append(container)

    def start_dfuse(self, pool=None):
        """Create a DfuseCommand object to start dfuse.

          Args:
            pool (TestPool): Test pool object if dfuse is intended to be
                             started using pool uuid option.
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        if pool:
            self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def stat_bfree(self, path):
        """Get stat bfree

          Args:
            path (str): path to get free block size of.
          Returns:
            integer value of stat free blocks
        """
        cmd = [
            "ssh", "{}@{}".format(getuser(), self.hostlist_clients[0]),
            "stat -c%a -f {}".format(path)
        ]
        try:
            result = subprocess.check_output(cmd)
        except subprocess.CalledProcessError as err:
            self.fail("Get free block size method failed with: {}".format(err))

        return int(result)

    def statvfs_pool(self, path):
        """Method to obtain free space using statvfs

          Args:
            path (str): path for which free space needs to be obtained for.

          Returns:
            List containing free space info for each pool supplied in pool_obj.
        """
        statvfs_list = []
        for _, pool in enumerate(self.pool):
            dfuse_pool_dir = str(path + "/" + pool.uuid)
            statvfs_info = self.stat_bfree(dfuse_pool_dir)
            statvfs_list.append(statvfs_info)
            self.log.info("Statvfs List Output: %s", statvfs_list)

        return statvfs_list

    def verify_aggregation(self, reduced_space, count):
        """Verify if expected space is returned for each pool after containers
           were destroyed. If not, wait for 60 secs and check again. Wait 4
           times, otherwise exit the test with a failure.

          Args:
            reduced_space: expected space to be returned
        """
        counter = 1
        while (self.statvfs_after_cont_destroy[count] <
               self.statvfs_before_cont_destroy[count] + reduced_space):
            # try to wait for 4 x 60 secs for aggregation to be completed
            # or else exit the test with a failure.
            if counter > 4:
                self.log.info("Free space before io: %s",
                              self.statvfs_info_initial)
                self.log.info("Free space after io: %s",
                              self.statvfs_before_cont_destroy)
                self.log.info("Free space at test termination: %s",
                              self.statvfs_after_cont_destroy)
                self.fail("Aggregation did not complete as expected")
            time.sleep(60)
            self.statvfs_after_cont_destroy = self.statvfs_pool(
                self.dfuse.mount_dir.value)
            counter += 1

    def test_parallelio(self):
        """Jira ID: DAOS-3775.

        Test Description:
            Purpose of this test is to mount dfuse and verify multiple
            containers using fio.
        Use cases:
            Mount dfuse using pool uuid.
            Create multiple containers under that dfuse mount point.
            Check those containers are accessible from that mount point.
            Perform io to those containers using FIO
            Delete one of the containers
            Check if dfuse is still running. If not, fail the test and exit.
            Otherwise, try accessing the deleted container.
            This should fail.
            Check dfuse again.
        :avocado: tags=all,hw,daosio,medium,ib2,full_regression,parallelio
        """
        # get test params for cont and pool count
        self.cont_count = self.params.get("cont_count", '/run/container/*')

        threads = []

        # Create a pool and start dfuse.
        self.create_pool()
        self.start_dfuse(self.pool[0])
        # create multiple containers
        for _ in range(self.cont_count):
            self.create_cont(self.pool[0])

        # check if all the created containers can be accessed and perform
        # io on each container using fio in parallel
        for _, cont in enumerate(self.container):
            dfuse_cont_dir = self.dfuse.mount_dir.value + "/" + cont.uuid
            cmd = u"ls -a {}".format(dfuse_cont_dir)
            try:
                # execute bash cmds
                ret_code = general_utils.pcmd(self.hostlist_clients,
                                              cmd,
                                              timeout=30)
                if 0 not in ret_code:
                    error_hosts = NodeSet(",".join([
                        str(node_set) for code, node_set in ret_code.items()
                        if code != 0
                    ]))
                    raise CommandFailure("Error running '{}' on the following "
                                         "hosts: {}".format(cmd, error_hosts))
            # report error if any command fails
            except CommandFailure as error:
                self.log.error("ParallelIo Test Failed: %s", str(error))
                self.fail("Test was expected to pass but " "it failed.\n")
            # run fio on all containers
            thread = threading.Thread(target=self.execute_fio,
                                      args=(self.dfuse.mount_dir.value + "/" +
                                            cont.uuid, False))
            threads.append(thread)
            thread.start()

        # wait for all fio jobs to be finished
        for job in threads:
            job.join()

        # destroy first container
        container_to_destroy = self.container[0].uuid
        self.container[0].destroy(1)

        # check dfuse if it is running fine
        self.dfuse.check_running()

        # try accessing destroyed container, it should fail
        try:
            self.execute_fio(self.dfuse.mount_dir.value + "/" + \
                container_to_destroy, False)
            self.fail("Fio was able to access destroyed container: {}".\
                format(self.container[0].uuid))
        except CommandFailure as error:
            self.log.info("This run is expected to fail")

            # check dfuse is still running after attempting to access deleted
            # container.
            self.dfuse.check_running()

    def test_multipool_parallelio(self):
        """Jira ID: DAOS-3775.

        Test Description:
            Purpose of this test is to verify aggregation across multiple
            pools and containers.
        Use cases:
            Create 10 pools
            Create 10 containers under each pool.
            Record statvfs free space for each pool.
            Perform parallel io to each pool without deleting the file
            after write.
            Record free space using statvfs after write.
            Delete half of the containers from each pool.
            Calculate the expected amount of data to be deleted when
            containers are destroyed.
            Record free space after container destroy.
            Loop until either the all space is returned back after aggregation
            completion or exit the loop after trying for 240 secs of wait and
            fail the test.

        :avocado: tags=all,hw,daosio,medium,ib2,full_regression
        :avocado: tags=multipoolparallelio
        """
        # test params
        threads = []
        pool_threads = []
        cont_threads = []
        self.pool_count = self.params.get("pool_count", '/run/pool/*')
        self.cont_count = self.params.get("cont_count", '/run/container/*')
        processes = self.params.get("np", '/run/ior/client_processes/*')

        # Create pools in parallel.
        for _ in range(self.pool_count):
            pool_thread = threading.Thread(target=self.create_pool())
            pool_threads.append(pool_thread)
            pool_thread.start()
        # wait for container create to finish
        for pool_job in pool_threads:
            pool_job.join()

        # start dfuse using --svc option only.
        self.start_dfuse()

        # record free space using statvfs before any data is written.
        self.statvfs_info_initial = self.statvfs_pool(
            self.dfuse.mount_dir.value)

        # Create 10 containers for each pool. Container create process cannot
        # be parallelised as different container create could complete at
        # different times and get appended in the self.container variable in
        # unorderly manner, causing problems during the write process.
        for _, pool in enumerate(self.pool):
            for _ in range(self.cont_count):
                self.create_cont(pool)

        # Try to access each dfuse mounted container using ls. Once it is
        # accessed successfully, go ahead and perform io on that location
        # using ior. This process of performing io is done in parallel for
        # all containers using threads.
        for pool_count, pool in enumerate(self.pool):
            dfuse_pool_dir = str(self.dfuse.mount_dir.value + "/" + pool.uuid)
            for counter in range(self.cont_count):
                cont_num = (pool_count * self.cont_count) + counter
                dfuse_cont_dir = str(dfuse_pool_dir + "/" +
                                     self.container[cont_num].uuid)
                cmd = u"###ls -a {}".format(dfuse_cont_dir)
                self.execute_cmd(cmd)

                # run ior on all containers
                test_file = dfuse_cont_dir + "/testfile"
                self.ior_cmd.test_file.update(test_file)
                self.ior_cmd.set_daos_params(self.server_group, pool,
                                             self.container[cont_num].uuid)
                thread = threading.Thread(
                    target=self.run_ior,
                    args=(self.get_ior_job_manager_command(), processes, None,
                          False))
                threads.append(thread)
                thread.start()

        # wait for all ior jobs to be finished
        for job in threads:
            job.join()

        # Record free space after io
        self.statvfs_before_cont_destroy = self.statvfs_pool(
            self.dfuse.mount_dir.value)

        # Destroy half of the containers from each pool
        pfinal = 0
        for count in range(self.cont_count):
            pinitial = pfinal
            pfinal = pinitial + (self.cont_count // 2)
            del self.container[pinitial:pfinal]

        for cont in self.container:
            cont_thread = threading.Thread(target=cont.destroy)
            cont_threads.append(cont_thread)
            cont_thread.start()

        for destroy_job in cont_threads:
            destroy_job.join()

        # Record free space after container destroy.
        self.statvfs_after_cont_destroy = self.statvfs_pool(
            self.dfuse.mount_dir.value)

        # Calculate the expected space to be returned after containers
        # are destroyed.
        reduced_space = (self.cont_count *
                         int(self.ior_cmd.block_size.value)) / 2

        # Verify if expected space is returned for each pool after containers
        # were destroyed. If not, wait for 60 secs and check again. Wait 4
        # times, otherwise exit the test with a failure.
        for count in range(self.pool_count):
            thread = threading.Thread(target=self.verify_aggregation,
                                      args=(reduced_space, count))
            threads.append(thread)
            thread.start()

        for job in threads:
            job.join()
Beispiel #23
0
class BashCmd(TestWithServers):
    """Base BashCmd test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a BashCmd object."""
        super(BashCmd, self).__init__(*args, **kwargs)
        self.dfuse = None
        self.file_name = None
        self.dir_name = None
        self.pool_count = None
        self.cont_count = None

    def setUp(self):
        """Set up each test case."""
        # Start the servers and agents
        super(BashCmd, self).setUp()

        # Get the parameters for BashCmd
        self.dir_name = self.params.get("dirname", '/run/bashcmd/*')
        self.file_name1 = self.params.get("filename1", '/run/bashcmd/*')
        self.file_name2 = self.params.get("filename2", '/run/bashcmd/*')
        self.dd_count = self.params.get("dd_count", '/run/bashcmd/*')
        self.dd_blocksize = self.params.get("dd_blocksize", '/run/bashcmd/*')

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(BashCmd, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # Get container params
        self.container = TestContainer(self.pool,
                                       daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # create container
        self.container.create()

    def start_dfuse(self, count):
        """Create a DfuseCommand object to start dfuse.

           Args:
             count(int): container index
        """

        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.mount_dir.update("/tmp/" + self.pool.uuid + "_daos_dfuse" +
                                    str(count))
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def test_bashcmd(self):
        """Jira ID: DAOS-3508.

        Test Description:
            Purpose of this test is to mount different mount points of dfuse
            for different container and pool sizes and perform basic bash
            commands.
        Use cases:
            Following list of bash commands have been incorporated
            as part of this test: mkdir, touch, ls, chmod, rm, dd, stat,
            cp, cmp, mv, rmdir.
              Create a directory.
              Create a file under that directory.
              List the created file.
              Remove the file.
              Write a file to the dfuse mounted location using dd.
              List the written file to verify if it's create.
              Verify the file created is of right size as desired.
              Copy the file
              Compare the copied file with original to verify the
              content is same.
              Remove copied file.
              Rename file
              Verify renamed file exist using list.
              Remove a directory
        :avocado: tags=all,hw,daosio,medium,ib2,full_regression,bashcmd
        """
        self.cont_count = self.params.get("cont_count", '/run/container/*')
        self.pool_count = self.params.get("pool_count", '/run/pool/*')

        # Create a pool if one does not already exist.
        for _ in range(self.pool_count):
            self.create_pool()
            # perform test for multiple containers.
            for count in range(self.cont_count):
                self.create_cont()
                self.start_dfuse(count)
                abs_dir_path = os.path.join(self.dfuse.mount_dir.value,
                                            self.dir_name)
                abs_file_path1 = os.path.join(abs_dir_path, self.file_name1)
                abs_file_path2 = os.path.join(abs_dir_path, self.file_name2)
                # list of commands to be executed.
                commands = [
                    u"mkdir -p {}".format(abs_dir_path),
                    u"touch {}".format(abs_file_path1),
                    u"ls -a {}".format(abs_file_path1),
                    u"rm {}".format(abs_file_path1),
                    u"dd if=/dev/zero of={} count={} bs={}".format(
                        abs_file_path1, self.dd_count, self.dd_blocksize),
                    u"ls -al {}".format(abs_file_path1),
                    u"filesize=$(stat -c%s '{}');\
                            if (( filesize != {}*{} )); then exit 1;\
                            fi".format(abs_file_path1, self.dd_count,
                                       self.dd_blocksize),
                    u"cp -r {} {}".format(abs_file_path1, abs_file_path2),
                    u"cmp --silent {} {}".format(abs_file_path1,
                                                 abs_file_path2),
                    u"rm {}".format(abs_file_path2), u"mv {} {}".format(
                        abs_file_path1,
                        abs_file_path2), u"ls -al {}".format(abs_file_path2),
                    u"rm {}".format(abs_file_path2),
                    u"rmdir {}".format(abs_dir_path)
                ]
                for cmd in commands:
                    try:
                        # execute bash cmds
                        ret_code = general_utils.pcmd(self.hostlist_clients,
                                                      cmd,
                                                      timeout=30)
                        if 0 not in ret_code:
                            error_hosts = NodeSet(",".join([
                                str(node_set)
                                for code, node_set in ret_code.items()
                                if code != 0
                            ]))
                            raise CommandFailure(
                                "Error running '{}' on the following "
                                "hosts: {}".format(cmd, error_hosts))
                    # report error if any command fails
                    except CommandFailure as error:
                        self.log.error("BashCmd Test Failed: %s", str(error))
                        self.fail("Test was expected to pass but "
                                  "it failed.\n")

                # stop dfuse
                self.dfuse.stop()
                # destroy container
                self.container.destroy()
            # destroy pool
            self.pool.destroy()
Beispiel #24
0
class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None
        self.lock = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')

        # Until DAOS-3320 is resolved run IOR for POSIX
        # with single client node
        if self.ior_cmd.api.value == "POSIX":
            self.hostlist_clients = [self.hostlist_clients[0]]
            self.hostfile_clients = write_host_file.write_host_file(
                self.hostlist_clients, self.workdir,
                self.hostfile_clients_slots)
        # lock is needed for run_multiple_ior method.
        self.lock = threading.Lock()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(IorTestBase, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # Get container params
        self.container = TestContainer(self.pool,
                                       daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # create container
        self.container.create()

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def run_ior_with_pool(self,
                          intercept=None,
                          test_file_suffix="",
                          test_file="daos:testFile"):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str, optional): path to the interception library. Shall
                    be used only for POSIX through DFUSE. Defaults to None.
            test_file_suffix (str, optional): suffix to add to the end of the
                test file name. Defaults to "".
            test_file (str, optional): ior test file name. Defaults to
                "daos:testFile". Is ignored when using POSIX through DFUSE.

        Returns:
            CmdResult: result of the ior command execution

        """
        self.update_ior_cmd_with_pool()
        # start dfuse if api is POSIX
        if self.ior_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            if self.ior_cmd.transfer_size.value == "256B":
                return "Skipping the case for transfer_size=256B"
            self._start_dfuse()
            test_file = os.path.join(self.dfuse.mount_dir.value, "testfile")
        elif self.ior_cmd.api.value == "DFS":
            test_file = os.path.join("/", "testfile")

        self.ior_cmd.test_file.update("".join([test_file, test_file_suffix]))

        out = self.run_ior(self.get_ior_job_manager_command(), self.processes,
                           intercept)

        if self.dfuse:
            self.dfuse.stop()
            self.dfuse = None
        return out

    def update_ior_cmd_with_pool(self):
        """Update ior_cmd with pool."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Always create a container
        # Don't pass uuid and pool handle to IOR.
        # It will not enable checksum feature
        self.pool.connect()
        self.create_cont()
        # Update IOR params with the pool and container params
        self.ior_cmd.set_daos_params(self.server_group, self.pool,
                                     self.container.uuid)

    def get_ior_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode
        if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX", "DFS"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        return Mpirun(self.ior_cmd, mpitype="mpich")

    def run_ior(self, manager, processes, intercept=None):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.setup_command(env, self.hostfile_clients, processes)
        try:
            self.pool.display_pool_daos_space()
            out = manager.run()
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()

    def run_multiple_ior_with_pool(self, results, intercept=None):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str): path to the interception library. Shall be used
                             only for POSIX through DFUSE.
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        self.update_ior_cmd_with_pool()

        # start dfuse for POSIX api. This is specific to interception
        # library test requirements.
        self._start_dfuse()

        # Create two jobs and run in parallel.
        # Job1 will have 3 client set up to use dfuse + interception
        # library
        # Job2 will have 1 client set up to use only dfuse.
        job1 = self.get_new_job(self.hostlist_clients[:-1], 1, results,
                                intercept)
        job2 = self.get_new_job([self.hostlist_clients[-1]], 2, results, None)

        job1.start()
        # Since same ior_cmd is used to trigger the MPIRUN
        # with different parameters, pausing for 2 seconds to
        # avoid data collisions.
        time.sleep(2)
        job2.start()
        job1.join()
        job2.join()
        self.dfuse.stop()
        self.dfuse = None

    def get_new_job(self, clients, job_num, results, intercept=None):
        """Create a new thread for ior run.

        Args:
            clients (lst): Number of clients the ior would run against.
            job_num (int): Assigned job number
            results (dict): A dictionary object to store the ior metrics
            intercept (path): Path to interception library
        """
        hostfile = write_host_file.write_host_file(clients, self.workdir,
                                                   self.hostfile_clients_slots)
        job = threading.Thread(
            target=self.run_multiple_ior,
            args=[hostfile,
                  len(clients), results, job_num, intercept])
        return job

    def run_multiple_ior(self,
                         hostfile,
                         num_clients,
                         results,
                         job_num,
                         intercept=None):
        # pylint: disable=too-many-arguments
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        self.lock.acquire(True)
        tsize = self.ior_cmd.transfer_size.value
        testfile = os.path.join(self.dfuse.mount_dir.value,
                                "testfile{}{}".format(tsize, job_num))
        if intercept:
            testfile += "intercept"
        self.ior_cmd.test_file.update(testfile)
        manager = self.get_ior_job_manager_command()
        procs = (self.processes // len(self.hostlist_clients)) * num_clients
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.setup_command(env, hostfile, procs)
        self.lock.release()
        try:
            self.pool.display_pool_daos_space()
            out = manager.run()
            self.lock.acquire(True)
            results[job_num] = IorCommand.get_ior_metrics(out)
            self.lock.release()
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))
Beispiel #25
0
class MdtestBase(TestWithServers):
    """Base mdtest class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a MdtestBase object."""
        super(MdtestBase, self).__init__(*args, **kwargs)
        self.mdtest_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.daos_cmd = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(MdtestBase, self).setUp()

        # initialize daos_cmd
        self.daos_cmd = DaosCommand(self.bin)

        # Get the parameters for Mdtest
        self.mdtest_cmd = MdtestCommand()
        self.mdtest_cmd.get_params(self)
        self.processes = self.params.get("np",
                                         '/run/mdtest/client_processes/*')
        self.manager = self.params.get("manager", '/run/mdtest/*', "MPICH")

        self.log.info('Clients %s', self.hostlist_clients)
        self.log.info('Servers %s', self.hostlist_servers)

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(MdtestBase, self).tearDown()

    def create_pool(self):
        """Create a pool and execute Mdtest."""
        # Get the pool params
        self.pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def _create_cont(self):
        """Create a container.

        Returns:
            str: UUID of the created container

        """
        cont_type = self.params.get("type", "/run/container/*")
        result = self.daos_cmd.container_create(pool=self.pool.uuid,
                                                svc=self.pool.svc_ranks,
                                                cont_type=cont_type)

        # Extract the container UUID from the daos container create output
        cont_uuid = re.findall(r"created\s+container\s+([0-9a-f-]+)",
                               result.stdout)
        if not cont_uuid:
            self.fail("Error obtaining the container uuid from: {}".format(
                result.stdout))
        return cont_uuid[0]

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params

        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self._create_cont())
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           self.dfuse.hosts,
                           exc_info=error)
            self.fail("Unable to launch Dfuse.\n")

    def execute_mdtest(self):
        """Runner method for Mdtest."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # set Mdtest params
        self.mdtest_cmd.set_daos_params(self.server_group, self.pool)

        # start dfuse if api is POSIX
        if self.mdtest_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            # self.pool.connect()
            # self.create_cont()
            self._start_dfuse()
            self.mdtest_cmd.test_dir.update(self.dfuse.mount_dir.value)

        # Run Mdtest
        self.run_mdtest(self.get_mdtest_job_manager_command(self.manager),
                        self.processes)
        if self.dfuse:
            self.dfuse.stop()
            self.dfuse = None

    def get_mdtest_job_manager_command(self, manager):
        """Get the MPI job manager command for Mdtest.

        Returns:
            JobManager: the object for the mpi job manager command

        """
        # Initialize MpioUtils if mdtest needs to be run using mpich
        if manager == "MPICH":
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
            return Mpirun(self.mdtest_cmd, mpitype="mpich")

        return Orterun(self.mdtest_cmd)

    def run_mdtest(self, manager, processes):
        """Run the Mdtest command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
        """
        env = self.mdtest_cmd.get_default_env(str(manager), self.client_log)
        manager.assign_hosts(self.hostlist_clients, self.workdir,
                             self.hostfile_clients_slots)
        manager.assign_processes(processes)
        manager.assign_environment(env)
        try:
            self.pool.display_pool_daos_space()
            manager.run()
        except CommandFailure as error:
            self.log.error("Mdtest Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()
Beispiel #26
0
class FioBase(TestWithServers):
    """Base fio class.

    :avocado: recursive
    """

    def __init__(self, *args, **kwargs):
        """Initialize a FioBase object."""
        super(FioBase, self).__init__(*args, **kwargs)
        self.fio_cmd = None
        self.processes = None
        self.manager = None
        self.dfuse = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()

        # Start the servers and agents
        super(FioBase, self).setUp()

        # removing runner node from hostlist_client, only need one client node.
        self.hostlist_clients = self.hostlist_clients[:-1]

        # Get the parameters for Fio
        self.fio_cmd = FioCommand()
        self.fio_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/fio/client_processes/*')
        self.manager = self.params.get("manager", '/run/fio/*', "MPICH")

    def tearDown(self):
        """Tear down each test case."""
        try:
            self.dfuse = None
        finally:
            # Stop the servers and agents
            super(FioBase, self).tearDown()

    def _create_pool(self):
        """Create a pool and execute Fio."""
        # Get the pool params
        # pylint: disable=attribute-defined-outside-init
        self.pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def _create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # TO-DO: Enable container using TestContainer object,
        # once DAOS-3355 is resolved.
        # Get Container params
        # self.container = TestContainer(self.pool)
        # self.container.get_params(self)

        # create container
        # self.container.create()
        env = Dfuse(self.hostlist_clients, self.tmp).get_default_env()
        # command to create container of posix type
        cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format(
            self.pool.uuid, ":".join(
                [str(item) for item in self.pool.svc_ranks]))
        try:
            container = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                         shell=True)
            (output, err) = container.communicate()
            self.log.info("Container created with UUID %s", output.split()[3])

        except subprocess.CalledProcessError as err:
            self.fail("Container create failed:{}".format(err))

        return output.split()[3]

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp, self.basepath)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self._create_cont())

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse), str(
                               NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Unable to launch Dfuse.\n")

    def execute_fio(self):
        """Runner method for Fio."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self._create_pool()

        # start dfuse if api is POSIX
        if self.fio_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            # self.pool.connect()
            # self.create_cont()
            self._start_dfuse()
            self.fio_cmd.update(
                "global", "directory", self.dfuse.mount_dir.value,
                "fio --name=global --directory")

        # Run Fio
        self.fio_cmd.hosts = self.hostlist_clients
        self.fio_cmd.run()
Beispiel #27
0
class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """

    IOR_WRITE_PATTERN = "Commencing write performance test"
    IOR_READ_PATTERN = "Commencing read performance test"

    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None
        self.lock = None
        self.mpirun = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')
        self.subprocess = self.params.get("subprocess", '/run/ior/*', False)

        # lock is needed for run_multiple_ior method.
        self.lock = threading.Lock()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(IorTestBase, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(
            self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # Get container params
        self.container = TestContainer(
            self.pool, daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # create container
        self.container.create()

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def run_ior_with_pool(self, intercept=None, test_file_suffix="",
                          test_file="daos:testFile", create_pool=True,
                          create_cont=True, stop_dfuse=True):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str, optional): path to the interception library. Shall
                    be used only for POSIX through DFUSE. Defaults to None.
            test_file_suffix (str, optional): suffix to add to the end of the
                test file name. Defaults to "".
            test_file (str, optional): ior test file name. Defaults to
                "daos:testFile". Is ignored when using POSIX through DFUSE.
            create_pool (bool, optional): If it is true, create pool and
                container else just run the ior. Defaults to True.
            create_cont (bool, optional): Create new container. Default is True
            stop_dfuse (bool, optional): Stop dfuse after ior command is
                finished. Default is True.

        Returns:
            CmdResult: result of the ior command execution

        """
        if create_pool:
            self.update_ior_cmd_with_pool(create_cont)

        # start dfuse if api is POSIX
        if self.ior_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            if not self.dfuse:
                self._start_dfuse()
            test_file = os.path.join(self.dfuse.mount_dir.value, "testfile")
        elif self.ior_cmd.api.value == "DFS":
            test_file = os.path.join("/", "testfile")

        self.ior_cmd.test_file.update("".join([test_file, test_file_suffix]))

        out = self.run_ior(self.get_ior_job_manager_command(), self.processes,
                           intercept)

        if stop_dfuse and self.dfuse:
            self.dfuse.stop()
            self.dfuse = None
        return out

    def update_ior_cmd_with_pool(self, create_cont=True):
        """Update ior_cmd with pool."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Create a container, if needed.
        # Don't pass uuid and pool handle to IOR.
        # It will not enable checksum feature
        if create_cont:
            self.pool.connect()
            self.create_cont()
        # Update IOR params with the pool and container params
        self.ior_cmd.set_daos_params(self.server_group, self.pool,
                                     self.container.uuid)

    def get_ior_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DFS mode
        if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        if self.subprocess:
            self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich")
        else:
            self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich")

        return self.mpirun

    def check_subprocess_status(self, operation="write"):
        """Check subprocess status """
        if operation == "write":
            self.ior_cmd.pattern = self.IOR_WRITE_PATTERN
        elif operation == "read":
            self.ior_cmd.pattern = self.IOR_READ_PATTERN
        else:
            self.fail("Exiting Test: Inappropriate operation type \
                      for subprocess status check")

        if not self.ior_cmd.check_ior_subprocess_status(
                self.mpirun.process, self.ior_cmd):
            self.fail("Exiting Test: Subprocess not running")

    def run_ior(self, manager, processes, intercept=None, display_space=True):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(
            self.hostlist_clients, self.workdir, self.hostfile_clients_slots)
        manager.assign_processes(processes)
        manager.assign_environment(env)

        try:
            if display_space:
                self.pool.display_pool_daos_space()
            out = manager.run()

            if not self.subprocess:
                for line in out.stdout.splitlines():
                    if 'WARNING' in line:
                        self.fail("IOR command issued warnings.\n")
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            if not self.subprocess and display_space:
                self.pool.display_pool_daos_space()

    def stop_ior(self):
        """Stop IOR process.
        Args:
            manager (str): mpi job manager command
        """
        self.log.info(
            "<IOR> Stopping in-progress IOR command: %s", self.mpirun.__str__())

        try:
            out = self.mpirun.stop()
            return out
        except CommandFailure as error:
            self.log.error("IOR stop Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()


    def run_multiple_ior_with_pool(self, results, intercept=None):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str): path to the interception library. Shall be used
                             only for POSIX through DFUSE.
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        self.update_ior_cmd_with_pool()

        # start dfuse for POSIX api. This is specific to interception
        # library test requirements.
        self._start_dfuse()

        # Create two jobs and run in parallel.
        # Job1 will have 3 client set up to use dfuse + interception
        # library
        # Job2 will have 1 client set up to use only dfuse.
        job1 = self.get_new_job(self.hostlist_clients[:-1], 1,
                                results, intercept)
        job2 = self.get_new_job([self.hostlist_clients[-1]], 2,
                                results, None)

        job1.start()
        # Since same ior_cmd is used to trigger the MPIRUN
        # with different parameters, pausing for 2 seconds to
        # avoid data collisions.
        time.sleep(2)
        job2.start()
        job1.join()
        job2.join()
        self.dfuse.stop()
        self.dfuse = None

    def get_new_job(self, clients, job_num, results, intercept=None):
        """Create a new thread for ior run.

        Args:
            clients (list): hosts on which to run ior
            job_num (int): Assigned job number
            results (dict): A dictionary object to store the ior metrics
            intercept (path): Path to interception library
        """
        job = threading.Thread(target=self.run_multiple_ior, args=[
            clients, results, job_num, intercept])
        return job

    def run_multiple_ior(self, clients, results, job_num, intercept=None):
        """Run the IOR command.

        Args:
            clients (list): hosts on which to run ior
            results (dict): A dictionary object to store the ior metrics
            job_num (int): Assigned job number
            intercept (str, optional): path to interception library. Defaults to
                None.
        """
        self.lock.acquire(True)
        tsize = self.ior_cmd.transfer_size.value
        testfile = os.path.join(self.dfuse.mount_dir.value,
                                "testfile{}{}".format(tsize, job_num))
        if intercept:
            testfile += "intercept"
        self.ior_cmd.test_file.update(testfile)
        manager = self.get_ior_job_manager_command()
        procs = (self.processes // len(self.hostlist_clients)) * len(clients)
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots)
        manager.assign_processes(procs)
        manager.assign_environment(env)
        self.lock.release()
        try:
            self.pool.display_pool_daos_space()
            out = manager.run()
            self.lock.acquire(True)
            results[job_num] = IorCommand.get_ior_metrics(out)
            self.lock.release()
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))

    def execute_cmd(self, cmd, fail_on_err=True, display_output=True):
        """Execute cmd using general_utils.pcmd

          Args:
            cmd (str): String command to be executed
            fail_on_err (bool): Boolean for whether to fail the test if command
                                execution returns non zero return code.
            display_output (bool): Boolean for whether to display output.

          Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                  values indicating which hosts yielded the return code.
        """
        try:
            # execute bash cmds
            ret = pcmd(
                self.hostlist_clients, cmd, verbose=display_output, timeout=300)
            if 0 not in ret:
                error_hosts = NodeSet(
                    ",".join(
                        [str(node_set) for code, node_set in
                         ret.items() if code != 0]))
                if fail_on_err:
                    raise CommandFailure(
                        "Error running '{}' on the following "
                        "hosts: {}".format(cmd, error_hosts))

         # report error if any command fails
        except CommandFailure as error:
            self.log.error("DfuseSparseFile Test Failed: %s",
                           str(error))
            self.fail("Test was expected to pass but "
                      "it failed.\n")
        return ret
Beispiel #28
0
class DfuseTestBase(TestWithServers):
    """Runs HDF5 vol test suites.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a TestWithServers object."""
        super(DfuseTestBase, self).__init__(*args, **kwargs)
        self.dfuse = None

    def tearDown(self):
        """Tear down each test case."""
        try:
            self.stop_dfuse()
        finally:
            # Stop the servers and agents
            super(DfuseTestBase, self).tearDown()

    def start_dfuse(self, hosts, pool, container):
        """Create a DfuseCommand object and use it to start Dfuse.

        Args:
            hosts (list): list of hosts on which to start Dfuse
            pool (TestPool): pool to use with Dfuse
            container (TestContainer): container to use with Dfuse
        """
        self.dfuse = Dfuse(hosts, self.tmp)
        self.dfuse.get_params(self)

        # Update dfuse params
        self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_cont_param(container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # Start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.")

    def stop_dfuse(self):
        """Stop Dfuse and unset the DfuseCommand object."""
        if self.dfuse:
            self.dfuse.stop()
            self.dfuse = None