Beispiel #1
0
    def run_ior_collect_error(self, results, job_num, file_name, clients):
        """Run IOR command and store error in results.

        Args:
            results (dict): A dictionary object to store the ior metrics.
            job_num (int): Assigned job number.
            file_name (str): File name used for self.ior_cmd.test_file.
            clients (list): Client hostnames to run IOR from.
        """
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(
            group=self.server_group, pool=self.pool, cont_uuid=self.container.uuid)
        testfile = os.path.join("/", file_name)
        ior_cmd.test_file.update(testfile)

        manager = get_job_manager(
            test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess,
            mpi_type="mpich")
        manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots)
        ppn = self.params.get("ppn", '/run/ior/client_processes/*')
        manager.ppn.update(ppn, 'mpirun.ppn')
        manager.processes.update(None, 'mpirun.np')

        try:
            ior_output = manager.run()
            results[job_num] = [True]
            # For debugging.
            results[job_num].extend(IorCommand.get_ior_metrics(ior_output))
            # We'll verify the error message.
            results[job_num].append(ior_output.stderr_text)
        except CommandFailure as error:
            results[job_num] = [False, "IOR failed: {}".format(error)]
Beispiel #2
0
    def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob):
        """Create an IOR cmdline to run in slurm batch.

        Args:

            job_spec (str):   ior job in yaml to run
            pool (obj):       TestPool obj
            ppn(int):         number of tasks to run on each node
            nodesperjob(int): number of nodes per job

        Returns:
            cmd: cmdline string

        """
        commands = []

        iteration = self.test_iteration
        ior_params = "/run/" + job_spec + "/*"
        # IOR job specs with a list of parameters; update each value
        api_list = self.params.get("api", ior_params + "*")
        tsize_list = self.params.get("transfer_size", ior_params + "*")
        bsize_list = self.params.get("block_size", ior_params + "*")
        oclass_list = self.params.get("daos_oclass", ior_params + "*")
        # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX
        if self.is_harasser("rebuild"):
            oclass_list = self.params.get("daos_oclass", "/run/rebuild/*")
        # update IOR cmdline for each additional IOR obj
        for api in api_list:
            for b_size in bsize_list:
                for t_size in tsize_list:
                    for o_type in oclass_list:
                        ior_cmd = IorCommand()
                        ior_cmd.namespace = ior_params
                        ior_cmd.get_params(self)
                        if iteration is not None and iteration < 0:
                            ior_cmd.repetitions.update(1000000)
                        if self.job_timeout is not None:
                            ior_cmd.max_duration.update(self.job_timeout)
                        else:
                            ior_cmd.max_duration.update(10)
                        ior_cmd.api.update(api)
                        ior_cmd.block_size.update(b_size)
                        ior_cmd.transfer_size.update(t_size)
                        ior_cmd.daos_oclass.update(o_type)
                        ior_cmd.set_daos_params(self.server_group, pool)
                        # srun cmdline
                        nprocs = nodesperjob * ppn
                        env = ior_cmd.get_default_env("srun")
                        if ior_cmd.api.value == "MPIIO":
                            env["DAOS_CONT"] = ior_cmd.daos_cont.value
                        cmd = Srun(ior_cmd)
                        cmd.assign_processes(nprocs)
                        cmd.assign_environment(env, True)
                        cmd.ntasks_per_node.update(ppn)
                        log_name = "{}_{}_{}_{}".format(
                            api, b_size, t_size, o_type)
                        commands.append([cmd.__str__(), log_name])
                        self.log.info("<<IOR cmdline>>: %s \n",
                                      commands[-1].__str__())
        return commands
Beispiel #3
0
    def run_ior_threads_il(self, results, intercept, with_clients,
                           without_clients):
        """Execute 2 IOR threads in parallel.

        One thread is run with the interception library (IL) and one without.

        Args:
            results (dict): Dictionary to store the IOR results that gets
                printed in the IOR output.
            intercept (str): Path to the interception library. Shall be used
                only for POSIX through DFUSE.
            with_clients (list): List of clients that use IL.
            without_clients (list): List of clients that doesn't use IL.
        """
        # We can't use the shared self.ior_cmd, so we need to create the
        # IorCommand object for each thread.
        ior_cmd1 = IorCommand()
        ior_cmd1.get_params(self)
        # Update IOR params with the pool and container params
        ior_cmd1.set_daos_params(self.server_group, self.pool,
                                 self.container.uuid)

        ior_cmd2 = IorCommand()
        ior_cmd2.get_params(self)
        ior_cmd2.set_daos_params(self.server_group, self.pool,
                                 self.container.uuid)

        # start dfuse for POSIX api. This is specific to interception library
        # test requirements.
        self.start_dfuse(self.hostlist_clients, self.pool, self.container)

        # Create two threads and run in parallel.
        thread1 = self.create_ior_thread(ior_cmd1, with_clients, 1, results,
                                         intercept)
        thread2 = self.create_ior_thread(ior_cmd2, without_clients, 2, results,
                                         None)

        thread1.start()
        thread2.start()
        thread1.join()
        thread2.join()

        self.stop_dfuse()

        # Basic verification of the thread results
        status = True
        for key in sorted(results):
            if not results[key].pop(0):
                self.log.error("IOR Thread %d: %s", key, results[key][0])
                status = False
            if len(results[key]) != 2:
                self.log.error(
                    "IOR Thread %d: expecting 2 results; %d found: %s", key,
                    len(results[key]), results[key])
                status = False
        if not status:
            self.fail("At least one IOR thread failed!")
Beispiel #4
0
    def create_ior_cmdline(self, job_params, job_spec, pool):
        """Create an IOR cmdline to run in slurm batch.

        Args:
            job_params (str): job params from yaml file
            job_spec (str): specific ior job to run
            pool (obj):   TestPool obj

        Returns:
            cmd: cmdline string

        """
        command = []
        iteration = self.test_iteration
        ior_params = "/run/" + job_spec + "/"

        ior_cmd = IorCommand()
        ior_cmd.namespace = ior_params
        ior_cmd.get_params(self)
        if iteration is not None and iteration < 0:
            ior_cmd.repetitions.update(1000000)
        ior_cmd.max_duration.update(self.params.get("time", job_params + '*'))
        # IOR job specs with a list of parameters; update each value
        #   transfer_size
        #   block_size
        #   daos object class
        tsize_list = ior_cmd.transfer_size.value
        bsize_list = ior_cmd.block_size.value
        oclass_list = ior_cmd.daos_oclass.value
        for b_size in bsize_list:
            ior_cmd.block_size.update(b_size)
            for o_type in oclass_list:
                ior_cmd.daos_oclass.update(o_type)
                for t_size in tsize_list:
                    ior_cmd.transfer_size.update(t_size)
                    ior_cmd.set_daos_params(self.server_group, pool)
                    # export the user environment to test node
                    exports = ["ALL"]
                    if ior_cmd.api.value == "MPIIO":
                        env = {
                            "CRT_ATTACH_INFO_PATH": os.path.join(
                                self.basepath, "install/tmp"),
                            "DAOS_POOL": str(ior_cmd.daos_pool.value),
                            "MPI_LIB": "\"\"",
                            "DAOS_SVCL": str(ior_cmd.daos_svcl.value),
                            "DAOS_SINGLETON_CLI": 1,
                            "FI_PSM2_DISCONNECT": 1
                        }
                        exports.extend(
                            ["{}={}".format(
                                key, val) for key, val in env.items()])
                    cmd = "srun -l --mpi=pmi2 --export={} {}".format(
                        ",".join(exports), ior_cmd)
                    command.append(cmd)
                    self.log.debug("<<IOR cmdline >>: %s \n", cmd)
        return command
    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            api (str): IOR api
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        Returns:
            None
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test : Mpich not installed on :"
                      " {}".format(self.hostfile_clients[0]))
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.daos_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}"
                       .format(oclass,
                               api,
                               test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        manager.job.daos_cont.update(container_info
                                     ["{}{}{}".format(oclass,
                                                      api,
                                                      test[2])])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")
Beispiel #6
0
    def run_ior_report_error(self, results, job_num, file_name, pool,
                             container, namespace):
        """Run IOR command and store the results to results dictionary.

        Create a new IorCommand object instead of using the one in IorTestBase because
        we'll run a test that runs multiple IOR processes at the same time.

        Args:
            results (dict): A dictionary object to store the ior metrics
            job_num (int): Assigned job number
            file_name (str): File name used for self.ior_cmd.test_file.
            oclass (str): Value for dfs_oclass and dfs_dir_oclass.
            pool (TestPool): Pool to run IOR.
            container (TestContainer): Container to run IOR.
        """
        # Update the object class depending on the test case.
        ior_cmd = IorCommand(namespace=namespace)
        ior_cmd.get_params(self)

        # Standard IOR prep sequence.
        ior_cmd.set_daos_params(self.server_group, pool, container.uuid)
        testfile = os.path.join("/", file_name)
        ior_cmd.test_file.update(testfile)

        manager = get_job_manager(test=self,
                                  class_name="Mpirun",
                                  job=ior_cmd,
                                  subprocess=self.subprocess,
                                  mpi_type="mpich")
        manager.assign_hosts(self.hostlist_clients, self.workdir,
                             self.hostfile_clients_slots)
        ppn = self.params.get("ppn", '/run/ior/client_processes/*')
        manager.ppn.update(ppn, 'mpirun.ppn')
        manager.processes.update(None, 'mpirun.np')

        # Run the command.
        try:
            self.log.info("--- IOR command %d start ---", job_num)
            ior_output = manager.run()
            results[job_num] = [True]
            # For debugging.
            results[job_num].extend(IorCommand.get_ior_metrics(ior_output))
            # Command worked, but append the error message if any.
            results[job_num].append(ior_output.stderr_text)
            self.log.info("--- IOR command %d end ---", job_num)
        except CommandFailure as error:
            self.log.info("--- IOR command %d failed ---", job_num)
            results[job_num] = [False, "IOR failed: {}".format(error)]
Beispiel #7
0
    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.

        Args:
            pool (TestPool): Pool to run IOR command on.
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}

        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}"
                       .format(oclass,
                               api,
                               test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        job_manager = get_job_manager(self, "Mpirun", ior_cmd, mpi_type="mpich")
        key = "{}{}{}".format(oclass, api, test[2])
        job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(job_manager))
        job_manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        job_manager.assign_processes(processes)
        job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")
Beispiel #8
0
    def ior_bg_thread(self, results):
        """Start IOR Background thread, This will write small data set and
        keep reading it in loop until it fails or main program exit.

        Args:
            results (queue): queue for returning thread results
        """
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Define the IOR Command and use the parameter from yaml file.
        ior_bg_cmd = IorCommand()
        ior_bg_cmd.get_params(self)
        ior_bg_cmd.set_daos_params(self.server_group, self.pool)
        ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value)
        ior_bg_cmd.api.update(self.ior_cmd.api.value)
        ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize)
        ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value)
        ior_bg_cmd.flags.update(self.ior_cmd.flags.value)
        ior_bg_cmd.test_file.update('/testfile_background')

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich")
        self.create_cont()
        self.job_manager.job.dfs_cont.update(self.container.uuid)
        env = ior_bg_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(1)
        self.job_manager.assign_environment(env, True)
        print('----Run IOR in Background-------')
        # run IOR Write Command
        try:
            self.job_manager.run()
        except (CommandFailure, TestFail) as _error:
            results.put("FAIL")
            return

        # run IOR Read Command in loop
        ior_bg_cmd.flags.update(self.ior_read_flags)
        while True:
            try:
                self.job_manager.run()
            except (CommandFailure, TestFail) as _error:
                results.put("FAIL")
                break
Beispiel #9
0
    def ior_bg_thread(self):
        """Start IOR Background thread, This will write small data set and
        keep reading it in loop until it fails or main program exit.

        """

        # Define the IOR Command and use the parameter from yaml file.
        ior_bg_cmd = IorCommand()
        ior_bg_cmd.get_params(self)
        ior_bg_cmd.set_daos_params(self.server_group, self.pool)
        ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value)
        ior_bg_cmd.api.update(self.ior_cmd.api.value)
        ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize)
        ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value)
        ior_bg_cmd.flags.update(self.ior_cmd.flags.value)
        ior_bg_cmd.test_file.update('/testfile_background')

        # Define the job manager for the IOR command
        job_manager = get_job_manager(self,
                                      "Mpirun",
                                      ior_bg_cmd,
                                      mpi_type="mpich")

        # create container
        container = self.get_container(self.pool)

        job_manager.job.dfs_cont.update(container.uuid)
        env = ior_bg_cmd.get_default_env(str(job_manager))
        job_manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        job_manager.assign_processes(1)
        job_manager.assign_environment(env, True)
        print('----Run IOR in Background-------')
        # run IOR Write Command
        try:
            job_manager.run()
        except (CommandFailure, TestFail) as _error:
            self.test_result.append("FAIL")
            return

        # run IOR Read Command in loop
        ior_bg_cmd.flags.update(self.ior_read_flags)
        while True:
            try:
                job_manager.run()
            except (CommandFailure, TestFail) as _error:
                break
Beispiel #10
0
    def ior_thread(self, pool, oclass, api, test, flags, results):
        """This method calls job manager for IOR command
        invocation.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[0])
        ior_cmd.block_size.update(test[1])
        ior_cmd.flags.update(flags)
        if "-w" in flags:
            self.container_info["{}{}{}"
                                .format(oclass,
                                        api,
                                        test[0])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[0])])
        manager.job.dfs_cont.update(self.container_info[key])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")
Beispiel #11
0
    def run_ior_threads_il(self, results, intercept, with_clients,
                           without_clients):
        """Execute 2 IOR threads in parallel. One thread with interception
        library (IL) and one without.

        Args:
            results (dict): Dictionary to store the IOR results that gets
                printed in the IOR output.
            intercept (str): Path to the interception library. Shall be used
                only for POSIX through DFUSE.
            with_clients (list): List of clients that use IL.
            without_clients (list): List of clients that doesn't use IL.
        """
        # We can't use the shared self.ior_cmd, so we need to create the
        # IorCommand object for each thread.
        ior_cmd1 = IorCommand()
        ior_cmd1.get_params(self)
        # Update IOR params with the pool and container params
        ior_cmd1.set_daos_params(
            self.server_group, self.pool, self.container.uuid)

        ior_cmd2 = IorCommand()
        ior_cmd2.get_params(self)
        ior_cmd2.set_daos_params(
            self.server_group, self.pool, self.container.uuid)

        # start dfuse for POSIX api. This is specific to interception library
        # test requirements.
        self.start_dfuse(self.hostlist_clients, self.pool, self.container)

        # Create two threads and run in parallel.
        thread1 = self.create_ior_thread(
            ior_cmd1, with_clients, 1, results, intercept)
        thread2 = self.create_ior_thread(
            ior_cmd2, without_clients, 2, results, None)

        thread1.start()
        thread2.start()
        thread1.join()
        thread2.join()

        self.stop_dfuse()
Beispiel #12
0
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob):
    """Create an IOR cmdline to run in slurm batch.

    Args:

        job_spec (str):   ior job in yaml to run
        pool (obj):       TestPool obj
        ppn(int):         number of tasks to run on each node
        nodesperjob(int): number of nodes per job

    Returns:
        cmd: cmdline string

    """
    commands = []
    iteration = self.test_iteration
    ior_params = "/run/" + job_spec + "/*"
    mpi_module = self.params.get(
        "mpi_module", "/run/", default="mpi/mpich-x86_64")
    # IOR job specs with a list of parameters; update each value
    api_list = self.params.get("api", ior_params + "*")
    tsize_list = self.params.get("transfer_size", ior_params + "*")
    bsize_list = self.params.get("block_size", ior_params + "*")
    oclass_list = self.params.get("dfs_oclass", ior_params + "*")
    plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/")
    # check if capable of doing rebuild; if yes then dfs_oclass = RP_*GX
    if is_harasser(self, "rebuild"):
        oclass_list = self.params.get("dfs_oclass", "/run/rebuild/*")
    # update IOR cmdline for each additional IOR obj
    for api in api_list:
        for b_size in bsize_list:
            for t_size in tsize_list:
                for o_type in oclass_list:
                    ior_cmd = IorCommand()
                    ior_cmd.namespace = ior_params
                    ior_cmd.get_params(self)
                    if iteration is not None and iteration < 0:
                        ior_cmd.repetitions.update(1000000)
                    if self.job_timeout is not None:
                        ior_cmd.max_duration.update(self.job_timeout)
                    else:
                        ior_cmd.max_duration.update(10)
                    if api == "HDF5-VOL":
                        ior_cmd.api.update("HDF5")
                    else:
                        ior_cmd.api.update(api)
                    ior_cmd.block_size.update(b_size)
                    ior_cmd.transfer_size.update(t_size)
                    ior_cmd.dfs_oclass.update(o_type)
                    if ior_cmd.api.value == "DFS":
                        ior_cmd.test_file.update(
                            os.path.join("/", "testfile"))
                    ior_cmd.set_daos_params(self.server_group, pool)
                    env = ior_cmd.get_default_env("srun")
                    sbatch_cmds = ["module load -q {}".format(mpi_module)]
                    # include dfuse cmdlines
                    if api in ["HDF5-VOL", "POSIX"]:
                        dfuse, dfuse_start_cmdlist = start_dfuse(
                            self, pool, nodesperjob, "SLURM")
                        sbatch_cmds.extend(dfuse_start_cmdlist)
                        ior_cmd.test_file.update(
                            os.path.join(dfuse.mount_dir.value, "testfile"))
                    # add envs if api is HDF5-VOL
                    if api == "HDF5-VOL":
                        env["HDF5_VOL_CONNECTOR"] = "daos"
                        env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path)
                        # env["H5_DAOS_BYPASS_DUNS"] = 1
                    srun_cmd = Srun(ior_cmd)
                    srun_cmd.assign_processes(nodesperjob * ppn)
                    srun_cmd.assign_environment(env, True)
                    srun_cmd.ntasks_per_node.update(ppn)
                    srun_cmd.nodes.update(nodesperjob)
                    sbatch_cmds.append(str(srun_cmd))
                    sbatch_cmds.append("status=$?")
                    if api in ["HDF5-VOL", "POSIX"]:
                        sbatch_cmds.extend(
                            stop_dfuse(dfuse, nodesperjob, "SLURM"))
                    sbatch_cmds.append("exit $status")
                    log_name = "{}_{}_{}_{}".format(
                        api, b_size, t_size, o_type)
                    commands.append([sbatch_cmds, log_name])
                    self.log.info(
                        "<<IOR {} cmdlines>>:".format(api))
                    for cmd in sbatch_cmds:
                        self.log.info("%s", cmd)
    return commands
Beispiel #13
0
class IorTestBase(DfuseTestBase):
    # pylint: disable=too-many-ancestors
    """Base IOR test class.

    :avocado: recursive
    """

    IOR_WRITE_PATTERN = "Commencing write performance test"
    IOR_READ_PATTERN = "Commencing read performance test"

    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super().__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.container = None
        self.ior_timeout = None
        self.ppn = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super().setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')
        self.ppn = self.params.get("ppn", '/run/ior/client_processes/*')
        self.subprocess = self.params.get("subprocess", '/run/ior/*', False)
        self.ior_timeout = self.params.get("ior_timeout", '/run/ior/*', None)

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params and create a pool
        self.add_pool(connect=False)

    def create_cont(self):
        """Create a TestContainer object to be used to create container.

        """
        # Get container params
        self.container = TestContainer(self.pool,
                                       daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # update container oclass
        if self.ior_cmd.dfs_oclass:
            self.container.oclass.update(self.ior_cmd.dfs_oclass.value)

        # create container
        self.container.create()

    def display_pool_space(self, pool=None):
        """Display the current pool space.

        If the TestPool object has a DmgCommand object assigned, also display
        the free pool space per target.

        Args:
            pool (TestPool, optional): The pool for which to display space.
                    Default is self.pool.
        """
        if not pool:
            pool = self.pool

        pool.display_pool_daos_space()
        if pool.dmg:
            pool.set_query_data()

    def run_ior_with_pool(self,
                          intercept=None,
                          test_file_suffix="",
                          test_file="daos:/testFile",
                          create_pool=True,
                          create_cont=True,
                          stop_dfuse=True,
                          plugin_path=None,
                          timeout=None,
                          fail_on_warning=False,
                          mount_dir=None,
                          out_queue=None,
                          env=None):
        # pylint: disable=too-many-arguments
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str, optional): path to the interception library. Shall
                    be used only for POSIX through DFUSE. Defaults to None.
            test_file_suffix (str, optional): suffix to add to the end of the
                test file name. Defaults to "".
            test_file (str, optional): ior test file name. Defaults to
                "daos:/testFile". Is ignored when using POSIX through DFUSE.
            create_pool (bool, optional): If it is true, create pool and
                container else just run the ior. Defaults to True.
            create_cont (bool, optional): Create new container. Default is True
            stop_dfuse (bool, optional): Stop dfuse after ior command is
                finished. Default is True.
            plugin_path (str, optional): HDF5 vol connector library path.
                This will enable dfuse (xattr) working directory which is
                needed to run vol connector for DAOS. Default is None.
            timeout (int, optional): command timeout. Defaults to None.
            fail_on_warning (bool, optional): Controls whether the test
                should fail if a 'WARNING' is found. Default is False.
            mount_dir (str, optional): Create specific mount point
            out_queue (queue, optional): Pass the exception to the queue.
                Defaults to None
            env (EnvironmentVariables, optional): Pass the environment to be
                used when calling run_ior. Defaults to None

        Returns:
            CmdResult: result of the ior command execution

        """
        if create_pool:
            self.update_ior_cmd_with_pool(create_cont)

        # start dfuse if api is POSIX or HDF5 with vol connector
        if self.ior_cmd.api.value == "POSIX" or plugin_path:
            # add a substring in case of HDF5-VOL
            if plugin_path:
                sub_dir = get_random_string(5)
                mount_dir = os.path.join(mount_dir, sub_dir)
            # Connect to the pool, create container and then start dfuse
            if not self.dfuse:
                self.start_dfuse(self.hostlist_clients, self.pool,
                                 self.container, mount_dir)

        # setup test file for POSIX or HDF5 with vol connector
        if self.ior_cmd.api.value == "POSIX" or plugin_path:
            test_file = os.path.join(self.dfuse.mount_dir.value, "testfile")
        elif self.ior_cmd.api.value == "DFS":
            test_file = os.path.join("/", "testfile")

        self.ior_cmd.test_file.update("".join([test_file, test_file_suffix]))
        job_manager = self.get_ior_job_manager_command()
        job_manager.timeout = timeout
        try:
            out = self.run_ior(job_manager,
                               self.processes,
                               intercept,
                               plugin_path=plugin_path,
                               fail_on_warning=fail_on_warning,
                               out_queue=out_queue,
                               env=env)
        finally:
            if stop_dfuse:
                self.stop_dfuse()

        return out

    def update_ior_cmd_with_pool(self, create_cont=True):
        """Update ior_cmd with pool.

        Args:
          create_cont (bool, optional): create a container. Defaults to True.
        """
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Create a container, if needed.
        # Don't pass uuid and pool handle to IOR.
        # It will not enable checksum feature
        if create_cont:
            self.pool.connect()
            self.create_cont()
        # Update IOR params with the pool and container params
        self.ior_cmd.set_daos_params(self.server_group, self.pool,
                                     self.container.uuid)

    def get_ior_job_manager_command(self, custom_ior_cmd=None):
        """Get the MPI job manager command for IOR.

        Args:
            custom_ior_cmd (IorCommand): Custom IorCommand instance to create
            job_manager with.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DFS mode
        if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS", "HDF5"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        if custom_ior_cmd:
            self.job_manager = Mpirun(custom_ior_cmd, self.subprocess, "mpich")
        else:
            self.job_manager = Mpirun(self.ior_cmd, self.subprocess, "mpich")

        return self.job_manager

    def check_subprocess_status(self, operation="write"):
        """Check subprocess status."""
        if operation == "write":
            self.ior_cmd.pattern = self.IOR_WRITE_PATTERN
        elif operation == "read":
            self.ior_cmd.pattern = self.IOR_READ_PATTERN
        else:
            self.fail("Exiting Test: Inappropriate operation type \
                      for subprocess status check")

        if not self.ior_cmd.check_ior_subprocess_status(
                self.job_manager.process, self.ior_cmd):
            self.fail("Exiting Test: Subprocess not running")

    def run_ior(self,
                manager,
                processes,
                intercept=None,
                display_space=True,
                plugin_path=None,
                fail_on_warning=False,
                pool=None,
                out_queue=None,
                env=None):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str, optional): path to interception library.
            display_space (bool, optional): Whether to display the pool
                space. Defaults to True.
            plugin_path (str, optional): HDF5 vol connector library path.
                This will enable dfuse (xattr) working directory which is
                needed to run vol connector for DAOS. Default is None.
            fail_on_warning (bool, optional): Controls whether the test
                should fail if a 'WARNING' is found. Default is False.
            pool (TestPool, optional): The pool for which to display space.
                Default is self.pool.
            out_queue (queue, optional): Pass the exception to the queue.
                Defaults to None.
            env (EnvironmentVariables, optional): Environment to be used
             when running ior. Defaults to None
        """
        if not env:
            env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env['LD_PRELOAD'] = intercept
            env['D_LOG_MASK'] = 'INFO'
            if env.get('D_IL_REPORT', None) is None:
                env['D_IL_REPORT'] = '1'

            #env['D_LOG_MASK'] = 'INFO,IL=DEBUG'
            #env['DD_MASK'] = 'all'
            #env['DD_SUBSYS'] = 'all'
        if plugin_path:
            env["HDF5_VOL_CONNECTOR"] = "daos"
            env["HDF5_PLUGIN_PATH"] = str(plugin_path)
            manager.working_dir.value = self.dfuse.mount_dir.value
        manager.assign_hosts(self.hostlist_clients, self.workdir,
                             self.hostfile_clients_slots)
        if self.ppn is None:
            manager.assign_processes(processes)
        else:
            manager.ppn.update(self.ppn, 'mpirun.ppn')
            manager.processes.update(None, 'mpirun.np')

        manager.assign_environment(env)

        if not pool:
            pool = self.pool

        try:
            if display_space:
                self.display_pool_space(pool)
            out = manager.run()

            if self.subprocess:
                return out

            if fail_on_warning:
                report_warning = self.fail
            else:
                report_warning = self.log.warning

            for line in out.stdout_text.splitlines():
                if 'WARNING' in line:
                    report_warning("IOR command issued warnings.")
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            # Queue is used when we use a thread to call
            # ior thread (eg: thread1 --> thread2 --> ior)
            if out_queue is not None:
                out_queue.put("IOR Failed")
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            if not self.subprocess and display_space:
                self.display_pool_space(pool)

    def stop_ior(self):
        """Stop IOR process.

        Args:
            manager (str): mpi job manager command
        """
        self.log.info("<IOR> Stopping in-progress IOR command: %s",
                      str(self.job_manager))

        try:
            out = self.job_manager.stop()
            return out
        except CommandFailure as error:
            self.log.error("IOR stop Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.display_pool_space()

    def run_ior_threads_il(self, results, intercept, with_clients,
                           without_clients):
        """Execute 2 IOR threads in parallel.

        One thread is run with the interception library (IL) and one without.

        Args:
            results (dict): Dictionary to store the IOR results that gets
                printed in the IOR output.
            intercept (str): Path to the interception library. Shall be used
                only for POSIX through DFUSE.
            with_clients (list): List of clients that use IL.
            without_clients (list): List of clients that doesn't use IL.
        """
        # We can't use the shared self.ior_cmd, so we need to create the
        # IorCommand object for each thread.
        ior_cmd1 = IorCommand()
        ior_cmd1.get_params(self)
        # Update IOR params with the pool and container params
        ior_cmd1.set_daos_params(self.server_group, self.pool,
                                 self.container.uuid)

        ior_cmd2 = IorCommand()
        ior_cmd2.get_params(self)
        ior_cmd2.set_daos_params(self.server_group, self.pool,
                                 self.container.uuid)

        # start dfuse for POSIX api. This is specific to interception library
        # test requirements.
        self.start_dfuse(self.hostlist_clients, self.pool, self.container)

        # Create two threads and run in parallel.
        thread1 = self.create_ior_thread(ior_cmd1, with_clients, 1, results,
                                         intercept)
        thread2 = self.create_ior_thread(ior_cmd2, without_clients, 2, results,
                                         None)

        thread1.start()
        thread2.start()
        thread1.join()
        thread2.join()

        self.stop_dfuse()

        # Basic verification of the thread results
        status = True
        for key in sorted(results):
            if not results[key].pop(0):
                self.log.error("IOR Thread %d: %s", key, results[key][0])
                status = False
            if len(results[key]) != 2:
                self.log.error(
                    "IOR Thread %d: expecting 2 results; %d found: %s", key,
                    len(results[key]), results[key])
                status = False
        if not status:
            self.fail("At least one IOR thread failed!")

    def create_ior_thread(self,
                          ior_command,
                          clients,
                          job_num,
                          results,
                          intercept=None):
        """Create a new thread for ior run.

        Args:
            ior_command (IorCommand): IOR command instance.
            clients (list): hosts on which to run ior
            job_num (int): Assigned job number
            results (dict): A dictionary object to store the ior metrics
            intercept (path): Path to interception library
        """
        job = threading.Thread(
            target=self.run_custom_ior_cmd,
            args=[ior_command, clients, results, job_num, intercept])
        return job

    def run_custom_ior_cmd(self,
                           ior_command,
                           clients,
                           results,
                           job_num,
                           intercept=None):
        """Run customized IOR command, not self.ior_cmd.

        Expected to be used with a threaded code where multiple IOR commands are
        executed in parallel.

        Display pool space before running it for a reference.

        Args:
            ior_command (IorCommand): Custom IOR command instance.
            clients (list): hosts on which to run ior
            results (dict): A dictionary object to store the ior metrics
            job_num (int): Assigned job number
            intercept (str, optional): path to interception library. Defaults to
                None.
        """
        self.log.info("--- IOR Thread %d: Start ---", job_num)
        tsize = ior_command.transfer_size.value
        testfile = os.path.join(self.dfuse.mount_dir.value,
                                "testfile{}{}".format(tsize, job_num))
        if intercept:
            testfile += "intercept"
        ior_command.test_file.update(testfile)

        # Get the custom job manager that's associated with this thread.
        manager = self.get_ior_job_manager_command(custom_ior_cmd=ior_command)

        procs = (self.processes // len(self.hostlist_clients)) * len(clients)
        env = ior_command.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(clients, self.workdir,
                             self.hostfile_clients_slots)
        manager.assign_processes(procs)
        manager.assign_environment(env)

        self.log.info("--- IOR Thread %d: Starting IOR ---", job_num)
        self.display_pool_space()
        try:
            ior_output = manager.run()
            results[job_num] = [True]
            results[job_num].extend(IorCommand.get_ior_metrics(ior_output))
        except CommandFailure as error:
            results[job_num] = [False, "IOR failed: {}".format(error)]
        finally:
            self.display_pool_space()

        self.log.info("--- IOR Thread %d: End ---", job_num)

    def run_ior_multiple_variants(self, obj_class, apis, transfer_block_size,
                                  flags, mount_dir):
        """Run multiple ior commands with various different combination
           of ior input params.

        Args:
            obj_class(list): List of different object classes
            apis(list): list of different apis
            transfer_block_size(list): list of different transfer sizes
                                       and block sizes. eg: [1M, 32M]
                                       1M is transfer size and 32M is
                                       block size in the above example.
            flags(list): list of ior flags
            mount_dir(str): dfuse mount directory
        """
        results = []

        for oclass in obj_class:
            self.ior_cmd.dfs_oclass.update(oclass)
            for api in apis:
                if api == "HDF5-VOL":
                    self.ior_cmd.api.update("HDF5")
                    hdf5_plugin_path = self.params.get("plugin_path",
                                                       '/run/hdf5_vol/*')
                    flags_w_k = " ".join([flags[0]] + ["-k"])
                    self.ior_cmd.flags.update(flags_w_k, "ior.flags")
                else:
                    # run tests for different variants
                    self.ior_cmd.flags.update(flags[0], "ior.flags")
                    hdf5_plugin_path = None
                    self.ior_cmd.api.update(api)
                for test in transfer_block_size:
                    # update transfer and block size
                    self.ior_cmd.transfer_size.update(test[0])
                    self.ior_cmd.block_size.update(test[1])
                    # run ior
                    try:
                        self.run_ior_with_pool(plugin_path=hdf5_plugin_path,
                                               timeout=self.ior_timeout,
                                               mount_dir=mount_dir)
                        results.append(["PASS", str(self.ior_cmd)])
                    except CommandFailure:
                        results.append(["FAIL", str(self.ior_cmd)])
        return results

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))

    def execute_cmd(self, command, fail_on_err=True, display_output=True):
        """Execute cmd using general_utils.pcmd.

        Args:
            command (str): the command to execute on the client hosts
            fail_on_err (bool, optional): whether or not to fail the test if
                command returns a non zero return code. Defaults to True.
            display_output (bool, optional): whether or not to display output.
                Defaults to True.

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        try:
            # Execute the bash command on each client host
            result = self._execute_command(command, fail_on_err,
                                           display_output)

        except CommandFailure as error:
            # Report an error if any command fails
            self.log.error("DfuseSparseFile Test Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")

        return result

    def _execute_command(self,
                         command,
                         fail_on_err=True,
                         display_output=True,
                         hosts=None):
        """Execute the command on all client hosts.

        Optionally verify if the command returns a non zero return code.

        Args:
            command (str): the command to execute on the client hosts
            fail_on_err (bool, optional): whether or not to fail the test if
                command returns a non zero return code. Defaults to True.
            display_output (bool, optional): whether or not to display output.
                Defaults to True.

        Raises:
            CommandFailure: if 'fail_on_err' is set and the command fails on at
                least one of the client hosts

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        if hosts is None:
            hosts = self.hostlist_clients
        result = pcmd(hosts, command, verbose=display_output, timeout=300)
        if 0 not in result and fail_on_err:
            hosts = [
                str(nodes) for code, nodes in list(result.items()) if code != 0
            ]
            raise CommandFailure(
                "Error running '{}' on the following hosts: {}".format(
                    command, NodeSet(",".join(hosts))))
        return result
Beispiel #14
0
class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """

    IOR_WRITE_PATTERN = "Commencing write performance test"
    IOR_READ_PATTERN = "Commencing read performance test"

    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None
        self.lock = None
        self.mpirun = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')
        self.subprocess = self.params.get("subprocess", '/run/ior/*', False)

        # lock is needed for run_multiple_ior method.
        self.lock = threading.Lock()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(IorTestBase, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(
            self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # Get container params
        self.container = TestContainer(
            self.pool, daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # create container
        self.container.create()

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def run_ior_with_pool(self, intercept=None, test_file_suffix="",
                          test_file="daos:testFile", create_pool=True,
                          create_cont=True, stop_dfuse=True):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str, optional): path to the interception library. Shall
                    be used only for POSIX through DFUSE. Defaults to None.
            test_file_suffix (str, optional): suffix to add to the end of the
                test file name. Defaults to "".
            test_file (str, optional): ior test file name. Defaults to
                "daos:testFile". Is ignored when using POSIX through DFUSE.
            create_pool (bool, optional): If it is true, create pool and
                container else just run the ior. Defaults to True.
            create_cont (bool, optional): Create new container. Default is True
            stop_dfuse (bool, optional): Stop dfuse after ior command is
                finished. Default is True.

        Returns:
            CmdResult: result of the ior command execution

        """
        if create_pool:
            self.update_ior_cmd_with_pool(create_cont)

        # start dfuse if api is POSIX
        if self.ior_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            if not self.dfuse:
                self._start_dfuse()
            test_file = os.path.join(self.dfuse.mount_dir.value, "testfile")
        elif self.ior_cmd.api.value == "DFS":
            test_file = os.path.join("/", "testfile")

        self.ior_cmd.test_file.update("".join([test_file, test_file_suffix]))

        out = self.run_ior(self.get_ior_job_manager_command(), self.processes,
                           intercept)

        if stop_dfuse and self.dfuse:
            self.dfuse.stop()
            self.dfuse = None
        return out

    def update_ior_cmd_with_pool(self, create_cont=True):
        """Update ior_cmd with pool."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Create a container, if needed.
        # Don't pass uuid and pool handle to IOR.
        # It will not enable checksum feature
        if create_cont:
            self.pool.connect()
            self.create_cont()
        # Update IOR params with the pool and container params
        self.ior_cmd.set_daos_params(self.server_group, self.pool,
                                     self.container.uuid)

    def get_ior_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DFS mode
        if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        if self.subprocess:
            self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich")
        else:
            self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich")

        return self.mpirun

    def check_subprocess_status(self, operation="write"):
        """Check subprocess status """
        if operation == "write":
            self.ior_cmd.pattern = self.IOR_WRITE_PATTERN
        elif operation == "read":
            self.ior_cmd.pattern = self.IOR_READ_PATTERN
        else:
            self.fail("Exiting Test: Inappropriate operation type \
                      for subprocess status check")

        if not self.ior_cmd.check_ior_subprocess_status(
                self.mpirun.process, self.ior_cmd):
            self.fail("Exiting Test: Subprocess not running")

    def run_ior(self, manager, processes, intercept=None, display_space=True):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(
            self.hostlist_clients, self.workdir, self.hostfile_clients_slots)
        manager.assign_processes(processes)
        manager.assign_environment(env)

        try:
            if display_space:
                self.pool.display_pool_daos_space()
            out = manager.run()

            if not self.subprocess:
                for line in out.stdout.splitlines():
                    if 'WARNING' in line:
                        self.fail("IOR command issued warnings.\n")
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            if not self.subprocess and display_space:
                self.pool.display_pool_daos_space()

    def stop_ior(self):
        """Stop IOR process.
        Args:
            manager (str): mpi job manager command
        """
        self.log.info(
            "<IOR> Stopping in-progress IOR command: %s", self.mpirun.__str__())

        try:
            out = self.mpirun.stop()
            return out
        except CommandFailure as error:
            self.log.error("IOR stop Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()


    def run_multiple_ior_with_pool(self, results, intercept=None):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str): path to the interception library. Shall be used
                             only for POSIX through DFUSE.
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        self.update_ior_cmd_with_pool()

        # start dfuse for POSIX api. This is specific to interception
        # library test requirements.
        self._start_dfuse()

        # Create two jobs and run in parallel.
        # Job1 will have 3 client set up to use dfuse + interception
        # library
        # Job2 will have 1 client set up to use only dfuse.
        job1 = self.get_new_job(self.hostlist_clients[:-1], 1,
                                results, intercept)
        job2 = self.get_new_job([self.hostlist_clients[-1]], 2,
                                results, None)

        job1.start()
        # Since same ior_cmd is used to trigger the MPIRUN
        # with different parameters, pausing for 2 seconds to
        # avoid data collisions.
        time.sleep(2)
        job2.start()
        job1.join()
        job2.join()
        self.dfuse.stop()
        self.dfuse = None

    def get_new_job(self, clients, job_num, results, intercept=None):
        """Create a new thread for ior run.

        Args:
            clients (list): hosts on which to run ior
            job_num (int): Assigned job number
            results (dict): A dictionary object to store the ior metrics
            intercept (path): Path to interception library
        """
        job = threading.Thread(target=self.run_multiple_ior, args=[
            clients, results, job_num, intercept])
        return job

    def run_multiple_ior(self, clients, results, job_num, intercept=None):
        """Run the IOR command.

        Args:
            clients (list): hosts on which to run ior
            results (dict): A dictionary object to store the ior metrics
            job_num (int): Assigned job number
            intercept (str, optional): path to interception library. Defaults to
                None.
        """
        self.lock.acquire(True)
        tsize = self.ior_cmd.transfer_size.value
        testfile = os.path.join(self.dfuse.mount_dir.value,
                                "testfile{}{}".format(tsize, job_num))
        if intercept:
            testfile += "intercept"
        self.ior_cmd.test_file.update(testfile)
        manager = self.get_ior_job_manager_command()
        procs = (self.processes // len(self.hostlist_clients)) * len(clients)
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots)
        manager.assign_processes(procs)
        manager.assign_environment(env)
        self.lock.release()
        try:
            self.pool.display_pool_daos_space()
            out = manager.run()
            self.lock.acquire(True)
            results[job_num] = IorCommand.get_ior_metrics(out)
            self.lock.release()
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))

    def execute_cmd(self, cmd, fail_on_err=True, display_output=True):
        """Execute cmd using general_utils.pcmd

          Args:
            cmd (str): String command to be executed
            fail_on_err (bool): Boolean for whether to fail the test if command
                                execution returns non zero return code.
            display_output (bool): Boolean for whether to display output.

          Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                  values indicating which hosts yielded the return code.
        """
        try:
            # execute bash cmds
            ret = pcmd(
                self.hostlist_clients, cmd, verbose=display_output, timeout=300)
            if 0 not in ret:
                error_hosts = NodeSet(
                    ",".join(
                        [str(node_set) for code, node_set in
                         ret.items() if code != 0]))
                if fail_on_err:
                    raise CommandFailure(
                        "Error running '{}' on the following "
                        "hosts: {}".format(cmd, error_hosts))

         # report error if any command fails
        except CommandFailure as error:
            self.log.error("DfuseSparseFile Test Failed: %s",
                           str(error))
            self.fail("Test was expected to pass but "
                      "it failed.\n")
        return ret
Beispiel #15
0
    def test_metadata_server_restart(self):
        """JIRA ID: DAOS-1512.

        Test Description:
            This test will verify 2000 IOR small size container after server
            restart. Test will write IOR in 5 different threads for faster
            execution time. Each thread will create 400 (8bytes) containers to
            the same pool. Restart the servers, read IOR container file written
            previously and validate data integrity by using IOR option
            "-R -G 1".

        Use Cases:
            ?

        :avocado: tags=all,full_regression
        :avocado: tags=hw,large
        :avocado: tags=server,metadata,metadata_ior,nvme
        """
        self.create_pool()
        files_per_thread = 400
        total_ior_threads = 5

        processes = self.params.get("slots", "/run/ior/clientslots/*")

        list_of_uuid_lists = [[
            str(uuid.uuid4()) for _ in range(files_per_thread)
        ] for _ in range(total_ior_threads)]

        # Setup the thread manager
        thread_manager = ThreadManager(run_ior_loop, self.timeout - 30)

        # Launch threads to run IOR to write data, restart the agents and
        # servers, and then run IOR to read the data
        for operation in ("write", "read"):
            # Create the IOR threads
            for index in range(total_ior_threads):
                # Define the arguments for the run_ior_loop method
                ior_cmd = IorCommand()
                ior_cmd.get_params(self)
                ior_cmd.set_daos_params(self.server_group, self.pool)
                ior_cmd.flags.value = self.params.get(
                    "F", "/run/ior/ior{}flags/".format(operation))

                # Define the job manager for the IOR command
                self.ior_managers.append(Orterun(ior_cmd))
                env = ior_cmd.get_default_env(str(self.ior_managers[-1]))
                self.ior_managers[-1].assign_hosts(self.hostlist_clients,
                                                   self.workdir, None)
                self.ior_managers[-1].assign_processes(processes)
                self.ior_managers[-1].assign_environment(env)
                self.ior_managers[-1].verbose = False

                # Add a thread for these IOR arguments
                thread_manager.add(manager=self.ior_managers[-1],
                                   uuids=list_of_uuid_lists[index],
                                   tmpdir_base=self.test_dir)
                self.log.info("Created %s thread %s with container uuids %s",
                              operation, index, list_of_uuid_lists[index])

            # Launch the IOR threads
            self.log.info("Launching %d IOR %s threads", thread_manager.qty,
                          operation)
            failed_thread_count = thread_manager.check_run()
            if failed_thread_count > 0:
                msg = "{} FAILED IOR {} Thread(s)".format(
                    failed_thread_count, operation)
                self.d_log.error(msg)
                self.fail(msg)

            # Restart the agents and servers after the write / before the read
            if operation == "write":
                # Stop the agents
                errors = self.stop_agents()
                self.assertEqual(
                    len(errors), 0,
                    "Error stopping agents:\n  {}".format("\n  ".join(errors)))

                # Restart the servers w/o formatting the storage
                errors = self.restart_servers()
                self.assertEqual(
                    len(errors), 0, "Error stopping servers:\n  {}".format(
                        "\n  ".join(errors)))

                # Start the agents
                self.start_agent_managers()

        self.log.info("Test passed")
Beispiel #16
0
class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')
        # Until DAOS-3320 is resolved run IOR for POSIX
        # with single client node
        if self.ior_cmd.api.value == "POSIX":
            self.hostlist_clients = [self.hostlist_clients[0]]
            self.hostfile_clients = write_host_file.write_host_file(
                self.hostlist_clients, self.workdir,
                self.hostfile_clients_slots)

    def tearDown(self):
        """Tear down each test case."""
        try:
            self.dfuse = None
        finally:
            # Stop the servers and agents
            super(IorTestBase, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(self.context, self.log)
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # TO-DO: Enable container using TestContainer object,
        # once DAOS-3355 is resolved.
        # Get Container params
        #self.container = TestContainer(self.pool)
        #self.container.get_params(self)

        # create container
        # self.container.create()
        env = Dfuse(self.hostlist_clients, self.tmp).get_default_env()
        # command to create container of posix type
        cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format(
            self.ior_cmd.daos_pool.value, self.ior_cmd.daos_svcl.value)
        try:
            container = subprocess.Popen(cmd,
                                         stdout=subprocess.PIPE,
                                         shell=True)
            (output, err) = container.communicate()
            self.log.info("Container created with UUID %s", output.split()[3])

        except subprocess.CalledProcessError as err:
            self.fail("Container create failed:{}".format(err))

        return output.split()[3]

    def start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.create_cont())

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def run_ior_with_pool(self, intercept=None):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str): path to the interception library. Shall be used
                             only for POSIX through DFUSE.
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Update IOR params with the pool
        self.ior_cmd.set_daos_params(self.server_group, self.pool)

        # start dfuse if api is POSIX
        if self.ior_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            # self.pool.connect()
            # self.create_cont()
            if self.ior_cmd.transfer_size.value == "256B":
                self.cancelForTicket("DAOS-3449")
            self.start_dfuse()
            self.ior_cmd.test_file.update(self.dfuse.mount_dir.value +
                                          "/testfile")

        out = self.run_ior(self.get_job_manager_command(), self.processes,
                           intercept)

        return out

    def get_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode
        if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        mpirun_path = os.path.join(mpio_util.mpichinstall, "bin")
        return Mpirun(self.ior_cmd, mpirun_path)

    def run_ior(self, manager, processes, intercept=None):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        env = self.ior_cmd.get_default_env(str(manager), self.tmp,
                                           self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.setup_command(env, self.hostfile_clients, processes)
        try:
            out = manager.run()
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))
Beispiel #17
0
class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None
        self.lock = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')

        # Until DAOS-3320 is resolved run IOR for POSIX
        # with single client node
        if self.ior_cmd.api.value == "POSIX":
            self.hostlist_clients = [self.hostlist_clients[0]]
            self.hostfile_clients = write_host_file.write_host_file(
                self.hostlist_clients, self.workdir,
                self.hostfile_clients_slots)
        # lock is needed for run_multiple_ior method.
        self.lock = threading.Lock()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(IorTestBase, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # Get container params
        self.container = TestContainer(self.pool,
                                       daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # create container
        self.container.create()

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def run_ior_with_pool(self,
                          intercept=None,
                          test_file_suffix="",
                          test_file="daos:testFile"):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str, optional): path to the interception library. Shall
                    be used only for POSIX through DFUSE. Defaults to None.
            test_file_suffix (str, optional): suffix to add to the end of the
                test file name. Defaults to "".
            test_file (str, optional): ior test file name. Defaults to
                "daos:testFile". Is ignored when using POSIX through DFUSE.

        Returns:
            CmdResult: result of the ior command execution

        """
        self.update_ior_cmd_with_pool()
        # start dfuse if api is POSIX
        if self.ior_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            # Uncomment below two lines once DAOS-3355 is resolved
            if self.ior_cmd.transfer_size.value == "256B":
                return "Skipping the case for transfer_size=256B"
            self._start_dfuse()
            test_file = os.path.join(self.dfuse.mount_dir.value, "testfile")
        elif self.ior_cmd.api.value == "DFS":
            test_file = os.path.join("/", "testfile")

        self.ior_cmd.test_file.update("".join([test_file, test_file_suffix]))

        out = self.run_ior(self.get_ior_job_manager_command(), self.processes,
                           intercept)

        if self.dfuse:
            self.dfuse.stop()
            self.dfuse = None
        return out

    def update_ior_cmd_with_pool(self):
        """Update ior_cmd with pool."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Always create a container
        # Don't pass uuid and pool handle to IOR.
        # It will not enable checksum feature
        self.pool.connect()
        self.create_cont()
        # Update IOR params with the pool and container params
        self.ior_cmd.set_daos_params(self.server_group, self.pool,
                                     self.container.uuid)

    def get_ior_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode
        if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX", "DFS"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        return Mpirun(self.ior_cmd, mpitype="mpich")

    def run_ior(self, manager, processes, intercept=None):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.setup_command(env, self.hostfile_clients, processes)
        try:
            self.pool.display_pool_daos_space()
            out = manager.run()
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()

    def run_multiple_ior_with_pool(self, results, intercept=None):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str): path to the interception library. Shall be used
                             only for POSIX through DFUSE.
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        self.update_ior_cmd_with_pool()

        # start dfuse for POSIX api. This is specific to interception
        # library test requirements.
        self._start_dfuse()

        # Create two jobs and run in parallel.
        # Job1 will have 3 client set up to use dfuse + interception
        # library
        # Job2 will have 1 client set up to use only dfuse.
        job1 = self.get_new_job(self.hostlist_clients[:-1], 1, results,
                                intercept)
        job2 = self.get_new_job([self.hostlist_clients[-1]], 2, results, None)

        job1.start()
        # Since same ior_cmd is used to trigger the MPIRUN
        # with different parameters, pausing for 2 seconds to
        # avoid data collisions.
        time.sleep(2)
        job2.start()
        job1.join()
        job2.join()
        self.dfuse.stop()
        self.dfuse = None

    def get_new_job(self, clients, job_num, results, intercept=None):
        """Create a new thread for ior run.

        Args:
            clients (lst): Number of clients the ior would run against.
            job_num (int): Assigned job number
            results (dict): A dictionary object to store the ior metrics
            intercept (path): Path to interception library
        """
        hostfile = write_host_file.write_host_file(clients, self.workdir,
                                                   self.hostfile_clients_slots)
        job = threading.Thread(
            target=self.run_multiple_ior,
            args=[hostfile,
                  len(clients), results, job_num, intercept])
        return job

    def run_multiple_ior(self,
                         hostfile,
                         num_clients,
                         results,
                         job_num,
                         intercept=None):
        # pylint: disable=too-many-arguments
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        self.lock.acquire(True)
        tsize = self.ior_cmd.transfer_size.value
        testfile = os.path.join(self.dfuse.mount_dir.value,
                                "testfile{}{}".format(tsize, job_num))
        if intercept:
            testfile += "intercept"
        self.ior_cmd.test_file.update(testfile)
        manager = self.get_ior_job_manager_command()
        procs = (self.processes // len(self.hostlist_clients)) * num_clients
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.setup_command(env, hostfile, procs)
        self.lock.release()
        try:
            self.pool.display_pool_daos_space()
            out = manager.run()
            self.lock.acquire(True)
            results[job_num] = IorCommand.get_ior_metrics(out)
            self.lock.release()
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))
Beispiel #18
0
    def test_metadata_server_restart(self):
        """JIRA ID: DAOS-1512.

        Test Description:
            This test will verify 2000 IOR small size container after server
            restart. Test will write IOR in 5 different threads for faster
            execution time. Each thread will create 400 (8bytes) containers to
            the same pool. Restart the servers, read IOR container file written
            previously and validate data integrity by using IOR option
            "-R -G 1".

        Use Cases:
            ?

        :avocado: tags=metadata,metadata_ior,nvme,small
        """
        files_per_thread = 400
        total_ior_threads = 5
        self.out_queue = Queue.Queue()

        processes = self.params.get("slots", "/run/ior/clientslots/*")

        list_of_uuid_lists = [[
            str(uuid.uuid4()) for _ in range(files_per_thread)
        ] for _ in range(total_ior_threads)]

        # Launch threads to run IOR to write data, restart the agents and
        # servers, and then run IOR to read the data
        for operation in ("write", "read"):
            # Create the IOR threads
            threads = []
            for index in range(total_ior_threads):
                # Define the arguments for the ior_runner_thread method
                ior_cmd = IorCommand()
                ior_cmd.get_params(self)
                ior_cmd.set_daos_params(self.server_group, self.pool)
                ior_cmd.flags.value = self.params.get(
                    "F", "/run/ior/ior{}flags/".format(operation))

                # Add a thread for these IOR arguments
                threads.append(
                    threading.Thread(target=ior_runner_thread,
                                     kwargs={
                                         "ior_cmd": ior_cmd,
                                         "uuids": list_of_uuid_lists[index],
                                         "mgr": self.orterun,
                                         "attach": self.tmp,
                                         "hostfile": self.hostfile_clients,
                                         "procs": processes,
                                         "results": self.out_queue
                                     }))

                self.log.info("Creatied %s thread %s with container uuids %s",
                              operation, index, list_of_uuid_lists[index])

            # Launch the IOR threads
            if self.thread_control(threads, operation) == "FAIL":
                self.d_log.error("IOR {} Thread FAIL".format(operation))
                self.fail("IOR {} Thread FAIL".format(operation))

            # Restart the agents and servers after the write / before the read
            if operation == "write":
                # Stop the agents and servers
                if self.agent_sessions:
                    stop_agent(self.agent_sessions, self.hostlist_clients)
                stop_server(hosts=self.hostlist_servers)

                # Start the agents
                self.agent_sessions = run_agent(self.basepath,
                                                self.hostlist_clients,
                                                self.hostlist_servers)

                # Start the servers
                run_server(self.hostfile_servers,
                           self.server_group,
                           self.basepath,
                           clean=False)
Beispiel #19
0
    def test_rebuild_container_create(self):
        """Jira ID: DAOS-1168.

        Test Description:
            Configure 4 servers and 1 client with 1 or 2 pools and a pool
            service leader quantity of 2.  Add 1 container to the first pool
            configured with 3 replicas.  Populate the container with 1GB of
            objects.  Exclude a server that has shards of this object and
            verify that rebuild is initiated.  While rebuild is active, create
            1000 additional containers in the same pool or the second pool
            (when available).  Finally verify that rebuild completes and the
            pool info indicates the correct number of rebuilt objects and
            records.  Also confirm that all 1000 additional containers created
            during rebuild are accessible.

        Use Cases:
            Basic rebuild of container objects of array values with sufficient
            numbers of rebuild targets and no available rebuild targets.

        :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate
        """
        # Get test params
        targets = self.params.get("targets", "/run/server_config/*")
        pool_qty = self.params.get("pools", "/run/test/*")
        loop_qty = self.params.get("loops", "/run/test/*")
        cont_qty = self.params.get("containers", "/run/test/*")
        rank = self.params.get("rank", "/run/test/*")
        node_qty = len(self.hostlist_servers)

        # Get pool params
        self.pool = []
        for index in range(pool_qty):
            self.pool.append(TestPool(self.context, self.log))
            self.pool[-1].get_params(self)

        # Get ior params
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)

        # Cancel any tests with tickets already assigned
        if rank == 1 or rank == 2:
            self.cancelForTicket("DAOS-2434")

        errors = [0 for _ in range(loop_qty)]
        for loop in range(loop_qty):
            # Log the start of the loop
            loop_id = "LOOP {}/{}".format(loop + 1, loop_qty)
            self.log.info("%s", "-" * 80)
            self.log.info("%s: Starting loop", loop_id)

            # Create the requested number of pools
            info_checks = []
            rebuild_checks = []
            for pool in self.pool:
                pool.create()
                info_checks.append({
                    "pi_uuid": pool.uuid,
                    "pi_ntargets": node_qty * targets,
                    "pi_nnodes": node_qty,
                    "pi_ndisabled": 0,
                })
                rebuild_checks.append({
                    "rs_errno": 0,
                    "rs_done": 1,
                    "rs_obj_nr": 0,
                    "rs_rec_nr": 0,
                })

            # Check the pool info
            status = True
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
            self.assertTrue(
                status,
                "Error verifying pool info prior to excluding rank {}".format(
                    rank))

            # Create a container with 1GB of data in the first pool
            ior_cmd.flags.update("-v -w -W -G 1 -k", "ior.flags")
            ior_cmd.daos_destroy.update(False, "ior.daos_destroy")
            ior_cmd.set_daos_params(self.server_group, self.pool[0])
            self.log.info(
                "%s: Running IOR on pool %s to fill container %s with data",
                loop_id, self.pool[0].uuid, ior_cmd.daos_cont.value)
            self.run_ior(loop_id, ior_cmd)

            # Exclude the first rank from the first pool to initiate rebuild
            self.pool[0].start_rebuild(self.server_group, rank, self.d_log)

            # Wait for rebuild to start
            self.pool[0].wait_for_rebuild(True, 1)

            # Create additional containers in the last pool
            new_containers = self.add_containers_during_rebuild(
                loop_id, cont_qty, self.pool[0], self.pool[-1])

            # Confirm rebuild completes
            self.pool[0].wait_for_rebuild(False, 1)

            # Check the pool info
            info_checks[0]["pi_ndisabled"] += targets
            rebuild_checks[0]["rs_done"] = 1
            rebuild_checks[0]["rs_obj_nr"] = ">=0"
            rebuild_checks[0]["rs_rec_nr"] = ">=0"
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
            self.assertTrue(status, "Error verifying pool info after rebuild")

            # Verify that each of created containers exist by openning them
            for index, container in enumerate(new_containers):
                count = "{}/{}".format(index + 1, len(new_containers))
                if not self.access_container(loop_id, container, count):
                    errors[loop] += 1

            # Destroy the containers created during rebuild
            for index, container in enumerate(new_containers):
                container.destroy()

            # Read the data from the container created before rebuild
            self.log.info("%s: Running IOR on pool %s to verify container %s",
                          loop_id, self.pool[0].uuid, ior_cmd.daos_cont.value)
            ior_cmd.flags.update("-v -r -R -G 1 -E", "ior.flags")
            ior_cmd.daos_destroy.update(True, "ior.daos_destroy")
            self.run_ior(loop_id, ior_cmd)

            # Destroy the pools
            for pool in self.pool:
                pool.destroy(1)

            self.log.info("%s: Loop %s", loop_id,
                          "passed" if errors[loop] == 0 else "failed")

        self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
Beispiel #20
0
class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """

    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(self.context, self.log)
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def run_ior_with_pool(self):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()

        # Update IOR params with the pool
        self.ior_cmd.set_daos_params(self.server_group, self.pool)

        # Run IOR
        self.run_ior(self.get_job_manager_command(), self.processes)

    def get_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode
        if self.ior_cmd.api.value in ["MPIIO", "DAOS"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        mpirun_path = os.path.join(mpio_util.mpichinstall, "bin")
        return Mpirun(self.ior_cmd, mpirun_path)

    def run_ior(self, manager, processes):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
        """
        env = self.ior_cmd.get_default_env(
            str(manager), self.tmp, self.client_log)
        manager.setup_command(env, self.hostfile_clients, processes)
        try:
            manager.run()
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0

        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))
Beispiel #21
0
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob):
    """Create an IOR cmdline to run in slurm batch.

    Args:

        self (obj): soak obj
        job_spec (str):   ior job in yaml to run
        pool (obj):       TestPool obj
        ppn(int):         number of tasks to run on each node
        nodesperjob(int): number of nodes per job

    Returns:
        cmd: cmdline string

    """
    commands = []
    ior_params = os.path.join(os.sep, "run", job_spec, "*")
    ior_timeout = self.params.get("job_timeout", ior_params, 10)
    mpi_module = self.params.get("mpi_module",
                                 "/run/*",
                                 default="mpi/mpich-x86_64")
    # IOR job specs with a list of parameters; update each value
    api_list = self.params.get("api", ior_params)
    tsize_list = self.params.get("transfer_size", ior_params)
    bsize_list = self.params.get("block_size", ior_params)
    oclass_list = self.params.get("dfs_oclass", ior_params)
    plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/")
    # update IOR cmdline for each additional IOR obj
    for api in api_list:
        for b_size in bsize_list:
            for t_size in tsize_list:
                for o_type in oclass_list:
                    # Cancel for ticket DAOS-6095
                    if (api in ["HDF5-VOL", "HDF5", "POSIX"] and t_size == "4k"
                            and o_type in ["RP_2G1", 'RP_2GX']):
                        self.add_cancel_ticket(
                            "DAOS-6095",
                            "IOR -a {} with -t {} and -o {}".format(
                                api, t_size, o_type))
                        continue
                    # Cancel for ticket DAOS-6308
                    if api == "MPIIO" and o_type == "RP_2GX":
                        self.add_cancel_ticket(
                            "DAOS-6308",
                            "IOR -a {} with -o {}".format(api, o_type))
                        continue
                    if api in ["HDF5-VOL", "HDF5", "POSIX"] and ppn > 16:
                        continue
                    ior_cmd = IorCommand()
                    ior_cmd.namespace = ior_params
                    ior_cmd.get_params(self)
                    ior_cmd.max_duration.update(ior_timeout)
                    if api == "HDF5-VOL":
                        ior_cmd.api.update("HDF5")
                    else:
                        ior_cmd.api.update(api)
                    ior_cmd.block_size.update(b_size)
                    ior_cmd.transfer_size.update(t_size)
                    if (api in ["HDF5-VOL", "POSIX"]):
                        ior_cmd.dfs_oclass.update(None)
                        ior_cmd.dfs_dir_oclass.update(None)
                    else:
                        ior_cmd.dfs_oclass.update(o_type)
                        ior_cmd.dfs_dir_oclass.update(o_type)
                    if ior_cmd.api.value == "DFS":
                        ior_cmd.test_file.update(os.path.join("/", "testfile"))
                    add_containers(self, pool, o_type)
                    ior_cmd.set_daos_params(self.server_group, pool,
                                            self.container[-1].uuid)
                    env = ior_cmd.get_default_env("srun")
                    sbatch_cmds = ["module load -q {}".format(mpi_module)]
                    # include dfuse cmdlines
                    log_name = "{}_{}_{}_{}_{}_{}_{}_{}".format(
                        job_spec, api, b_size, t_size, o_type,
                        nodesperjob * ppn, nodesperjob, ppn)
                    if api in ["HDF5-VOL", "POSIX"]:
                        dfuse, dfuse_start_cmdlist = start_dfuse(
                            self,
                            pool,
                            self.container[-1],
                            nodesperjob,
                            "SLURM",
                            name=log_name,
                            job_spec=job_spec)
                        sbatch_cmds.extend(dfuse_start_cmdlist)
                        ior_cmd.test_file.update(
                            os.path.join(dfuse.mount_dir.value, "testfile"))
                    # add envs if api is HDF5-VOL
                    if api == "HDF5-VOL":
                        env["HDF5_VOL_CONNECTOR"] = "daos"
                        env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path)
                        # env["H5_DAOS_BYPASS_DUNS"] = 1
                    srun_cmd = Srun(ior_cmd)
                    srun_cmd.assign_processes(nodesperjob * ppn)
                    srun_cmd.assign_environment(env, True)
                    srun_cmd.ntasks_per_node.update(ppn)
                    srun_cmd.nodes.update(nodesperjob)
                    sbatch_cmds.append(str(srun_cmd))
                    sbatch_cmds.append("status=$?")
                    if api in ["HDF5-VOL", "POSIX"]:
                        sbatch_cmds.extend(
                            stop_dfuse(dfuse, nodesperjob, "SLURM"))
                    commands.append([sbatch_cmds, log_name])
                    self.log.info("<<IOR {} cmdlines>>:".format(api))
                    for cmd in sbatch_cmds:
                        self.log.info("%s", cmd)
    return commands
Beispiel #22
0
class ServerFillUp(IorTestBase):
    # pylint: disable=too-many-ancestors,too-many-instance-attributes
    """Class to fill up the servers based on pool percentage given.

    It will get the drives listed in yaml file and find the maximum capacity of
    the pool which will be created.
    IOR block size will be calculated as part of function based on percentage
    of pool needs to fill up.
    """

    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super().__init__(*args, **kwargs)
        self.capacity = 1
        self.no_of_servers = 1
        self.no_of_drives = 1
        self.pool = None
        self.dmg = None
        self.set_faulty_device = False
        self.set_online_rebuild = False
        self.scm_fill = False
        self.nvme_fill = False
        self.ior_matrix = None
        self.ior_local_cmd = None
        self.result = []
        self.fail_on_warning = False
        self.rank_to_kill = []
        self.pool_exclude = {}
        self.nvme_local_cont = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super().setUp()
        self.hostfile_clients = None
        self.ior_local_cmd = IorCommand()
        self.ior_local_cmd.get_params(self)
        self.ior_default_flags = self.ior_local_cmd.flags.value
        self.ior_scm_xfersize = self.params.get("transfer_size",
                                                '/run/ior/transfersize_blocksize/*', '2048')
        self.ior_read_flags = self.params.get("read_flags", '/run/ior/iorflags/*', '-r -R -k -G 1')
        self.ior_nvme_xfersize = self.params.get("nvme_transfer_size",
                                                 '/run/ior/transfersize_blocksize/*', '16777216')
        # Get the number of daos_engine
        self.engines = self.server_managers[0].manager.job.yaml.engine_params
        self.dmg_command = self.get_dmg_command()

    def create_container(self):
        """Create the container """
        self.nvme_local_cont = self.get_container(self.pool, create=False)

        # update container oclass
        if self.ior_local_cmd.dfs_oclass:
            self.nvme_local_cont.oclass.update(self.ior_local_cmd.dfs_oclass.value)

        self.nvme_local_cont.create()

    def start_ior_thread(self, create_cont, operation):
        """Start IOR write/read threads and wait until all threads are finished.

        Args:
            create_cont (Bool): To create the new container or not.
            operation (str):
                Write/WriteRead: It will Write or Write/Read base on IOR parameter in yaml file.
                Auto_Write/Auto_Read: It will calculate the IOR block size based on requested
                                        storage % to be fill.
        """
        # IOR flag can Write/Read based on test yaml
        self.ior_local_cmd.flags.value = self.ior_default_flags

        # Calculate the block size based on server % to fill up.
        if 'Auto' in operation:
            block_size = self.calculate_ior_block_size()
            self.ior_local_cmd.block_size.update('{}'.format(block_size))

        # For IOR Read operation update the read only flag from yaml file.
        if 'Auto_Read' in operation or operation == "Read":
            create_cont = False
            self.ior_local_cmd.flags.value = self.ior_read_flags

        self.ior_local_cmd.set_daos_params(self.server_group, self.pool)
        self.ior_local_cmd.test_file.update('/testfile')

        # Created new container or use the existing container for reading
        if create_cont:
            self.create_container()
        self.ior_local_cmd.dfs_cont.update(self.nvme_local_cont.uuid)

        # Define the job manager for the IOR command
        job_manager_main = get_job_manager(self, "Mpirun", self.ior_local_cmd, mpi_type="mpich")
        env = self.ior_local_cmd.get_default_env(str(job_manager_main))
        job_manager_main.assign_hosts(self.hostlist_clients, self.workdir, None)
        job_manager_main.assign_environment(env, True)
        job_manager_main.assign_processes(self.params.get("np", '/run/ior/client_processes/*'))

        # run IOR Command
        try:
            output = job_manager_main.run()
            self.ior_matrix = IorCommand.get_ior_metrics(output)

            for line in output.stdout_text.splitlines():
                if 'WARNING' in line and self.fail_on_warning:
                    self.result.append("FAIL-IOR command issued warnings.")
        except (CommandFailure, TestFail) as error:
            self.result.append("FAIL - {}".format(error))

    def calculate_ior_block_size(self):
        """Calculate IOR Block size to fill up the Server.

        Returns:
            block_size(int): IOR Block size

        """
        if self.scm_fill:
            free_space = self.pool.get_pool_daos_space()["s_total"][0]
            self.ior_local_cmd.transfer_size.value = self.ior_scm_xfersize
        elif self.nvme_fill:
            free_space = self.pool.get_pool_daos_space()["s_total"][1]
            self.ior_local_cmd.transfer_size.value = self.ior_nvme_xfersize
        else:
            self.fail('Provide storage type (SCM/NVMe) to be filled')

        # Get the block size based on the capacity to be filled. For example
        # If nvme_free_space is 100G and to fill 50% of capacity.
        # Formula : (107374182400 / 100) * 50.This will give 50%(50G) of space to be filled.
        _tmp_block_size = ((free_space/100)*self.capacity)

        # Check the IOR object type to calculate the correct block size.
        _replica = re.findall(r'_(.+?)G', self.ior_local_cmd.dfs_oclass.value)

        # This is for non replica and EC class where _tmp_block_size will not change.
        if not _replica:
            pass

        # If it's EC object, Calculate the tmp block size based on number of data + parity
        # targets. And calculate the write data size for the total number data targets.
        # For example: 100Gb of total pool to be filled 10% in total: For EC_4P1GX,  Get the data
        # target fill size = 8G, which will fill 8G of data and 2G of Parity. So total 10G (10%
        # of 100G of pool size)
        elif 'P' in _replica[0]:
            replica_server = re.findall(r'\d+', _replica[0])[0]
            parity_count = re.findall(r'\d+', _replica[0])[1]
            _tmp_block_size = int(_tmp_block_size / (int(replica_server) + int(parity_count)))
            _tmp_block_size = int(_tmp_block_size) * int(replica_server)

        # This is Replica type object class
        else:
            _tmp_block_size = int(_tmp_block_size / int(_replica[0]))

        # Last divide the Total sized with IOR number of process
        _tmp_block_size = int(_tmp_block_size) / self.processes

        # Calculate the Final block size of IOR multiple of Transfer size.
        block_size = (int(_tmp_block_size / int(self.ior_local_cmd.transfer_size.value)) * int(
            self.ior_local_cmd.transfer_size.value))

        return block_size

    def set_device_faulty(self, server, disk_id):
        """Set the devices to Faulty and wait for rebuild to complete.

        Args:
            server (string): server hostname where it generate the NVMe fault.
            disk_id (string): NVMe disk ID where it will be changed to faulty.
        """
        self.dmg.hostlist = server
        self.dmg.storage_set_faulty(disk_id)
        result = self.dmg.storage_query_device_health(disk_id)
        # Check if device state changed to EVICTED.
        if 'State:EVICTED' not in result.stdout_text:
            self.fail("device State {} on host {} suppose to be EVICTED".format(disk_id, server))

        # Wait for rebuild to start
        self.pool.wait_for_rebuild(True)
        # Wait for rebuild to complete
        self.pool.wait_for_rebuild(False)

    def set_device_faulty_loop(self):
        """Set devices to Faulty one by one and wait for rebuild to complete."""
        # Get the device ids from all servers and try to eject the disks
        device_ids = get_device_ids(self.dmg, self.hostlist_servers)

        # no_of_servers and no_of_drives can be set from test yaml. 1 Server, 1 Drive = Remove
        # single drive from single server
        for num in range(0, self.no_of_servers):
            server = self.hostlist_servers[num]
            for disk_id in range(0, self.no_of_drives):
                self.set_device_faulty(server, device_ids[server][disk_id])

    def get_max_storage_sizes(self):
        """Get the maximum pool sizes for the current server configuration.

        Returns:
            list: a list of the maximum SCM and NVMe size

        """
        try:
            sizes_dict = self.server_managers[0].get_available_storage()
            sizes = [sizes_dict["scm"], sizes_dict["nvme"]]
        except (ServerFailed, KeyError) as error:
            self.fail(error)

        # Return the 96% of storage space as it won't be used 100% for pool creation.
        for index, _size in enumerate(sizes):
            sizes[index] = int(sizes[index] * 0.96)

        return sizes

    def create_pool_max_size(self, scm=False, nvme=False):
        """Create a single pool with Maximum NVMe/SCM size available.

        Args:
            scm (bool): To create the pool with max SCM size or not.
            nvme (bool): To create the pool with max NVMe size or not.

        Note: Method to Fill up the server. It will get the maximum Storage space and create the
              pool. Replace with dmg options in future when it's available.
        """
        # Create a pool
        self.add_pool(create=False)

        if nvme or scm:
            sizes = self.get_max_storage_sizes()

        # If NVMe is True get the max NVMe size from servers
        if nvme:
            self.pool.nvme_size.update('{}'.format(sizes[1]))

        # If SCM is True get the max SCM size from servers
        if scm:
            self.pool.scm_size.update('{}'.format(sizes[0]))

        # Create the Pool
        self.pool.create()

    def kill_rank_thread(self, rank):
        """
        Server rank kill thread function

        Args:
            rank: Rank number to kill the daos server
        """
        self.server_managers[0].stop_ranks([rank], self.d_log, force=True)

    def exclude_target_thread(self, rank, target):
        """
        Target kill thread function

        Args:
            rank(int): Rank number to kill the target from
            target(str): target number or range of targets to kill
        """
        self.dmg_command.pool_exclude(self.pool.uuid, rank, str(target))

    def start_ior_load(self, storage='NVMe', operation="WriteRead",
                       percent=1, create_cont=True):
        """Fill up the server either SCM or NVMe.

        Fill up based on percent amount given using IOR.

        Args:
            storage (string): SCM or NVMe, by default it will fill NVMe.
            operation (string): Write/Read operation
            percent (int): % of storage to be filled
            create_cont (bool): To create the new container for IOR
        """
        kill_rank_job = []
        kill_target_job = []
        self.result.clear()
        self.capacity = percent
        # Fill up NVMe by default
        self.nvme_fill = 'NVMe' in storage
        self.scm_fill = 'SCM' in storage

        # Create the IOR threads
        job = threading.Thread(target=self.start_ior_thread, kwargs={"create_cont": create_cont,
                                                                     "operation": operation})
        # Launch the IOR thread
        job.start()

        # Set NVMe device faulty if it's set
        if self.set_faulty_device:
            time.sleep(60)
            # Set the device faulty
            self.set_device_faulty_loop()

        # Kill the server rank while IOR in progress
        if self.set_online_rebuild:
            time.sleep(30)
            # Kill the server rank in BG thread
            for _id, _rank in enumerate(self.rank_to_kill):
                kill_rank_job.append(threading.Thread(target=self.kill_rank_thread,
                                                      kwargs={"rank": _rank}))
                kill_rank_job[_id].start()

            # Kill the target from rank in BG thread
            for _id, (key, value) in enumerate(self.pool_exclude.items()):
                kill_target_job.append(threading.Thread(target=self.exclude_target_thread,
                                                        kwargs={"rank": key,
                                                                "target": value}))
                kill_target_job[_id].start()

            # Wait for server kill thread to finish
            for _kill_rank in kill_rank_job:
                _kill_rank.join()

            # Wait for rank kill thread to finish
            for _kill_tgt in kill_target_job:
                _kill_tgt.join()

        # Wait to finish the IOR thread
        job.join()

        # Verify if any test failed for any IOR run
        for test_result in self.result:
            if "FAIL" in test_result:
                self.fail(test_result)
Beispiel #23
0
    def test_metadata_server_restart(self):
        """JIRA ID: DAOS-1512.

        Test Description:
            This test will verify 2000 IOR small size container after server
            restart. Test will write IOR in 5 different threads for faster
            execution time. Each thread will create 400 (8bytes) containers to
            the same pool. Restart the servers, read IOR container file written
            previously and validate data integrity by using IOR option
            "-R -G 1".

        Use Cases:
            ?

        :avocado: tags=metadata,metadata_ior,nvme,large
        """
        files_per_thread = 400
        total_ior_threads = 5
        self.out_queue = queue.Queue()

        processes = self.params.get("slots", "/run/ior/clientslots/*")

        list_of_uuid_lists = [
            [str(uuid.uuid4()) for _ in range(files_per_thread)]
            for _ in range(total_ior_threads)]

        # Launch threads to run IOR to write data, restart the agents and
        # servers, and then run IOR to read the data
        for operation in ("write", "read"):
            # Create the IOR threads
            threads = []
            for index in range(total_ior_threads):
                # Define the arguments for the ior_runner_thread method
                ior_cmd = IorCommand()
                ior_cmd.get_params(self)
                ior_cmd.set_daos_params(self.server_group, self.pool)
                ior_cmd.flags.value = self.params.get(
                    "F", "/run/ior/ior{}flags/".format(operation))

                # Define the job manager for the IOR command
                manager = Orterun(ior_cmd)
                env = ior_cmd.get_default_env(str(manager))
                manager.assign_hosts(self.hostlist_clients, self.workdir, None)
                manager.assign_processes(processes)
                manager.assign_environment(env)

                # Add a thread for these IOR arguments
                threads.append(
                    threading.Thread(
                        target=ior_runner_thread,
                        kwargs={
                            "manager": manager,
                            "uuids": list_of_uuid_lists[index],
                            "results": self.out_queue}))

                self.log.info(
                    "Creatied %s thread %s with container uuids %s", operation,
                    index, list_of_uuid_lists[index])

            # Launch the IOR threads
            if self.thread_control(threads, operation) == "FAIL":
                self.d_log.error("IOR {} Thread FAIL".format(operation))
                self.fail("IOR {} Thread FAIL".format(operation))

            # Restart the agents and servers after the write / before the read
            if operation == "write":
                # Stop the agents
                errors = self.stop_agents()
                self.assertEqual(
                    len(errors), 0,
                    "Error stopping agents:\n  {}".format("\n  ".join(errors)))

                # Stop the servers
                errors = self.stop_servers()
                self.assertEqual(
                    len(errors), 0,
                    "Error stopping servers:\n  {}".format("\n  ".join(errors)))

                # Start the agents
                self.start_agent_managers()

                # Start the servers
                self.start_server_managers()
Beispiel #24
0
    def ior_runner_thread(self, results):
        """Start threads and wait until all threads are finished.

        Destroy the container at the end of this thread run.

        Args:
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        cmd = DaosCommand(os.path.join(self.prefix, "bin"))
        cmd.set_sub_command("container")
        cmd.sub_command_class.set_sub_command("destroy")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Iterate through IOR different value and run in sequence
        for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                self.ior_apis,
                                                self.ior_transfer_size,
                                                self.ior_flags):
            # Define the arguments for the ior_runner_thread method
            ior_cmd = IorCommand()
            ior_cmd.get_params(self)
            ior_cmd.set_daos_params(self.server_group, self.pool)
            ior_cmd.dfs_oclass.update(oclass)
            ior_cmd.api.update(api)
            ior_cmd.transfer_size.update(test[0])
            ior_cmd.block_size.update(test[1])
            ior_cmd.flags.update(flags)

            container_info["{}{}{}"
                           .format(oclass,
                                   api,
                                   test[0])] = str(uuid.uuid4())

            # Define the job manager for the IOR command
            manager = Mpirun(ior_cmd, mpitype="mpich")
            manager.job.dfs_cont.update(container_info
                                         ["{}{}{}".format(oclass,
                                                          api,
                                                          test[0])])
            env = ior_cmd.get_default_env(str(manager))
            manager.assign_hosts(self.hostlist_clients, self.workdir, None)
            manager.assign_processes(processes)
            manager.assign_environment(env, True)

            # run IOR Command
            try:
                manager.run()
            except CommandFailure as _error:
                results.put("FAIL")

        # Destroy the container created by thread
        for key in container_info:
            cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid
            cmd.sub_command_class.sub_command_class.svc.value = \
                self.pool.svc_ranks
            cmd.sub_command_class.sub_command_class.cont.value = \
                container_info[key]

            try:
                cmd._get_result()
            except CommandFailure as _error:
                results.put("FAIL")