def run_ior_collect_error(self, results, job_num, file_name, clients): """Run IOR command and store error in results. Args: results (dict): A dictionary object to store the ior metrics. job_num (int): Assigned job number. file_name (str): File name used for self.ior_cmd.test_file. clients (list): Client hostnames to run IOR from. """ ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params( group=self.server_group, pool=self.pool, cont_uuid=self.container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager( test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') try: ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # We'll verify the error message. results[job_num].append(ior_output.stderr_text) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)]
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("daos_oclass", ior_params + "*") # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX if self.is_harasser("rebuild"): oclass_list = self.params.get("daos_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.daos_oclass.update(o_type) ior_cmd.set_daos_params(self.server_group, pool) # srun cmdline nprocs = nodesperjob * ppn env = ior_cmd.get_default_env("srun") if ior_cmd.api.value == "MPIIO": env["DAOS_CONT"] = ior_cmd.daos_cont.value cmd = Srun(ior_cmd) cmd.assign_processes(nprocs) cmd.assign_environment(env, True) cmd.ntasks_per_node.update(ppn) log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([cmd.__str__(), log_name]) self.log.info("<<IOR cmdline>>: %s \n", commands[-1].__str__()) return commands
def run_ior_threads_il(self, results, intercept, with_clients, without_clients): """Execute 2 IOR threads in parallel. One thread is run with the interception library (IL) and one without. Args: results (dict): Dictionary to store the IOR results that gets printed in the IOR output. intercept (str): Path to the interception library. Shall be used only for POSIX through DFUSE. with_clients (list): List of clients that use IL. without_clients (list): List of clients that doesn't use IL. """ # We can't use the shared self.ior_cmd, so we need to create the # IorCommand object for each thread. ior_cmd1 = IorCommand() ior_cmd1.get_params(self) # Update IOR params with the pool and container params ior_cmd1.set_daos_params(self.server_group, self.pool, self.container.uuid) ior_cmd2 = IorCommand() ior_cmd2.get_params(self) ior_cmd2.set_daos_params(self.server_group, self.pool, self.container.uuid) # start dfuse for POSIX api. This is specific to interception library # test requirements. self.start_dfuse(self.hostlist_clients, self.pool, self.container) # Create two threads and run in parallel. thread1 = self.create_ior_thread(ior_cmd1, with_clients, 1, results, intercept) thread2 = self.create_ior_thread(ior_cmd2, without_clients, 2, results, None) thread1.start() thread2.start() thread1.join() thread2.join() self.stop_dfuse() # Basic verification of the thread results status = True for key in sorted(results): if not results[key].pop(0): self.log.error("IOR Thread %d: %s", key, results[key][0]) status = False if len(results[key]) != 2: self.log.error( "IOR Thread %d: expecting 2 results; %d found: %s", key, len(results[key]), results[key]) status = False if not status: self.fail("At least one IOR thread failed!")
def create_ior_cmdline(self, job_params, job_spec, pool): """Create an IOR cmdline to run in slurm batch. Args: job_params (str): job params from yaml file job_spec (str): specific ior job to run pool (obj): TestPool obj Returns: cmd: cmdline string """ command = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/" ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) ior_cmd.max_duration.update(self.params.get("time", job_params + '*')) # IOR job specs with a list of parameters; update each value # transfer_size # block_size # daos object class tsize_list = ior_cmd.transfer_size.value bsize_list = ior_cmd.block_size.value oclass_list = ior_cmd.daos_oclass.value for b_size in bsize_list: ior_cmd.block_size.update(b_size) for o_type in oclass_list: ior_cmd.daos_oclass.update(o_type) for t_size in tsize_list: ior_cmd.transfer_size.update(t_size) ior_cmd.set_daos_params(self.server_group, pool) # export the user environment to test node exports = ["ALL"] if ior_cmd.api.value == "MPIIO": env = { "CRT_ATTACH_INFO_PATH": os.path.join( self.basepath, "install/tmp"), "DAOS_POOL": str(ior_cmd.daos_pool.value), "MPI_LIB": "\"\"", "DAOS_SVCL": str(ior_cmd.daos_svcl.value), "DAOS_SINGLETON_CLI": 1, "FI_PSM2_DISCONNECT": 1 } exports.extend( ["{}={}".format( key, val) for key, val in env.items()]) cmd = "srun -l --mpi=pmi2 --export={} {}".format( ",".join(exports), ior_cmd) command.append(cmd) self.log.debug("<<IOR cmdline >>: %s \n", cmd) return command
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.daos_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.daos_cont.update(container_info ["{}{}{}".format(oclass, api, test[2])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def run_ior_report_error(self, results, job_num, file_name, pool, container, namespace): """Run IOR command and store the results to results dictionary. Create a new IorCommand object instead of using the one in IorTestBase because we'll run a test that runs multiple IOR processes at the same time. Args: results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number file_name (str): File name used for self.ior_cmd.test_file. oclass (str): Value for dfs_oclass and dfs_dir_oclass. pool (TestPool): Pool to run IOR. container (TestContainer): Container to run IOR. """ # Update the object class depending on the test case. ior_cmd = IorCommand(namespace=namespace) ior_cmd.get_params(self) # Standard IOR prep sequence. ior_cmd.set_daos_params(self.server_group, pool, container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager(test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') # Run the command. try: self.log.info("--- IOR command %d start ---", job_num) ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # Command worked, but append the error message if any. results[job_num].append(ior_output.stderr_text) self.log.info("--- IOR command %d end ---", job_num) except CommandFailure as error: self.log.info("--- IOR command %d failed ---", job_num) results[job_num] = [False, "IOR failed: {}".format(error)]
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (TestPool): Pool to run IOR command on. oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_cmd, mpi_type="mpich") key = "{}{}{}".format(oclass, api, test[2]) job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(processes) job_manager.assign_environment(env, True) # run IOR Command try: job_manager.run() except CommandFailure as _error: results.put("FAIL")
def ior_bg_thread(self, results): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. Args: results (queue): queue for returning thread results """ mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich") self.create_cont() self.job_manager.job.dfs_cont.update(self.container.uuid) env = ior_bg_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(1) self.job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") break
def ior_bg_thread(self): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. """ # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_bg_cmd, mpi_type="mpich") # create container container = self.get_container(self.pool) job_manager.job.dfs_cont.update(container.uuid) env = ior_bg_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(1) job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: job_manager.run() except (CommandFailure, TestFail) as _error: self.test_result.append("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: job_manager.run() except (CommandFailure, TestFail) as _error: break
def ior_thread(self, pool, oclass, api, test, flags, results): """This method calls job manager for IOR command invocation. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) if "-w" in flags: self.container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[0])]) manager.job.dfs_cont.update(self.container_info[key]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def run_ior_threads_il(self, results, intercept, with_clients, without_clients): """Execute 2 IOR threads in parallel. One thread with interception library (IL) and one without. Args: results (dict): Dictionary to store the IOR results that gets printed in the IOR output. intercept (str): Path to the interception library. Shall be used only for POSIX through DFUSE. with_clients (list): List of clients that use IL. without_clients (list): List of clients that doesn't use IL. """ # We can't use the shared self.ior_cmd, so we need to create the # IorCommand object for each thread. ior_cmd1 = IorCommand() ior_cmd1.get_params(self) # Update IOR params with the pool and container params ior_cmd1.set_daos_params( self.server_group, self.pool, self.container.uuid) ior_cmd2 = IorCommand() ior_cmd2.get_params(self) ior_cmd2.set_daos_params( self.server_group, self.pool, self.container.uuid) # start dfuse for POSIX api. This is specific to interception library # test requirements. self.start_dfuse(self.hostlist_clients, self.pool, self.container) # Create two threads and run in parallel. thread1 = self.create_ior_thread( ior_cmd1, with_clients, 1, results, intercept) thread2 = self.create_ior_thread( ior_cmd2, without_clients, 2, results, None) thread1.start() thread2.start() thread1.join() thread2.join() self.stop_dfuse()
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" mpi_module = self.params.get( "mpi_module", "/run/", default="mpi/mpich-x86_64") # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("dfs_oclass", ior_params + "*") plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/") # check if capable of doing rebuild; if yes then dfs_oclass = RP_*GX if is_harasser(self, "rebuild"): oclass_list = self.params.get("dfs_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) if api == "HDF5-VOL": ior_cmd.api.update("HDF5") else: ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.dfs_oclass.update(o_type) if ior_cmd.api.value == "DFS": ior_cmd.test_file.update( os.path.join("/", "testfile")) ior_cmd.set_daos_params(self.server_group, pool) env = ior_cmd.get_default_env("srun") sbatch_cmds = ["module load -q {}".format(mpi_module)] # include dfuse cmdlines if api in ["HDF5-VOL", "POSIX"]: dfuse, dfuse_start_cmdlist = start_dfuse( self, pool, nodesperjob, "SLURM") sbatch_cmds.extend(dfuse_start_cmdlist) ior_cmd.test_file.update( os.path.join(dfuse.mount_dir.value, "testfile")) # add envs if api is HDF5-VOL if api == "HDF5-VOL": env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path) # env["H5_DAOS_BYPASS_DUNS"] = 1 srun_cmd = Srun(ior_cmd) srun_cmd.assign_processes(nodesperjob * ppn) srun_cmd.assign_environment(env, True) srun_cmd.ntasks_per_node.update(ppn) srun_cmd.nodes.update(nodesperjob) sbatch_cmds.append(str(srun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL", "POSIX"]: sbatch_cmds.extend( stop_dfuse(dfuse, nodesperjob, "SLURM")) sbatch_cmds.append("exit $status") log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([sbatch_cmds, log_name]) self.log.info( "<<IOR {} cmdlines>>:".format(api)) for cmd in sbatch_cmds: self.log.info("%s", cmd) return commands
class IorTestBase(DfuseTestBase): # pylint: disable=too-many-ancestors """Base IOR test class. :avocado: recursive """ IOR_WRITE_PATTERN = "Commencing write performance test" IOR_READ_PATTERN = "Commencing read performance test" def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super().__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.container = None self.ior_timeout = None self.ppn = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super().setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.ppn = self.params.get("ppn", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) self.ior_timeout = self.params.get("ior_timeout", '/run/ior/*', None) def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params and create a pool self.add_pool(connect=False) def create_cont(self): """Create a TestContainer object to be used to create container. """ # Get container params self.container = TestContainer(self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # update container oclass if self.ior_cmd.dfs_oclass: self.container.oclass.update(self.ior_cmd.dfs_oclass.value) # create container self.container.create() def display_pool_space(self, pool=None): """Display the current pool space. If the TestPool object has a DmgCommand object assigned, also display the free pool space per target. Args: pool (TestPool, optional): The pool for which to display space. Default is self.pool. """ if not pool: pool = self.pool pool.display_pool_daos_space() if pool.dmg: pool.set_query_data() def run_ior_with_pool(self, intercept=None, test_file_suffix="", test_file="daos:/testFile", create_pool=True, create_cont=True, stop_dfuse=True, plugin_path=None, timeout=None, fail_on_warning=False, mount_dir=None, out_queue=None, env=None): # pylint: disable=too-many-arguments """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str, optional): path to the interception library. Shall be used only for POSIX through DFUSE. Defaults to None. test_file_suffix (str, optional): suffix to add to the end of the test file name. Defaults to "". test_file (str, optional): ior test file name. Defaults to "daos:/testFile". Is ignored when using POSIX through DFUSE. create_pool (bool, optional): If it is true, create pool and container else just run the ior. Defaults to True. create_cont (bool, optional): Create new container. Default is True stop_dfuse (bool, optional): Stop dfuse after ior command is finished. Default is True. plugin_path (str, optional): HDF5 vol connector library path. This will enable dfuse (xattr) working directory which is needed to run vol connector for DAOS. Default is None. timeout (int, optional): command timeout. Defaults to None. fail_on_warning (bool, optional): Controls whether the test should fail if a 'WARNING' is found. Default is False. mount_dir (str, optional): Create specific mount point out_queue (queue, optional): Pass the exception to the queue. Defaults to None env (EnvironmentVariables, optional): Pass the environment to be used when calling run_ior. Defaults to None Returns: CmdResult: result of the ior command execution """ if create_pool: self.update_ior_cmd_with_pool(create_cont) # start dfuse if api is POSIX or HDF5 with vol connector if self.ior_cmd.api.value == "POSIX" or plugin_path: # add a substring in case of HDF5-VOL if plugin_path: sub_dir = get_random_string(5) mount_dir = os.path.join(mount_dir, sub_dir) # Connect to the pool, create container and then start dfuse if not self.dfuse: self.start_dfuse(self.hostlist_clients, self.pool, self.container, mount_dir) # setup test file for POSIX or HDF5 with vol connector if self.ior_cmd.api.value == "POSIX" or plugin_path: test_file = os.path.join(self.dfuse.mount_dir.value, "testfile") elif self.ior_cmd.api.value == "DFS": test_file = os.path.join("/", "testfile") self.ior_cmd.test_file.update("".join([test_file, test_file_suffix])) job_manager = self.get_ior_job_manager_command() job_manager.timeout = timeout try: out = self.run_ior(job_manager, self.processes, intercept, plugin_path=plugin_path, fail_on_warning=fail_on_warning, out_queue=out_queue, env=env) finally: if stop_dfuse: self.stop_dfuse() return out def update_ior_cmd_with_pool(self, create_cont=True): """Update ior_cmd with pool. Args: create_cont (bool, optional): create a container. Defaults to True. """ # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Create a container, if needed. # Don't pass uuid and pool handle to IOR. # It will not enable checksum feature if create_cont: self.pool.connect() self.create_cont() # Update IOR params with the pool and container params self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.uuid) def get_ior_job_manager_command(self, custom_ior_cmd=None): """Get the MPI job manager command for IOR. Args: custom_ior_cmd (IorCommand): Custom IorCommand instance to create job_manager with. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DFS mode if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS", "HDF5"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") if custom_ior_cmd: self.job_manager = Mpirun(custom_ior_cmd, self.subprocess, "mpich") else: self.job_manager = Mpirun(self.ior_cmd, self.subprocess, "mpich") return self.job_manager def check_subprocess_status(self, operation="write"): """Check subprocess status.""" if operation == "write": self.ior_cmd.pattern = self.IOR_WRITE_PATTERN elif operation == "read": self.ior_cmd.pattern = self.IOR_READ_PATTERN else: self.fail("Exiting Test: Inappropriate operation type \ for subprocess status check") if not self.ior_cmd.check_ior_subprocess_status( self.job_manager.process, self.ior_cmd): self.fail("Exiting Test: Subprocess not running") def run_ior(self, manager, processes, intercept=None, display_space=True, plugin_path=None, fail_on_warning=False, pool=None, out_queue=None, env=None): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str, optional): path to interception library. display_space (bool, optional): Whether to display the pool space. Defaults to True. plugin_path (str, optional): HDF5 vol connector library path. This will enable dfuse (xattr) working directory which is needed to run vol connector for DAOS. Default is None. fail_on_warning (bool, optional): Controls whether the test should fail if a 'WARNING' is found. Default is False. pool (TestPool, optional): The pool for which to display space. Default is self.pool. out_queue (queue, optional): Pass the exception to the queue. Defaults to None. env (EnvironmentVariables, optional): Environment to be used when running ior. Defaults to None """ if not env: env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env['LD_PRELOAD'] = intercept env['D_LOG_MASK'] = 'INFO' if env.get('D_IL_REPORT', None) is None: env['D_IL_REPORT'] = '1' #env['D_LOG_MASK'] = 'INFO,IL=DEBUG' #env['DD_MASK'] = 'all' #env['DD_SUBSYS'] = 'all' if plugin_path: env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = str(plugin_path) manager.working_dir.value = self.dfuse.mount_dir.value manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) if self.ppn is None: manager.assign_processes(processes) else: manager.ppn.update(self.ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') manager.assign_environment(env) if not pool: pool = self.pool try: if display_space: self.display_pool_space(pool) out = manager.run() if self.subprocess: return out if fail_on_warning: report_warning = self.fail else: report_warning = self.log.warning for line in out.stdout_text.splitlines(): if 'WARNING' in line: report_warning("IOR command issued warnings.") return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) # Queue is used when we use a thread to call # ior thread (eg: thread1 --> thread2 --> ior) if out_queue is not None: out_queue.put("IOR Failed") self.fail("Test was expected to pass but it failed.\n") finally: if not self.subprocess and display_space: self.display_pool_space(pool) def stop_ior(self): """Stop IOR process. Args: manager (str): mpi job manager command """ self.log.info("<IOR> Stopping in-progress IOR command: %s", str(self.job_manager)) try: out = self.job_manager.stop() return out except CommandFailure as error: self.log.error("IOR stop Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.display_pool_space() def run_ior_threads_il(self, results, intercept, with_clients, without_clients): """Execute 2 IOR threads in parallel. One thread is run with the interception library (IL) and one without. Args: results (dict): Dictionary to store the IOR results that gets printed in the IOR output. intercept (str): Path to the interception library. Shall be used only for POSIX through DFUSE. with_clients (list): List of clients that use IL. without_clients (list): List of clients that doesn't use IL. """ # We can't use the shared self.ior_cmd, so we need to create the # IorCommand object for each thread. ior_cmd1 = IorCommand() ior_cmd1.get_params(self) # Update IOR params with the pool and container params ior_cmd1.set_daos_params(self.server_group, self.pool, self.container.uuid) ior_cmd2 = IorCommand() ior_cmd2.get_params(self) ior_cmd2.set_daos_params(self.server_group, self.pool, self.container.uuid) # start dfuse for POSIX api. This is specific to interception library # test requirements. self.start_dfuse(self.hostlist_clients, self.pool, self.container) # Create two threads and run in parallel. thread1 = self.create_ior_thread(ior_cmd1, with_clients, 1, results, intercept) thread2 = self.create_ior_thread(ior_cmd2, without_clients, 2, results, None) thread1.start() thread2.start() thread1.join() thread2.join() self.stop_dfuse() # Basic verification of the thread results status = True for key in sorted(results): if not results[key].pop(0): self.log.error("IOR Thread %d: %s", key, results[key][0]) status = False if len(results[key]) != 2: self.log.error( "IOR Thread %d: expecting 2 results; %d found: %s", key, len(results[key]), results[key]) status = False if not status: self.fail("At least one IOR thread failed!") def create_ior_thread(self, ior_command, clients, job_num, results, intercept=None): """Create a new thread for ior run. Args: ior_command (IorCommand): IOR command instance. clients (list): hosts on which to run ior job_num (int): Assigned job number results (dict): A dictionary object to store the ior metrics intercept (path): Path to interception library """ job = threading.Thread( target=self.run_custom_ior_cmd, args=[ior_command, clients, results, job_num, intercept]) return job def run_custom_ior_cmd(self, ior_command, clients, results, job_num, intercept=None): """Run customized IOR command, not self.ior_cmd. Expected to be used with a threaded code where multiple IOR commands are executed in parallel. Display pool space before running it for a reference. Args: ior_command (IorCommand): Custom IOR command instance. clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.log.info("--- IOR Thread %d: Start ---", job_num) tsize = ior_command.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" ior_command.test_file.update(testfile) # Get the custom job manager that's associated with this thread. manager = self.get_ior_job_manager_command(custom_ior_cmd=ior_command) procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = ior_command.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.log.info("--- IOR Thread %d: Starting IOR ---", job_num) self.display_pool_space() try: ior_output = manager.run() results[job_num] = [True] results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)] finally: self.display_pool_space() self.log.info("--- IOR Thread %d: End ---", job_num) def run_ior_multiple_variants(self, obj_class, apis, transfer_block_size, flags, mount_dir): """Run multiple ior commands with various different combination of ior input params. Args: obj_class(list): List of different object classes apis(list): list of different apis transfer_block_size(list): list of different transfer sizes and block sizes. eg: [1M, 32M] 1M is transfer size and 32M is block size in the above example. flags(list): list of ior flags mount_dir(str): dfuse mount directory """ results = [] for oclass in obj_class: self.ior_cmd.dfs_oclass.update(oclass) for api in apis: if api == "HDF5-VOL": self.ior_cmd.api.update("HDF5") hdf5_plugin_path = self.params.get("plugin_path", '/run/hdf5_vol/*') flags_w_k = " ".join([flags[0]] + ["-k"]) self.ior_cmd.flags.update(flags_w_k, "ior.flags") else: # run tests for different variants self.ior_cmd.flags.update(flags[0], "ior.flags") hdf5_plugin_path = None self.ior_cmd.api.update(api) for test in transfer_block_size: # update transfer and block size self.ior_cmd.transfer_size.update(test[0]) self.ior_cmd.block_size.update(test[1]) # run ior try: self.run_ior_with_pool(plugin_path=hdf5_plugin_path, timeout=self.ior_timeout, mount_dir=mount_dir) results.append(["PASS", str(self.ior_cmd)]) except CommandFailure: results.append(["FAIL", str(self.ior_cmd)]) return results def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size)) def execute_cmd(self, command, fail_on_err=True, display_output=True): """Execute cmd using general_utils.pcmd. Args: command (str): the command to execute on the client hosts fail_on_err (bool, optional): whether or not to fail the test if command returns a non zero return code. Defaults to True. display_output (bool, optional): whether or not to display output. Defaults to True. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: # Execute the bash command on each client host result = self._execute_command(command, fail_on_err, display_output) except CommandFailure as error: # Report an error if any command fails self.log.error("DfuseSparseFile Test Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") return result def _execute_command(self, command, fail_on_err=True, display_output=True, hosts=None): """Execute the command on all client hosts. Optionally verify if the command returns a non zero return code. Args: command (str): the command to execute on the client hosts fail_on_err (bool, optional): whether or not to fail the test if command returns a non zero return code. Defaults to True. display_output (bool, optional): whether or not to display output. Defaults to True. Raises: CommandFailure: if 'fail_on_err' is set and the command fails on at least one of the client hosts Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ if hosts is None: hosts = self.hostlist_clients result = pcmd(hosts, command, verbose=display_output, timeout=300) if 0 not in result and fail_on_err: hosts = [ str(nodes) for code, nodes in list(result.items()) if code != 0 ] raise CommandFailure( "Error running '{}' on the following hosts: {}".format( command, NodeSet(",".join(hosts)))) return result
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ IOR_WRITE_PATTERN = "Commencing write performance test" IOR_READ_PATTERN = "Commencing read performance test" def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None self.lock = None self.mpirun = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) # lock is needed for run_multiple_ior method. self.lock = threading.Lock() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(IorTestBase, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool( self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # Get container params self.container = TestContainer( self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # create container self.container.create() def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n") def run_ior_with_pool(self, intercept=None, test_file_suffix="", test_file="daos:testFile", create_pool=True, create_cont=True, stop_dfuse=True): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str, optional): path to the interception library. Shall be used only for POSIX through DFUSE. Defaults to None. test_file_suffix (str, optional): suffix to add to the end of the test file name. Defaults to "". test_file (str, optional): ior test file name. Defaults to "daos:testFile". Is ignored when using POSIX through DFUSE. create_pool (bool, optional): If it is true, create pool and container else just run the ior. Defaults to True. create_cont (bool, optional): Create new container. Default is True stop_dfuse (bool, optional): Stop dfuse after ior command is finished. Default is True. Returns: CmdResult: result of the ior command execution """ if create_pool: self.update_ior_cmd_with_pool(create_cont) # start dfuse if api is POSIX if self.ior_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse if not self.dfuse: self._start_dfuse() test_file = os.path.join(self.dfuse.mount_dir.value, "testfile") elif self.ior_cmd.api.value == "DFS": test_file = os.path.join("/", "testfile") self.ior_cmd.test_file.update("".join([test_file, test_file_suffix])) out = self.run_ior(self.get_ior_job_manager_command(), self.processes, intercept) if stop_dfuse and self.dfuse: self.dfuse.stop() self.dfuse = None return out def update_ior_cmd_with_pool(self, create_cont=True): """Update ior_cmd with pool.""" # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Create a container, if needed. # Don't pass uuid and pool handle to IOR. # It will not enable checksum feature if create_cont: self.pool.connect() self.create_cont() # Update IOR params with the pool and container params self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.uuid) def get_ior_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DFS mode if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") if self.subprocess: self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich") else: self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich") return self.mpirun def check_subprocess_status(self, operation="write"): """Check subprocess status """ if operation == "write": self.ior_cmd.pattern = self.IOR_WRITE_PATTERN elif operation == "read": self.ior_cmd.pattern = self.IOR_READ_PATTERN else: self.fail("Exiting Test: Inappropriate operation type \ for subprocess status check") if not self.ior_cmd.check_ior_subprocess_status( self.mpirun.process, self.ior_cmd): self.fail("Exiting Test: Subprocess not running") def run_ior(self, manager, processes, intercept=None, display_space=True): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(processes) manager.assign_environment(env) try: if display_space: self.pool.display_pool_daos_space() out = manager.run() if not self.subprocess: for line in out.stdout.splitlines(): if 'WARNING' in line: self.fail("IOR command issued warnings.\n") return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: if not self.subprocess and display_space: self.pool.display_pool_daos_space() def stop_ior(self): """Stop IOR process. Args: manager (str): mpi job manager command """ self.log.info( "<IOR> Stopping in-progress IOR command: %s", self.mpirun.__str__()) try: out = self.mpirun.stop() return out except CommandFailure as error: self.log.error("IOR stop Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def run_multiple_ior_with_pool(self, results, intercept=None): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str): path to the interception library. Shall be used only for POSIX through DFUSE. ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ self.update_ior_cmd_with_pool() # start dfuse for POSIX api. This is specific to interception # library test requirements. self._start_dfuse() # Create two jobs and run in parallel. # Job1 will have 3 client set up to use dfuse + interception # library # Job2 will have 1 client set up to use only dfuse. job1 = self.get_new_job(self.hostlist_clients[:-1], 1, results, intercept) job2 = self.get_new_job([self.hostlist_clients[-1]], 2, results, None) job1.start() # Since same ior_cmd is used to trigger the MPIRUN # with different parameters, pausing for 2 seconds to # avoid data collisions. time.sleep(2) job2.start() job1.join() job2.join() self.dfuse.stop() self.dfuse = None def get_new_job(self, clients, job_num, results, intercept=None): """Create a new thread for ior run. Args: clients (list): hosts on which to run ior job_num (int): Assigned job number results (dict): A dictionary object to store the ior metrics intercept (path): Path to interception library """ job = threading.Thread(target=self.run_multiple_ior, args=[ clients, results, job_num, intercept]) return job def run_multiple_ior(self, clients, results, job_num, intercept=None): """Run the IOR command. Args: clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.lock.acquire(True) tsize = self.ior_cmd.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" self.ior_cmd.test_file.update(testfile) manager = self.get_ior_job_manager_command() procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.lock.release() try: self.pool.display_pool_daos_space() out = manager.run() self.lock.acquire(True) results[job_num] = IorCommand.get_ior_metrics(out) self.lock.release() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size)) def execute_cmd(self, cmd, fail_on_err=True, display_output=True): """Execute cmd using general_utils.pcmd Args: cmd (str): String command to be executed fail_on_err (bool): Boolean for whether to fail the test if command execution returns non zero return code. display_output (bool): Boolean for whether to display output. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: # execute bash cmds ret = pcmd( self.hostlist_clients, cmd, verbose=display_output, timeout=300) if 0 not in ret: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret.items() if code != 0])) if fail_on_err: raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("DfuseSparseFile Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") return ret
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=server,metadata,metadata_ior,nvme """ self.create_pool() files_per_thread = 400 total_ior_threads = 5 processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [[ str(uuid.uuid4()) for _ in range(files_per_thread) ] for _ in range(total_ior_threads)] # Setup the thread manager thread_manager = ThreadManager(run_ior_loop, self.timeout - 30) # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads for index in range(total_ior_threads): # Define the arguments for the run_ior_loop method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Define the job manager for the IOR command self.ior_managers.append(Orterun(ior_cmd)) env = ior_cmd.get_default_env(str(self.ior_managers[-1])) self.ior_managers[-1].assign_hosts(self.hostlist_clients, self.workdir, None) self.ior_managers[-1].assign_processes(processes) self.ior_managers[-1].assign_environment(env) self.ior_managers[-1].verbose = False # Add a thread for these IOR arguments thread_manager.add(manager=self.ior_managers[-1], uuids=list_of_uuid_lists[index], tmpdir_base=self.test_dir) self.log.info("Created %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads self.log.info("Launching %d IOR %s threads", thread_manager.qty, operation) failed_thread_count = thread_manager.check_run() if failed_thread_count > 0: msg = "{} FAILED IOR {} Thread(s)".format( failed_thread_count, operation) self.d_log.error(msg) self.fail(msg) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents errors = self.stop_agents() self.assertEqual( len(errors), 0, "Error stopping agents:\n {}".format("\n ".join(errors))) # Restart the servers w/o formatting the storage errors = self.restart_servers() self.assertEqual( len(errors), 0, "Error stopping servers:\n {}".format( "\n ".join(errors))) # Start the agents self.start_agent_managers() self.log.info("Test passed")
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) def tearDown(self): """Tear down each test case.""" try: self.dfuse = None finally: # Stop the servers and agents super(IorTestBase, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool(self.context, self.log) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # TO-DO: Enable container using TestContainer object, # once DAOS-3355 is resolved. # Get Container params #self.container = TestContainer(self.pool) #self.container.get_params(self) # create container # self.container.create() env = Dfuse(self.hostlist_clients, self.tmp).get_default_env() # command to create container of posix type cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format( self.ior_cmd.daos_pool.value, self.ior_cmd.daos_svcl.value) try: container = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) (output, err) = container.communicate() self.log.info("Container created with UUID %s", output.split()[3]) except subprocess.CalledProcessError as err: self.fail("Container create failed:{}".format(err)) return output.split()[3] def start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.create_cont()) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n") def run_ior_with_pool(self, intercept=None): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str): path to the interception library. Shall be used only for POSIX through DFUSE. ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Update IOR params with the pool self.ior_cmd.set_daos_params(self.server_group, self.pool) # start dfuse if api is POSIX if self.ior_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved # self.pool.connect() # self.create_cont() if self.ior_cmd.transfer_size.value == "256B": self.cancelForTicket("DAOS-3449") self.start_dfuse() self.ior_cmd.test_file.update(self.dfuse.mount_dir.value + "/testfile") out = self.run_ior(self.get_job_manager_command(), self.processes, intercept) return out def get_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") mpirun_path = os.path.join(mpio_util.mpichinstall, "bin") return Mpirun(self.ior_cmd, mpirun_path) def run_ior(self, manager, processes, intercept=None): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ env = self.ior_cmd.get_default_env(str(manager), self.tmp, self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.setup_command(env, self.hostfile_clients, processes) try: out = manager.run() return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size))
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None self.lock = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) # lock is needed for run_multiple_ior method. self.lock = threading.Lock() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(IorTestBase, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool(self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # Get container params self.container = TestContainer(self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # create container self.container.create() def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n") def run_ior_with_pool(self, intercept=None, test_file_suffix="", test_file="daos:testFile"): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str, optional): path to the interception library. Shall be used only for POSIX through DFUSE. Defaults to None. test_file_suffix (str, optional): suffix to add to the end of the test file name. Defaults to "". test_file (str, optional): ior test file name. Defaults to "daos:testFile". Is ignored when using POSIX through DFUSE. Returns: CmdResult: result of the ior command execution """ self.update_ior_cmd_with_pool() # start dfuse if api is POSIX if self.ior_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved if self.ior_cmd.transfer_size.value == "256B": return "Skipping the case for transfer_size=256B" self._start_dfuse() test_file = os.path.join(self.dfuse.mount_dir.value, "testfile") elif self.ior_cmd.api.value == "DFS": test_file = os.path.join("/", "testfile") self.ior_cmd.test_file.update("".join([test_file, test_file_suffix])) out = self.run_ior(self.get_ior_job_manager_command(), self.processes, intercept) if self.dfuse: self.dfuse.stop() self.dfuse = None return out def update_ior_cmd_with_pool(self): """Update ior_cmd with pool.""" # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Always create a container # Don't pass uuid and pool handle to IOR. # It will not enable checksum feature self.pool.connect() self.create_cont() # Update IOR params with the pool and container params self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.uuid) def get_ior_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX", "DFS"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") return Mpirun(self.ior_cmd, mpitype="mpich") def run_ior(self, manager, processes, intercept=None): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.setup_command(env, self.hostfile_clients, processes) try: self.pool.display_pool_daos_space() out = manager.run() return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def run_multiple_ior_with_pool(self, results, intercept=None): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str): path to the interception library. Shall be used only for POSIX through DFUSE. ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ self.update_ior_cmd_with_pool() # start dfuse for POSIX api. This is specific to interception # library test requirements. self._start_dfuse() # Create two jobs and run in parallel. # Job1 will have 3 client set up to use dfuse + interception # library # Job2 will have 1 client set up to use only dfuse. job1 = self.get_new_job(self.hostlist_clients[:-1], 1, results, intercept) job2 = self.get_new_job([self.hostlist_clients[-1]], 2, results, None) job1.start() # Since same ior_cmd is used to trigger the MPIRUN # with different parameters, pausing for 2 seconds to # avoid data collisions. time.sleep(2) job2.start() job1.join() job2.join() self.dfuse.stop() self.dfuse = None def get_new_job(self, clients, job_num, results, intercept=None): """Create a new thread for ior run. Args: clients (lst): Number of clients the ior would run against. job_num (int): Assigned job number results (dict): A dictionary object to store the ior metrics intercept (path): Path to interception library """ hostfile = write_host_file.write_host_file(clients, self.workdir, self.hostfile_clients_slots) job = threading.Thread( target=self.run_multiple_ior, args=[hostfile, len(clients), results, job_num, intercept]) return job def run_multiple_ior(self, hostfile, num_clients, results, job_num, intercept=None): # pylint: disable=too-many-arguments """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ self.lock.acquire(True) tsize = self.ior_cmd.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" self.ior_cmd.test_file.update(testfile) manager = self.get_ior_job_manager_command() procs = (self.processes // len(self.hostlist_clients)) * num_clients env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.setup_command(env, hostfile, procs) self.lock.release() try: self.pool.display_pool_daos_space() out = manager.run() self.lock.acquire(True) results[job_num] = IorCommand.get_ior_metrics(out) self.lock.release() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size))
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=metadata,metadata_ior,nvme,small """ files_per_thread = 400 total_ior_threads = 5 self.out_queue = Queue.Queue() processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [[ str(uuid.uuid4()) for _ in range(files_per_thread) ] for _ in range(total_ior_threads)] # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads threads = [] for index in range(total_ior_threads): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Add a thread for these IOR arguments threads.append( threading.Thread(target=ior_runner_thread, kwargs={ "ior_cmd": ior_cmd, "uuids": list_of_uuid_lists[index], "mgr": self.orterun, "attach": self.tmp, "hostfile": self.hostfile_clients, "procs": processes, "results": self.out_queue })) self.log.info("Creatied %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads if self.thread_control(threads, operation) == "FAIL": self.d_log.error("IOR {} Thread FAIL".format(operation)) self.fail("IOR {} Thread FAIL".format(operation)) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents and servers if self.agent_sessions: stop_agent(self.agent_sessions, self.hostlist_clients) stop_server(hosts=self.hostlist_servers) # Start the agents self.agent_sessions = run_agent(self.basepath, self.hostlist_clients, self.hostlist_servers) # Start the servers run_server(self.hostfile_servers, self.server_group, self.basepath, clean=False)
def test_rebuild_container_create(self): """Jira ID: DAOS-1168. Test Description: Configure 4 servers and 1 client with 1 or 2 pools and a pool service leader quantity of 2. Add 1 container to the first pool configured with 3 replicas. Populate the container with 1GB of objects. Exclude a server that has shards of this object and verify that rebuild is initiated. While rebuild is active, create 1000 additional containers in the same pool or the second pool (when available). Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Also confirm that all 1000 additional containers created during rebuild are accessible. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate """ # Get test params targets = self.params.get("targets", "/run/server_config/*") pool_qty = self.params.get("pools", "/run/test/*") loop_qty = self.params.get("loops", "/run/test/*") cont_qty = self.params.get("containers", "/run/test/*") rank = self.params.get("rank", "/run/test/*") node_qty = len(self.hostlist_servers) # Get pool params self.pool = [] for index in range(pool_qty): self.pool.append(TestPool(self.context, self.log)) self.pool[-1].get_params(self) # Get ior params ior_cmd = IorCommand() ior_cmd.get_params(self) # Cancel any tests with tickets already assigned if rank == 1 or rank == 2: self.cancelForTicket("DAOS-2434") errors = [0 for _ in range(loop_qty)] for loop in range(loop_qty): # Log the start of the loop loop_id = "LOOP {}/{}".format(loop + 1, loop_qty) self.log.info("%s", "-" * 80) self.log.info("%s: Starting loop", loop_id) # Create the requested number of pools info_checks = [] rebuild_checks = [] for pool in self.pool: pool.create() info_checks.append({ "pi_uuid": pool.uuid, "pi_ntargets": node_qty * targets, "pi_nnodes": node_qty, "pi_ndisabled": 0, }) rebuild_checks.append({ "rs_errno": 0, "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, }) # Check the pool info status = True for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue( status, "Error verifying pool info prior to excluding rank {}".format( rank)) # Create a container with 1GB of data in the first pool ior_cmd.flags.update("-v -w -W -G 1 -k", "ior.flags") ior_cmd.daos_destroy.update(False, "ior.daos_destroy") ior_cmd.set_daos_params(self.server_group, self.pool[0]) self.log.info( "%s: Running IOR on pool %s to fill container %s with data", loop_id, self.pool[0].uuid, ior_cmd.daos_cont.value) self.run_ior(loop_id, ior_cmd) # Exclude the first rank from the first pool to initiate rebuild self.pool[0].start_rebuild(self.server_group, rank, self.d_log) # Wait for rebuild to start self.pool[0].wait_for_rebuild(True, 1) # Create additional containers in the last pool new_containers = self.add_containers_during_rebuild( loop_id, cont_qty, self.pool[0], self.pool[-1]) # Confirm rebuild completes self.pool[0].wait_for_rebuild(False, 1) # Check the pool info info_checks[0]["pi_ndisabled"] += targets rebuild_checks[0]["rs_done"] = 1 rebuild_checks[0]["rs_obj_nr"] = ">=0" rebuild_checks[0]["rs_rec_nr"] = ">=0" for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue(status, "Error verifying pool info after rebuild") # Verify that each of created containers exist by openning them for index, container in enumerate(new_containers): count = "{}/{}".format(index + 1, len(new_containers)) if not self.access_container(loop_id, container, count): errors[loop] += 1 # Destroy the containers created during rebuild for index, container in enumerate(new_containers): container.destroy() # Read the data from the container created before rebuild self.log.info("%s: Running IOR on pool %s to verify container %s", loop_id, self.pool[0].uuid, ior_cmd.daos_cont.value) ior_cmd.flags.update("-v -r -R -G 1 -E", "ior.flags") ior_cmd.daos_destroy.update(True, "ior.daos_destroy") self.run_ior(loop_id, ior_cmd) # Destroy the pools for pool in self.pool: pool.destroy(1) self.log.info("%s: Loop %s", loop_id, "passed" if errors[loop] == 0 else "failed") self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool(self.context, self.log) self.pool.get_params(self) # Create a pool self.pool.create() def run_ior_with_pool(self): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Update IOR params with the pool self.ior_cmd.set_daos_params(self.server_group, self.pool) # Run IOR self.run_ior(self.get_job_manager_command(), self.processes) def get_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode if self.ior_cmd.api.value in ["MPIIO", "DAOS"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") mpirun_path = os.path.join(mpio_util.mpichinstall, "bin") return Mpirun(self.ior_cmd, mpirun_path) def run_ior(self, manager, processes): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes """ env = self.ior_cmd.get_default_env( str(manager), self.tmp, self.client_log) manager.setup_command(env, self.hostfile_clients, processes) try: manager.run() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size))
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: self (obj): soak obj job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] ior_params = os.path.join(os.sep, "run", job_spec, "*") ior_timeout = self.params.get("job_timeout", ior_params, 10) mpi_module = self.params.get("mpi_module", "/run/*", default="mpi/mpich-x86_64") # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params) tsize_list = self.params.get("transfer_size", ior_params) bsize_list = self.params.get("block_size", ior_params) oclass_list = self.params.get("dfs_oclass", ior_params) plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: # Cancel for ticket DAOS-6095 if (api in ["HDF5-VOL", "HDF5", "POSIX"] and t_size == "4k" and o_type in ["RP_2G1", 'RP_2GX']): self.add_cancel_ticket( "DAOS-6095", "IOR -a {} with -t {} and -o {}".format( api, t_size, o_type)) continue # Cancel for ticket DAOS-6308 if api == "MPIIO" and o_type == "RP_2GX": self.add_cancel_ticket( "DAOS-6308", "IOR -a {} with -o {}".format(api, o_type)) continue if api in ["HDF5-VOL", "HDF5", "POSIX"] and ppn > 16: continue ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) ior_cmd.max_duration.update(ior_timeout) if api == "HDF5-VOL": ior_cmd.api.update("HDF5") else: ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) if (api in ["HDF5-VOL", "POSIX"]): ior_cmd.dfs_oclass.update(None) ior_cmd.dfs_dir_oclass.update(None) else: ior_cmd.dfs_oclass.update(o_type) ior_cmd.dfs_dir_oclass.update(o_type) if ior_cmd.api.value == "DFS": ior_cmd.test_file.update(os.path.join("/", "testfile")) add_containers(self, pool, o_type) ior_cmd.set_daos_params(self.server_group, pool, self.container[-1].uuid) env = ior_cmd.get_default_env("srun") sbatch_cmds = ["module load -q {}".format(mpi_module)] # include dfuse cmdlines log_name = "{}_{}_{}_{}_{}_{}_{}_{}".format( job_spec, api, b_size, t_size, o_type, nodesperjob * ppn, nodesperjob, ppn) if api in ["HDF5-VOL", "POSIX"]: dfuse, dfuse_start_cmdlist = start_dfuse( self, pool, self.container[-1], nodesperjob, "SLURM", name=log_name, job_spec=job_spec) sbatch_cmds.extend(dfuse_start_cmdlist) ior_cmd.test_file.update( os.path.join(dfuse.mount_dir.value, "testfile")) # add envs if api is HDF5-VOL if api == "HDF5-VOL": env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path) # env["H5_DAOS_BYPASS_DUNS"] = 1 srun_cmd = Srun(ior_cmd) srun_cmd.assign_processes(nodesperjob * ppn) srun_cmd.assign_environment(env, True) srun_cmd.ntasks_per_node.update(ppn) srun_cmd.nodes.update(nodesperjob) sbatch_cmds.append(str(srun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL", "POSIX"]: sbatch_cmds.extend( stop_dfuse(dfuse, nodesperjob, "SLURM")) commands.append([sbatch_cmds, log_name]) self.log.info("<<IOR {} cmdlines>>:".format(api)) for cmd in sbatch_cmds: self.log.info("%s", cmd) return commands
class ServerFillUp(IorTestBase): # pylint: disable=too-many-ancestors,too-many-instance-attributes """Class to fill up the servers based on pool percentage given. It will get the drives listed in yaml file and find the maximum capacity of the pool which will be created. IOR block size will be calculated as part of function based on percentage of pool needs to fill up. """ def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super().__init__(*args, **kwargs) self.capacity = 1 self.no_of_servers = 1 self.no_of_drives = 1 self.pool = None self.dmg = None self.set_faulty_device = False self.set_online_rebuild = False self.scm_fill = False self.nvme_fill = False self.ior_matrix = None self.ior_local_cmd = None self.result = [] self.fail_on_warning = False self.rank_to_kill = [] self.pool_exclude = {} self.nvme_local_cont = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super().setUp() self.hostfile_clients = None self.ior_local_cmd = IorCommand() self.ior_local_cmd.get_params(self) self.ior_default_flags = self.ior_local_cmd.flags.value self.ior_scm_xfersize = self.params.get("transfer_size", '/run/ior/transfersize_blocksize/*', '2048') self.ior_read_flags = self.params.get("read_flags", '/run/ior/iorflags/*', '-r -R -k -G 1') self.ior_nvme_xfersize = self.params.get("nvme_transfer_size", '/run/ior/transfersize_blocksize/*', '16777216') # Get the number of daos_engine self.engines = self.server_managers[0].manager.job.yaml.engine_params self.dmg_command = self.get_dmg_command() def create_container(self): """Create the container """ self.nvme_local_cont = self.get_container(self.pool, create=False) # update container oclass if self.ior_local_cmd.dfs_oclass: self.nvme_local_cont.oclass.update(self.ior_local_cmd.dfs_oclass.value) self.nvme_local_cont.create() def start_ior_thread(self, create_cont, operation): """Start IOR write/read threads and wait until all threads are finished. Args: create_cont (Bool): To create the new container or not. operation (str): Write/WriteRead: It will Write or Write/Read base on IOR parameter in yaml file. Auto_Write/Auto_Read: It will calculate the IOR block size based on requested storage % to be fill. """ # IOR flag can Write/Read based on test yaml self.ior_local_cmd.flags.value = self.ior_default_flags # Calculate the block size based on server % to fill up. if 'Auto' in operation: block_size = self.calculate_ior_block_size() self.ior_local_cmd.block_size.update('{}'.format(block_size)) # For IOR Read operation update the read only flag from yaml file. if 'Auto_Read' in operation or operation == "Read": create_cont = False self.ior_local_cmd.flags.value = self.ior_read_flags self.ior_local_cmd.set_daos_params(self.server_group, self.pool) self.ior_local_cmd.test_file.update('/testfile') # Created new container or use the existing container for reading if create_cont: self.create_container() self.ior_local_cmd.dfs_cont.update(self.nvme_local_cont.uuid) # Define the job manager for the IOR command job_manager_main = get_job_manager(self, "Mpirun", self.ior_local_cmd, mpi_type="mpich") env = self.ior_local_cmd.get_default_env(str(job_manager_main)) job_manager_main.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager_main.assign_environment(env, True) job_manager_main.assign_processes(self.params.get("np", '/run/ior/client_processes/*')) # run IOR Command try: output = job_manager_main.run() self.ior_matrix = IorCommand.get_ior_metrics(output) for line in output.stdout_text.splitlines(): if 'WARNING' in line and self.fail_on_warning: self.result.append("FAIL-IOR command issued warnings.") except (CommandFailure, TestFail) as error: self.result.append("FAIL - {}".format(error)) def calculate_ior_block_size(self): """Calculate IOR Block size to fill up the Server. Returns: block_size(int): IOR Block size """ if self.scm_fill: free_space = self.pool.get_pool_daos_space()["s_total"][0] self.ior_local_cmd.transfer_size.value = self.ior_scm_xfersize elif self.nvme_fill: free_space = self.pool.get_pool_daos_space()["s_total"][1] self.ior_local_cmd.transfer_size.value = self.ior_nvme_xfersize else: self.fail('Provide storage type (SCM/NVMe) to be filled') # Get the block size based on the capacity to be filled. For example # If nvme_free_space is 100G and to fill 50% of capacity. # Formula : (107374182400 / 100) * 50.This will give 50%(50G) of space to be filled. _tmp_block_size = ((free_space/100)*self.capacity) # Check the IOR object type to calculate the correct block size. _replica = re.findall(r'_(.+?)G', self.ior_local_cmd.dfs_oclass.value) # This is for non replica and EC class where _tmp_block_size will not change. if not _replica: pass # If it's EC object, Calculate the tmp block size based on number of data + parity # targets. And calculate the write data size for the total number data targets. # For example: 100Gb of total pool to be filled 10% in total: For EC_4P1GX, Get the data # target fill size = 8G, which will fill 8G of data and 2G of Parity. So total 10G (10% # of 100G of pool size) elif 'P' in _replica[0]: replica_server = re.findall(r'\d+', _replica[0])[0] parity_count = re.findall(r'\d+', _replica[0])[1] _tmp_block_size = int(_tmp_block_size / (int(replica_server) + int(parity_count))) _tmp_block_size = int(_tmp_block_size) * int(replica_server) # This is Replica type object class else: _tmp_block_size = int(_tmp_block_size / int(_replica[0])) # Last divide the Total sized with IOR number of process _tmp_block_size = int(_tmp_block_size) / self.processes # Calculate the Final block size of IOR multiple of Transfer size. block_size = (int(_tmp_block_size / int(self.ior_local_cmd.transfer_size.value)) * int( self.ior_local_cmd.transfer_size.value)) return block_size def set_device_faulty(self, server, disk_id): """Set the devices to Faulty and wait for rebuild to complete. Args: server (string): server hostname where it generate the NVMe fault. disk_id (string): NVMe disk ID where it will be changed to faulty. """ self.dmg.hostlist = server self.dmg.storage_set_faulty(disk_id) result = self.dmg.storage_query_device_health(disk_id) # Check if device state changed to EVICTED. if 'State:EVICTED' not in result.stdout_text: self.fail("device State {} on host {} suppose to be EVICTED".format(disk_id, server)) # Wait for rebuild to start self.pool.wait_for_rebuild(True) # Wait for rebuild to complete self.pool.wait_for_rebuild(False) def set_device_faulty_loop(self): """Set devices to Faulty one by one and wait for rebuild to complete.""" # Get the device ids from all servers and try to eject the disks device_ids = get_device_ids(self.dmg, self.hostlist_servers) # no_of_servers and no_of_drives can be set from test yaml. 1 Server, 1 Drive = Remove # single drive from single server for num in range(0, self.no_of_servers): server = self.hostlist_servers[num] for disk_id in range(0, self.no_of_drives): self.set_device_faulty(server, device_ids[server][disk_id]) def get_max_storage_sizes(self): """Get the maximum pool sizes for the current server configuration. Returns: list: a list of the maximum SCM and NVMe size """ try: sizes_dict = self.server_managers[0].get_available_storage() sizes = [sizes_dict["scm"], sizes_dict["nvme"]] except (ServerFailed, KeyError) as error: self.fail(error) # Return the 96% of storage space as it won't be used 100% for pool creation. for index, _size in enumerate(sizes): sizes[index] = int(sizes[index] * 0.96) return sizes def create_pool_max_size(self, scm=False, nvme=False): """Create a single pool with Maximum NVMe/SCM size available. Args: scm (bool): To create the pool with max SCM size or not. nvme (bool): To create the pool with max NVMe size or not. Note: Method to Fill up the server. It will get the maximum Storage space and create the pool. Replace with dmg options in future when it's available. """ # Create a pool self.add_pool(create=False) if nvme or scm: sizes = self.get_max_storage_sizes() # If NVMe is True get the max NVMe size from servers if nvme: self.pool.nvme_size.update('{}'.format(sizes[1])) # If SCM is True get the max SCM size from servers if scm: self.pool.scm_size.update('{}'.format(sizes[0])) # Create the Pool self.pool.create() def kill_rank_thread(self, rank): """ Server rank kill thread function Args: rank: Rank number to kill the daos server """ self.server_managers[0].stop_ranks([rank], self.d_log, force=True) def exclude_target_thread(self, rank, target): """ Target kill thread function Args: rank(int): Rank number to kill the target from target(str): target number or range of targets to kill """ self.dmg_command.pool_exclude(self.pool.uuid, rank, str(target)) def start_ior_load(self, storage='NVMe', operation="WriteRead", percent=1, create_cont=True): """Fill up the server either SCM or NVMe. Fill up based on percent amount given using IOR. Args: storage (string): SCM or NVMe, by default it will fill NVMe. operation (string): Write/Read operation percent (int): % of storage to be filled create_cont (bool): To create the new container for IOR """ kill_rank_job = [] kill_target_job = [] self.result.clear() self.capacity = percent # Fill up NVMe by default self.nvme_fill = 'NVMe' in storage self.scm_fill = 'SCM' in storage # Create the IOR threads job = threading.Thread(target=self.start_ior_thread, kwargs={"create_cont": create_cont, "operation": operation}) # Launch the IOR thread job.start() # Set NVMe device faulty if it's set if self.set_faulty_device: time.sleep(60) # Set the device faulty self.set_device_faulty_loop() # Kill the server rank while IOR in progress if self.set_online_rebuild: time.sleep(30) # Kill the server rank in BG thread for _id, _rank in enumerate(self.rank_to_kill): kill_rank_job.append(threading.Thread(target=self.kill_rank_thread, kwargs={"rank": _rank})) kill_rank_job[_id].start() # Kill the target from rank in BG thread for _id, (key, value) in enumerate(self.pool_exclude.items()): kill_target_job.append(threading.Thread(target=self.exclude_target_thread, kwargs={"rank": key, "target": value})) kill_target_job[_id].start() # Wait for server kill thread to finish for _kill_rank in kill_rank_job: _kill_rank.join() # Wait for rank kill thread to finish for _kill_tgt in kill_target_job: _kill_tgt.join() # Wait to finish the IOR thread job.join() # Verify if any test failed for any IOR run for test_result in self.result: if "FAIL" in test_result: self.fail(test_result)
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=metadata,metadata_ior,nvme,large """ files_per_thread = 400 total_ior_threads = 5 self.out_queue = queue.Queue() processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [ [str(uuid.uuid4()) for _ in range(files_per_thread)] for _ in range(total_ior_threads)] # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads threads = [] for index in range(total_ior_threads): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Define the job manager for the IOR command manager = Orterun(ior_cmd) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env) # Add a thread for these IOR arguments threads.append( threading.Thread( target=ior_runner_thread, kwargs={ "manager": manager, "uuids": list_of_uuid_lists[index], "results": self.out_queue})) self.log.info( "Creatied %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads if self.thread_control(threads, operation) == "FAIL": self.d_log.error("IOR {} Thread FAIL".format(operation)) self.fail("IOR {} Thread FAIL".format(operation)) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents errors = self.stop_agents() self.assertEqual( len(errors), 0, "Error stopping agents:\n {}".format("\n ".join(errors))) # Stop the servers errors = self.stop_servers() self.assertEqual( len(errors), 0, "Error stopping servers:\n {}".format("\n ".join(errors))) # Start the agents self.start_agent_managers() # Start the servers self.start_server_managers()
def ior_runner_thread(self, results): """Start threads and wait until all threads are finished. Destroy the container at the end of this thread run. Args: results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} cmd = DaosCommand(os.path.join(self.prefix, "bin")) cmd.set_sub_command("container") cmd.sub_command_class.set_sub_command("destroy") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Iterate through IOR different value and run in sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_transfer_size, self.ior_flags): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.dfs_cont.update(container_info ["{}{}{}".format(oclass, api, test[0])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL") # Destroy the container created by thread for key in container_info: cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid cmd.sub_command_class.sub_command_class.svc.value = \ self.pool.svc_ranks cmd.sub_command_class.sub_command_class.cont.value = \ container_info[key] try: cmd._get_result() except CommandFailure as _error: results.put("FAIL")