def start_dfuse(self, hosts, pool, container): """Create a DfuseCommand object and use it to start Dfuse. Args: hosts (list): list of hosts on which to start Dfuse pool (TestPool): pool to use with Dfuse container (TestContainer): container to use with Dfuse """ self.dfuse = Dfuse(hosts, self.tmp) self.dfuse.get_params(self) # Update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # Start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.")
def start_dfuse(self, hosts, pool=None, container=None, mount_dir=None): """Create a DfuseCommand object and use it to start Dfuse. Args: hosts (list): list of hosts on which to start Dfuse pool (TestPool, optional): pool to use with Dfuse container (TestContainer, optional): container to use with Dfuse mount_dir (str, optional): updated mount dir name. Defaults to None. """ self.dfuse = Dfuse(hosts, self.tmp) self.dfuse.get_params(self) # Update dfuse params if mount_dir: self.dfuse.mount_dir.update(mount_dir) if pool: self.dfuse.set_dfuse_params(pool) if container: self.dfuse.set_dfuse_cont_param(container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # Start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.")
def start_dfuse(self, pool): """Create a DfuseCommand object to start dfuse. Args: pool (obj): TestPool obj """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool)) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) # create dfuse mount point cmd = "mkdir -p {}".format(self.dfuse.mount_dir.value) params = self.srun_params params["export"] = "all" params["ntasks-per-node"] = 1 result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: Dfuse mountpoint {} not created>>".format( self.dfuse.mount_dir.value)) cmd = self.dfuse.__str__() result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Dfuse failed to start>>")
def _start_dfuse(self, pool, container): """Create a DfuseCommand object to start dfuse. Args: container: Container to mount dfuse """ # Get Dfuse params self.dfuse = Dfuse(self.dfuse_hosts, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n")
def start_dfuse(self, pool=None): """Create a DfuseCommand object to start dfuse. Args: pool (TestPool): Test pool object if dfuse is intended to be started using pool uuid option. """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params if pool: self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n")
def start_dfuse(self, count): """Create a DfuseCommand object to start dfuse. Args: count(int): container index """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.mount_dir.update("/tmp/" + self.pool.uuid + "_daos_dfuse" + str(count)) self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n")
def start_dfuse(self, pool): """Create dfuse start command line for slurm. Args: pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(self.get_container(pool)) dfuse_start_cmds = [ "mkdir -p {}".format(dfuse.mount_dir.value), "{}".format(dfuse.__str__()), "df -h {}".format(dfuse.mount_dir.value) ] return dfuse, dfuse_start_cmds
def start_dfuse(self, pool): """Create dfuse start command line for slurm. Args: pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ commands = [] # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool)) # create dfuse mount point commands.append(slurm_utils.srun_str( hosts=None, cmd="mkdir -p {}".format(dfuse.mount_dir.value), srun_params=None)) commands.append(slurm_utils.srun_str( hosts=None, cmd="{}".format(dfuse.__str__()), srun_params=None)) commands.append("sleep 10") commands.append(slurm_utils.srun_str( hosts=None, cmd="df -h {}".format(dfuse.mount_dir.value), srun_params=None)) return dfuse, commands
def start_dfuse(self, pool, container, nodesperjob, resource_mgr=None, name=None, job_spec=None): """Create dfuse start command line for slurm. Args: self (obj): soak obj pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.namespace = os.path.join(os.sep, "run", job_spec, "dfuse", "*") dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(container) dfuse_log = os.path.join( self.test_log_dir, self.test_name + "_" + name + "_${SLURM_JOB_NODELIST}_" "" + "${SLURM_JOB_ID}_" + "daos_dfuse_" + unique) dfuse_env = "export D_LOG_MASK=ERR;export D_LOG_FILE={}".format(dfuse_log) dfuse_start_cmds = [ "mkdir -p {}".format(dfuse.mount_dir.value), "clush -S -w $SLURM_JOB_NODELIST \"cd {};{};{}\"".format( dfuse.mount_dir.value, dfuse_env, dfuse.__str__()), "sleep 10", "df -h {}".format(dfuse.mount_dir.value), ] if resource_mgr == "SLURM": cmds = [] for cmd in dfuse_start_cmds: if cmd.startswith("clush") or cmd.startswith("sleep"): cmds.append(cmd) else: cmds.append(get_srun_cmd(cmd, nodesperjob)) dfuse_start_cmds = cmds return dfuse, dfuse_start_cmds
def create_cont(self): """Create a TestContainer object to be used to create container.""" # TO-DO: Enable container using TestContainer object, # once DAOS-3355 is resolved. # Get Container params #self.container = TestContainer(self.pool) #self.container.get_params(self) # create container # self.container.create() env = Dfuse(self.hostlist_clients, self.tmp).get_default_env() # command to create container of posix type cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format( self.ior_cmd.daos_pool.value, self.ior_cmd.daos_svcl.value) try: container = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) (output, err) = container.communicate() self.log.info("Container created with UUID %s", output.split()[3]) except subprocess.CalledProcessError as err: self.fail("Container create failed:{}".format(err)) return output.split()[3]
def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self._create_cont()) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet(self.dfuse.hosts)), exc_info=error) self.fail("Unable to launch Dfuse.\n")
def start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.create_cont()) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n")
def start_dfuse(self, pool, nodesperjob, resource_mgr=None): """Create dfuse start command line for slurm. Args: self (obj): soak obj pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) add_containers(self, pool) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(self.container[-1]) dfuse_start_cmds = [ "mkdir -p {}".format(dfuse.mount_dir.value), "clush -w $SLURM_JOB_NODELIST \"cd {};{}\"".format( dfuse.mount_dir.value, dfuse.__str__()), "sleep 10", "df -h {}".format(dfuse.mount_dir.value), ] if resource_mgr == "SLURM": cmds = [] for cmd in dfuse_start_cmds: if cmd.startswith("clush") or cmd.startswith("sleep"): cmds.append(cmd) else: cmds.append(get_srun_cmd(cmd, nodesperjob)) dfuse_start_cmds = cmds return dfuse, dfuse_start_cmds
class RootContainerTest(TestWithServers): """Base Dfuse Container check test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a RootContainerTest object.""" super(RootContainerTest, self).__init__(*args, **kwargs) self.pool = [] self.container = [] self.tmp_file_count = self.params.get( "tmp_file_count", '/run/container/*') self.cont_count = self.params.get( "cont_count", '/run/container/*') self.tmp_file_size = self.params.get( "tmp_file_size", '/run/container/*') self.tmp_file_name = self.params.get( "tmp_file_name", '/run/container/*') # device where the pools and containers are created self.device = "scm" def setUp(self): """Set up each test case.""" # Start the servers and agents super(RootContainerTest, self).setUp() self.dfuse = None self.dfuse_hosts = None def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(RootContainerTest, self).tearDown() def _create_pool(self): """Create a TestPool object to use with ior. """ # Get the pool params pool = TestPool( self.context, dmg_command=self.get_dmg_command()) pool.get_params(self) # Create a pool pool.create() self.pool.append(pool) return pool def _create_cont(self, pool, path=None): """Create a TestContainer object to be used to create container. Args: pool (TestPool): pool object path (str): Unified namespace path for container """ # Get container params container = TestContainer(pool, daos_command=DaosCommand(self.bin)) container.get_params(self) if path is not None: container.path.update(path) # create container container.create() self.container.append(container) return container def _start_dfuse(self, pool, container): """Create a DfuseCommand object to start dfuse. Args: container: Container to mount dfuse """ # Get Dfuse params self.dfuse = Dfuse(self.dfuse_hosts, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n") def test_rootcontainer(self): """Jira ID: DAOS-3782. Test Description: Purpose of this test is to try and create a container and mount it over dfuse and use it as a root container and create subcontainers underneath it and insert several files and see if they can be accessed using ls and cd. Verify the pool size reflects the space occupied by container. Try to remove the files and containers and see the space is reclaimed. Test the above procedure with 100 sub containers. Test the above procedure with 5 pools and 50 containers spread across the pools. :avocado: tags=all,hw,small,full_regression,container :avocado: tags=rootcontainer """ # Create a pool and start dfuse. pool = self._create_pool() container = self._create_cont(pool) self.dfuse_hosts = self.agent_managers[0].hosts # mount fuse self._start_dfuse(pool, container) # Create another container and add it as sub container under # root container sub_container = str(self.dfuse.mount_dir.value + "/cont0") container = self._create_cont(pool, path=sub_container) #Insert files into root container self.insert_files_and_verify("") #Insert files into sub container self.insert_files_and_verify("cont0") #Create 100 subcontainer and verify the temp files self.verify_create_delete_containers(pool, 100) self.verify_multi_pool_containers() def verify_multi_pool_containers(self): """Create several pools and containers and mount it under the root container and verify they're accessible. """ pool_count = self.params.get("pool_count", "/run/pool/*") for i in range(pool_count): pool = self._create_pool() for j in range(self.cont_count): cont_name = "/cont_{}{}".format(i, j) sub_cont = str(self.dfuse.mount_dir.value + cont_name) self._create_cont(pool=pool, path=sub_cont) self.insert_files_and_verify(cont_name) def verify_create_delete_containers(self, pool, cont_count): """Create multiple containers and multiple multi-mb files in each of them and verify the space usage. Destroy half of the containers and verify the space usage is reclaimed. Args: cont_count (int): Number of containers to be created. """ self.log.info("Verifying multiple container create delete") pool_space_before = pool.get_pool_free_space(self.device) self.log.info("Pool space before = %s", pool_space_before) for i in range(cont_count): sub_cont = str(self.dfuse.mount_dir.value + "/cont{}".format(i+1)) self._create_cont(pool, path=sub_cont) self.insert_files_and_verify("cont{}".format(i+1)) expected = pool_space_before - \ cont_count * self.tmp_file_count * self.tmp_file_size pool_space_after = pool.get_pool_free_space(self.device) self.log.info("Pool space <= Expected") self.log.info("%s <= %s", pool_space_after, expected) self.assertTrue(pool_space_after <= expected) self.log.info("Destroying half of the containers = %s", cont_count//2) for i in range(cont_count // 2): self.container[-1].destroy(1) self.container.pop() expected = pool_space_after + \ ((cont_count // 2) * self.tmp_file_count *\ self.tmp_file_size) pool_space_after_cont_destroy = \ pool.get_pool_free_space(self.device) self.log.info("After container destroy") self.log.info("Free Pool space >= Expected") self.log.info("%s >= %s", pool_space_after_cont_destroy, expected) self.assertTrue(pool_space_after_cont_destroy >= expected) def insert_files_and_verify(self, container_name): """ Insert files into the specific container and verify they're navigable and accessible. Args: container_name: Name of the POSIX Container file_name_prefix: Prefix of the file name that will be created no_of_files: Number of files to be created iteratively Return: None """ cont_dir = self.dfuse.mount_dir.value if container_name: cont_dir = "{}/{}".format(cont_dir, container_name) cmds = [] ls_cmds = [] for i in range(self.tmp_file_count): # Create 40 MB files file_name = "{}{}".format(self.tmp_file_name, i+1) cmd = "head -c {} /dev/urandom > {}/{}".format( self.tmp_file_size, cont_dir, file_name) ls_cmds.append("ls {}".format(file_name)) cmds.append(cmd) self._execute_cmd(";".join(cmds)) cmds = [] # Run ls to verify the temp files are actually created cmds = ["cd {}".format(cont_dir)] cmds.extend(ls_cmds) self._execute_cmd(";".join(cmds)) def _execute_cmd(self, cmd): """Execute command on the host clients Args: cmd (str): Command to run """ try: # execute bash cmds ret = pcmd( self.dfuse_hosts, cmd, verbose=True, timeout=30) if 0 not in ret: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret.items() if code != 0])) raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("DfuseSparseFile Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") return ret
class DfuseTestBase(TestWithServers): """Runs HDF5 vol test suites. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a TestWithServers object.""" super(DfuseTestBase, self).__init__(*args, **kwargs) self.dfuse = None def stop_job_managers(self): """Stop the test job manager followed by dfuse. Returns: list: a list of exceptions raised stopping the agents """ error_list = super(DfuseTestBase, self).stop_job_managers() try: self.stop_dfuse() except CommandFailure as error: error_list.append("Error stopping dfuse: {}".format(error)) return error_list def start_dfuse(self, hosts, pool=None, container=None, mount_dir=None): """Create a DfuseCommand object and use it to start Dfuse. Args: hosts (list): list of hosts on which to start Dfuse pool (TestPool, optional): pool to use with Dfuse container (TestContainer, optional): container to use with Dfuse mount_dir (str, optional): updated mount dir name. Defaults to None. """ self.dfuse = Dfuse(hosts, self.tmp) self.dfuse.get_params(self) # Update dfuse params if mount_dir: self.dfuse.mount_dir.update(mount_dir) if pool: self.dfuse.set_dfuse_params(pool) if container: self.dfuse.set_dfuse_cont_param(container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # Start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.") def stop_dfuse(self): """Stop Dfuse and unset the DfuseCommand object.""" if self.dfuse: self.dfuse.stop() self.dfuse = None
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) def tearDown(self): """Tear down each test case.""" try: self.dfuse = None finally: # Stop the servers and agents super(IorTestBase, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool(self.context, self.log) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # TO-DO: Enable container using TestContainer object, # once DAOS-3355 is resolved. # Get Container params #self.container = TestContainer(self.pool) #self.container.get_params(self) # create container # self.container.create() env = Dfuse(self.hostlist_clients, self.tmp).get_default_env() # command to create container of posix type cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format( self.ior_cmd.daos_pool.value, self.ior_cmd.daos_svcl.value) try: container = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) (output, err) = container.communicate() self.log.info("Container created with UUID %s", output.split()[3]) except subprocess.CalledProcessError as err: self.fail("Container create failed:{}".format(err)) return output.split()[3] def start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp, True) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.create_cont()) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n") def run_ior_with_pool(self, intercept=None): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str): path to the interception library. Shall be used only for POSIX through DFUSE. ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Update IOR params with the pool self.ior_cmd.set_daos_params(self.server_group, self.pool) # start dfuse if api is POSIX if self.ior_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved # self.pool.connect() # self.create_cont() if self.ior_cmd.transfer_size.value == "256B": self.cancelForTicket("DAOS-3449") self.start_dfuse() self.ior_cmd.test_file.update(self.dfuse.mount_dir.value + "/testfile") out = self.run_ior(self.get_job_manager_command(), self.processes, intercept) return out def get_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") mpirun_path = os.path.join(mpio_util.mpichinstall, "bin") return Mpirun(self.ior_cmd, mpirun_path) def run_ior(self, manager, processes, intercept=None): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ env = self.ior_cmd.get_default_env(str(manager), self.tmp, self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.setup_command(env, self.hostfile_clients, processes) try: out = manager.run() return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size))
class ParallelIo(FioBase): """Base Parallel IO test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a ParallelIo object.""" super(ParallelIo, self).__init__(*args, **kwargs) self.dfuse = None self.cont_count = None self.container = [] def setUp(self): """Set up each test case.""" # Start the servers and agents super(ParallelIo, self).setUp() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(ParallelIo, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool( self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # Get container params container = TestContainer( self.pool, daos_command=DaosCommand(self.bin)) container.get_params(self) # create container container.create() self.container.append(container) def start_dfuse(self): """Create a DfuseCommand object to start dfuse. """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n") def test_parallelio(self): """Jira ID: DAOS-3775. Test Description: Purpose of this test is to mount dfuse and verify multiple containers using fio. Use cases: Mount dfuse using pool uuid. Create multiple containers under that dfuse mount point. Check those containers are accessible from that mount point. Perform io to those containers using FIO Delete one of the containers Check if dfuse is still running. If not, fail the test and exit. Otherwise, try accessing the deleted container. This should fail. Check dfuse again. :avocado: tags=all,hw,daosio,medium,ib2,full_regression,parallelio """ # get test params for cont and pool count self.cont_count = self.params.get("cont_count", '/run/container/*') threads = [] # Create a pool and start dfuse. self.create_pool() self.start_dfuse() # create multiple containers in parallel cont_threads = [] for _ in range(self.cont_count): cont_thread = threading.Thread(target=self.create_cont()) cont_threads.append(cont_thread) # start container create job for cont_job in cont_threads: cont_job.start() # wait for container create to finish for cont_job in cont_threads: cont_job.join() # check if all the created containers can be accessed and perform # io on each container using fio in parallel for _, cont in enumerate(self.container): dfuse_cont_dir = self.dfuse.mount_dir.value + "/" + cont.uuid cmd = u"ls -a {}".format(dfuse_cont_dir) try: # execute bash cmds ret_code = general_utils.pcmd( self.hostlist_clients, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("ParallelIo Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") # run fio on all containers thread = threading.Thread(target=self.execute_fio, args=( self.dfuse.mount_dir.value + "/" + cont.uuid, False)) threads.append(thread) thread.start() # wait for all fio jobs to be finished for job in threads: job.join() # destroy first container container_to_destroy = self.container[0].uuid self.container[0].destroy(1) # check dfuse if it is running fine self.dfuse.check_running() # try accessing destroyed container, it should fail try: self.execute_fio(self.dfuse.mount_dir.value + "/" + \ container_to_destroy, False) self.fail("Fio was able to access destroyed container: {}".\ format(self.container[0].uuid)) except CommandFailure as error: self.log.info("This run is expected to fail") # check dfuse is still running after attempting to access deleted # container. self.dfuse.check_running()
class SoakTestBase(TestWithServers): # pylint: disable=too-many-public-methods """Execute DAOS Soak test cases. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a SoakBase object.""" super(SoakTestBase, self).__init__(*args, **kwargs) self.failed_job_id_list = None self.test_log_dir = None self.exclude_slurm_nodes = None self.loop = None self.log_dir = None self.outputsoakdir = None self.test_name = None self.local_pass_dir = None self.dfuse = None self.test_timeout = None self.end_time = None self.job_timeout = None self.nodesperjob = None self.task_list = None self.soak_results = None self.srun_params = None self.pool = None self.container = None self.test_iteration = None self.h_list = None self.harasser_joblist = None self.harasser_results = None self.harasser_timeout = None self.all_failed_jobs = None self.username = None def setUp(self): """Define test setup to be done.""" self.log.info("<<setUp Started>> at %s", time.ctime()) # Start the daos_agents in the job scripts self.setup_start_servers = True self.setup_start_agents = False super(SoakTestBase, self).setUp() self.username = getuser() # Initialize loop param for all tests self.loop = 1 self.exclude_slurm_nodes = [] # Setup logging directories for soak logfiles # self.output dir is an avocado directory .../data/ self.log_dir = self.params.get("logdir", "/run/*") self.outputsoakdir = self.outputdir + "/soak" # Create the remote log directories on all client nodes self.test_log_dir = self.log_dir + "/pass" + str(self.loop) self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) # Fail if slurm partition daos_client is not defined if not self.client_partition: raise SoakTestError( "<<FAILED: Partition is not correctly setup for daos " "slurm partition>>") # Check if the server nodes are in the client list; # this will happen when only one partition is specified for host_server in self.hostlist_servers: if host_server in self.hostlist_clients: self.hostlist_clients.remove(host_server) self.exclude_slurm_nodes.append(host_server) self.log.info("<<Updated hostlist_clients %s >>", self.hostlist_clients) if not self.hostlist_clients: self.fail("There are no nodes that are client only;" "check if the partition also contains server nodes") # Include test node for log cleanup; remove from client list local_host_list = include_local_host(None) self.exclude_slurm_nodes.extend(local_host_list) # Start an agent on the test control host to enable API calls for # reserved pool and containers. The test control host should be the # last host in the hostlist_clients list. agent_groups = {self.server_group: local_host_list} self.start_agents(agent_groups) def pre_tear_down(self): """Tear down any test-specific steps prior to running tearDown(). Returns: list: a list of error strings to report after all tear down steps have been attempted """ errors = [] # clear out any jobs in squeue; if self.failed_job_id_list: self.log.info("<<Cancel jobs in queue with ids %s >>", self.failed_job_id_list) status = process.system("scancel --partition {} -u {}".format( self.client_partition, self.username)) if status > 0: errors.append("Failed to cancel jobs {}".format( self.failed_job_id_list)) if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) # One last attempt to copy any logfiles from client nodes try: self.get_remote_logs() except SoakTestError as error: self.log.info("Remote copy failed with %s", error) # daos_agent is always started on this node when start agent is false if not self.setup_start_agents: self.hostlist_clients = [socket.gethostname().split('.', 1)[0]] return errors def tearDown(self): """Define tearDown and clear any left over jobs in squeue.""" # Perform any test-specific tear down steps and collect any # reported errors self.log.info("<<tearDown Started>> at %s", time.ctime()) super(SoakTestBase, self).tearDown() def job_done(self, args): """Call this function when a job is done. Args: args (list):handle --which job, i.e. the job ID, state --string indicating job completion status """ self.soak_results[args["handle"]] = args["state"] def add_pools(self, pool_names): """Create a list of pools that the various tests use for storage. Args: pool_names: list of pool namespaces from yaml file /run/<test_params>/poollist/* """ for pool_name in pool_names: path = "".join(["/run/", pool_name, "/*"]) # Create a pool and add it to the overall list of pools self.pool.append( TestPool(self.context, self.log, dmg_command=self.get_dmg_command())) self.pool[-1].namespace = path self.pool[-1].get_params(self) self.pool[-1].create() self.log.info("Valid Pool UUID is %s", self.pool[-1].uuid) def get_remote_logs(self): """Copy files from remote dir to local dir. Raises: SoakTestError: if there is an error with the remote copy """ # copy the files from the remote # TO-DO: change scp this_host = socket.gethostname() rsync_str = "rsync -avtr --min-size=1B" result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), "bash -c \"{0} {1} {2}:{1}/.. && rm -rf {1}/*\"".format( rsync_str, self.test_log_dir, this_host), self.srun_params) if result.exit_status == 0: cmd = "cp -R -p {0}/ \'{1}\'; rm -rf {0}/*".format( self.test_log_dir, self.outputsoakdir) try: result = process.run(cmd, shell=True, timeout=30) except process.CmdError as error: raise SoakTestError("<<FAILED: Soak remote logfiles not copied" "to avocado data dir {} - check /tmp/soak " "on nodes {}>>".format( error, self.hostlist_clients)) else: raise SoakTestError("<<FAILED: Soak remote logfiles not copied " "from clients>>: {}".format( self.hostlist_clients)) def is_harasser(self, harasser): """Check if harasser is defined in yaml. Args: harasser (list): list of harassers to launch Returns: bool """ return self.h_list and harasser in self.h_list def launch_harassers(self, harassers, pools): """Launch any harasser tests if defined in yaml. Args: harasser (list): list of harassers to launch pools (TestPool): pool obj """ job = None # Launch harasser after one complete pass for harasser in harassers: if harasser == "rebuild": method = self.launch_rebuild ranks = self.params.get("ranks_to_kill", "/run/" + harasser + "/*") param_list = (ranks, pools) name = "REBUILD" if harasser in "snapshot": method = self.launch_snapshot param_list = () name = "SNAPSHOT" else: raise SoakTestError( "<<FAILED: Harasser {} is not supported. ".format( harasser)) job = threading.Thread(target=method, args=param_list, name=name) self.harasser_joblist.append(job) # start all harassers for job in self.harasser_joblist: job.start() def harasser_completion(self, timeout): """Complete harasser jobs. Args: timeout (int): timeout in secs Returns: bool: status """ status = True for job in self.harasser_joblist: job.join(timeout) for job in self.harasser_joblist: if job.is_alive(): self.log.error("<< HARASSER is alive %s FAILED to join>> ", job.name) status &= False # Check if the completed job passed for harasser, status in self.harasser_results.items(): if not status: self.log.error("<< HARASSER %s FAILED>> ", harasser) status &= False self.harasser_joblist = [] return status def launch_rebuild(self, ranks, pools): """Launch the rebuild process. Args: ranks (list): Server ranks to kill pools (list): list of TestPool obj """ self.log.info("<<Launch Rebuild>> at %s", time.ctime()) status = True for pool in pools: # Kill the server try: pool.start_rebuild(ranks, self.d_log) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Rebuild failed to start", exc_info=error) status &= False break # Wait for rebuild to start try: pool.wait_for_rebuild(True) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Rebuild failed waiting to start", exc_info=error) status &= False break # Wait for rebuild to complete try: pool.wait_for_rebuild(False) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Rebuild failed waiting to finish", exc_info=error) status &= False break with H_LOCK: self.harasser_results["REBUILD"] = status def launch_snapshot(self): """Create a basic snapshot of the reserved pool.""" self.log.info("<<Launch Snapshot>> at %s", time.ctime()) status = True # Create container container = TestContainer(self.pool[0]) container.namespace = "/run/container_reserved/*" container.get_params(self) container.create() container.open() obj_cls = self.params.get("object_class", '/run/container_reserved/*') # write data to object data_pattern = get_random_string(500) datasize = len(data_pattern) + 1 dkey = "dkey" akey = "akey" tx_handle = container.container.get_new_tx() obj = container.container.write_an_obj(data_pattern, datasize, dkey, akey, obj_cls=obj_cls, txn=tx_handle) container.container.commit_tx(tx_handle) obj.close() # Take a snapshot of the container snapshot = DaosSnapshot(self.context) try: snapshot.create(container.container.coh, tx_handle) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Snapshot failed", exc_info=error) status &= False if status: self.log.info("Snapshot Created") # write more data to object data_pattern2 = get_random_string(500) datasize2 = len(data_pattern2) + 1 dkey = "dkey" akey = "akey" obj2 = container.container.write_an_obj(data_pattern2, datasize2, dkey, akey, obj_cls=obj_cls) obj2.close() self.log.info("Wrote additional data to container") # open the snapshot and read the data obj.open() snap_handle = snapshot.open(container.container.coh) try: data_pattern3 = container.container.read_an_obj( datasize, dkey, akey, obj, txn=snap_handle.value) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Error when retrieving the snapshot data %s", error) status &= False if status: # Compare the snapshot to the original written data. if data_pattern3.value != data_pattern: self.log.error("Snapshot data miscompere") status &= False # Destroy the snapshot try: snapshot.destroy(container.container.coh) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Failed to destroy snapshot %s", error) status &= False # cleanup container.close() container.destroy() with H_LOCK: self.harasser_results["SNAPSHOT"] = status def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("daos_oclass", ior_params + "*") # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX if self.is_harasser("rebuild"): oclass_list = self.params.get("daos_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.daos_oclass.update(o_type) ior_cmd.set_daos_params(self.server_group, pool) # srun cmdline nprocs = nodesperjob * ppn env = ior_cmd.get_default_env("srun") if ior_cmd.api.value == "MPIIO": env["DAOS_CONT"] = ior_cmd.daos_cont.value cmd = Srun(ior_cmd) cmd.setup_command(env, None, nprocs) cmd.ntasks_per_node.update(ppn) log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([cmd.__str__(), log_name]) self.log.info("<<IOR cmdline>>: %s \n", commands[-1].__str__()) return commands def create_dfuse_cont(self, pool): """Create a TestContainer object to be used to create container. Args: pool (obj): TestPool obj Returns: cuuid: container uuid """ # TO-DO: use daos tool when available # This method assumes that doas agent is running on test node cmd = "daos cont create --pool={} --svc={} --type=POSIX".format( pool.uuid, ":".join([str(item) for item in pool.svc_ranks])) try: result = process.run(cmd, shell=True, timeout=30) except process.CmdError as error: raise SoakTestError( "<<FAILED: Dfuse container failed {}>>".format(error)) self.log.info("Dfuse Container UUID = %s", result.stdout.split()[3]) return result.stdout.split()[3] def start_dfuse(self, pool): """Create a DfuseCommand object to start dfuse. Args: pool (obj): TestPool obj """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool)) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) # create dfuse mount point cmd = "mkdir -p {}".format(self.dfuse.mount_dir.value) params = self.srun_params params["export"] = "all" params["ntasks-per-node"] = 1 result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: Dfuse mountpoint {} not created>>".format( self.dfuse.mount_dir.value)) cmd = self.dfuse.__str__() result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Dfuse failed to start>>") def create_fio_cmdline(self, job_spec, pool): """Create the FOI commandline. Args: job_spec (str): fio job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node Returns: cmd(list): list of cmdlines """ commands = [] fio_namespace = "/run/{}".format(job_spec) # test params bs_list = self.params.get("blocksize", fio_namespace + "/soak/*") size_list = self.params.get("size", fio_namespace + "/soak/*") rw_list = self.params.get("rw", fio_namespace + "/soak/*") # Get the parameters for Fio fio_cmd = FioCommand() fio_cmd.namespace = "{}/*".format(fio_namespace) fio_cmd.get_params(self) for blocksize in bs_list: for size in size_list: for rw in rw_list: # update fio params fio_cmd.update("global", "blocksize", blocksize, "fio --name=global --blocksize") fio_cmd.update("global", "size", size, "fio --name=global --size") fio_cmd.update("global", "rw", rw, "fio --name=global --rw") # start dfuse if api is POSIX if fio_cmd.api.value == "POSIX": # Connect to the pool, create container # and then start dfuse self.start_dfuse(pool) fio_cmd.update("global", "directory", self.dfuse.mount_dir.value, "fio --name=global --directory") # fio command log_name = "{}_{}_{}".format(blocksize, size, rw) commands.append([fio_cmd.__str__(), log_name]) self.log.info("<<FIO cmdline>>: %s \n", commands[-1]) return commands def build_job_script(self, commands, job, ppn, nodesperjob): """Create a slurm batch script that will execute a list of cmdlines. Args: commands(list): commandlines and cmd specific log_name job(str): the job name that will be defined in the slurm script ppn(int): number of tasks to run on each node Returns: script_list: list of slurm batch scripts """ self.log.info("<<Build Script>> at %s", time.ctime()) script_list = [] # Start the daos_agent in the batch script for now # TO-DO: daos_agents start with systemd agent_launch_cmds = [ "mkdir -p {}".format(os.environ.get("DAOS_TEST_LOG_DIR")) ] agent_launch_cmds.append(" ".join( [str(self.agent_managers[0].manager.job), "&"])) # Create the sbatch script for each cmdline used = [] for cmd, log_name in commands: output = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_") error = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_ERROR_") sbatch = { "time": str(self.job_timeout) + ":00", "exclude": NodeSet.fromlist(self.exclude_slurm_nodes), "error": str(error) } # include the cluster specific params sbatch.update(self.srun_params) unique = get_random_string(5, used) script = slurm_utils.write_slurm_script(self.test_log_dir, job, output, nodesperjob, agent_launch_cmds + [cmd], unique, sbatch) script_list.append(script) used.append(unique) return script_list def job_setup(self, job, pool): """Create the cmdline needed to launch job. Args: job(str): single job from test params list of jobs to run pool (obj): TestPool obj Returns: job_cmdlist: list cmdline that can be launched by specifed job manager """ job_cmdlist = [] commands = [] scripts = [] nodesperjob = [] self.log.info("<<Job_Setup %s >> at %s", self.test_name, time.ctime()) for npj in self.nodesperjob: # nodesperjob = -1 indicates to use all nodes in client hostlist if npj < 0: npj = len(self.hostlist_clients) if len(self.hostlist_clients) / npj < 1: raise SoakTestError( "<<FAILED: There are only {} client nodes for this job. " "Job requires {}".format(len(self.hostlist_clients), npj)) nodesperjob.append(npj) if "ior" in job: for npj in nodesperjob: for ppn in self.task_list: commands = self.create_ior_cmdline(job, pool, ppn, npj) # scripts are single cmdline scripts = self.build_job_script(commands, job, ppn, npj) job_cmdlist.extend(scripts) elif "fio" in job: commands = self.create_fio_cmdline(job, pool) # scripts are single cmdline scripts = self.build_job_script(commands, job, 1, 1) job_cmdlist.extend(scripts) else: raise SoakTestError("<<FAILED: Job {} is not supported. ".format( self.job)) return job_cmdlist def job_startup(self, job_cmdlist): """Submit job batch script. Args: job_cmdlist (list): list of jobs to execute Returns: job_id_list: IDs of each job submitted to slurm. """ self.log.info("<<Job Startup - %s >> at %s", self.test_name, time.ctime()) job_id_list = [] # before submitting the jobs to the queue, check the job timeout; if time.time() > self.end_time: self.log.info("<< SOAK test timeout in Job Startup>>") return job_id_list # job_cmdlist is a list of batch script files for script in job_cmdlist: try: job_id = slurm_utils.run_slurm_script(str(script)) except slurm_utils.SlurmFailed as error: self.log.error(error) # Force the test to exit with failure job_id = None if job_id: self.log.info("<<Job %s started with %s >> at %s", job_id, script, time.ctime()) slurm_utils.register_for_job_results(job_id, self, maxwait=self.test_timeout) # keep a list of the job_id's job_id_list.append(int(job_id)) else: # one of the jobs failed to queue; exit on first fail for now. err_msg = "Slurm failed to submit job for {}".format(script) job_id_list = [] raise SoakTestError("<<FAILED: Soak {}: {}>>".format( self.test_name, err_msg)) return job_id_list def job_completion(self, job_id_list): """Wait for job completion and cleanup. Args: job_id_list: IDs of each job submitted to slurm Returns: failed_job_id_list: IDs of each job that failed in slurm """ self.log.info("<<Job Completion - %s >> at %s", self.test_name, time.ctime()) # If there is nothing to do; exit if job_id_list: # wait for all the jobs to finish while len(self.soak_results) < len(job_id_list): # wait for the jobs to complete. # enter tearDown before hitting the avocado timeout if time.time() > self.end_time: self.log.info("<< SOAK test timeout in Job Completion>>") break time.sleep(5) # check for job COMPLETED and remove it from the job queue for job, result in self.soak_results.items(): # The queue include status of "COMPLETING" if result == "COMPLETED": job_id_list.remove(int(job)) else: self.log.info("<< Job %s failed with status %s>>", job, result) if job_id_list: self.log.info("<<Cancel jobs in queue with id's %s >>", job_id_list) for job in job_id_list: status = slurm_utils.cancel_jobs(int(job)) if status == 0: self.log.info("<<Job %s successfully cancelled>>", job) else: self.log.info("<<Job %s could not be killed>>", job) # gather all the logfiles for this pass and cleanup test nodes try: self.get_remote_logs() except SoakTestError as error: self.log.info("Remote copy failed with %s", error) self.soak_results = {} return job_id_list def execute_jobs(self, jobs, pools): """Execute the overall soak test. Args: pools (list): list of TestPool obj - self.pool[1:] Raise: SoakTestError """ cmdlist = [] # Create the remote log directories from new loop/pass self.test_log_dir = self.log_dir + "/pass" + str(self.loop) self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "mkdir -p {}".format(self.test_log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: logfile directory not" "created on clients>>: {}".format( self.hostlist_clients)) # Create local log directory os.makedirs(self.local_pass_dir) # Setup cmdlines for job with specified pool if len(pools) < len(jobs): raise SoakTestError( "<<FAILED: There are not enough pools to run this test>>") for index, job in enumerate(jobs): cmdlist.extend(self.job_setup(job, pools[index])) # Gather the job_ids job_id_list = self.job_startup(cmdlist) # Initialize the failed_job_list to job_list so that any # unexpected failures will clear the squeue in tearDown self.failed_job_id_list = job_id_list # launch harassers if defined and enabled if self.h_list and self.loop > 1: self.log.info("<<Harassers are enabled>>") self.launch_harassers(self.h_list, pools) if not self.harasser_completion(self.harasser_timeout): raise SoakTestError("<<FAILED: Harassers failed ") # rebuild can only run once for now if self.is_harasser("rebuild"): self.h_list.remove("rebuild") # Wait for jobs to finish and cancel/kill jobs if necessary self.failed_job_id_list = self.job_completion(job_id_list) # Log the failing job ID if self.failed_job_id_list: self.log.info( "<<FAILED: The following jobs failed %s >>", (" ,".join(str(j_id) for j_id in self.failed_job_id_list))) # accumulate failing job IDs self.all_failed_jobs.extend(self.failed_job_id_list) def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.harasser_joblist = [] self.harasser_results = {} test_to = self.params.get("test_timeout", test_param) self.job_timeout = self.params.get("job_timeout", test_param) self.harasser_timeout = self.params.get("harasser_timeout", test_param) self.test_name = self.params.get("name", test_param) self.nodesperjob = self.params.get("nodesperjob", test_param) self.test_iteration = self.params.get("iteration", test_param) self.task_list = self.params.get("taskspernode", test_param + "*") self.h_list = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") pool_list = self.params.get("poollist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") if self.is_harasser("rebuild"): obj_class = "_".join([ "OC", str(self.params.get("daos_oclass", "/run/rebuild/*")[0]) ]) else: obj_class = self.params.get("object_class", "/run/container_reserved/*") slurm_reservation = self.params.get("reservation", "/run/srun_params/*") # Srun params if self.client_partition is not None: self.srun_params = {"partition": self.client_partition} if slurm_reservation is not None: self.srun_params["reservation"] = slurm_reservation # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool self.add_pools(["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data self.container = TestContainer(self.pool[0]) self.container.namespace = "/run/container_reserved/*" self.container.get_params(self) self.container.create() self.container.write_objects(rank, obj_class) self.all_failed_jobs = [] # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node /tmp/soak cmd = "rm -rf {}".format(self.log_dir) try: result = process.run(cmd, shell=True, timeout=30) except process.CmdError as error: raise SoakTestError( "<<FAILED: Soak directory on testnode not removed {}>>".format( error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<Soak1 PASS %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create all specified pools self.add_pools(pool_list) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) try: self.execute_jobs(job_list, self.pool[1:]) except SoakTestError as error: self.fail(error) errors = self.destroy_pools(self.pool[1:]) # remove the test pools from self.pool; preserving reserved pool self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) self.assertEqual(len(errors), 0, "\n".join(errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<PASS %s completed in %s >>", self.loop, DDHHMMSS_format(loop_time)) # # if the time left if less than a loop exit now # if end_time - time.time() < loop_time: # break self.loop += 1 # TO-DO: use IOR self.assertTrue( self.container.read_objects(), "Data verification error on reserved pool" "after SOAK completed") # gather the doas logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>", DDHHMMSS_format(time.time() - start_time))
class DfuseContainerCheck(TestWithServers): """Base Dfuse Container check test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a DfuseContainerCheck object.""" super(DfuseContainerCheck, self).__init__(*args, **kwargs) self.dfuse = None self.pool = None self.container = None def setUp(self): """Set up each test case.""" # Start the servers and agents super(DfuseContainerCheck, self).setUp() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(DfuseContainerCheck, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool(self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def start_dfuse(self): """Create a DfuseCommand object to start dfuse. """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run(False) except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n") def test_dfusecontainercheck(self): """Jira ID: DAOS-3635. Test Description: Purpose of this test is to try and mount different container types to dfuse and check the behavior. Use cases: Create pool Create container of type default Try to mount to dfuse and check the behaviour. Create container of type POSIX. Try to mount to dfuse and check the behaviour. :avocado: tags=all,small,full_regression,dfusecontainercheck """ # get test params for cont and pool count cont_types = self.params.get("cont_types", '/run/container/*') # Create a pool and start dfuse. self.create_pool() for cont_type in cont_types: # Get container params self.container = TestContainer(self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # create container if cont_type == "POSIX": self.container.type.update(cont_type) self.container.create() try: # mount fuse self.start_dfuse() # check if fuse got mounted self.dfuse.check_running() # fail the test if fuse mounts with non-posix type container if cont_type == "": self.fail( "Non-Posix type container got mounted over dfuse") except CommandFailure as error: # expected to throw CommandFailure exception for non-posix type # container if cont_type == "": self.log.info( "Expected behaviour: Default container type \ is expected to fail on dfuse mount: %s", str(error)) # fail the test if exception is caught for POSIX type container elif cont_type == "POSIX": self.log.error( "Posix Container dfuse mount \ failed: %s", str(error)) self.fail("Posix container type was expected to mount \ over dfuse") # stop fuse and container for next iteration if not cont_type == "": self.dfuse.stop() self.container.destroy(1)
class FioBase(TestWithServers): """Base fio class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a FioBase object.""" super(FioBase, self).__init__(*args, **kwargs) self.fio_cmd = None self.processes = None self.manager = None self.dfuse = None self.daos_cmd = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(FioBase, self).setUp() # initialise daos_cmd self.daos_cmd = DaosCommand(self.bin) # Get the parameters for Fio self.fio_cmd = FioCommand() self.fio_cmd.get_params(self) self.processes = self.params.get("np", '/run/fio/client_processes/*') self.manager = self.params.get("manager", '/run/fio/*', "MPICH") def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(FioBase, self).tearDown() def _create_pool(self): """Create a pool and execute Fio.""" # Get the pool params # pylint: disable=attribute-defined-outside-init self.pool = TestPool(self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def _create_cont(self): """Create a container. Returns: str: UUID of the created container """ cont_type = self.params.get("type", "/run/container/*") result = self.daos_cmd.container_create(pool=self.pool.uuid, svc=self.pool.svc_ranks, cont_type=cont_type) # Extract the container UUID from the daos container create output cont_uuid = re.findall(r"created\s+container\s+([0-9a-f-]+)", result.stdout) if not cont_uuid: self.fail("Error obtaining the container uuid from: {}".format( result.stdout)) return cont_uuid[0] def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self._create_cont()) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Unable to launch Dfuse.\n") def execute_fio(self): """Runner method for Fio.""" # Create a pool if one does not already exist if self.pool is None: self._create_pool() # start dfuse if api is POSIX if self.fio_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved # self.pool.connect() # self.create_cont() self._start_dfuse() self.fio_cmd.update("global", "directory", self.dfuse.mount_dir.value, "fio --name=global --directory") # Run Fio self.fio_cmd.hosts = self.hostlist_clients self.fio_cmd.run() if self.dfuse: self.dfuse.stop() self.dfuse = None
class MdtestBase(TestWithServers): """Base mdtest class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a MdtestBase object.""" super(MdtestBase, self).__init__(*args, **kwargs) self.mdtest_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(MdtestBase, self).setUp() # Get the parameters for Mdtest self.mdtest_cmd = MdtestCommand() self.mdtest_cmd.get_params(self) self.processes = self.params.get("np", '/run/mdtest/client_processes/*') self.manager = self.params.get("manager", '/run/mdtest/*', "MPICH") # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.mdtest_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) def tearDown(self): """Tear down each test case.""" try: self.dfuse = None finally: # Stop the servers and agents super(MdtestBase, self).tearDown() def _create_pool(self): """Create a pool and execute Mdtest.""" # Get the pool params self.pool = TestPool(self.context, self.log) self.pool.get_params(self) # Create a pool self.pool.create() def _create_cont(self): """Create a TestContainer object to be used to create container.""" # TO-DO: Enable container using TestContainer object, # once DAOS-3355 is resolved. # Get Container params #self.container = TestContainer(self.pool) #self.container.get_params(self) # create container # self.container.create() env = Dfuse(self.hostlist_clients, self.tmp).get_default_env() # command to create container of posix type cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format( self.mdtest_cmd.dfs_pool_uuid.value, self.mdtest_cmd.dfs_svcl.value) try: container = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) (output, err) = container.communicate() self.log.info("Container created with UUID %s", output.split()[3]) except subprocess.CalledProcessError as err: self.fail("Container create failed:{}".format(err)) return output.split()[3] def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp, self.basepath) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self._create_cont()) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet(self.dfuse.hosts)), exc_info=error) self.fail("Unable to launch Dfuse.\n") def execute_mdtest(self): """Runner method for Mdtest.""" # Create a pool if one does not already exist if self.pool is None: self._create_pool() # set Mdtest params self.mdtest_cmd.set_daos_params(self.server_group, self.pool) # start dfuse if api is POSIX if self.mdtest_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved # self.pool.connect() # self.create_cont() self._start_dfuse() self.mdtest_cmd.test_dir.update(self.dfuse.mount_dir.value) # Run Mdtest self.run_mdtest(self.get_job_manager_command(self.manager), self.processes) def get_job_manager_command(self, manager): """Get the MPI job manager command for Mdtest. Returns: JobManager: the object for the mpi job manager command """ # Initialize MpioUtils if mdtest needs to be run using mpich if manager == "MPICH": mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") path = os.path.join(mpio_util.mpichinstall, "bin") return Mpirun(self.mdtest_cmd, path) path = os.path.join(self.ompi_prefix, "bin") return Orterun(self.mdtest_cmd, path) def run_mdtest(self, manager, processes): """Run the Mdtest command. Args: manager (str): mpi job manager command processes (int): number of host processes """ env = self.mdtest_cmd.get_default_env(str(manager), self.tmp, self.client_log) manager.setup_command(env, self.hostfile_clients, processes) try: manager.run() except CommandFailure as error: self.log.error("Mdtest Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n")
class ParallelIo(FioBase, IorTestBase): """Base Parallel IO test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a ParallelIo object.""" super(ParallelIo, self).__init__(*args, **kwargs) self.dfuse = None self.cont_count = None self.pool_count = None self.statvfs_info_initial = None self.statvfs_before_cont_destroy = None self.statvfs_after_cont_destroy = None self.pool = [] self.container = [] def setUp(self): """Set up each test case.""" # Start the servers and agents super(ParallelIo, self).setUp() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(ParallelIo, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params pool = TestPool(self.context, dmg_command=self.get_dmg_command()) pool.get_params(self) # Create a pool pool.create() self.pool.append(pool) # pylint: disable=arguments-differ def create_cont(self, pool): """Create a TestContainer object to be used to create container. Args: pool (TestPool): TestPool object type for which container needs to be created """ # Get container params container = TestContainer(pool, daos_command=DaosCommand(self.bin)) container.get_params(self) # create container container.create() self.container.append(container) def start_dfuse(self, pool=None): """Create a DfuseCommand object to start dfuse. Args: pool (TestPool): Test pool object if dfuse is intended to be started using pool uuid option. """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params if pool: self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n") def stat_bfree(self, path): """Get stat bfree Args: path (str): path to get free block size of. Returns: integer value of stat free blocks """ cmd = [ "ssh", "{}@{}".format(getuser(), self.hostlist_clients[0]), "stat -c%a -f {}".format(path) ] try: result = subprocess.check_output(cmd) except subprocess.CalledProcessError as err: self.fail("Get free block size method failed with: {}".format(err)) return int(result) def statvfs_pool(self, path): """Method to obtain free space using statvfs Args: path (str): path for which free space needs to be obtained for. Returns: List containing free space info for each pool supplied in pool_obj. """ statvfs_list = [] for _, pool in enumerate(self.pool): dfuse_pool_dir = str(path + "/" + pool.uuid) statvfs_info = self.stat_bfree(dfuse_pool_dir) statvfs_list.append(statvfs_info) self.log.info("Statvfs List Output: %s", statvfs_list) return statvfs_list def verify_aggregation(self, reduced_space, count): """Verify if expected space is returned for each pool after containers were destroyed. If not, wait for 60 secs and check again. Wait 4 times, otherwise exit the test with a failure. Args: reduced_space: expected space to be returned """ counter = 1 while (self.statvfs_after_cont_destroy[count] < self.statvfs_before_cont_destroy[count] + reduced_space): # try to wait for 4 x 60 secs for aggregation to be completed # or else exit the test with a failure. if counter > 4: self.log.info("Free space before io: %s", self.statvfs_info_initial) self.log.info("Free space after io: %s", self.statvfs_before_cont_destroy) self.log.info("Free space at test termination: %s", self.statvfs_after_cont_destroy) self.fail("Aggregation did not complete as expected") time.sleep(60) self.statvfs_after_cont_destroy = self.statvfs_pool( self.dfuse.mount_dir.value) counter += 1 def test_parallelio(self): """Jira ID: DAOS-3775. Test Description: Purpose of this test is to mount dfuse and verify multiple containers using fio. Use cases: Mount dfuse using pool uuid. Create multiple containers under that dfuse mount point. Check those containers are accessible from that mount point. Perform io to those containers using FIO Delete one of the containers Check if dfuse is still running. If not, fail the test and exit. Otherwise, try accessing the deleted container. This should fail. Check dfuse again. :avocado: tags=all,hw,daosio,medium,ib2,full_regression,parallelio """ # get test params for cont and pool count self.cont_count = self.params.get("cont_count", '/run/container/*') threads = [] # Create a pool and start dfuse. self.create_pool() self.start_dfuse(self.pool[0]) # create multiple containers for _ in range(self.cont_count): self.create_cont(self.pool[0]) # check if all the created containers can be accessed and perform # io on each container using fio in parallel for _, cont in enumerate(self.container): dfuse_cont_dir = self.dfuse.mount_dir.value + "/" + cont.uuid cmd = u"ls -a {}".format(dfuse_cont_dir) try: # execute bash cmds ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in ret_code.items() if code != 0 ])) raise CommandFailure("Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("ParallelIo Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") # run fio on all containers thread = threading.Thread(target=self.execute_fio, args=(self.dfuse.mount_dir.value + "/" + cont.uuid, False)) threads.append(thread) thread.start() # wait for all fio jobs to be finished for job in threads: job.join() # destroy first container container_to_destroy = self.container[0].uuid self.container[0].destroy(1) # check dfuse if it is running fine self.dfuse.check_running() # try accessing destroyed container, it should fail try: self.execute_fio(self.dfuse.mount_dir.value + "/" + \ container_to_destroy, False) self.fail("Fio was able to access destroyed container: {}".\ format(self.container[0].uuid)) except CommandFailure as error: self.log.info("This run is expected to fail") # check dfuse is still running after attempting to access deleted # container. self.dfuse.check_running() def test_multipool_parallelio(self): """Jira ID: DAOS-3775. Test Description: Purpose of this test is to verify aggregation across multiple pools and containers. Use cases: Create 10 pools Create 10 containers under each pool. Record statvfs free space for each pool. Perform parallel io to each pool without deleting the file after write. Record free space using statvfs after write. Delete half of the containers from each pool. Calculate the expected amount of data to be deleted when containers are destroyed. Record free space after container destroy. Loop until either the all space is returned back after aggregation completion or exit the loop after trying for 240 secs of wait and fail the test. :avocado: tags=all,hw,daosio,medium,ib2,full_regression :avocado: tags=multipoolparallelio """ # test params threads = [] pool_threads = [] cont_threads = [] self.pool_count = self.params.get("pool_count", '/run/pool/*') self.cont_count = self.params.get("cont_count", '/run/container/*') processes = self.params.get("np", '/run/ior/client_processes/*') # Create pools in parallel. for _ in range(self.pool_count): pool_thread = threading.Thread(target=self.create_pool()) pool_threads.append(pool_thread) pool_thread.start() # wait for container create to finish for pool_job in pool_threads: pool_job.join() # start dfuse using --svc option only. self.start_dfuse() # record free space using statvfs before any data is written. self.statvfs_info_initial = self.statvfs_pool( self.dfuse.mount_dir.value) # Create 10 containers for each pool. Container create process cannot # be parallelised as different container create could complete at # different times and get appended in the self.container variable in # unorderly manner, causing problems during the write process. for _, pool in enumerate(self.pool): for _ in range(self.cont_count): self.create_cont(pool) # Try to access each dfuse mounted container using ls. Once it is # accessed successfully, go ahead and perform io on that location # using ior. This process of performing io is done in parallel for # all containers using threads. for pool_count, pool in enumerate(self.pool): dfuse_pool_dir = str(self.dfuse.mount_dir.value + "/" + pool.uuid) for counter in range(self.cont_count): cont_num = (pool_count * self.cont_count) + counter dfuse_cont_dir = str(dfuse_pool_dir + "/" + self.container[cont_num].uuid) cmd = u"###ls -a {}".format(dfuse_cont_dir) self.execute_cmd(cmd) # run ior on all containers test_file = dfuse_cont_dir + "/testfile" self.ior_cmd.test_file.update(test_file) self.ior_cmd.set_daos_params(self.server_group, pool, self.container[cont_num].uuid) thread = threading.Thread( target=self.run_ior, args=(self.get_ior_job_manager_command(), processes, None, False)) threads.append(thread) thread.start() # wait for all ior jobs to be finished for job in threads: job.join() # Record free space after io self.statvfs_before_cont_destroy = self.statvfs_pool( self.dfuse.mount_dir.value) # Destroy half of the containers from each pool pfinal = 0 for count in range(self.cont_count): pinitial = pfinal pfinal = pinitial + (self.cont_count // 2) del self.container[pinitial:pfinal] for cont in self.container: cont_thread = threading.Thread(target=cont.destroy) cont_threads.append(cont_thread) cont_thread.start() for destroy_job in cont_threads: destroy_job.join() # Record free space after container destroy. self.statvfs_after_cont_destroy = self.statvfs_pool( self.dfuse.mount_dir.value) # Calculate the expected space to be returned after containers # are destroyed. reduced_space = (self.cont_count * int(self.ior_cmd.block_size.value)) / 2 # Verify if expected space is returned for each pool after containers # were destroyed. If not, wait for 60 secs and check again. Wait 4 # times, otherwise exit the test with a failure. for count in range(self.pool_count): thread = threading.Thread(target=self.verify_aggregation, args=(reduced_space, count)) threads.append(thread) thread.start() for job in threads: job.join()
class BashCmd(TestWithServers): """Base BashCmd test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a BashCmd object.""" super(BashCmd, self).__init__(*args, **kwargs) self.dfuse = None self.file_name = None self.dir_name = None self.pool_count = None self.cont_count = None def setUp(self): """Set up each test case.""" # Start the servers and agents super(BashCmd, self).setUp() # Get the parameters for BashCmd self.dir_name = self.params.get("dirname", '/run/bashcmd/*') self.file_name1 = self.params.get("filename1", '/run/bashcmd/*') self.file_name2 = self.params.get("filename2", '/run/bashcmd/*') self.dd_count = self.params.get("dd_count", '/run/bashcmd/*') self.dd_blocksize = self.params.get("dd_blocksize", '/run/bashcmd/*') def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(BashCmd, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool(self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # Get container params self.container = TestContainer(self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # create container self.container.create() def start_dfuse(self, count): """Create a DfuseCommand object to start dfuse. Args: count(int): container index """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.mount_dir.update("/tmp/" + self.pool.uuid + "_daos_dfuse" + str(count)) self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Test was expected to pass but it failed.\n") def test_bashcmd(self): """Jira ID: DAOS-3508. Test Description: Purpose of this test is to mount different mount points of dfuse for different container and pool sizes and perform basic bash commands. Use cases: Following list of bash commands have been incorporated as part of this test: mkdir, touch, ls, chmod, rm, dd, stat, cp, cmp, mv, rmdir. Create a directory. Create a file under that directory. List the created file. Remove the file. Write a file to the dfuse mounted location using dd. List the written file to verify if it's create. Verify the file created is of right size as desired. Copy the file Compare the copied file with original to verify the content is same. Remove copied file. Rename file Verify renamed file exist using list. Remove a directory :avocado: tags=all,hw,daosio,medium,ib2,full_regression,bashcmd """ self.cont_count = self.params.get("cont_count", '/run/container/*') self.pool_count = self.params.get("pool_count", '/run/pool/*') # Create a pool if one does not already exist. for _ in range(self.pool_count): self.create_pool() # perform test for multiple containers. for count in range(self.cont_count): self.create_cont() self.start_dfuse(count) abs_dir_path = os.path.join(self.dfuse.mount_dir.value, self.dir_name) abs_file_path1 = os.path.join(abs_dir_path, self.file_name1) abs_file_path2 = os.path.join(abs_dir_path, self.file_name2) # list of commands to be executed. commands = [ u"mkdir -p {}".format(abs_dir_path), u"touch {}".format(abs_file_path1), u"ls -a {}".format(abs_file_path1), u"rm {}".format(abs_file_path1), u"dd if=/dev/zero of={} count={} bs={}".format( abs_file_path1, self.dd_count, self.dd_blocksize), u"ls -al {}".format(abs_file_path1), u"filesize=$(stat -c%s '{}');\ if (( filesize != {}*{} )); then exit 1;\ fi".format(abs_file_path1, self.dd_count, self.dd_blocksize), u"cp -r {} {}".format(abs_file_path1, abs_file_path2), u"cmp --silent {} {}".format(abs_file_path1, abs_file_path2), u"rm {}".format(abs_file_path2), u"mv {} {}".format( abs_file_path1, abs_file_path2), u"ls -al {}".format(abs_file_path2), u"rm {}".format(abs_file_path2), u"rmdir {}".format(abs_dir_path) ] for cmd in commands: try: # execute bash cmds ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in ret_code.items() if code != 0 ])) raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("BashCmd Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") # stop dfuse self.dfuse.stop() # destroy container self.container.destroy() # destroy pool self.pool.destroy()
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None self.lock = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) # lock is needed for run_multiple_ior method. self.lock = threading.Lock() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(IorTestBase, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool(self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # Get container params self.container = TestContainer(self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # create container self.container.create() def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n") def run_ior_with_pool(self, intercept=None, test_file_suffix="", test_file="daos:testFile"): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str, optional): path to the interception library. Shall be used only for POSIX through DFUSE. Defaults to None. test_file_suffix (str, optional): suffix to add to the end of the test file name. Defaults to "". test_file (str, optional): ior test file name. Defaults to "daos:testFile". Is ignored when using POSIX through DFUSE. Returns: CmdResult: result of the ior command execution """ self.update_ior_cmd_with_pool() # start dfuse if api is POSIX if self.ior_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved if self.ior_cmd.transfer_size.value == "256B": return "Skipping the case for transfer_size=256B" self._start_dfuse() test_file = os.path.join(self.dfuse.mount_dir.value, "testfile") elif self.ior_cmd.api.value == "DFS": test_file = os.path.join("/", "testfile") self.ior_cmd.test_file.update("".join([test_file, test_file_suffix])) out = self.run_ior(self.get_ior_job_manager_command(), self.processes, intercept) if self.dfuse: self.dfuse.stop() self.dfuse = None return out def update_ior_cmd_with_pool(self): """Update ior_cmd with pool.""" # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Always create a container # Don't pass uuid and pool handle to IOR. # It will not enable checksum feature self.pool.connect() self.create_cont() # Update IOR params with the pool and container params self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.uuid) def get_ior_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX", "DFS"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") return Mpirun(self.ior_cmd, mpitype="mpich") def run_ior(self, manager, processes, intercept=None): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.setup_command(env, self.hostfile_clients, processes) try: self.pool.display_pool_daos_space() out = manager.run() return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def run_multiple_ior_with_pool(self, results, intercept=None): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str): path to the interception library. Shall be used only for POSIX through DFUSE. ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ self.update_ior_cmd_with_pool() # start dfuse for POSIX api. This is specific to interception # library test requirements. self._start_dfuse() # Create two jobs and run in parallel. # Job1 will have 3 client set up to use dfuse + interception # library # Job2 will have 1 client set up to use only dfuse. job1 = self.get_new_job(self.hostlist_clients[:-1], 1, results, intercept) job2 = self.get_new_job([self.hostlist_clients[-1]], 2, results, None) job1.start() # Since same ior_cmd is used to trigger the MPIRUN # with different parameters, pausing for 2 seconds to # avoid data collisions. time.sleep(2) job2.start() job1.join() job2.join() self.dfuse.stop() self.dfuse = None def get_new_job(self, clients, job_num, results, intercept=None): """Create a new thread for ior run. Args: clients (lst): Number of clients the ior would run against. job_num (int): Assigned job number results (dict): A dictionary object to store the ior metrics intercept (path): Path to interception library """ hostfile = write_host_file.write_host_file(clients, self.workdir, self.hostfile_clients_slots) job = threading.Thread( target=self.run_multiple_ior, args=[hostfile, len(clients), results, job_num, intercept]) return job def run_multiple_ior(self, hostfile, num_clients, results, job_num, intercept=None): # pylint: disable=too-many-arguments """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ self.lock.acquire(True) tsize = self.ior_cmd.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" self.ior_cmd.test_file.update(testfile) manager = self.get_ior_job_manager_command() procs = (self.processes // len(self.hostlist_clients)) * num_clients env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.setup_command(env, hostfile, procs) self.lock.release() try: self.pool.display_pool_daos_space() out = manager.run() self.lock.acquire(True) results[job_num] = IorCommand.get_ior_metrics(out) self.lock.release() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size))
class MdtestBase(TestWithServers): """Base mdtest class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a MdtestBase object.""" super(MdtestBase, self).__init__(*args, **kwargs) self.mdtest_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.daos_cmd = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(MdtestBase, self).setUp() # initialize daos_cmd self.daos_cmd = DaosCommand(self.bin) # Get the parameters for Mdtest self.mdtest_cmd = MdtestCommand() self.mdtest_cmd.get_params(self) self.processes = self.params.get("np", '/run/mdtest/client_processes/*') self.manager = self.params.get("manager", '/run/mdtest/*', "MPICH") self.log.info('Clients %s', self.hostlist_clients) self.log.info('Servers %s', self.hostlist_servers) def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(MdtestBase, self).tearDown() def create_pool(self): """Create a pool and execute Mdtest.""" # Get the pool params self.pool = TestPool(self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def _create_cont(self): """Create a container. Returns: str: UUID of the created container """ cont_type = self.params.get("type", "/run/container/*") result = self.daos_cmd.container_create(pool=self.pool.uuid, svc=self.pool.svc_ranks, cont_type=cont_type) # Extract the container UUID from the daos container create output cont_uuid = re.findall(r"created\s+container\s+([0-9a-f-]+)", result.stdout) if not cont_uuid: self.fail("Error obtaining the container uuid from: {}".format( result.stdout)) return cont_uuid[0] def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self._create_cont()) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), self.dfuse.hosts, exc_info=error) self.fail("Unable to launch Dfuse.\n") def execute_mdtest(self): """Runner method for Mdtest.""" # Create a pool if one does not already exist if self.pool is None: self.create_pool() # set Mdtest params self.mdtest_cmd.set_daos_params(self.server_group, self.pool) # start dfuse if api is POSIX if self.mdtest_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved # self.pool.connect() # self.create_cont() self._start_dfuse() self.mdtest_cmd.test_dir.update(self.dfuse.mount_dir.value) # Run Mdtest self.run_mdtest(self.get_mdtest_job_manager_command(self.manager), self.processes) if self.dfuse: self.dfuse.stop() self.dfuse = None def get_mdtest_job_manager_command(self, manager): """Get the MPI job manager command for Mdtest. Returns: JobManager: the object for the mpi job manager command """ # Initialize MpioUtils if mdtest needs to be run using mpich if manager == "MPICH": mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") return Mpirun(self.mdtest_cmd, mpitype="mpich") return Orterun(self.mdtest_cmd) def run_mdtest(self, manager, processes): """Run the Mdtest command. Args: manager (str): mpi job manager command processes (int): number of host processes """ env = self.mdtest_cmd.get_default_env(str(manager), self.client_log) manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(processes) manager.assign_environment(env) try: self.pool.display_pool_daos_space() manager.run() except CommandFailure as error: self.log.error("Mdtest Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space()
class FioBase(TestWithServers): """Base fio class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a FioBase object.""" super(FioBase, self).__init__(*args, **kwargs) self.fio_cmd = None self.processes = None self.manager = None self.dfuse = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(FioBase, self).setUp() # removing runner node from hostlist_client, only need one client node. self.hostlist_clients = self.hostlist_clients[:-1] # Get the parameters for Fio self.fio_cmd = FioCommand() self.fio_cmd.get_params(self) self.processes = self.params.get("np", '/run/fio/client_processes/*') self.manager = self.params.get("manager", '/run/fio/*', "MPICH") def tearDown(self): """Tear down each test case.""" try: self.dfuse = None finally: # Stop the servers and agents super(FioBase, self).tearDown() def _create_pool(self): """Create a pool and execute Fio.""" # Get the pool params # pylint: disable=attribute-defined-outside-init self.pool = TestPool(self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def _create_cont(self): """Create a TestContainer object to be used to create container.""" # TO-DO: Enable container using TestContainer object, # once DAOS-3355 is resolved. # Get Container params # self.container = TestContainer(self.pool) # self.container.get_params(self) # create container # self.container.create() env = Dfuse(self.hostlist_clients, self.tmp).get_default_env() # command to create container of posix type cmd = env + "daos cont create --pool={} --svc={} --type=POSIX".format( self.pool.uuid, ":".join( [str(item) for item in self.pool.svc_ranks])) try: container = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) (output, err) = container.communicate() self.log.info("Container created with UUID %s", output.split()[3]) except subprocess.CalledProcessError as err: self.fail("Container create failed:{}".format(err)) return output.split()[3] def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp, self.basepath) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self._create_cont()) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str( NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Unable to launch Dfuse.\n") def execute_fio(self): """Runner method for Fio.""" # Create a pool if one does not already exist if self.pool is None: self._create_pool() # start dfuse if api is POSIX if self.fio_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse # Uncomment below two lines once DAOS-3355 is resolved # self.pool.connect() # self.create_cont() self._start_dfuse() self.fio_cmd.update( "global", "directory", self.dfuse.mount_dir.value, "fio --name=global --directory") # Run Fio self.fio_cmd.hosts = self.hostlist_clients self.fio_cmd.run()
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ IOR_WRITE_PATTERN = "Commencing write performance test" IOR_READ_PATTERN = "Commencing read performance test" def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None self.lock = None self.mpirun = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) # lock is needed for run_multiple_ior method. self.lock = threading.Lock() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(IorTestBase, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool( self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # Get container params self.container = TestContainer( self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # create container self.container.create() def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n") def run_ior_with_pool(self, intercept=None, test_file_suffix="", test_file="daos:testFile", create_pool=True, create_cont=True, stop_dfuse=True): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str, optional): path to the interception library. Shall be used only for POSIX through DFUSE. Defaults to None. test_file_suffix (str, optional): suffix to add to the end of the test file name. Defaults to "". test_file (str, optional): ior test file name. Defaults to "daos:testFile". Is ignored when using POSIX through DFUSE. create_pool (bool, optional): If it is true, create pool and container else just run the ior. Defaults to True. create_cont (bool, optional): Create new container. Default is True stop_dfuse (bool, optional): Stop dfuse after ior command is finished. Default is True. Returns: CmdResult: result of the ior command execution """ if create_pool: self.update_ior_cmd_with_pool(create_cont) # start dfuse if api is POSIX if self.ior_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse if not self.dfuse: self._start_dfuse() test_file = os.path.join(self.dfuse.mount_dir.value, "testfile") elif self.ior_cmd.api.value == "DFS": test_file = os.path.join("/", "testfile") self.ior_cmd.test_file.update("".join([test_file, test_file_suffix])) out = self.run_ior(self.get_ior_job_manager_command(), self.processes, intercept) if stop_dfuse and self.dfuse: self.dfuse.stop() self.dfuse = None return out def update_ior_cmd_with_pool(self, create_cont=True): """Update ior_cmd with pool.""" # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Create a container, if needed. # Don't pass uuid and pool handle to IOR. # It will not enable checksum feature if create_cont: self.pool.connect() self.create_cont() # Update IOR params with the pool and container params self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.uuid) def get_ior_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DFS mode if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") if self.subprocess: self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich") else: self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich") return self.mpirun def check_subprocess_status(self, operation="write"): """Check subprocess status """ if operation == "write": self.ior_cmd.pattern = self.IOR_WRITE_PATTERN elif operation == "read": self.ior_cmd.pattern = self.IOR_READ_PATTERN else: self.fail("Exiting Test: Inappropriate operation type \ for subprocess status check") if not self.ior_cmd.check_ior_subprocess_status( self.mpirun.process, self.ior_cmd): self.fail("Exiting Test: Subprocess not running") def run_ior(self, manager, processes, intercept=None, display_space=True): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(processes) manager.assign_environment(env) try: if display_space: self.pool.display_pool_daos_space() out = manager.run() if not self.subprocess: for line in out.stdout.splitlines(): if 'WARNING' in line: self.fail("IOR command issued warnings.\n") return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: if not self.subprocess and display_space: self.pool.display_pool_daos_space() def stop_ior(self): """Stop IOR process. Args: manager (str): mpi job manager command """ self.log.info( "<IOR> Stopping in-progress IOR command: %s", self.mpirun.__str__()) try: out = self.mpirun.stop() return out except CommandFailure as error: self.log.error("IOR stop Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def run_multiple_ior_with_pool(self, results, intercept=None): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str): path to the interception library. Shall be used only for POSIX through DFUSE. ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ self.update_ior_cmd_with_pool() # start dfuse for POSIX api. This is specific to interception # library test requirements. self._start_dfuse() # Create two jobs and run in parallel. # Job1 will have 3 client set up to use dfuse + interception # library # Job2 will have 1 client set up to use only dfuse. job1 = self.get_new_job(self.hostlist_clients[:-1], 1, results, intercept) job2 = self.get_new_job([self.hostlist_clients[-1]], 2, results, None) job1.start() # Since same ior_cmd is used to trigger the MPIRUN # with different parameters, pausing for 2 seconds to # avoid data collisions. time.sleep(2) job2.start() job1.join() job2.join() self.dfuse.stop() self.dfuse = None def get_new_job(self, clients, job_num, results, intercept=None): """Create a new thread for ior run. Args: clients (list): hosts on which to run ior job_num (int): Assigned job number results (dict): A dictionary object to store the ior metrics intercept (path): Path to interception library """ job = threading.Thread(target=self.run_multiple_ior, args=[ clients, results, job_num, intercept]) return job def run_multiple_ior(self, clients, results, job_num, intercept=None): """Run the IOR command. Args: clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.lock.acquire(True) tsize = self.ior_cmd.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" self.ior_cmd.test_file.update(testfile) manager = self.get_ior_job_manager_command() procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.lock.release() try: self.pool.display_pool_daos_space() out = manager.run() self.lock.acquire(True) results[job_num] = IorCommand.get_ior_metrics(out) self.lock.release() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size)) def execute_cmd(self, cmd, fail_on_err=True, display_output=True): """Execute cmd using general_utils.pcmd Args: cmd (str): String command to be executed fail_on_err (bool): Boolean for whether to fail the test if command execution returns non zero return code. display_output (bool): Boolean for whether to display output. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: # execute bash cmds ret = pcmd( self.hostlist_clients, cmd, verbose=display_output, timeout=300) if 0 not in ret: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret.items() if code != 0])) if fail_on_err: raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("DfuseSparseFile Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") return ret
class DfuseTestBase(TestWithServers): """Runs HDF5 vol test suites. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a TestWithServers object.""" super(DfuseTestBase, self).__init__(*args, **kwargs) self.dfuse = None def tearDown(self): """Tear down each test case.""" try: self.stop_dfuse() finally: # Stop the servers and agents super(DfuseTestBase, self).tearDown() def start_dfuse(self, hosts, pool, container): """Create a DfuseCommand object and use it to start Dfuse. Args: hosts (list): list of hosts on which to start Dfuse pool (TestPool): pool to use with Dfuse container (TestContainer): container to use with Dfuse """ self.dfuse = Dfuse(hosts, self.tmp) self.dfuse.get_params(self) # Update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # Start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.") def stop_dfuse(self): """Stop Dfuse and unset the DfuseCommand object.""" if self.dfuse: self.dfuse.stop() self.dfuse = None