def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.daos_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.daos_cont.update(container_info ["{}{}{}".format(oclass, api, test[2])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def ior_bg_thread(self, results): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. Args: results (queue): queue for returning thread results """ mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command manager = Mpirun(ior_bg_cmd, mpitype="mpich") self.create_cont() manager.job.dfs_cont.update(self.container.uuid) env = ior_bg_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(1) manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") break
def ior_thread(self, pool, oclass, api, test, flags, results): """This method calls job manager for IOR command invocation. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) if "-w" in flags: self.container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[0])]) manager.job.dfs_cont.update(self.container_info[key]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
class OSAUtils(IorTestBase): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server offline drain test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAUtils, self).setUp() self.container = None self.obj = None self.ioreq = None self.dmg_command = self.get_dmg_command() self.no_of_dkeys = self.params.get("no_of_dkeys", '/run/dkeys/*', default=[0])[0] self.no_of_akeys = self.params.get("no_of_akeys", '/run/akeys/*', default=[0])[0] self.record_length = self.params.get("length", '/run/record/*', default=[0])[0] @fail_on(CommandFailure) def get_pool_leader(self): """Get the pool leader. Returns: int: pool leader value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["leader"]) @fail_on(CommandFailure) def get_rebuild_status(self): """Get the rebuild status. Returns: str: reuild status """ data = self.dmg_command.pool_query(self.pool.uuid) return data["rebuild"]["status"] @fail_on(CommandFailure) def is_rebuild_done(self, time_interval): """Rebuild is completed/done. Args: time_interval: Wait interval between checks Returns: False: If rebuild_status not "done" or "completed". True: If rebuild status is "done" or "completed". """ status = False fail_count = 0 completion_flag = ["done", "completed"] while fail_count <= 20: rebuild_status = self.get_rebuild_status() time.sleep(time_interval) fail_count += 1 if rebuild_status in completion_flag: status = True break return status @fail_on(CommandFailure) def assert_on_rebuild_failure(self): """If the rebuild is not successful, raise assert. """ rebuild_status = self.get_rebuild_status() self.log.info("Rebuild Status: %s", rebuild_status) rebuild_failed_string = ["failed", "scanning", "aborted", "busy"] self.assertTrue(rebuild_status not in rebuild_failed_string, "Rebuild failed") @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. Returns: int: pool_version_value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) @fail_on(DaosApiError) def write_single_object(self): """Write some data to the existing pool.""" self.pool.connect(2) csum = self.params.get("enable_checksum", '/run/container/*') self.container = DaosContainer(self.context) input_param = self.container.cont_input_values input_param.enable_chksum = csum self.container.create(poh=self.pool.pool.handle, con_prop=input_param) self.container.open() self.obj = DaosObj(self.context, self.container) self.obj.create(objcls=1) self.obj.open() self.ioreq = IORequest(self.context, self.container, self.obj, objtype=4) self.log.info("Writing the Single Dataset") for dkey in range(self.no_of_dkeys): for akey in range(self.no_of_akeys): indata = ("{0}".format(str(akey)[0]) * self.record_length) d_key_value = "dkey {0}".format(dkey) c_dkey = ctypes.create_string_buffer(d_key_value) a_key_value = "akey {0}".format(akey) c_akey = ctypes.create_string_buffer(a_key_value) c_value = ctypes.create_string_buffer(indata) c_size = ctypes.c_size_t(ctypes.sizeof(c_value)) self.ioreq.single_insert(c_dkey, c_akey, c_value, c_size) self.obj.close() self.container.close() @fail_on(DaosApiError) def verify_single_object(self): """Verify the container data on the existing pool.""" self.pool.connect(2) self.container.open() self.obj.open() self.log.info("Single Dataset Verification -- Started") for dkey in range(self.no_of_dkeys): for akey in range(self.no_of_akeys): indata = ("{0}".format(str(akey)[0]) * self.record_length) c_dkey = ctypes.create_string_buffer("dkey {0}".format(dkey)) c_akey = ctypes.create_string_buffer("akey {0}".format(akey)) val = self.ioreq.single_fetch(c_dkey, c_akey, len(indata) + 1) if indata != (repr(val.value)[1:-1]): self.d_log.error("ERROR:Data mismatch for " "dkey = {0}, " "akey = {1}".format( "dkey {0}".format(dkey), "akey {0}".format(akey))) self.fail( "ERROR: Data mismatch for dkey = {0}, akey={1}".format( "dkey {0}".format(dkey), "akey {0}".format(akey))) self.obj.close() self.container.close() def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}".format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[2])]) self.job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(self.processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() except CommandFailure as _error: results.put("FAIL")
class NvmePoolCapacity(TestWithServers): # pylfloat: disable=too-many-ancestors """Test class Description: Verify NOSPC condition is reported when accessing data beyond pool size. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(NvmePoolCapacity, self).setUp() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}".format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") key = "{}{}{}".format(oclass, api, test[2]) self.job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() except CommandFailure as _error: results.put("FAIL") def test_create_delete(self, num_pool=2, num_cont=5, total_count=100, scm_size=100000000000, nvme_size=300000000000): """ Test Description: This method is used to create/delete pools for a long run. It verifies the NVME free space during this process. Args: num_pool (int): Total pools for running test num_cont (int): Total containers created on each pool total_count (int): Total times the test is run in a loop scm_size (int): SCM size used in the testing nvme_size (int): NVME size used in the testing Returns: None """ pool = {} cont = {} for loop_count in range(0, total_count): self.log.info("Running test %s", loop_count) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. temp = int(scm_size) / num_pool pool[val].scm_size.update(str(temp)) temp = int(nvme_size) / num_pool pool[val].nvme_size.update(str(temp)) pool[val].create() self.pool = pool[val] display_string = "pool{} space at the Beginning".format(val) self.pool.display_pool_daos_space(display_string) nvme_size_begin = self.pool.get_pool_free_space("NVME") for cont_val in range(0, num_cont): cont[cont_val] = TestContainer(pool[val]) m_leak = 0 for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) nvme_size_end = self.pool.get_pool_free_space("NVME") pool[val].destroy() if (nvme_size_begin != nvme_size_end) and (m_leak == 0): m_leak = val + 1 # After destroying pools, check memory leak for each test loop. if m_leak != 0: self.fail("Memory leak : iteration {0} \n".format(m_leak)) def test_run(self, num_pool=1): """ Method Description: This method is called with different test_cases. Args: num_pool (int): Total pools for running a test. Returns: None """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} # Iterate through IOR different ior test sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): # Create the IOR threads threads = [] for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(test[0]) / num_pool pool[val].nvme_size.value = int(test[1]) / num_pool pool[val].create() display_string = "pool{} space at the Beginning".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) for thrd in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(5) # Wait to finish the threads for thrd in threads: thrd.join() # Verify the queue and make sure no FAIL for any IOR run # Test should fail with ENOSPC. while not self.out_queue.empty(): if (self.out_queue.get() == "FAIL" and test[4] == "PASS") \ or (self.out_queue.get() != "FAIL" and test[4] == "FAIL"): self.fail("FAIL") for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) self.pool.destroy() def test_nvme_pool_capacity(self): """Jira ID: DAOS-2085. Test Description: Purpose of this test is to verify whether DAOS stack report NOSPC when accessing data beyond pool size. Use Cases Test Case 1 or 2: 1. Perform IO less than entire SSD disk space. 2. Perform IO beyond entire SSD disk space. Test Case 3: 3. Create Pool/Container and destroy them several times. Use case: :avocado: tags=all,hw,medium,ib2,nvme,full_regression :avocado: tags=nvme_pool_capacity """ # Run test with one pool. self.log.info("Running Test Case 1 with one Pool") self.test_run(1) time.sleep(5) # Run test with two pools. self.log.info("Running Test Case 1 with two Pools") self.test_run(2) time.sleep(5) # Run Create/delete pool/container self.log.info("Running Test Case 3: Pool/Cont Create/Destroy") self.test_create_delete(10, 50, 100)
def test_rebuild_container_create(self): """Jira ID: DAOS-1168. Test Description: Configure 4 servers and 1 client with 1 or 2 pools and a pool service leader quantity of 2. Add 1 container to the first pool configured with 3 replicas. Populate the container with 1GB of objects. Exclude a server that has shards of this object and verify that rebuild is initiated. While rebuild is active, create 1000 additional containers in the same pool or the second pool (when available). Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Also confirm that all 1000 additional containers created during rebuild are accessible. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate """ # Get test params targets = self.params.get("targets", "/run/server_config/*") pool_qty = self.params.get("pools", "/run/test/*") loop_qty = self.params.get("loops", "/run/test/*") cont_qty = self.params.get("containers", "/run/test/*") cont_obj_cls = self.params.get("container_obj_class", "/run/test/*") rank = self.params.get("rank", "/run/test/*") use_ior = self.params.get("use_ior", "/run/test/*", False) node_qty = len(self.hostlist_servers) # Get pool params self.pool = [] for index in range(pool_qty): self.pool.append( TestPool(self.context, dmg_command=self.get_dmg_command())) self.pool[-1].get_params(self) if use_ior: # Get ior params mpirun = Mpirun(IorCommand()) mpirun.job.get_params(self) mpirun.assign_hosts( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) mpirun.assign_processes(len(self.hostlist_clients)) mpirun.assign_environment(mpirun.job.get_default_env("mpirun")) # Cancel any tests with tickets already assigned if rank in (1, 2): self.cancelForTicket("DAOS-2434") errors = [0 for _ in range(loop_qty)] for loop in range(loop_qty): # Log the start of the loop loop_id = "LOOP {}/{}".format(loop + 1, loop_qty) self.log.info("%s", "-" * 80) self.log.info("%s: Starting loop", loop_id) # Start this loop with a fresh list of containers self.container = [] # Create the requested number of pools info_checks = [] rebuild_checks = [] for pool in self.pool: pool.create() info_checks.append( { "pi_uuid": pool.uuid, "pi_ntargets": node_qty * targets, "pi_nnodes": node_qty, "pi_ndisabled": 0, } ) rebuild_checks.append( { "rs_errno": 0, "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, } ) # Check the pool info status = True for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) pool.display_pool_daos_space("after creation") self.assertTrue( status, "Error verifying pool info prior to excluding rank {}".format( rank)) # Create a container with 1GB of data in the first pool if use_ior: mpirun.job.flags.update("-v -w -W -G 1 -k", "ior.flags") mpirun.job.dfs_destroy.update(False, "ior.dfs_destroy") mpirun.job.set_daos_params(self.server_group, self.pool[0]) self.log.info( "%s: Running IOR on pool %s to fill container %s with data", loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value) self.run_ior(loop_id, mpirun) else: self.container.append(TestContainer(self.pool[0])) self.container[-1].get_params(self) self.container[-1].create() self.log.info( "%s: Writing to pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.container[-1].uuid) self.container[-1].object_qty.value = 8 self.container[-1].record_qty.value = 64 self.container[-1].data_size.value = 1024 * 1024 self.container[-1].write_objects(rank, cont_obj_cls) rank_list = self.container[-1].get_target_rank_lists( " after writing data") self.container[-1].get_target_rank_count(rank, rank_list) # Display the updated pool space usage for pool in self.pool: pool.display_pool_daos_space("after container creation") # Exclude the first rank from the first pool to initiate rebuild self.pool[0].start_rebuild([rank], self.d_log) # Wait for rebuild to start self.pool[0].wait_for_rebuild(True, 1) # Create additional containers in the last pool start_index = len(self.container) self.add_containers_during_rebuild( loop_id, cont_qty, self.pool[0], self.pool[-1]) # Confirm rebuild completes self.pool[0].wait_for_rebuild(False, 1) # Check the pool info info_checks[0]["pi_ndisabled"] += targets rebuild_checks[0]["rs_done"] = 1 rebuild_checks[0]["rs_obj_nr"] = ">=0" rebuild_checks[0]["rs_rec_nr"] = ">=0" for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue(status, "Error verifying pool info after rebuild") # Verify that each of created containers exist by opening them for index in range(start_index, len(self.container)): count = "{}/{}".format( index - start_index + 1, len(self.container) - start_index) if not self.access_container(loop_id, index, count): errors[loop] += 1 # Destroy the containers created during rebuild for index in range(start_index, len(self.container)): self.container[index].destroy() # Read the data from the container created before rebuild if use_ior: self.log.info( "%s: Running IOR on pool %s to verify container %s", loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value) mpirun.job.flags.update("-v -r -R -G 1 -E", "ior.flags") mpirun.job.dfs_destroy.update(True, "ior.dfs_destroy") self.run_ior(loop_id, mpirun) else: self.log.info( "%s: Reading pool %s to verify container %s", loop_id, self.pool[0].uuid, self.container[0].uuid) self.assertTrue( self.container[0].read_objects(), "Error verifying data written before rebuild") self.container[0].destroy() # Destroy the pools for pool in self.pool: pool.destroy(1) self.log.info( "%s: Loop %s", loop_id, "passed" if errors[loop] == 0 else "failed") self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
class NvmeEnospace(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate DER_NOSPACE for SCM and NVMe :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a NvmeEnospace object.""" super(NvmeEnospace, self).__init__(*args, **kwargs) self.daos_cmd = None def setUp(self): super(NvmeEnospace, self).setUp() # initialize daos command self.daos_cmd = DaosCommand(self.bin) self.create_pool_max_size() self.der_nospace_count = 0 self.other_errors_count = 0 def verify_enspace_log(self, der_nospace_err_count): """ Function to verify there are no other error except DER_NOSPACE in client log and also DER_NOSPACE count is higher. args: expected_err_count(int): Expected DER_NOSPACE count from client log. """ #Get the DER_NOSPACE and other error count from log self.der_nospace_count, self.other_errors_count = error_count( "-1007", self.hostlist_clients, self.client_log) #Check there are no other errors in log file if self.other_errors_count > 0: self.fail('Found other errors, count {} in client log {}' .format(self.other_errors_count, self.client_log)) #Check the DER_NOSPACE error count is higher if not test will FAIL if self.der_nospace_count < der_nospace_err_count: self.fail('Expected DER_NOSPACE should be > {} and Found {}' .format(der_nospace_err_count, self.der_nospace_count)) def delete_all_containers(self): """ Delete all the containers. """ #List all the container kwargs = {"pool": self.pool.uuid} data = self.daos_cmd.pool_list_cont(**kwargs) containers = data["uuids"] #Destroy all the containers for _cont in containers: kwargs["cont"] = _cont self.daos_cmd.container_destroy(**kwargs) def ior_bg_thread(self, results): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. Args: results (queue): queue for returning thread results """ mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich") self.create_cont() self.job_manager.job.dfs_cont.update(self.container.uuid) env = ior_bg_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(1) self.job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") break def run_enospace_foreground(self): """ Function to run test and validate DER_ENOSPACE and expected storage size """ #Fill 75% more of SCM pool,Aggregation is Enabled so NVMe space will be #start filling print('Starting main IOR load') self.start_ior_load(storage='SCM', percent=75) print(self.pool.pool_percentage_used()) #Fill 50% more of SCM pool,Aggregation is Enabled so NVMe space will be #filled self.start_ior_load(storage='SCM', percent=50) print(self.pool.pool_percentage_used()) #Fill 60% more of SCM pool, now NVMe will be Full so data will not be #moved to NVMe but it will start filling SCM. SCM size will be going to #full and this command expected to fail with DER_NOSPACE try: self.start_ior_load(storage='SCM', percent=60) self.fail('This test suppose to FAIL because of DER_NOSPACE' 'but it got Passed') except TestFail as _error: self.log.info('Test expected to fail because of DER_NOSPACE') #Display the pool% print(self.pool.pool_percentage_used()) #verify the DER_NO_SAPCE error count is expected and no other Error in #client log self.verify_enspace_log(self.der_nospace_count) #Check both NVMe and SCM are full. pool_usage = self.pool.pool_percentage_used() #NVMe should be almost full if not test will fail. if pool_usage['nvme'] > 8: self.fail('Pool NVMe used percentage should be < 8%, instead {}'. format(pool_usage['nvme'])) #For SCM some % space used for system so it won't be 100% full. if pool_usage['scm'] > 50: self.fail('Pool SCM used percentage should be < 50%, instead {}'. format(pool_usage['scm'])) def run_enospace_with_bg_job(self): """ Function to run test and validate DER_ENOSPACE and expected storage size. Single IOR job will run in background while space is filling. """ #Get the initial DER_ENOSPACE count self.der_nospace_count, self.other_errors_count = error_count( "-1007", self.hostlist_clients, self.client_log) # Start the IOR Background thread which will write small data set and # read in loop, until storage space is full. out_queue = queue.Queue() job = threading.Thread(target=self.ior_bg_thread, kwargs={"results": out_queue}) job.daemon = True job.start() #Run IOR in Foreground self.run_enospace_foreground() # Verify the background job queue and make sure no FAIL for any IOR run while not self.out_queue.empty(): if self.out_queue.get() == "FAIL": self.fail("One of the Background IOR job failed") def test_enospace_lazy_with_bg(self): """Jira ID: DAOS-4756. Test Description: IO gets DER_NOSPACE when SCM and NVMe is full with default (lazy) Aggregation mode. Use Case: This tests will create the pool and fill 75% of SCM size which will trigger the aggregation because of space pressure, next fill 75% more which should fill NVMe. Try to fill 60% more and now SCM size will be full too. verify that last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full.One background IO job will be running continuously. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_bg """ print(self.pool.pool_percentage_used()) #Run IOR to fill the pool. self.run_enospace_with_bg_job() def test_enospace_lazy_with_fg(self): """Jira ID: DAOS-4756. Test Description: Fill up the system (default aggregation mode) and delete all containers in loop, which should release the space. Use Case: This tests will create the pool and fill 75% of SCM size which will trigger the aggregation because of space pressure, next fill 75% more which should fill NVMe. Try to fill 60% more and now SCM size will be full too. verify that last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full. Delete all the containers. Do this in loop for 10 times and verify space is released. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_fg """ print(self.pool.pool_percentage_used()) #Repeat the test in loop. for _loop in range(10): print("-------enospc_lazy_fg Loop--------- {}".format(_loop)) #Run IOR to fill the pool. self.run_enospace_foreground() #Delete all the containers self.delete_all_containers() #Delete container will take some time to release the space time.sleep(60) #Run last IO self.start_ior_load(storage='SCM', percent=1) def test_enospace_time_with_bg(self): """Jira ID: DAOS-4756. Test Description: IO gets DER_NOSPACE when SCM is full and it release the size when container destroy with Aggregation set on time mode. Use Case: This tests will create the pool. Set Aggregation mode to Time. Start filling 75% of SCM size. Aggregation will be triggered time to time, next fill 75% more which will fill up NVMe. Try to fill 60% more and now SCM size will be full too. Verify last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full.One background IO job will be running continuously. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_time,enospc_time_bg """ print(self.pool.pool_percentage_used()) # Enabled TIme mode for Aggregation. self.pool.set_property("reclaim", "time") #Run IOR to fill the pool. self.run_enospace_with_bg_job() def test_enospace_time_with_fg(self): """Jira ID: DAOS-4756. Test Description: Fill up the system (time aggregation mode) and delete all containers in loop, which should release the space. Use Case: This tests will create the pool. Set Aggregation mode to Time. Start filling 75% of SCM size. Aggregation will be triggered time to time, next fill 75% more which will fill up NVMe. Try to fill 60% more and now SCM size will be full too. Verify last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full. Delete all the containers. Do this in loop for 10 times and verify space is released. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_time,enospc_time_fg """ print(self.pool.pool_percentage_used()) # Enabled TIme mode for Aggregation. self.pool.set_property("reclaim", "time") #Repeat the test in loop. for _loop in range(10): print("-------enospc_time_fg Loop--------- {}".format(_loop)) #Run IOR to fill the pool. self.run_enospace_with_bg_job() #Delete all the containers self.delete_all_containers() #Delete container will take some time to release the space time.sleep(60) #Run last IO self.start_ior_load(storage='SCM', percent=1) @skipForTicket("DAOS-5403") def test_performance_storage_full(self): """Jira ID: DAOS-4756. Test Description: Verify IO Read performance when pool size is full. Use Case: This tests will create the pool. Run small set of IOR as baseline.Start IOR with < 4K which will start filling SCM and trigger aggregation and start filling up NVMe. Check the IOR baseline read number and make sure it's +- 5% to the number ran prior system storage was full. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_performance """ #Write the IOR Baseline and get the Read BW for later comparison. print(self.pool.pool_percentage_used()) #Write First self.start_ior_load(storage='SCM', percent=1) #Read the baseline data set self.start_ior_load(storage='SCM', operation='Read', percent=1) max_mib_baseline = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)]) baseline_cont_uuid = self.ior_cmd.dfs_cont.value print("IOR Baseline Read MiB {}".format(max_mib_baseline)) #Run IOR to fill the pool. self.run_enospace_with_bg_job() #Read the same container which was written at the beginning. self.container.uuid = baseline_cont_uuid self.start_ior_load(storage='SCM', operation='Read', percent=1) max_mib_latest = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)]) print("IOR Latest Read MiB {}".format(max_mib_latest)) #Check if latest IOR read performance is in Tolerance of 5%, when #Storage space is full. if abs(max_mib_baseline-max_mib_latest) > (max_mib_baseline/100 * 5): self.fail('Latest IOR read performance is not under 5% Tolerance' ' Baseline Read MiB = {} and latest IOR Read MiB = {}' .format(max_mib_baseline, max_mib_latest)) def test_enospace_no_aggregation(self): """Jira ID: DAOS-4756. Test Description: IO gets DER_NOSPACE when SCM is full and it release the size when container destroy with Aggregation disabled. Use Case: This tests will create the pool and disable aggregation. Fill 75% of SCM size which should work, next try fill 10% more which should fail with DER_NOSPACE. Destroy the container and validate the Pool SCM free size is close to full (> 95%). Do this in loop ~10 times and verify the DER_NOSPACE and SCM free size after container destroy. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_no_aggregation """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches print(self.pool.pool_percentage_used()) # Disable the aggregation self.pool.set_property("reclaim", "disabled") #Get the DER_NOSPACE and other error count from log self.der_nospace_count, self.other_errors_count = error_count( "-1007", self.hostlist_clients, self.client_log) #Repeat the test in loop. for _loop in range(10): print("-------enospc_no_aggregation Loop--------- {}".format(_loop)) #Fill 75% of SCM pool self.start_ior_load(storage='SCM', percent=40) print(self.pool.pool_percentage_used()) try: #Fill 10% more to SCM ,which should Fail because no SCM space self.start_ior_load(storage='SCM', percent=40) self.fail('This test suppose to fail because of DER_NOSPACE' 'but it got Passed') except TestFail as _error: self.log.info('Expected to fail because of DER_NOSPACE') #Verify DER_NO_SAPCE error count is expected and no other Error #in client log. self.verify_enspace_log(self.der_nospace_count) #Delete all the containers self.delete_all_containers() #Get the pool usage pool_usage = self.pool.pool_percentage_used() #Delay to release the SCM size. time.sleep(60) print(pool_usage) #SCM pool size should be released (some still be used for system) #Pool SCM free % should not be less than 50% if pool_usage['scm'] > 55: self.fail('SCM pool used percentage should be < 55, instead {}'. format(pool_usage['scm'])) #Run last IO self.start_ior_load(storage='SCM', percent=1)
def ior_runner_thread(self, results): """Start threads and wait until all threads are finished. Destroy the container at the end of this thread run. Args: results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} cmd = DaosCommand(os.path.join(self.prefix, "bin")) cmd.set_sub_command("container") cmd.sub_command_class.set_sub_command("destroy") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Iterate through IOR different value and run in sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_transfer_size, self.ior_flags): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.dfs_cont.update(container_info ["{}{}{}".format(oclass, api, test[0])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL") # Destroy the container created by thread for key in container_info: cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid cmd.sub_command_class.sub_command_class.svc.value = \ self.pool.svc_ranks cmd.sub_command_class.sub_command_class.cont.value = \ container_info[key] try: cmd._get_result() except CommandFailure as _error: results.put("FAIL")
class NvmeFragmentation(TestWithServers): # pylint: disable=too-many-ancestors # pylint: disable=too-many-instance-attributes """NVMe drive fragmentation test cases. Test class Description: Verify the drive fragmentation does free the space and do not lead to ENOM_SPACE. :avocado: recursive """ def setUp(self): """Set up for test case.""" super().setUp() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_transfer_size = self.params.get("transfer_block_size", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() def ior_runner_thread(self, results): """Start threads and wait until all threads are finished. Destroy the container at the end of this thread run. Args: results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} cmd = DaosCommand(os.path.join(self.prefix, "bin")) cmd.set_sub_command("container") cmd.sub_command_class.set_sub_command("destroy") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Iterate through IOR different value and run in sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_transfer_size, self.ior_flags): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") cont_uuid = str(uuid.uuid4()) self.job_manager.job.dfs_cont.update(cont_uuid) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() container_info["{}{}{}".format(oclass, api, test[0])] = cont_uuid except CommandFailure as _error: results.put("FAIL") # Destroy the container created by thread for key in container_info: cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid #cmd.sub_command_class.sub_command_class.svc.value = \ # self.pool.svc_ranks cmd.sub_command_class.sub_command_class.cont.value = \ container_info[key] try: # pylint: disable=protected-access cmd._get_result() except CommandFailure as _error: results.put("FAIL") def test_nvme_fragmentation(self): """Jira ID: DAOS-2332. Test Description: Purpose of this test is to verify there is no Fragmentation after doing some IO write/delete operation for ~hour. Use case: Create object with different transfer size in parallel (10 IOR threads) Delete the container created by IOR which will dealloc NVMe block Run above code in loop for some time (~1 hours) and expect not to fail with NO ENOM SPAC. :avocado: tags=all,full_regression :avocado: tags=hw,medium :avocado: tags=nvme,ib2,nvme_fragmentation """ no_of_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool self.add_pool(connect=False) self.pool.display_pool_daos_space("Pool space at the Beginning") # Repeat the test for 30 times which will take ~1 hour for test_loop in range(30): self.log.info("--Test Repeat for loop %s---", test_loop) # Create the IOR threads threads = [] for thrd in range(no_of_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_runner_thread, kwargs={"results": self.out_queue})) # Launch the IOR threads for thrd in threads: thrd.start() time.sleep(5) # Wait to finish the threads for thrd in threads: thrd.join() # Verify the queue and make sure no FAIL for any IOR run while not self.out_queue.empty(): if self.out_queue.get() == "FAIL": self.fail("FAIL") self.pool.display_pool_daos_space("Pool space at the End")
class OSAOnlineDrain(TestWithServers): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server Online Drain test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAOnlineDrain, self).setUp() self.dmg_command = self.get_dmg_command() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() @fail_on(CommandFailure) def get_pool_leader(self): """Get the pool leader. Returns: int: pool leader value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["leader"]) @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. Returns: int: pool_version_value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}".format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[2])]) self.job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() except CommandFailure as _error: results.put("FAIL") def run_online_drain_test(self, num_pool): """Run the Online drain without data. Args: int : total pools to create for testing purposes. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] target_list = [] drain_servers = len(self.hostlist_servers) - 1 # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) target_list.append(n) target_list.append(n + 1) t_string = "{},{}".format(target_list[0], target_list[1]) # Drain one of the ranks (or server) rank = random.randint(1, drain_servers) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Drain the pool_uuid, rank and targets for val in range(0, num_pool): for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): threads = [] for thrd in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(5) self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_drain(self.pool.uuid, rank, t_string) self.log.info(output) fail_count = 0 while fail_count <= 20: pver_drain = self.get_pool_version() time.sleep(10) fail_count += 1 if pver_drain > pver_begin + 1: break self.log.info("Pool Version after drain %s", pver_drain) # Check pool version incremented after pool exclude self.assertTrue(pver_drain > pver_begin, "Pool Version Error: After drain") # Wait to finish the threads for thrd in threads: thrd.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) pool[val].destroy() @skipForTicket("DAOS-6061") def test_osa_online_drain(self): """Test ID: DAOS-4750 Test Description: Validate Online drain :avocado: tags=all,pr,hw,large,osa,osa_drain,online_drain,DAOS_5610 """ # Perform drain testing with 1 to 2 pools for pool_num in range(1, 3): self.run_online_drain_test(pool_num)
class RbldContainerCreate(TestWithServers): """Rebuild with container creation test cases. Test Class Description: These rebuild tests verify the ability to create additional containers while rebuild is ongoing. :avocado: recursive """ def add_containers_during_rebuild(self, loop_id, qty, pool1, pool2): """Add containers to a pool while rebuild is still in progress. Args: loop_id (str): loop identification string qty (int): the number of containers to create pool1 (TestPool): pool used to determine if rebuild is complete pool2 (TestPool): pool used to add containers """ count = 0 while not pool1.rebuild_complete() and count < qty: # Create a new container count += 1 self.log.info( "%s: Creating container %s/%s in pool %s during rebuild", loop_id, count, qty, pool2.uuid) self.container.append(TestContainer(pool2)) self.container[-1].get_params(self) self.container[-1].create() self.container[-1].write_objects() if count < qty: self.fail("{}: Rebuild completed with only {}/{} containers " "created".format(loop_id, count, qty)) def run_ior(self, loop_id, mpirun): """Run the ior command defined by the specified ior command object. Args: loop_id (str): loop identification string mpirun (Mpirun): mpirun command object to run ior """ total_bytes = mpirun.job.get_aggregate_total(mpirun.processes.value) try: mpirun.run() except CommandFailure as error: self.fail( "{}: Error populating the container with {} bytes of data " "prior to target exclusion: {}".format(loop_id, total_bytes, error)) self.log.info("%s: %s %s bytes to the container", loop_id, "Wrote" if "-w" in mpirun.job.flags.value else "Read", total_bytes) def access_container(self, loop_id, index, message): """Open and close the specified container. Args: loop_id (str): loop identification string index (int): index of the daos container object to open/close message (str): additional text describing the container Returns: bool: was the opening and closing of the container successful """ status = True self.log.info("%s: Verifying the container %s created during rebuild", loop_id, message) try: self.container[index].read_objects() self.container[index].close() except TestFail as error: self.log.error("%s: - Container read failed:", loop_id, exc_info=error) status = False return status def test_rebuild_container_create(self): """Jira ID: DAOS-1168. Test Description: Configure 4 servers and 1 client with 1 or 2 pools and a pool service leader quantity of 2. Add 1 container to the first pool configured with 3 replicas. Populate the container with 1GB of objects. Exclude a server that has shards of this object and verify that rebuild is initiated. While rebuild is active, create 1000 additional containers in the same pool or the second pool (when available). Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Also confirm that all 1000 additional containers created during rebuild are accessible. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,full_regression :avocado: tags=medium :avocado: tags=rebuild,rebuild_cont_create """ # Get test params targets = self.params.get("targets", "/run/server_config/*") pool_qty = self.params.get("pools", "/run/test/*") loop_qty = self.params.get("loops", "/run/test/*") cont_qty = self.params.get("containers", "/run/test/*") cont_obj_cls = self.params.get("container_obj_class", "/run/test/*") rank = self.params.get("rank", "/run/test/*") use_ior = self.params.get("use_ior", "/run/test/*", False) node_qty = len(self.hostlist_servers) # Get pool params self.pool = [] for index in range(pool_qty): self.pool.append(self.get_pool(create=False)) if use_ior: # Get ior params self.job_manager = Mpirun(IorCommand()) self.job_manager.job.get_params(self) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) self.job_manager.assign_processes(len(self.hostlist_clients)) self.job_manager.assign_environment( self.job_manager.job.get_default_env("mpirun")) errors = [0 for _ in range(loop_qty)] for loop in range(loop_qty): # Log the start of the loop loop_id = "LOOP {}/{}".format(loop + 1, loop_qty) self.log.info("%s", "-" * 80) self.log.info("%s: Starting loop", loop_id) # Start this loop with a fresh list of containers self.container = [] # Create the requested number of pools info_checks = [] rebuild_checks = [] for pool in self.pool: pool.create() info_checks.append({ "pi_uuid": pool.uuid, "pi_ntargets": node_qty * targets, "pi_nnodes": node_qty, "pi_ndisabled": 0, }) rebuild_checks.append({ "rs_errno": 0, "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, }) # Check the pool info status = True for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) pool.display_pool_daos_space("after creation") self.assertTrue( status, "Error verifying pool info prior to excluding rank {}".format( rank)) # Create a container with 1GB of data in the first pool if use_ior: self.job_manager.job.flags.update("-v -w -W -G 1 -k", "ior.flags") self.job_manager.job.dfs_destroy.update( False, "ior.dfs_destroy") self.job_manager.job.set_daos_params(self.server_group, self.pool[0]) self.log.info( "%s: Running IOR on pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.job_manager.job.dfs_cont.value) self.run_ior(loop_id, self.job_manager) else: self.container.append(TestContainer(self.pool[0])) self.container[-1].get_params(self) self.container[-1].create() self.log.info( "%s: Writing to pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.container[-1].uuid) self.container[-1].object_qty.value = 8 self.container[-1].record_qty.value = 64 self.container[-1].data_size.value = 1024 * 1024 self.container[-1].write_objects(rank, cont_obj_cls) rank_list = self.container[-1].get_target_rank_lists( " after writing data") self.container[-1].get_target_rank_count(rank, rank_list) # Display the updated pool space usage for pool in self.pool: pool.display_pool_daos_space("after container creation") # Exclude the first rank from the first pool to initiate rebuild self.server_managers[0].stop_ranks([rank], self.d_log) # Wait for rebuild to start self.pool[0].wait_for_rebuild(True, 1) # Create additional containers in the last pool start_index = len(self.container) self.add_containers_during_rebuild(loop_id, cont_qty, self.pool[0], self.pool[-1]) # Confirm rebuild completes self.pool[0].wait_for_rebuild(False, 1) # Check the pool info info_checks[0]["pi_ndisabled"] += targets rebuild_checks[0]["rs_done"] = 1 rebuild_checks[0]["rs_obj_nr"] = ">=0" rebuild_checks[0]["rs_rec_nr"] = ">=0" for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue(status, "Error verifying pool info after rebuild") # Verify that each of created containers exist by opening them for index in range(start_index, len(self.container)): count = "{}/{}".format(index - start_index + 1, len(self.container) - start_index) if not self.access_container(loop_id, index, count): errors[loop] += 1 # Destroy the containers created during rebuild for index in range(start_index, len(self.container)): self.container[index].destroy() # Read the data from the container created before rebuild if use_ior: self.log.info( "%s: Running IOR on pool %s to verify container %s", loop_id, self.pool[0].uuid, self.job_manager.job.dfs_cont.value) self.job_manager.job.flags.update("-v -r -R -G 1 -E", "ior.flags") self.job_manager.job.dfs_destroy.update( True, "ior.dfs_destroy") self.run_ior(loop_id, self.job_manager) else: self.log.info("%s: Reading pool %s to verify container %s", loop_id, self.pool[0].uuid, self.container[0].uuid) self.assertTrue(self.container[0].read_objects(), "Error verifying data written before rebuild") self.container[0].destroy() # Destroy the pools for pool in self.pool: pool.destroy(1) self.log.info("%s: Loop %s", loop_id, "passed" if errors[loop] == 0 else "failed") self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
class MacsioTestBase(TestWithServers): """Base MACSio test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a MacsioTestBase object.""" super(MacsioTestBase, self).__init__(*args, **kwargs) self.manager = None self.macsio = None def setUp(self): """Set up each test case.""" super(MacsioTestBase, self).setUp() self.manager = Mpirun(None, subprocess=False, mpitype="mpich") self.macsio = self.get_macsio_command() def get_macsio_command(self): """Get the MacsioCommand object. Returns: MacsioCommand: object defining the macsio command """ # Create the macsio command test_repo = self.params.get("macsio", "/run/test_repo/*", "") macsio = MacsioCommand(test_repo) macsio.get_params(self) # Create all the macsio output files in the same directory as the other # test log files macsio.set_output_file_path() return macsio def run_macsio(self, pool_uuid, pool_svcl, cont_uuid=None): """Run the macsio. Parameters for the macsio command are obtained from the test yaml file, including the path to the macsio executable. By default mpirun will be used to run macsio. This can be overridden by redfining the self.manager attribute prior to calling this method. Args: pool_uuid (str): pool uuid pool_svcl (str): pool service replica cont_uuid (str, optional): container uuid. Defaults to None. Returns: CmdResult: Object that contains exit status, stdout, and other information. """ # Setup the job manager (mpirun) to run the macsio command self.macsio.daos_pool = pool_uuid self.macsio.daos_svcl = pool_svcl self.macsio.daos_cont = cont_uuid self.manager.job = self.macsio self.manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.manager.assign_processes(len(self.hostlist_clients)) self.manager.assign_environment( self.macsio.get_environment(self.server_managers[0], self.client_log)) try: return self.manager.run() except CommandFailure as error: self.log.error("MACSio Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n")