def test_ior_crash(self): """Jira ID: DAOS-4332. Jira ID: DAOS-9946. Test Description: Verify DAOS server does not need to be restarted when an application crashes. Use Cases: Run IOR Write. Kill IOR process in the middle of Write. Verify DAOS engines did not crash. Run IOR Write, Read. Kill IOR process in the middle of Read. Verify DAOS engines did not crash. Run IOR Write, Read, CheckRead. Verify IOR completes successfully. :avocado: tags=all,full_regression :avocado: tags=hw,medium,ib2 :avocado: tags=daosio,ior,dfs :avocado: tags=ior_crash """ # Run IOR and crash it in the middle of Write self.run_ior_with_pool() self.check_subprocess_status() time.sleep(self.ior_cmd.sw_deadline.value / 2) self.stop_ior() # Verify engines did not crash scan_info = self.dmg.system_query(verbose=True) if not check_system_query_status(scan_info): self.fail("One or more engines crashed") # Run IOR and crash it in the middle of Read. # Must wait for Write to complete first. self.run_ior_with_pool() time.sleep(self.ior_cmd.sw_deadline.value * 1.5) self.check_subprocess_status("read") self.stop_ior() # Verify engines did not crash scan_info = self.dmg.system_query(verbose=True) if not check_system_query_status(scan_info): self.fail("One or more engines crashed") # Run IOR and verify it completes successfully self.run_ior_with_pool() self.job_manager.wait() # Verify engines did not crash scan_info = self.dmg.system_query(verbose=True) if not check_system_query_status(scan_info): self.fail("One or more engines crashed")
def test_crashior(self): """Jira ID: DAOS-4332. Test Description: DAOS server does not need to be restarted when the application crashes. Use Cases: Run IOR over dfuse. Cancel IOR in the middle of io. Check daos server does not need to be restarted when the application crashes. :avocado: tags=all,daosio,hw,medium,ib2,full_regression,crashior """ # run ior and crash it during write process self.run_ior_with_pool() # check if ior write has started self.check_subprocess_status() # allow 50 secs of write to happen time.sleep(50) # kill ior process in the middle of IO self.stop_ior() # obtain server rank info using 'dmg system query -v' scan_info = self.dmg.system_query(verbose=True) # check for any crashed servers after killing ior in the middle if not check_system_query_status(scan_info): self.fail("One or more server crashed") # run ior again and crash it during read process self.run_ior_with_pool() # allow write to finish which is set at stonewalling limit of 100 sec # hence allowing extra 5 secs for read to begin time.sleep(105) # check if ior read process started self.check_subprocess_status("read") # kill ior process in middle of read process self.stop_ior() # obtain server rank info using 'dmg system query -v' scan_info = self.dmg.system_query(verbose=True) # check for any crashed servers after killing ior in the middle if not check_system_query_status(scan_info): self.fail("One or more server crashed") # run ior again if everything goes well till now and allow it to # complete without killing in the middle this time to check # if io goes as expected after crashing it previously self.run_ior_with_pool() self.job_manager.wait()
def run_offline_extend_test(self, num_pool, data=False, oclass=None): """Run the offline extend without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. oclass (list) : list of daos object class (eg: "RP_2G8") """ # Create a pool pool = {} if oclass is None: oclass = [] oclass.append(self.ior_cmd.dfs_oclass.value) self.log.info(oclass[0]) for val in range(0, num_pool): # Perform IOR write using the oclass list if val < len(oclass): index = val else: index = 0 pool[val] = TestPool(self.context, dmg_command=self.dmg_command) pool[val].get_params(self) pool[val].create() self.pool = pool[val] test_seq = self.ior_test_sequence[0] self.pool.set_property("reclaim", "disabled") if data: self.run_ior_thread("Write", oclass[index], test_seq) self.run_mdtest_thread() if self.test_during_aggregation is True: self.run_ior_thread("Write", oclass[index], test_seq) # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Give sometime for the additional server to come up. for retry in range(0, 10): scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): if retry == 9: self.fail("One or more servers not in expected status") else: break for rank_index, rank_val in enumerate(self.rank): # If total pools less than 3, extend only a single pool. # If total pools >= 3 : Extend only 3 pools. if num_pool >= len(self.rank): val = rank_index else: val = 0 self.pool = pool[val] scm_size = self.pool.scm_size nvme_size = self.pool.nvme_size self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) # Enable aggregation for multiple pool testing only. if self.test_during_aggregation is True and (num_pool > 1): self.delete_extra_container(self.pool) output = self.dmg_command.pool_extend(self.pool.uuid, rank_val, scm_size, nvme_size) self.print_and_assert_on_rebuild_failure(output) pver_extend = self.get_pool_version() self.log.info("Pool Version after extend %d", pver_extend) # Check pool version incremented after pool extend self.assertTrue(pver_extend > pver_begin, "Pool Version Error: After extend") display_string = "Pool{} space at the End".format(val) pool[val].display_pool_daos_space(display_string) if data: # Perform the IOR read using the same # daos object class used for write. if val < len(oclass): index = val else: index = 0 self.run_ior_thread("Read", oclass[index], test_seq) self.run_mdtest_thread() self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)
def run_nvme_pool_extend(self, num_pool, oclass=None): """Run Pool Extend Args: num_pool (int) : total pools to create for testing purposes. oclass (str) : object class (eg: RP_2G8,etc) Defaults to None. """ self.pool = [] total_servers = len(self.hostlist_servers) * 2 self.log.info("Total Daos Servers (Initial): %d", total_servers) if oclass is None: oclass = self.ior_cmd.dfs_oclass.value for val in range(0, num_pool): # Create a pool self.pool.append(self.get_pool()) self.pool[-1].set_property("reclaim", "disabled") # On each pool (max 3), extend the ranks # eg: ranks : 4,5 ; 6,7; 8,9. for val in range(0, num_pool): test = self.ior_test_sequence[val] threads = [] threads.append(threading.Thread(target=self.run_ior_thread, kwargs={"action": "Write", "oclass": oclass, "test": test, "pool": self.pool[val]})) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool[val].display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() # Start the additional servers and extend the pool if val == 0: self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Check the system map extra servers are in joined state. for retry in range(0, 10): scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): if retry == 9: self.fail("One/More servers status not correct") else: break self.log.info("Pool Version at the beginning %s", pver_begin) # Extend ranks (4,5), (6,7), (8,9) ranks_extended = "{},{}".format((val * 2) + 4, (val * 2) + 5) output = self.dmg_command.pool_extend(self.pool[val].uuid, ranks_extended) self.print_and_assert_on_rebuild_failure(output) pver_extend = self.get_pool_version() self.log.info("Pool Version after extend %s", pver_extend) # Check pool version incremented after pool extend self.assertTrue(pver_extend > pver_begin, "Pool Version Error: After extend") # Wait to finish the threads for thrd in threads: thrd.join() if not self.out_queue.empty(): self.assert_on_exception() # Verify the data after pool extend self.run_ior_thread("Read", oclass, test) # Get the pool space at the end of the test display_string = "Pool{} space at the End".format(val) self.pool[val].display_pool_daos_space(display_string) self.container = self.pool_cont_dict[self.pool[val]][0] kwargs = {"pool": self.pool[val].uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)
def run_offline_parallel_test(self, num_pool, data=False, oclass=None): """Run multiple OSA commands in parallel with or without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. oclass (str) : Daos object class (RP_2G1,etc) """ # Create a pool pool = {} pool_uuid = [] target_list = [] if oclass is None: oclass = self.ior_cmd.dfs_oclass.value # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) target_list.append(n) target_list.append(n + 1) t_string = "{},{}".format(target_list[0], target_list[1]) # Exclude rank 2. rank = 2 test_seq = self.ior_test_sequence[0] for val in range(0, num_pool): pool[val] = TestPool(self.context, dmg_command=self.get_dmg_command()) pool[val].get_params(self) pool[val].create() pool_uuid.append(pool[val].uuid) self.pool = pool[val] self.pool.set_property("reclaim", "disabled") if data: self.run_ior_thread("Write", oclass, test_seq) if oclass != "S1": self.run_mdtest_thread() # if self.test_during_aggregation is set, # Create another container and run the IOR # command using the second container. if self.test_during_aggregation is True: self.run_ior_thread("Write", oclass, test_seq) # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Give sometime for the additional server to come up. for retry in range(0, 10): scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): if retry == 9: self.fail("One or more servers not in expected status") else: break # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) # If we need to trigger aggregation on pool 1, delete # the second container which has IOR data. if self.test_during_aggregation is True and val == 0: self.delete_extra_container(self.pool) # Create the threads here threads = [] # Action dictionary with OSA dmg command parameters action_args = { "drain": { "pool": self.pool.uuid, "rank": rank, "tgt_idx": None }, "exclude": { "pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string }, "reintegrate": { "pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string }, "extend": { "pool": self.pool.uuid, "ranks": (rank + 2), "scm_size": self.pool.scm_size, "nvme_size": self.pool.nvme_size } } for action in sorted(action_args): # Add a dmg thread process = threading.Thread(target=self.dmg_thread, kwargs={ "action": action, "action_args": action_args, "results": self.out_queue }) process.start() threads.append(process) # Wait to finish the threads for thrd in threads: thrd.join() time.sleep(5) # Check the queue for any failure. tmp_list = list(self.out_queue.queue) for failure in tmp_list: if "FAIL" in failure: self.fail("Test failed : {0}".format(failure)) for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) pool[val].display_pool_daos_space(display_string) self.is_rebuild_done(3) self.assert_on_rebuild_failure() pver_end = self.get_pool_version() self.log.info("Pool Version at the End %s", pver_end) self.assertTrue(pver_end >= 26, "Pool Version Error: at the end") if data: self.run_ior_thread("Read", oclass, test_seq) if oclass != "S1": self.run_mdtest_thread() self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)
def test_nvme_server_restart(self): """Jira ID: DAOS-2650. Test Description: Test will run IOR with non standard transfer sizes for different set of pool sizes. Purpose is to verify io transaction to scm and nvme for different pool sizes when servers are restarted after write. Use Cases: (1) Running IOR with different set of transfer size where first transfer size is < 4096 and then > 4096. Verify the data after servers are restarted. (2) Repeat the case(1) with maximum nvme pool size that can be created. (3) Running IOR with different set of transfer size where the transfer size is > 4096 throughout. Verify the data after servers are restarted. (4) Repeat the case(3) with maximum nvme pool size that can be created. :avocado: tags=all,full_regression,hw,large,daosio,nvme_server_restart """ # Test params tests = self.params.get("ior_sequence", '/run/ior/*') processes = self.params.get("np", '/run/ior/*') transfer_size = self.params.get("tsize", '/run/ior/transfersize/*/') flag_write = self.params.get("write", '/run/ior/*/') flag_read = self.params.get("read", '/run/ior/*/') block_size = self.ior_cmd.block_size.value # Loop for every IOR object type for ior_param in tests: # Create and connect to a pool self.pool = TestPool( self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # update pool sizes self.pool.scm_size.update(ior_param[0]) self.pool.nvme_size.update(ior_param[1]) # Create a pool self.pool.create() # get pool info self.pool.get_info() for tsize in transfer_size: # Run ior with the parameters specified for this pass self.ior_cmd.transfer_size.update(tsize) self.ior_cmd.flags.update(flag_write) # if transfer size is less thank 1K # update block size to 32K to keep it small if tsize <= 1000: self.ior_cmd.block_size.update(32000) else: self.ior_cmd.block_size.update(block_size) self.ior_cmd.set_daos_params(self.server_group, self.pool) self.run_ior(self.get_ior_job_manager_command(), processes) # Stop all servers self.get_dmg_command().system_stop(True) # Start all servers self.get_dmg_command().system_start() # check if all servers started as expected scan_info = self.get_dmg_command().get_output("system_query") if not check_system_query_status(scan_info): self.fail("One or more servers crashed") # read all the data written before server restart self.ior_cmd.flags.update(flag_read) self.run_ior(self.get_ior_job_manager_command(), processes) # destroy pool self.destroy_pools(self.pool)
def test_ioaggregation(self): """Jira ID: DAOS-4332. Test Description: Verify Aggregation across system shutdown. Use Cases: Create Pool. Create Container. Run IOR and keep the written. Capture Free space available after first ior write. Create snapshot and obtain the epoch id. Write to the same ior file and same amount of data, without overwriting the previous data. Capture free space again, after second ior write. Capture Highest epoch ID before snapshot destroy. Destroy the snapshot which was created. Shut down the servers and restart them again. After servers have successfully restarted, Look for aggregation to finish by checking the free space available and value of highest epoch which should be higher than the value of highest epoch before snapshot destroy. If current free space is equal to free space after first ior write, then pass otherwise fail the test after waiting for 4 attempts. :avocado: tags=all,daosio,hw,small,full_regression,ioaggregation """ # update ior signature option self.ior_cmd.signature.update("123") # run ior write process self.run_ior_with_pool() # capture free space before taking the snapshot self.get_nvme_free_space() # create snapshot self.container.create_snap() # write to same ior file again self.ior_cmd.signature.update("456") self.run_ior_with_pool(create_cont=False) # capture free space after second ior write free_space_before_snap_destroy = self.get_nvme_free_space() # obtain highest epoch before snapshot destroy via container query kwargs = { "pool": self.pool.uuid, "cont": self.container.uuid } highest_epc_before_snap_destroy = self.highest_epoch(kwargs) # delete snapshot self.container.destroy_snap(epc=self.container.epoch) # Shutdown the servers and restart self.get_dmg_command().system_stop(True) time.sleep(5) self.get_dmg_command().system_start() # check if all servers started as expected scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): self.fail("One or more servers crashed") # Now check if the space is returned back and Highest epoch value # is higher than the the value just before snapshot destroy. counter = 1 returned_space = (self.get_nvme_free_space() - free_space_before_snap_destroy) while returned_space < int(self.ior_cmd.block_size.value) or \ highest_epc_before_snap_destroy >= self.highest_epoch(kwargs): # try to wait for 4 x 60 secs for aggregation to be completed or # else exit the test with a failure. if counter > 4: self.log.info("Free space before snapshot destroy: %s", free_space_before_snap_destroy) self.log.info("Free space when test terminated: %s", self.get_nvme_free_space()) self.log.info("Highest Epoch before IO Aggregation: %s", highest_epc_before_snap_destroy) self.log.info("Highest Epoch when test terminated: %s", self.highest_epoch(kwargs)) self.fail("Aggregation did not complete as expected") time.sleep(60) returned_space = (self.get_nvme_free_space() - free_space_before_snap_destroy) counter += 1
def run_online_extend_test(self, num_pool, racer=False, oclass=None, app_name="ior"): """Run the Online extend without data. Args: num_pool(int) : total pools to create for testing purposes. racer(bool) : Run the testing along with daos_racer. Defaults to False. oclass(str) : Object Class (eg: RP_2G1, etc). Default to None. app_name(str) : App (ior or mdtest) to run during the testing. Defaults to ior. """ # Pool dictionary pool = {} if oclass is None: oclass = self.ior_cmd.dfs_oclass.value test_seq = self.ior_test_sequence[0] # Start the daos_racer thread if racer is True: daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(context=self.context, dmg_command=self.get_dmg_command(), label_generator=self.label_generator) pool[val].get_params(self) pool[val].create() pool[val].set_property("reclaim", "disabled") # Extend the pool_uuid, rank and targets for val in range(0, num_pool): threads = [] self.pool = pool[val] # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) if self.test_during_aggregation is True: for _ in range(0, 2): self.run_ior_thread("Write", oclass, test_seq) self.delete_extra_container(self.pool) # The following thread runs while performing osa operations. if app_name == "ior": threads.append( threading.Thread(target=self.run_ior_thread, kwargs={ "action": "Write", "oclass": oclass, "test": test_seq })) else: threads.append(threading.Thread(target=self.run_mdtest_thread)) # Make sure system map has all ranks in joined state. for retry in range(0, 10): scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): if retry == 9: self.fail("One or more servers not in expected status") else: break # Launch the IOR or mdtest thread for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_extend(self.pool.uuid, self.ranks) self.print_and_assert_on_rebuild_failure(output) pver_extend = self.get_pool_version() self.log.info("Pool Version after extend %s", pver_extend) # Check pool version incremented after pool exclude self.assertTrue(pver_extend > pver_begin, "Pool Version Error: After extend") # Wait to finish the threads for thrd in threads: thrd.join() if not self.out_queue.empty(): self.assert_on_exception() # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. if racer is True: daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) self.run_ior_thread("Read", oclass, test_seq) self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)