def run_osa_dmg_test(self, num_pool, extend=False): """Run the offline extend without data. Args: num_pool (int) : total pools to create for testing purposes. extend (bool) : Run testing after performing pool extend. """ # Create a pool self.dmg_command.exit_status_exception = False pool = {} pool_uuid = [] for val in range(0, num_pool): pool[val] = add_pool(self, create=False, connect=False) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) self.pool = pool[val] # Start the additional servers and extend the pool if extend is True: self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Get rank, target from the test_dmg_sequence # Some test_dmg_sequence data will be invalid, valid. for val in range(0, num_pool): for i in range(len(self.test_seq)): self.pool = pool[val] rank = self.test_seq[i][0] target = "{}".format(self.test_seq[i][1]) expected_result = "{}".format(self.test_seq[i][2]) # Extend the pool # There is no need to extend rank 0 # Avoid DER_ALREADY if extend is True and rank != "0": output = self.dmg_command.pool_extend(self.pool.uuid, rank) self.log.info(output) self.validate_results(expected_result, output.stdout_text) if (extend is False and rank in ["4", "5"]): continue # Exclude a rank, target output = self.dmg_command.pool_exclude(self.pool.uuid, rank, target) self.log.info(output) self.validate_results(expected_result, output.stdout_text) # Now reintegrate the excluded rank. output = self.dmg_command.pool_reintegrate( self.pool.uuid, rank, target) self.log.info(output) self.validate_results(expected_result, output.stdout_text) # Drain the data from a rank output = self.dmg_command.pool_drain(self.pool.uuid, rank, target) self.log.info(output) self.validate_results(expected_result, output.stdout_text) # Now reintegrate the drained rank output = self.dmg_command.pool_reintegrate( self.pool.uuid, rank, target) self.log.info(output) self.validate_results(expected_result, output.stdout_text)
def run_online_reintegration_test(self, num_pool, racer=False, server_boot=False, oclass=None): """Run the Online reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. Defults to False. oclass (str) : daos object class string (eg: "RP_2G8"). Defaults to None. """ if oclass is None: oclass = self.ior_cmd.dfs_oclass.value test_seq = self.ior_test_sequence[0] # Create a pool pool = {} exclude_servers = (len(self.hostlist_servers) * 2) - 1 # Exclude one rank : other than rank 0. rank = random.randint(1, exclude_servers) #nosec # Start the daos_racer thread if racer is True: daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = add_pool(self, connect=False) pool[val].set_property("reclaim", "disabled") # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): threads = [] self.pool = pool[val] # Instantiate aggregation if self.test_during_aggregation is True: for _ in range(0, 2): self.run_ior_thread("Write", oclass, test_seq) self.delete_extra_container(self.pool) # The following thread runs while performing osa operations. threads.append(threading.Thread(target=self.run_ior_thread, kwargs={"action": "Write", "oclass": oclass, "test": test_seq})) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if server_boot is False: output = self.dmg_command.pool_exclude( self.pool.uuid, rank) else: output = self.dmg_command.system_stop(ranks=rank, force=True) self.pool.wait_for_rebuild(False) self.log.info(output) output = self.dmg_command.system_start(ranks=rank) self.print_and_assert_on_rebuild_failure(output) pver_exclude = self.get_pool_version() self.log.info("Pool Version after exclude %s", pver_exclude) # Check pool version incremented after pool exclude # pver_exclude should be greater than # pver_begin + 8 targets. self.assertTrue(pver_exclude > (pver_begin + 8), "Pool Version Error: After exclude") output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank) self.print_and_assert_on_rebuild_failure(output) pver_reint = self.get_pool_version() self.log.info("Pool Version after reintegrate %d", pver_reint) # Check pool version incremented after pool reintegrate self.assertTrue(pver_reint > (pver_exclude + 1), "Pool Version Error: After reintegrate") # Wait to finish the threads for thrd in threads: thrd.join() if not self.out_queue.empty(): self.assert_on_exception() # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. if racer is True: daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) self.run_ior_thread("Read", oclass, test_seq) self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)
def run_nvme_pool_exclude(self, num_pool, oclass=None): """This is the main method which performs the actual testing. It does the following jobs: - Create number of TestPools - Start the IOR threads for running on each pools. - On each pool do the following: - Perform an IOR write (using a container) - Exclude a daos_server - Perform an IOR read/verify (same container used for write) Args: num_pool (int) : total pools to create for testing purposes. oclass (str) : object class (eg: RP_2G8, S1,etc). Defaults to None """ # Create a pool pool = {} if oclass is None: oclass = self.ior_cmd.dfs_oclass.value # Exclude rank : ranks other than rank 0. exclude_servers = len(self.hostlist_servers) * 2 rank_list = list(range(1, exclude_servers)) for val in range(0, num_pool): pool[val] = add_pool(self, connect=False) pool[val].set_property("reclaim", "disabled") for val in range(0, num_pool): self.pool = pool[val] self.add_container(self.pool) self.cont_list.append(self.container) rf = ''.join(self.container.properties.value.split(":")) rf_num = int(re.search(r"rf([0-9]+)", rf).group(1)) for test in range(0, rf_num): threads = [] threads.append(threading.Thread(target=self.run_ior_thread, kwargs={"action": "Write", "oclass": oclass, "test": test})) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool.display_pool_daos_space("Pool space: Before Exclude") pver_begin = self.get_pool_version() index = random.randint(1, len(rank_list)) #nosec rank = rank_list.pop(index-1) tgt_exclude = random.randint(1, 6) #nosec self.log.info("Removing rank %d, target %d", rank, tgt_exclude) self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_exclude(self.pool.uuid, rank, tgt_exclude) self.print_and_assert_on_rebuild_failure(output) pver_exclude = self.get_pool_version() self.log.info("Pool Version after exclude %s", pver_exclude) # Check pool version incremented after pool exclude self.assertTrue(pver_exclude > pver_begin, "Pool Version Error: After exclude") # Wait to finish the threads for thrd in threads: thrd.join() if not self.out_queue.empty(): self.assert_on_exception() # Verify the data after pool exclude self.run_ior_thread("Read", oclass, test) display_string = "Pool{} space at the End".format(val) self.pool.display_pool_daos_space(display_string) kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)
def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None, pool_fillup=0): """Run the offline reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. Defaults to False. oclass (str) : daos object class string (eg: "RP_2G8") pool_fillup (int) : Percentage of pool filled up with data before performing OSA operations. """ # Create a pool pool = {} random_pool = 0 if oclass is None: oclass = self.ior_cmd.dfs_oclass.value # Exclude ranks [0, 3, 4] rank = [0, 3, 4] for val in range(0, num_pool): pool[val] = add_pool(self, connect=False) self.pool = pool[val] self.pool.set_property("reclaim", "disabled") test_seq = self.ior_test_sequence[0] if data: # if pool_fillup is greater than 0, then # use start_ior_load method from nvme_utils.py. # Otherwise, use the osa_utils.py run_ior_thread # method. if pool_fillup > 0: self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) self.ior_default_flags = self.ior_w_flags self.ior_cmd.repetitions.update(self.ior_test_repetitions) self.log.info(self.pool.pool_percentage_used()) self.start_ior_load(storage='NVMe', operation="Auto_Write", percent=pool_fillup) self.log.info(self.pool.pool_percentage_used()) else: self.run_ior_thread("Write", oclass, test_seq) self.run_mdtest_thread(oclass) if self.test_with_snapshot is True: # Create a snapshot of the container # after IOR job completes. self.container.create_snap() self.log.info("Created container snapshot: %s", self.container.epoch) if self.test_during_aggregation is True: self.run_ior_thread("Write", oclass, test_seq) # Exclude all the ranks random_pool = random.randint(0, (num_pool - 1)) #nosec for _ in range(0, self.loop_test_cnt): for val, _ in enumerate(rank): self.pool = pool[random_pool] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if server_boot is False: if (self.test_during_rebuild is True and val == 0): # Exclude rank 5 output = self.dmg_command.pool_exclude( self.pool.uuid, "5") self.print_and_assert_on_rebuild_failure(output) if self.test_during_aggregation is True: self.delete_extra_container(self.pool) self.simple_osa_reintegrate_loop(rank[val]) # For redundancy factor testing, just exclude only # one target on a rank. Don't exclude a rank(s). if (self.test_with_rf is True and val == 0): output = self.dmg_command.pool_exclude( self.pool.uuid, rank[val]) elif (self.test_with_rf is True and val > 0): continue else: if pool_fillup > 0 and val > 0: continue output = self.dmg_command.pool_exclude( self.pool.uuid, rank[val]) else: output = self.dmg_command.system_stop(ranks=rank[val], force=True) self.print_and_assert_on_rebuild_failure(output) output = self.dmg_command.system_start(ranks=rank[val]) # Just try to reintegrate rank 5 if (self.test_during_rebuild is True and val == 2): # Reintegrate rank 5 output = self.dmg_command.pool_reintegrate( self.pool.uuid, "5") self.print_and_assert_on_rebuild_failure(output) pver_exclude = self.get_pool_version() self.log.info("Pool Version after exclude %s", pver_exclude) # Check pool version incremented after pool exclude # pver_exclude should be greater than # pver_begin + 1 (1 target + exclude) self.assertTrue(pver_exclude > (pver_begin + 1), "Pool Version Error: After exclude") # Reintegrate the ranks which was excluded for val, _ in enumerate(rank): if self.test_with_blank_node is True: ip_addr, p_num = self.get_ipaddr_for_rank(rank[val]) self.remove_pool_dir(ip_addr, p_num) if (val == 2 and "RP_2G" in oclass): output = self.dmg_command.pool_reintegrate( self.pool.uuid, rank[val], "0,2") elif (self.test_with_rf is True and val == 0): output = self.dmg_command.pool_reintegrate( self.pool.uuid, rank[val]) elif (self.test_with_rf is True and val > 0): continue else: if pool_fillup > 0 and val > 0: continue output = self.dmg_command.pool_reintegrate( self.pool.uuid, rank[val]) self.print_and_assert_on_rebuild_failure(output, timeout=15) pver_reint = self.get_pool_version() self.log.info("Pool Version after reintegrate %d", pver_reint) # Check pool version incremented after pool reintegrate self.assertTrue(pver_reint > pver_exclude, "Pool Version Error: After reintegrate") display_string = "Pool{} space at the End".format(random_pool) self.pool = pool[random_pool] self.pool.display_pool_daos_space(display_string) # Finally check whether the written data can be accessed. # Also, run the daos cont check (for object integrity) for val in range(0, num_pool): self.pool = pool[val] if data: if pool_fillup > 0: self.start_ior_load(storage='NVMe', operation='Auto_Read', percent=pool_fillup) else: self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread(oclass) self.container = self.pool_cont_dict[self.pool][0] kwargs = { "pool": self.pool.uuid, "cont": self.container.uuid } output = self.daos_command.container_check(**kwargs) self.log.info(output)
def run_offline_parallel_test(self, num_pool, data=False, oclass=None): """Run multiple OSA commands in parallel with or without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. oclass (str) : Daos object class (RP_2G1,etc) """ # Create a pool pool = {} pool_uuid = [] target_list = [] if oclass is None: oclass = self.ior_cmd.dfs_oclass.value # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) #nosec target_list.append(n) target_list.append(n + 1) t_string = "{},{}".format(target_list[0], target_list[1]) # Exclude rank 2. rank = 2 test_seq = self.ior_test_sequence[0] for val in range(0, num_pool): pool[val] = add_pool(self, connect=False) self.pool = pool[val] pool_uuid.append(self.pool.uuid) # Use only pool UUID while running the test. self.pool.use_label = False self.pool.set_property("reclaim", "disabled") if data: self.run_ior_thread("Write", oclass, test_seq) if oclass != "S1": self.run_mdtest_thread() # if self.test_during_aggregation is set, # Create another container and run the IOR # command using the second container. if self.test_during_aggregation is True: self.run_ior_thread("Write", oclass, test_seq) # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Give sometime for the additional server to come up. for retry in range(0, 10): scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): if retry == 9: self.fail("One or more servers not in expected status") else: break # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) # If we need to trigger aggregation on pool 1, delete # the second container which has IOR data. if self.test_during_aggregation is True and val == 0: self.delete_extra_container(self.pool) # Create the threads here threads = [] # Action dictionary with OSA dmg command parameters action_args = { "drain": { "pool": self.pool.uuid, "rank": rank, "tgt_idx": None }, "exclude": { "pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string }, "reintegrate": { "pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string }, "extend": { "pool": self.pool.uuid, "ranks": (rank + 2), "scm_size": self.pool.scm_size, "nvme_size": self.pool.nvme_size } } for action in sorted(action_args): # Add a dmg thread process = threading.Thread(target=self.dmg_thread, kwargs={ "action": action, "action_args": action_args, "results": self.out_queue }) process.start() threads.append(process) # Wait to finish the threads for thrd in threads: thrd.join() time.sleep(5) # Check the queue for any failure. tmp_list = list(self.out_queue.queue) for failure in tmp_list: if "FAIL" in failure: self.fail("Test failed : {0}".format(failure)) for val in range(0, num_pool): self.pool = pool[val] display_string = "Pool{} space at the End".format(val) self.pool.display_pool_daos_space(display_string) self.is_rebuild_done(3) self.assert_on_rebuild_failure() pver_end = self.get_pool_version() self.log.info("Pool Version at the End %s", pver_end) if self.server_boot is True: self.assertTrue(pver_end >= 17, "Pool Version Error: at the end") else: self.assertTrue(pver_end >= 25, "Pool Version Error: at the end") # Finally run IOR to read the data and perform daos_container_check for val in range(0, num_pool): self.pool = pool[val] if data: self.run_ior_thread("Read", oclass, test_seq) if oclass != "S1": self.run_mdtest_thread() self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)
def run_offline_drain_test(self, num_pool, data=False, oclass=None, pool_fillup=0): """Run the offline drain without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. oclass (str): DAOS object class (eg: RP_2G1,etc) """ # Create a pool pool = {} target_list = [] if oclass is None: oclass = self.ior_cmd.dfs_oclass.value # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) #nosec target_list.append(n) target_list.append(n+1) t_string = "{},{}".format(target_list[0], target_list[1]) for val in range(0, num_pool): pool[val] = add_pool(self, connect=False) self.pool = pool[val] self.pool.set_property("reclaim", "disabled") test_seq = self.ior_test_sequence[0] if data: # if pool_fillup is greater than 0, then # use start_ior_load method from nvme_utils.py. # Otherwise, use the osa_utils.py run_ior_thread # method. if pool_fillup > 0: self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) self.ior_default_flags = self.ior_w_flags self.log.info(self.pool.pool_percentage_used()) self.start_ior_load(storage='NVMe', operation="Auto_Write", percent=pool_fillup) self.log.info(self.pool.pool_percentage_used()) else: self.run_ior_thread("Write", oclass, test_seq) self.run_mdtest_thread(oclass) if self.test_with_snapshot is True: # Create a snapshot of the container # after IOR job completes. self.container.create_snap() self.log.info("Created container snapshot: %s", self.container.epoch) if self.test_during_aggregation is True: self.run_ior_thread("Write", oclass, test_seq) # Drain ranks and targets for val in range(0, num_pool): # Drain ranks provided in YAML file for index, rank in enumerate(self.ranks): self.pool = pool[val] # If we are testing using multiple pools, reintegrate # the rank back and then drain. self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if self.test_during_aggregation is True and index == 0: self.pool.set_property("reclaim", "time") self.delete_extra_container(self.pool) self.simple_osa_reintegrate_loop(rank=rank, action="drain") if (self.test_during_rebuild is True and val == 0): # Exclude rank 3 output = self.dmg_command.pool_exclude(self.pool.uuid, "3") self.pool.wait_for_rebuild(True) # If the pool is filled up just drain only a single rank. if pool_fillup > 0 and index > 0: continue output = self.dmg_command.pool_drain(self.pool.uuid, rank, t_string) self.print_and_assert_on_rebuild_failure(output) pver_drain = self.get_pool_version() self.log.info("Pool Version after drain %d", pver_drain) # Check pool version incremented after pool drain self.assertTrue(pver_drain > (pver_begin + 1), "Pool Version Error: After drain") if num_pool > 1: output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank, t_string) self.print_and_assert_on_rebuild_failure(output) if (self.test_during_rebuild is True and val == 0): # Reintegrate rank 3 output = self.dmg_command.pool_reintegrate(self.pool.uuid, "3") self.print_and_assert_on_rebuild_failure(output) for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) pool[val].display_pool_daos_space(display_string) if data: if pool_fillup > 0: self.start_ior_load(storage='NVMe', operation='Auto_Read', percent=pool_fillup) else: self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread(oclass) self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)
def run_offline_extend_test(self, num_pool, data=False, oclass=None): """Run the offline extend without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. oclass (list) : list of daos object class (eg: "RP_2G8") """ # Create a pool pool = {} if oclass is None: oclass = [] oclass.append(self.ior_cmd.dfs_oclass.value) self.log.info(oclass[0]) for val in range(0, num_pool): # Perform IOR write using the oclass list if val < len(oclass): index = val else: index = 0 pool[val] = add_pool(self, connect=False) self.pool = pool[val] test_seq = self.ior_test_sequence[0] self.pool.set_property("reclaim", "disabled") if data: self.run_ior_thread("Write", oclass[index], test_seq) self.run_mdtest_thread(oclass[index]) if self.test_during_aggregation is True: self.run_ior_thread("Write", oclass[index], test_seq) if self.test_with_snapshot is True: # Create a snapshot of the container # after IOR job completes. self.container.create_snap() self.log.info("Created container snapshot: %s", self.container.epoch) # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Give sometime for the additional server to come up. for retry in range(0, 10): scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): if retry == 9: self.fail("One or more servers not in expected status") else: break for rank_index, rank_val in enumerate(self.rank): # If total pools less than 3, extend only a single pool. # If total pools >= 3 : Extend only 3 pools. if num_pool >= len(self.rank): val = rank_index else: val = 0 self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) # Enable aggregation for multiple pool testing only. if self.test_during_aggregation is True and (num_pool > 1): self.delete_extra_container(self.pool) output = self.dmg_command.pool_extend(self.pool.uuid, rank_val) self.print_and_assert_on_rebuild_failure(output) pver_extend = self.get_pool_version() self.log.info("Pool Version after extend %d", pver_extend) # Check pool version incremented after pool extend self.assertTrue(pver_extend > pver_begin, "Pool Version Error: After extend") display_string = "Pool{} space at the End".format(val) pool[val].display_pool_daos_space(display_string) if data: # Perform the IOR read using the same # daos object class used for write. if val < len(oclass): index = val else: index = 0 self.run_ior_thread("Read", oclass[index], test_seq) self.run_mdtest_thread(oclass[index]) self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output)