def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] test_to = self.params.get("test_timeout", test_param + "*") self.job_timeout = self.params.get("job_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") self.nodesperjob = self.params.get("nodesperjob", test_param + "*") self.taskspernode = self.params.get("taskspernode", test_param + "*") harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") obj_class = self.params.get("oclass", "/run/container_reserved/*") if harassers: harasserlist = get_harassers(harassers) self.harassers = harasserlist[:] run_harasser = True self.log.info("<< Initial harrasser list = %s>>", " ".join([harasser for harasser in self.harassers])) # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) resv_cont.write_objects(rank, obj_class) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed {}>>".format( log_dir, error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize if harassers if run_harasser and not self.harassers: self.harasser_results = {} self.harasser_args = {} self.harassers = harasserlist[:] try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.soak_errors.extend(self.destroy_pools(self.pool[1])) # remove the test pools from self.pool; preserving reserved pool self.container = [] self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and self.harassers: self.harasser_loop_time = loop_time self.loop += 1 # TO-DO: use IOR if not resv_cont.read_objects(): self.soak_errors.append("Data verification error on reserved pool" "after SOAK completed") self.container.append(resv_cont) # gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] self.used = [] test_to = self.params.get("test_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") single_test_pool = self.params.get("single_test_pool", test_param + "*", True) self.dmg_command.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) self.dmg_command.copy_configuration(self.hostlist_clients) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") if harassers: run_harasser = True self.log.info("<< Initial harasser list = %s>>", harassers) harasserlist = harassers[:] # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) # Create the reserved container resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) # populate reserved container with a 500MB file initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "initial", "resv_file") try: reserved_file_copy(self, initial_resv_file, self.pool[0], resv_cont, num_bytes=500000000, cmd="write") except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container write failed>>") from error # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed>>".format( log_dir)) from error # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize harassers if run_harasser: if not harasserlist: harasserlist = harassers[:] harasser = harasserlist.pop(0) self.harasser_args = {} self.harasser_results = {} self.harassers, self.offline_harassers = get_harassers( harasser) try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # Remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1])) self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 # verify reserved container data final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "final", "resv_file") try: reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont) except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container read failed>>") from error if not cmp(initial_resv_file, final_resv_file): self.soak_errors.append("Data verification error on reserved pool" " after SOAK completed") for file in [initial_resv_file, final_resv_file]: if os.path.isfile(file): file_name = os.path.split(os.path.dirname(file))[-1] # save a copy of the POSIX file in self.outputsoakdir copy_cmd = "cp -p {} {}/{}_resv_file".format( file, self.outputsoakdir, file_name) try: run_command(copy_cmd, timeout=30) except DaosTestError as error: self.soak_errors.append( "Reserved data file {} failed to archive".format(file)) os.remove(file) self.container.append(resv_cont) # Gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.harasser_joblist = [] self.harasser_results = {} test_to = self.params.get("test_timeout", test_param) self.job_timeout = self.params.get("job_timeout", test_param) self.harasser_timeout = self.params.get("harasser_timeout", test_param) self.test_name = self.params.get("name", test_param) self.nodesperjob = self.params.get("nodesperjob", test_param) self.test_iteration = self.params.get("iteration", test_param) self.taskspernode = self.params.get("taskspernode", test_param + "*") self.h_list = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") pool_list = self.params.get("poollist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") if is_harasser(self, "rebuild"): obj_class = "_".join([ "OC", str(self.params.get("dfs_oclass", "/run/rebuild/*")[0]) ]) else: obj_class = "OC_SX" # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data self.container = self.get_container(self.pool[0], "/run/container_reserved/*", True) self.container.write_objects(rank, obj_class) self.all_failed_jobs = [] # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed {}>>".format( log_dir, error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<Soak1 PASS %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create all specified pools add_pools(self, pool_list) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) try: self.execute_jobs(job_list, self.pool[1:]) except SoakTestError as error: self.fail(error) errors = self.destroy_pools(self.pool[1:]) # remove the test pools from self.pool; preserving reserved pool self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) self.assertEqual(len(errors), 0, "\n".join(errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<PASS %s completed in %s >>", self.loop, DDHHMMSS_format(loop_time)) self.loop += 1 # TO-DO: use IOR self.assertTrue( self.container.read_objects(), "Data verification error on reserved pool" "after SOAK completed") # gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>", DDHHMMSS_format(time.time() - start_time))