def start_dfuse(self, pool): """Create dfuse start command line for slurm. Args: pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ commands = [] # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool)) # create dfuse mount point commands.append(slurm_utils.srun_str( hosts=None, cmd="mkdir -p {}".format(dfuse.mount_dir.value), srun_params=None)) commands.append(slurm_utils.srun_str( hosts=None, cmd="{}".format(dfuse.__str__()), srun_params=None)) commands.append("sleep 10") commands.append(slurm_utils.srun_str( hosts=None, cmd="df -h {}".format(dfuse.mount_dir.value), srun_params=None)) return dfuse, commands
def start_dfuse(self, pool): """Create dfuse start command line for slurm. Args: pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(self.get_container(pool)) dfuse_start_cmds = [ "mkdir -p {}".format(dfuse.mount_dir.value), "{}".format(dfuse.__str__()), "df -h {}".format(dfuse.mount_dir.value) ] return dfuse, dfuse_start_cmds
def start_dfuse(self, pool, container, nodesperjob, resource_mgr=None, name=None, job_spec=None): """Create dfuse start command line for slurm. Args: self (obj): soak obj pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.namespace = os.path.join(os.sep, "run", job_spec, "dfuse", "*") dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(container) dfuse_log = os.path.join( self.test_log_dir, self.test_name + "_" + name + "_${SLURM_JOB_NODELIST}_" "" + "${SLURM_JOB_ID}_" + "daos_dfuse_" + unique) dfuse_env = "export D_LOG_MASK=ERR;export D_LOG_FILE={}".format(dfuse_log) dfuse_start_cmds = [ "mkdir -p {}".format(dfuse.mount_dir.value), "clush -S -w $SLURM_JOB_NODELIST \"cd {};{};{}\"".format( dfuse.mount_dir.value, dfuse_env, dfuse.__str__()), "sleep 10", "df -h {}".format(dfuse.mount_dir.value), ] if resource_mgr == "SLURM": cmds = [] for cmd in dfuse_start_cmds: if cmd.startswith("clush") or cmd.startswith("sleep"): cmds.append(cmd) else: cmds.append(get_srun_cmd(cmd, nodesperjob)) dfuse_start_cmds = cmds return dfuse, dfuse_start_cmds
def start_dfuse(self, pool, nodesperjob, resource_mgr=None): """Create dfuse start command line for slurm. Args: self (obj): soak obj pool (obj): TestPool obj Returns dfuse(obj): Dfuse obj cmd(list): list of dfuse commands to add to jobscript """ # Get Dfuse params dfuse = Dfuse(self.hostlist_clients, self.tmp) dfuse.get_params(self) # update dfuse params; mountpoint for each container unique = get_random_string(5, self.used) self.used.append(unique) add_containers(self, pool) mount_dir = dfuse.mount_dir.value + unique dfuse.mount_dir.update(mount_dir) dfuse.set_dfuse_params(pool) dfuse.set_dfuse_cont_param(self.container[-1]) dfuse_start_cmds = [ "mkdir -p {}".format(dfuse.mount_dir.value), "clush -w $SLURM_JOB_NODELIST \"cd {};{}\"".format( dfuse.mount_dir.value, dfuse.__str__()), "sleep 10", "df -h {}".format(dfuse.mount_dir.value), ] if resource_mgr == "SLURM": cmds = [] for cmd in dfuse_start_cmds: if cmd.startswith("clush") or cmd.startswith("sleep"): cmds.append(cmd) else: cmds.append(get_srun_cmd(cmd, nodesperjob)) dfuse_start_cmds = cmds return dfuse, dfuse_start_cmds
class SoakTestBase(TestWithServers): # pylint: disable=too-many-public-methods """Execute DAOS Soak test cases. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a SoakBase object.""" super(SoakTestBase, self).__init__(*args, **kwargs) self.failed_job_id_list = None self.test_log_dir = None self.exclude_slurm_nodes = None self.loop = None self.log_dir = None self.outputsoakdir = None self.test_name = None self.local_pass_dir = None self.dfuse = None self.test_timeout = None self.end_time = None self.job_timeout = None self.nodesperjob = None self.task_list = None self.soak_results = None self.srun_params = None self.pool = None self.container = None self.test_iteration = None self.h_list = None self.harasser_joblist = None self.harasser_results = None self.harasser_timeout = None self.all_failed_jobs = None self.username = None def setUp(self): """Define test setup to be done.""" self.log.info("<<setUp Started>> at %s", time.ctime()) # Start the daos_agents in the job scripts self.setup_start_servers = True self.setup_start_agents = False super(SoakTestBase, self).setUp() self.username = getuser() # Initialize loop param for all tests self.loop = 1 self.exclude_slurm_nodes = [] # Setup logging directories for soak logfiles # self.output dir is an avocado directory .../data/ self.log_dir = self.params.get("logdir", "/run/*") self.outputsoakdir = self.outputdir + "/soak" # Create the remote log directories on all client nodes self.test_log_dir = self.log_dir + "/pass" + str(self.loop) self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) # Fail if slurm partition daos_client is not defined if not self.client_partition: raise SoakTestError( "<<FAILED: Partition is not correctly setup for daos " "slurm partition>>") # Check if the server nodes are in the client list; # this will happen when only one partition is specified for host_server in self.hostlist_servers: if host_server in self.hostlist_clients: self.hostlist_clients.remove(host_server) self.exclude_slurm_nodes.append(host_server) self.log.info("<<Updated hostlist_clients %s >>", self.hostlist_clients) if not self.hostlist_clients: self.fail("There are no nodes that are client only;" "check if the partition also contains server nodes") # Include test node for log cleanup; remove from client list local_host_list = include_local_host(None) self.exclude_slurm_nodes.extend(local_host_list) # Start an agent on the test control host to enable API calls for # reserved pool and containers. The test control host should be the # last host in the hostlist_clients list. agent_groups = {self.server_group: local_host_list} self.start_agents(agent_groups) def pre_tear_down(self): """Tear down any test-specific steps prior to running tearDown(). Returns: list: a list of error strings to report after all tear down steps have been attempted """ errors = [] # clear out any jobs in squeue; if self.failed_job_id_list: self.log.info("<<Cancel jobs in queue with ids %s >>", self.failed_job_id_list) status = process.system("scancel --partition {} -u {}".format( self.client_partition, self.username)) if status > 0: errors.append("Failed to cancel jobs {}".format( self.failed_job_id_list)) if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) # One last attempt to copy any logfiles from client nodes try: self.get_remote_logs() except SoakTestError as error: self.log.info("Remote copy failed with %s", error) # daos_agent is always started on this node when start agent is false if not self.setup_start_agents: self.hostlist_clients = [socket.gethostname().split('.', 1)[0]] return errors def tearDown(self): """Define tearDown and clear any left over jobs in squeue.""" # Perform any test-specific tear down steps and collect any # reported errors self.log.info("<<tearDown Started>> at %s", time.ctime()) super(SoakTestBase, self).tearDown() def job_done(self, args): """Call this function when a job is done. Args: args (list):handle --which job, i.e. the job ID, state --string indicating job completion status """ self.soak_results[args["handle"]] = args["state"] def add_pools(self, pool_names): """Create a list of pools that the various tests use for storage. Args: pool_names: list of pool namespaces from yaml file /run/<test_params>/poollist/* """ for pool_name in pool_names: path = "".join(["/run/", pool_name, "/*"]) # Create a pool and add it to the overall list of pools self.pool.append( TestPool(self.context, self.log, dmg_command=self.get_dmg_command())) self.pool[-1].namespace = path self.pool[-1].get_params(self) self.pool[-1].create() self.log.info("Valid Pool UUID is %s", self.pool[-1].uuid) def get_remote_logs(self): """Copy files from remote dir to local dir. Raises: SoakTestError: if there is an error with the remote copy """ # copy the files from the remote # TO-DO: change scp this_host = socket.gethostname() rsync_str = "rsync -avtr --min-size=1B" result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), "bash -c \"{0} {1} {2}:{1}/.. && rm -rf {1}/*\"".format( rsync_str, self.test_log_dir, this_host), self.srun_params) if result.exit_status == 0: cmd = "cp -R -p {0}/ \'{1}\'; rm -rf {0}/*".format( self.test_log_dir, self.outputsoakdir) try: result = process.run(cmd, shell=True, timeout=30) except process.CmdError as error: raise SoakTestError("<<FAILED: Soak remote logfiles not copied" "to avocado data dir {} - check /tmp/soak " "on nodes {}>>".format( error, self.hostlist_clients)) else: raise SoakTestError("<<FAILED: Soak remote logfiles not copied " "from clients>>: {}".format( self.hostlist_clients)) def is_harasser(self, harasser): """Check if harasser is defined in yaml. Args: harasser (list): list of harassers to launch Returns: bool """ return self.h_list and harasser in self.h_list def launch_harassers(self, harassers, pools): """Launch any harasser tests if defined in yaml. Args: harasser (list): list of harassers to launch pools (TestPool): pool obj """ job = None # Launch harasser after one complete pass for harasser in harassers: if harasser == "rebuild": method = self.launch_rebuild ranks = self.params.get("ranks_to_kill", "/run/" + harasser + "/*") param_list = (ranks, pools) name = "REBUILD" if harasser in "snapshot": method = self.launch_snapshot param_list = () name = "SNAPSHOT" else: raise SoakTestError( "<<FAILED: Harasser {} is not supported. ".format( harasser)) job = threading.Thread(target=method, args=param_list, name=name) self.harasser_joblist.append(job) # start all harassers for job in self.harasser_joblist: job.start() def harasser_completion(self, timeout): """Complete harasser jobs. Args: timeout (int): timeout in secs Returns: bool: status """ status = True for job in self.harasser_joblist: job.join(timeout) for job in self.harasser_joblist: if job.is_alive(): self.log.error("<< HARASSER is alive %s FAILED to join>> ", job.name) status &= False # Check if the completed job passed for harasser, status in self.harasser_results.items(): if not status: self.log.error("<< HARASSER %s FAILED>> ", harasser) status &= False self.harasser_joblist = [] return status def launch_rebuild(self, ranks, pools): """Launch the rebuild process. Args: ranks (list): Server ranks to kill pools (list): list of TestPool obj """ self.log.info("<<Launch Rebuild>> at %s", time.ctime()) status = True for pool in pools: # Kill the server try: pool.start_rebuild(ranks, self.d_log) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Rebuild failed to start", exc_info=error) status &= False break # Wait for rebuild to start try: pool.wait_for_rebuild(True) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Rebuild failed waiting to start", exc_info=error) status &= False break # Wait for rebuild to complete try: pool.wait_for_rebuild(False) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Rebuild failed waiting to finish", exc_info=error) status &= False break with H_LOCK: self.harasser_results["REBUILD"] = status def launch_snapshot(self): """Create a basic snapshot of the reserved pool.""" self.log.info("<<Launch Snapshot>> at %s", time.ctime()) status = True # Create container container = TestContainer(self.pool[0]) container.namespace = "/run/container_reserved/*" container.get_params(self) container.create() container.open() obj_cls = self.params.get("object_class", '/run/container_reserved/*') # write data to object data_pattern = get_random_string(500) datasize = len(data_pattern) + 1 dkey = "dkey" akey = "akey" tx_handle = container.container.get_new_tx() obj = container.container.write_an_obj(data_pattern, datasize, dkey, akey, obj_cls=obj_cls, txn=tx_handle) container.container.commit_tx(tx_handle) obj.close() # Take a snapshot of the container snapshot = DaosSnapshot(self.context) try: snapshot.create(container.container.coh, tx_handle) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Snapshot failed", exc_info=error) status &= False if status: self.log.info("Snapshot Created") # write more data to object data_pattern2 = get_random_string(500) datasize2 = len(data_pattern2) + 1 dkey = "dkey" akey = "akey" obj2 = container.container.write_an_obj(data_pattern2, datasize2, dkey, akey, obj_cls=obj_cls) obj2.close() self.log.info("Wrote additional data to container") # open the snapshot and read the data obj.open() snap_handle = snapshot.open(container.container.coh) try: data_pattern3 = container.container.read_an_obj( datasize, dkey, akey, obj, txn=snap_handle.value) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Error when retrieving the snapshot data %s", error) status &= False if status: # Compare the snapshot to the original written data. if data_pattern3.value != data_pattern: self.log.error("Snapshot data miscompere") status &= False # Destroy the snapshot try: snapshot.destroy(container.container.coh) except (RuntimeError, TestFail, DaosApiError) as error: self.log.error("Failed to destroy snapshot %s", error) status &= False # cleanup container.close() container.destroy() with H_LOCK: self.harasser_results["SNAPSHOT"] = status def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("daos_oclass", ior_params + "*") # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX if self.is_harasser("rebuild"): oclass_list = self.params.get("daos_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.daos_oclass.update(o_type) ior_cmd.set_daos_params(self.server_group, pool) # srun cmdline nprocs = nodesperjob * ppn env = ior_cmd.get_default_env("srun") if ior_cmd.api.value == "MPIIO": env["DAOS_CONT"] = ior_cmd.daos_cont.value cmd = Srun(ior_cmd) cmd.setup_command(env, None, nprocs) cmd.ntasks_per_node.update(ppn) log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([cmd.__str__(), log_name]) self.log.info("<<IOR cmdline>>: %s \n", commands[-1].__str__()) return commands def create_dfuse_cont(self, pool): """Create a TestContainer object to be used to create container. Args: pool (obj): TestPool obj Returns: cuuid: container uuid """ # TO-DO: use daos tool when available # This method assumes that doas agent is running on test node cmd = "daos cont create --pool={} --svc={} --type=POSIX".format( pool.uuid, ":".join([str(item) for item in pool.svc_ranks])) try: result = process.run(cmd, shell=True, timeout=30) except process.CmdError as error: raise SoakTestError( "<<FAILED: Dfuse container failed {}>>".format(error)) self.log.info("Dfuse Container UUID = %s", result.stdout.split()[3]) return result.stdout.split()[3] def start_dfuse(self, pool): """Create a DfuseCommand object to start dfuse. Args: pool (obj): TestPool obj """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool)) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) # create dfuse mount point cmd = "mkdir -p {}".format(self.dfuse.mount_dir.value) params = self.srun_params params["export"] = "all" params["ntasks-per-node"] = 1 result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: Dfuse mountpoint {} not created>>".format( self.dfuse.mount_dir.value)) cmd = self.dfuse.__str__() result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Dfuse failed to start>>") def create_fio_cmdline(self, job_spec, pool): """Create the FOI commandline. Args: job_spec (str): fio job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node Returns: cmd(list): list of cmdlines """ commands = [] fio_namespace = "/run/{}".format(job_spec) # test params bs_list = self.params.get("blocksize", fio_namespace + "/soak/*") size_list = self.params.get("size", fio_namespace + "/soak/*") rw_list = self.params.get("rw", fio_namespace + "/soak/*") # Get the parameters for Fio fio_cmd = FioCommand() fio_cmd.namespace = "{}/*".format(fio_namespace) fio_cmd.get_params(self) for blocksize in bs_list: for size in size_list: for rw in rw_list: # update fio params fio_cmd.update("global", "blocksize", blocksize, "fio --name=global --blocksize") fio_cmd.update("global", "size", size, "fio --name=global --size") fio_cmd.update("global", "rw", rw, "fio --name=global --rw") # start dfuse if api is POSIX if fio_cmd.api.value == "POSIX": # Connect to the pool, create container # and then start dfuse self.start_dfuse(pool) fio_cmd.update("global", "directory", self.dfuse.mount_dir.value, "fio --name=global --directory") # fio command log_name = "{}_{}_{}".format(blocksize, size, rw) commands.append([fio_cmd.__str__(), log_name]) self.log.info("<<FIO cmdline>>: %s \n", commands[-1]) return commands def build_job_script(self, commands, job, ppn, nodesperjob): """Create a slurm batch script that will execute a list of cmdlines. Args: commands(list): commandlines and cmd specific log_name job(str): the job name that will be defined in the slurm script ppn(int): number of tasks to run on each node Returns: script_list: list of slurm batch scripts """ self.log.info("<<Build Script>> at %s", time.ctime()) script_list = [] # Start the daos_agent in the batch script for now # TO-DO: daos_agents start with systemd agent_launch_cmds = [ "mkdir -p {}".format(os.environ.get("DAOS_TEST_LOG_DIR")) ] agent_launch_cmds.append(" ".join( [str(self.agent_managers[0].manager.job), "&"])) # Create the sbatch script for each cmdline used = [] for cmd, log_name in commands: output = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_") error = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_ERROR_") sbatch = { "time": str(self.job_timeout) + ":00", "exclude": NodeSet.fromlist(self.exclude_slurm_nodes), "error": str(error) } # include the cluster specific params sbatch.update(self.srun_params) unique = get_random_string(5, used) script = slurm_utils.write_slurm_script(self.test_log_dir, job, output, nodesperjob, agent_launch_cmds + [cmd], unique, sbatch) script_list.append(script) used.append(unique) return script_list def job_setup(self, job, pool): """Create the cmdline needed to launch job. Args: job(str): single job from test params list of jobs to run pool (obj): TestPool obj Returns: job_cmdlist: list cmdline that can be launched by specifed job manager """ job_cmdlist = [] commands = [] scripts = [] nodesperjob = [] self.log.info("<<Job_Setup %s >> at %s", self.test_name, time.ctime()) for npj in self.nodesperjob: # nodesperjob = -1 indicates to use all nodes in client hostlist if npj < 0: npj = len(self.hostlist_clients) if len(self.hostlist_clients) / npj < 1: raise SoakTestError( "<<FAILED: There are only {} client nodes for this job. " "Job requires {}".format(len(self.hostlist_clients), npj)) nodesperjob.append(npj) if "ior" in job: for npj in nodesperjob: for ppn in self.task_list: commands = self.create_ior_cmdline(job, pool, ppn, npj) # scripts are single cmdline scripts = self.build_job_script(commands, job, ppn, npj) job_cmdlist.extend(scripts) elif "fio" in job: commands = self.create_fio_cmdline(job, pool) # scripts are single cmdline scripts = self.build_job_script(commands, job, 1, 1) job_cmdlist.extend(scripts) else: raise SoakTestError("<<FAILED: Job {} is not supported. ".format( self.job)) return job_cmdlist def job_startup(self, job_cmdlist): """Submit job batch script. Args: job_cmdlist (list): list of jobs to execute Returns: job_id_list: IDs of each job submitted to slurm. """ self.log.info("<<Job Startup - %s >> at %s", self.test_name, time.ctime()) job_id_list = [] # before submitting the jobs to the queue, check the job timeout; if time.time() > self.end_time: self.log.info("<< SOAK test timeout in Job Startup>>") return job_id_list # job_cmdlist is a list of batch script files for script in job_cmdlist: try: job_id = slurm_utils.run_slurm_script(str(script)) except slurm_utils.SlurmFailed as error: self.log.error(error) # Force the test to exit with failure job_id = None if job_id: self.log.info("<<Job %s started with %s >> at %s", job_id, script, time.ctime()) slurm_utils.register_for_job_results(job_id, self, maxwait=self.test_timeout) # keep a list of the job_id's job_id_list.append(int(job_id)) else: # one of the jobs failed to queue; exit on first fail for now. err_msg = "Slurm failed to submit job for {}".format(script) job_id_list = [] raise SoakTestError("<<FAILED: Soak {}: {}>>".format( self.test_name, err_msg)) return job_id_list def job_completion(self, job_id_list): """Wait for job completion and cleanup. Args: job_id_list: IDs of each job submitted to slurm Returns: failed_job_id_list: IDs of each job that failed in slurm """ self.log.info("<<Job Completion - %s >> at %s", self.test_name, time.ctime()) # If there is nothing to do; exit if job_id_list: # wait for all the jobs to finish while len(self.soak_results) < len(job_id_list): # wait for the jobs to complete. # enter tearDown before hitting the avocado timeout if time.time() > self.end_time: self.log.info("<< SOAK test timeout in Job Completion>>") break time.sleep(5) # check for job COMPLETED and remove it from the job queue for job, result in self.soak_results.items(): # The queue include status of "COMPLETING" if result == "COMPLETED": job_id_list.remove(int(job)) else: self.log.info("<< Job %s failed with status %s>>", job, result) if job_id_list: self.log.info("<<Cancel jobs in queue with id's %s >>", job_id_list) for job in job_id_list: status = slurm_utils.cancel_jobs(int(job)) if status == 0: self.log.info("<<Job %s successfully cancelled>>", job) else: self.log.info("<<Job %s could not be killed>>", job) # gather all the logfiles for this pass and cleanup test nodes try: self.get_remote_logs() except SoakTestError as error: self.log.info("Remote copy failed with %s", error) self.soak_results = {} return job_id_list def execute_jobs(self, jobs, pools): """Execute the overall soak test. Args: pools (list): list of TestPool obj - self.pool[1:] Raise: SoakTestError """ cmdlist = [] # Create the remote log directories from new loop/pass self.test_log_dir = self.log_dir + "/pass" + str(self.loop) self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "mkdir -p {}".format(self.test_log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: logfile directory not" "created on clients>>: {}".format( self.hostlist_clients)) # Create local log directory os.makedirs(self.local_pass_dir) # Setup cmdlines for job with specified pool if len(pools) < len(jobs): raise SoakTestError( "<<FAILED: There are not enough pools to run this test>>") for index, job in enumerate(jobs): cmdlist.extend(self.job_setup(job, pools[index])) # Gather the job_ids job_id_list = self.job_startup(cmdlist) # Initialize the failed_job_list to job_list so that any # unexpected failures will clear the squeue in tearDown self.failed_job_id_list = job_id_list # launch harassers if defined and enabled if self.h_list and self.loop > 1: self.log.info("<<Harassers are enabled>>") self.launch_harassers(self.h_list, pools) if not self.harasser_completion(self.harasser_timeout): raise SoakTestError("<<FAILED: Harassers failed ") # rebuild can only run once for now if self.is_harasser("rebuild"): self.h_list.remove("rebuild") # Wait for jobs to finish and cancel/kill jobs if necessary self.failed_job_id_list = self.job_completion(job_id_list) # Log the failing job ID if self.failed_job_id_list: self.log.info( "<<FAILED: The following jobs failed %s >>", (" ,".join(str(j_id) for j_id in self.failed_job_id_list))) # accumulate failing job IDs self.all_failed_jobs.extend(self.failed_job_id_list) def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.harasser_joblist = [] self.harasser_results = {} test_to = self.params.get("test_timeout", test_param) self.job_timeout = self.params.get("job_timeout", test_param) self.harasser_timeout = self.params.get("harasser_timeout", test_param) self.test_name = self.params.get("name", test_param) self.nodesperjob = self.params.get("nodesperjob", test_param) self.test_iteration = self.params.get("iteration", test_param) self.task_list = self.params.get("taskspernode", test_param + "*") self.h_list = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") pool_list = self.params.get("poollist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") if self.is_harasser("rebuild"): obj_class = "_".join([ "OC", str(self.params.get("daos_oclass", "/run/rebuild/*")[0]) ]) else: obj_class = self.params.get("object_class", "/run/container_reserved/*") slurm_reservation = self.params.get("reservation", "/run/srun_params/*") # Srun params if self.client_partition is not None: self.srun_params = {"partition": self.client_partition} if slurm_reservation is not None: self.srun_params["reservation"] = slurm_reservation # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool self.add_pools(["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data self.container = TestContainer(self.pool[0]) self.container.namespace = "/run/container_reserved/*" self.container.get_params(self) self.container.create() self.container.write_objects(rank, obj_class) self.all_failed_jobs = [] # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node /tmp/soak cmd = "rm -rf {}".format(self.log_dir) try: result = process.run(cmd, shell=True, timeout=30) except process.CmdError as error: raise SoakTestError( "<<FAILED: Soak directory on testnode not removed {}>>".format( error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<Soak1 PASS %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create all specified pools self.add_pools(pool_list) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) try: self.execute_jobs(job_list, self.pool[1:]) except SoakTestError as error: self.fail(error) errors = self.destroy_pools(self.pool[1:]) # remove the test pools from self.pool; preserving reserved pool self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) self.assertEqual(len(errors), 0, "\n".join(errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<PASS %s completed in %s >>", self.loop, DDHHMMSS_format(loop_time)) # # if the time left if less than a loop exit now # if end_time - time.time() < loop_time: # break self.loop += 1 # TO-DO: use IOR self.assertTrue( self.container.read_objects(), "Data verification error on reserved pool" "after SOAK completed") # gather the doas logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>", DDHHMMSS_format(time.time() - start_time))