def build_ior_script(self, job): """ Builds an IOR command string which is then added to slurm script job --which job to read in the yaml file """ # for the moment build IOR #IorUtils.build_ior(self.basepath) # read job info job_params = "/run/" + job + "/" job_name = self.params.get("name", job_params) job_nodes = self.params.get("nodes", job_params) job_processes = self.params.get("process_per_node", job_params) job_spec = self.params.get("jobspec", job_params) # read ior cmd info spec = "/run/" + job_spec + "/" iteration = self.params.get("iter", spec + 'iteration/') ior_flags = self.params.get("F", spec + 'iorflags/') transfer_size = self.params.get("t", spec + 'transfersize/') record_size = self.params.get("r", spec + 'recordsize/*') stripe_size = self.params.get("s", spec + 'stripesize/*') stripe_count = self.params.get("c", spec + 'stripecount/') async_io = self.params.get("a", spec + 'asyncio/') object_class = self.params.get("o", spec + 'objectclass/') self.partition = self.params.get("partition", '/run/hosts/test_machines/') pool_uuid = self.pool.get_uuid_str() tmplist = [] svc_list = "" for i in range(self.createsvc): tmplist.append(int(self.pool.svc.rl_ranks[i])) svc_list += str(tmplist[i]) + ":" svc_list = svc_list[:-1] block_size = '1536m' if stripe_size == '8m': transfer_size = stripe_size hostfile = os.path.join(self.tmpdir, "ior_hosts_" + job_name) cmd = ior_utils.get_ior_cmd(ior_flags, iteration, block_size, transfer_size, pool_uuid, svc_list, record_size, stripe_size, stripe_count, async_io, object_class, self.basepath, hostfile, job_processes) output = os.path.join(self.tmpdir, job_name + "_results.out") script = slurm_utils.write_slurm_script(self.tmpdir, job_name, output, int(job_nodes), [cmd]) return script
def test_soak_3(self): """ Test ID: DAOS-2192 Test Description: this time try a dmg command combined with IOR run Use Cases: :avocado: tags=soak3 """ script1 = None script2 = None try: # retrieve IOR job parameters script1 = self.build_ior_script('job1') job_id1 = slurm_utils.run_slurm_script(script1) slurm_utils.register_for_job_results(job_id1, self, maxwait=3600) # now do the dmg job dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s3_job2_name = self.params.get("name", '/run/job3/') s3_job2_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s3_job2_name + "_results.out") script2 = slurm_utils.write_slurm_script(self.tmpdir, s3_job2_name, output, s3_job2_nodes, dmgcmds) job_id2 = slurm_utils.run_slurm_script(script2) slurm_utils.register_for_job_results(job_id2, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 2: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail( "Soak job: {} didn't complete as expected: {}".format( job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 3 Failed\n {}".format(error)) finally: try: os.remove(script1) except StandardError: pass try: os.remove(script2) except StandardError: pass
def test_soak_3(self): """ Test ID: DAOS-2192 Test Description: this time try a dmg command combined with IOR run Use Cases: :avocado: tags=soak3 """ script1 = None script2 = None try: # retrieve IOR job parameters script1 = self.build_ior_script('job1') job_id1 = slurm_utils.run_slurm_script(script1) slurm_utils.register_for_job_results(job_id1, self, maxwait=3600) # now do the dmg job dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s3_job2_name = self.params.get("name", '/run/job3/') s3_job2_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s3_job2_name + "_results.out") script2 = slurm_utils.write_slurm_script(self.tmpdir, s3_job2_name, output, s3_job2_nodes, dmgcmds) job_id2 = slurm_utils.run_slurm_script(script2) slurm_utils.register_for_job_results(job_id2, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 2: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail("Soak job: {} didn't complete as expected: {}". format(job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 3 Failed\n {}".format(error)) finally: try: os.remove(script1) except StandardError: pass try: os.remove(script2) except StandardError: pass
def build_job_script(self, commands, job, ppn, nodesperjob): """Create a slurm batch script that will execute a list of cmdlines. Args: commands(list): commandlines and cmd specific log_name job(str): the job name that will be defined in the slurm script ppn(int): number of tasks to run on each node Returns: script_list: list of slurm batch scripts """ self.log.info("<<Build Script>> at %s", time.ctime()) script_list = [] # Start the daos_agent in the batch script for now # TO-DO: daos_agents start with systemd agent_launch_cmds = [ "mkdir -p {}".format(os.environ.get("DAOS_TEST_LOG_DIR")) ] agent_launch_cmds.append(" ".join( [str(self.agent_managers[0].manager.job), "&"])) # Create the sbatch script for each cmdline used = [] for cmd, log_name in commands: output = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_") error = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_ERROR_") sbatch = { "time": str(self.job_timeout) + ":00", "exclude": NodeSet.fromlist(self.exclude_slurm_nodes), "error": str(error) } # include the cluster specific params sbatch.update(self.srun_params) unique = get_random_string(5, used) script = slurm_utils.write_slurm_script(self.test_log_dir, job, output, nodesperjob, agent_launch_cmds + [cmd], unique, sbatch) script_list.append(script) used.append(unique) return script_list
def build_job_script(self, commands, job, nodesperjob): """Create a slurm batch script that will execute a list of cmdlines. Args: self (obj): soak obj commands(list): commandlines and cmd specific log_name job(str): the job name that will be defined in the slurm script Returns: script_list: list of slurm batch scripts """ job_timeout = self.params.get("job_timeout", "/run/" + job + "/*", 10) self.log.info("<<Build Script>> at %s", time.ctime()) script_list = [] # if additional cmds are needed in the batch script prepend_cmds = [ "set -e", "/usr/bin/daos pool query --pool {} ".format(self.pool[1].uuid), "/usr/bin/daos pool query --pool {} ".format(self.pool[0].uuid) ] append_cmds = [ "/usr/bin/daos pool query --pool {} ".format(self.pool[1].uuid), "/usr/bin/daos pool query --pool {} ".format(self.pool[0].uuid) ] exit_cmd = ["exit $status"] # Create the sbatch script for each list of cmdlines for cmd, log_name in commands: if isinstance(cmd, str): cmd = [cmd] output = os.path.join(self.test_log_dir, self.test_name + "_" + log_name + "_%N_" + "%j_") error = os.path.join(str(output) + "ERROR_") sbatch = { "time": str(job_timeout) + ":00", "exclude": NodeSet.fromlist(self.exclude_slurm_nodes), "error": str(error), "export": "ALL" } # include the cluster specific params sbatch.update(self.srun_params) unique = get_random_string(5, self.used) script = slurm_utils.write_slurm_script( self.test_log_dir, job, output, nodesperjob, prepend_cmds + cmd + append_cmds + exit_cmd, unique, sbatch) script_list.append(script) self.used.append(unique) return script_list
def build_job_script(self, commands, job, ppn, nodesperjob): """Create a slurm batch script that will execute a list of cmdlines. Args: commands(list): commandlines and cmd specific log_name job(str): the job name that will be defined in the slurm script ppn(int): number of tasks to run on each node Returns: script_list: list of slurm batch scripts """ self.log.info("<<Build Script>> at %s", time.ctime()) script_list = [] # if additional cmds are needed in the batch script additional_cmds = [] # Create the sbatch script for each list of cmdlines for cmd, log_name in commands: if isinstance(cmd, str): cmd = [cmd] output = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_") error = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_ERROR_") sbatch = { "time": str(self.job_timeout) + ":00", "exclude": NodeSet.fromlist(self.exclude_slurm_nodes), "error": str(error), "export": "ALL" } # include the cluster specific params sbatch.update(self.srun_params) unique = get_random_string(5, self.used) script = slurm_utils.write_slurm_script(self.test_log_dir, job, output, nodesperjob, additional_cmds + cmd, unique, sbatch) script_list.append(script) self.used.append(unique) return script_list
def test_soak_2(self): """ Test ID: DAOS-2192 Test Description: This test verifies that a dmg script can be submitted. :avocado: tags=soak2 """ script = None try: dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s2_job1_name = self.params.get("name", '/run/job3/') s2_job1_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s2_job1_name + "_results.out") script = slurm_utils.write_slurm_script(self.tmpdir, s2_job1_name, output, s2_job1_nodes, dmgcmds) job_id = slurm_utils.run_slurm_script(script) slurm_utils.register_for_job_results(job_id, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 1: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail( "Soak job: {} didn't complete as expected: {}".format( job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 2 Failed/n {}".format(error)) finally: try: os.remove(script) finally: pass
def build_job_script(self, commands, job, ppn, nodesperjob): """Create a slurm batch script that will execute a list of cmdlines. Args: commands(list): commandlines job(str): the job name that will be defined in the slurm script ppn(int): number of tasks to run on each node Returns: script_list: list of slurm batch scripts """ self.log.info("<<Build Script>> at %s", time.ctime()) script_list = [] # Start the daos_agent in the batch script for now # TO-DO: daos_agents start with systemd added_cmd_list = [ "srun -l --mpi=pmi2 --ntasks-per-node=1 " "--export=ALL {} -o {} &".format( os.path.join(self.bin, "daos_agent"), os.path.join(self.tmp, "daos_agent.yml")) ] # Create the sbatch script for each cmdline for cmd in commands: output = os.path.join( self.test_log_dir, "%N_" + self.test_name + "_" + job + "_%j_%t_" + str(ppn) + "_") sbatch = { "time": str(self.job_timeout) + ":00", "exclude": NodeSet.fromlist(self.exclude_slurm_nodes) } # include the cluster specific params sbatch.update(self.srun_params) script = slurm_utils.write_slurm_script(self.test_log_dir, job, output, nodesperjob, added_cmd_list + [cmd], sbatch) script_list.append(script) return script_list
def test_soak_2(self): """ Test ID: DAOS-2192 Test Description: This test verifies that a dmg script can be submitted. :avocado: tags=soak2 """ script = None try: dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params, self.basepath) s2_job1_name = self.params.get("name", '/run/job3/') s2_job1_nodes = self.params.get("nodes", '/run/job3/') output = os.path.join(self.tmpdir, s2_job1_name + "_results.out") script = slurm_utils.write_slurm_script(self.tmpdir, s2_job1_name, output, s2_job1_nodes, dmgcmds) job_id = slurm_utils.run_slurm_script(script) slurm_utils.register_for_job_results(job_id, self, maxwait=3600) # wait for all the jobs to finish while len(self.soak_results) < 1: time.sleep(10) for job, result in self.soak_results.iteritems(): if result != "COMPLETED": self.fail("Soak job: {} didn't complete as expected: {}". format(job, result)) except (DaosApiError, ior_utils.IorFailed) as error: self.fail("Soak Test 2 Failed/n {}".format(error)) finally: try: os.remove(script) finally: pass
def build_job_script(self, nodesperjob, job, pool): """Create a slurm batch script that will execute a list of jobs. Args: nodesperjob(int): number of nodes executing each job job(str): the job that will be defined in the slurm script with /run/"job"/. It is currently defined in the yaml as: Example job: job1: name: job1 - unique name time: 10 - cmdline time in seconds; used in IOR -T param tasks: 1 - number of processes per node --ntaskspernode jobspec: - ior_daos - ior_mpiio pool (obj): TestPool obj Returns: script_list: list of slurm batch scripts """ self.log.info("<<Build Script for job %s >> at %s", job, time.ctime()) script_list = [] # create one batch script per cmdline # get job params job_params = "/run/" + job + "/" job_name = self.params.get("name", job_params + "*") job_specs = self.params.get("jobspec", job_params + "*") task_list = self.params.get("tasks", job_params + "*") job_time = self.params.get("time", job_params + "*") # job_time in minutes:seconds format job_time = str(job_time) + ":00" for job_spec in job_specs: if "ior" in job_spec: # Create IOR cmdline cmd_list = self.create_ior_cmdline(job_params, job_spec, pool) elif "dmg" in job_spec: # create dmg cmdline cmd_list = self.create_dmg_cmdline(job_params, job_spec, pool) else: raise SoakTestError( "<<FAILED: Soak job: {} Job spec {} is invalid>>".format( job, job_spec)) # a single cmdline per batch job; so that a failure is per cmdline # change to multiple cmdlines per batch job later. for cmd in cmd_list: # additional sbatch params for tasks in task_list: output = os.path.join( self.rem_pass_dir, "%N_" + self.test_name + "_" + job_name + "_" + job_spec + "_results.out_%j_%t_" + str(tasks) + "_") num_tasks = nodesperjob * tasks sbatch = { "ntasks-per-node": tasks, "ntasks": num_tasks, "time": job_time, "partition": self.partition_clients, "exclude": self.test_node[0] } script = slurm_utils.write_slurm_script( self.rem_pass_dir, job_name, output, nodesperjob, [cmd], sbatch) script_list.append(script) return script_list