def get_srun_cmd(cmd, nodesperjob=1, ppn=1, srun_params=None, env=None): """Wrap cmdline in a srun cmdline. Args: cmd (str): cmdline to wrap in srun cmdline ppn (int): processes per node nodesperjob(int): number of nodes srun_params(dict): additional srun_params env (dict): env variables to pass on cmdline Returns: cmdlines: cmdline string """ srun_cmd = Srun(cmd) srun_cmd.nodes.update(nodesperjob) srun_cmd.ntasks_per_node.update(ppn) if srun_params: for key, value in srun_params.items(): key_obj = getattr(srun_cmd, key) if isinstance(key_obj, BasicParameter): key_obj.update(value, key) else: raise SoakTestError( "<<FAILED: The srun param {} does not exist".format(key)) if env: srun_cmd.assign_environment(env) return str(srun_cmd)
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("daos_oclass", ior_params + "*") # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX if self.is_harasser("rebuild"): oclass_list = self.params.get("daos_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.daos_oclass.update(o_type) ior_cmd.set_daos_params(self.server_group, pool) # srun cmdline nprocs = nodesperjob * ppn env = ior_cmd.get_default_env("srun") if ior_cmd.api.value == "MPIIO": env["DAOS_CONT"] = ior_cmd.daos_cont.value cmd = Srun(ior_cmd) cmd.setup_command(env, None, nprocs) cmd.ntasks_per_node.update(ppn) log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([cmd.__str__(), log_name]) self.log.info("<<IOR cmdline>>: %s \n", commands[-1].__str__()) return commands
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" mpi_module = self.params.get( "mpi_module", "/run/", default="mpi/mpich-x86_64") # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("dfs_oclass", ior_params + "*") plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/") # check if capable of doing rebuild; if yes then dfs_oclass = RP_*GX if is_harasser(self, "rebuild"): oclass_list = self.params.get("dfs_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) if api == "HDF5-VOL": ior_cmd.api.update("HDF5") else: ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.dfs_oclass.update(o_type) if ior_cmd.api.value == "DFS": ior_cmd.test_file.update( os.path.join("/", "testfile")) ior_cmd.set_daos_params(self.server_group, pool) env = ior_cmd.get_default_env("srun") sbatch_cmds = ["module load -q {}".format(mpi_module)] # include dfuse cmdlines if api in ["HDF5-VOL", "POSIX"]: dfuse, dfuse_start_cmdlist = start_dfuse( self, pool, nodesperjob, "SLURM") sbatch_cmds.extend(dfuse_start_cmdlist) ior_cmd.test_file.update( os.path.join(dfuse.mount_dir.value, "testfile")) # add envs if api is HDF5-VOL if api == "HDF5-VOL": env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path) # env["H5_DAOS_BYPASS_DUNS"] = 1 srun_cmd = Srun(ior_cmd) srun_cmd.assign_processes(nodesperjob * ppn) srun_cmd.assign_environment(env, True) srun_cmd.ntasks_per_node.update(ppn) srun_cmd.nodes.update(nodesperjob) sbatch_cmds.append(str(srun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL", "POSIX"]: sbatch_cmds.extend( stop_dfuse(dfuse, nodesperjob, "SLURM")) sbatch_cmds.append("exit $status") log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([sbatch_cmds, log_name]) self.log.info( "<<IOR {} cmdlines>>:".format(api)) for cmd in sbatch_cmds: self.log.info("%s", cmd) return commands
def create_mdtest_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an MDTEST cmdline to run in slurm batch. Args: self (obj): soak obj job_spec (str): mdtest job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] mdtest_params = os.path.join(os.sep, "run", job_spec, "*") mpi_module = self.params.get("mpi_module", "/run/*", default="mpi/mpich-x86_64") # mdtest job specs with a list of parameters; update each value api_list = self.params.get("api", mdtest_params) write_bytes_list = self.params.get("write_bytes", mdtest_params) read_bytes_list = self.params.get("read_bytes", mdtest_params) depth_list = self.params.get("depth", mdtest_params) flag = self.params.get("flags", mdtest_params) oclass_list = self.params.get("dfs_oclass", mdtest_params) num_of_files_dirs = self.params.get("num_of_files_dirs", mdtest_params) # update mdtest cmdline for each additional mdtest obj for api in api_list: if api in ["POSIX"] and ppn > 16: continue for write_bytes in write_bytes_list: for read_bytes in read_bytes_list: for depth in depth_list: for oclass in oclass_list: # Get the parameters for Mdtest mdtest_cmd = MdtestCommand() mdtest_cmd.namespace = mdtest_params mdtest_cmd.get_params(self) mdtest_cmd.api.update(api) mdtest_cmd.write_bytes.update(write_bytes) mdtest_cmd.read_bytes.update(read_bytes) mdtest_cmd.depth.update(depth) mdtest_cmd.flags.update(flag) mdtest_cmd.num_of_files_dirs.update(num_of_files_dirs) if "POSIX" in api: mdtest_cmd.dfs_oclass.update(None) mdtest_cmd.dfs_dir_oclass.update(None) else: mdtest_cmd.dfs_oclass.update(oclass) mdtest_cmd.dfs_dir_oclass.update(oclass) if "EC" in oclass: # oclass_dir can not be EC must be RP based on rf rf = get_rf(oclass) if rf >= 2: mdtest_cmd.dfs_dir_oclass.update("RP_3G1") elif rf == 1: mdtest_cmd.dfs_dir_oclass.update("RP_2G1") else: mdtest_cmd.dfs_dir_oclass.update("SX") add_containers(self, pool, oclass) mdtest_cmd.set_daos_params(self.server_group, pool, self.container[-1].uuid) env = mdtest_cmd.get_default_env("srun") sbatch_cmds = ["module load -q {}".format(mpi_module)] # include dfuse cmdlines log_name = "{}_{}_{}_{}_{}_{}_{}_{}_{}".format( job_spec, api, write_bytes, read_bytes, depth, oclass, nodesperjob * ppn, nodesperjob, ppn) if api in ["POSIX"]: dfuse, dfuse_start_cmdlist = start_dfuse( self, pool, self.container[-1], nodesperjob, "SLURM", name=log_name, job_spec=job_spec) sbatch_cmds.extend(dfuse_start_cmdlist) mdtest_cmd.test_dir.update(dfuse.mount_dir.value) srun_cmd = Srun(mdtest_cmd) srun_cmd.assign_processes(nodesperjob * ppn) srun_cmd.assign_environment(env, True) srun_cmd.ntasks_per_node.update(ppn) srun_cmd.nodes.update(nodesperjob) sbatch_cmds.append(str(srun_cmd)) sbatch_cmds.append("status=$?") if api in ["POSIX"]: sbatch_cmds.extend( stop_dfuse(dfuse, nodesperjob, "SLURM")) commands.append([sbatch_cmds, log_name]) self.log.info("<<MDTEST {} cmdlines>>:".format(api)) for cmd in sbatch_cmds: self.log.info("%s", cmd) return commands
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: self (obj): soak obj job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] ior_params = os.path.join(os.sep, "run", job_spec, "*") ior_timeout = self.params.get("job_timeout", ior_params, 10) mpi_module = self.params.get("mpi_module", "/run/*", default="mpi/mpich-x86_64") # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params) tsize_list = self.params.get("transfer_size", ior_params) bsize_list = self.params.get("block_size", ior_params) oclass_list = self.params.get("dfs_oclass", ior_params) plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: # Cancel for ticket DAOS-6095 if (api in ["HDF5-VOL", "HDF5", "POSIX"] and t_size == "4k" and o_type in ["RP_2G1", 'RP_2GX']): self.add_cancel_ticket( "DAOS-6095", "IOR -a {} with -t {} and -o {}".format( api, t_size, o_type)) continue # Cancel for ticket DAOS-6308 if api == "MPIIO" and o_type == "RP_2GX": self.add_cancel_ticket( "DAOS-6308", "IOR -a {} with -o {}".format(api, o_type)) continue if api in ["HDF5-VOL", "HDF5", "POSIX"] and ppn > 16: continue ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) ior_cmd.max_duration.update(ior_timeout) if api == "HDF5-VOL": ior_cmd.api.update("HDF5") else: ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) if (api in ["HDF5-VOL", "POSIX"]): ior_cmd.dfs_oclass.update(None) ior_cmd.dfs_dir_oclass.update(None) else: ior_cmd.dfs_oclass.update(o_type) ior_cmd.dfs_dir_oclass.update(o_type) if ior_cmd.api.value == "DFS": ior_cmd.test_file.update(os.path.join("/", "testfile")) add_containers(self, pool, o_type) ior_cmd.set_daos_params(self.server_group, pool, self.container[-1].uuid) env = ior_cmd.get_default_env("srun") sbatch_cmds = ["module load -q {}".format(mpi_module)] # include dfuse cmdlines log_name = "{}_{}_{}_{}_{}_{}_{}_{}".format( job_spec, api, b_size, t_size, o_type, nodesperjob * ppn, nodesperjob, ppn) if api in ["HDF5-VOL", "POSIX"]: dfuse, dfuse_start_cmdlist = start_dfuse( self, pool, self.container[-1], nodesperjob, "SLURM", name=log_name, job_spec=job_spec) sbatch_cmds.extend(dfuse_start_cmdlist) ior_cmd.test_file.update( os.path.join(dfuse.mount_dir.value, "testfile")) # add envs if api is HDF5-VOL if api == "HDF5-VOL": env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path) # env["H5_DAOS_BYPASS_DUNS"] = 1 srun_cmd = Srun(ior_cmd) srun_cmd.assign_processes(nodesperjob * ppn) srun_cmd.assign_environment(env, True) srun_cmd.ntasks_per_node.update(ppn) srun_cmd.nodes.update(nodesperjob) sbatch_cmds.append(str(srun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL", "POSIX"]: sbatch_cmds.extend( stop_dfuse(dfuse, nodesperjob, "SLURM")) commands.append([sbatch_cmds, log_name]) self.log.info("<<IOR {} cmdlines>>:".format(api)) for cmd in sbatch_cmds: self.log.info("%s", cmd) return commands