Ejemplo n.º 1
0
def get_srun_cmd(cmd, nodesperjob=1, ppn=1, srun_params=None, env=None):
    """Wrap cmdline in a srun cmdline.

    Args:
        cmd (str): cmdline to wrap in srun cmdline
        ppn (int): processes per node
        nodesperjob(int): number of nodes
        srun_params(dict): additional srun_params
        env (dict): env variables to pass on cmdline

    Returns:
        cmdlines: cmdline string

    """
    srun_cmd = Srun(cmd)
    srun_cmd.nodes.update(nodesperjob)
    srun_cmd.ntasks_per_node.update(ppn)
    if srun_params:
        for key, value in srun_params.items():
            key_obj = getattr(srun_cmd, key)
            if isinstance(key_obj, BasicParameter):
                key_obj.update(value, key)
            else:
                raise SoakTestError(
                    "<<FAILED: The srun param {} does not exist".format(key))
    if env:
        srun_cmd.assign_environment(env)
    return str(srun_cmd)
Ejemplo n.º 2
0
    def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob):
        """Create an IOR cmdline to run in slurm batch.

        Args:

            job_spec (str):   ior job in yaml to run
            pool (obj):       TestPool obj
            ppn(int):         number of tasks to run on each node
            nodesperjob(int): number of nodes per job

        Returns:
            cmd: cmdline string

        """
        commands = []

        iteration = self.test_iteration
        ior_params = "/run/" + job_spec + "/*"
        # IOR job specs with a list of parameters; update each value
        api_list = self.params.get("api", ior_params + "*")
        tsize_list = self.params.get("transfer_size", ior_params + "*")
        bsize_list = self.params.get("block_size", ior_params + "*")
        oclass_list = self.params.get("daos_oclass", ior_params + "*")
        # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX
        if self.is_harasser("rebuild"):
            oclass_list = self.params.get("daos_oclass", "/run/rebuild/*")
        # update IOR cmdline for each additional IOR obj
        for api in api_list:
            for b_size in bsize_list:
                for t_size in tsize_list:
                    for o_type in oclass_list:
                        ior_cmd = IorCommand()
                        ior_cmd.namespace = ior_params
                        ior_cmd.get_params(self)
                        if iteration is not None and iteration < 0:
                            ior_cmd.repetitions.update(1000000)
                        if self.job_timeout is not None:
                            ior_cmd.max_duration.update(self.job_timeout)
                        else:
                            ior_cmd.max_duration.update(10)
                        ior_cmd.api.update(api)
                        ior_cmd.block_size.update(b_size)
                        ior_cmd.transfer_size.update(t_size)
                        ior_cmd.daos_oclass.update(o_type)
                        ior_cmd.set_daos_params(self.server_group, pool)
                        # srun cmdline
                        nprocs = nodesperjob * ppn
                        env = ior_cmd.get_default_env("srun")
                        if ior_cmd.api.value == "MPIIO":
                            env["DAOS_CONT"] = ior_cmd.daos_cont.value
                        cmd = Srun(ior_cmd)
                        cmd.setup_command(env, None, nprocs)
                        cmd.ntasks_per_node.update(ppn)
                        log_name = "{}_{}_{}_{}".format(
                            api, b_size, t_size, o_type)
                        commands.append([cmd.__str__(), log_name])
                        self.log.info("<<IOR cmdline>>: %s \n",
                                      commands[-1].__str__())
        return commands
Ejemplo n.º 3
0
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob):
    """Create an IOR cmdline to run in slurm batch.

    Args:

        job_spec (str):   ior job in yaml to run
        pool (obj):       TestPool obj
        ppn(int):         number of tasks to run on each node
        nodesperjob(int): number of nodes per job

    Returns:
        cmd: cmdline string

    """
    commands = []
    iteration = self.test_iteration
    ior_params = "/run/" + job_spec + "/*"
    mpi_module = self.params.get(
        "mpi_module", "/run/", default="mpi/mpich-x86_64")
    # IOR job specs with a list of parameters; update each value
    api_list = self.params.get("api", ior_params + "*")
    tsize_list = self.params.get("transfer_size", ior_params + "*")
    bsize_list = self.params.get("block_size", ior_params + "*")
    oclass_list = self.params.get("dfs_oclass", ior_params + "*")
    plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/")
    # check if capable of doing rebuild; if yes then dfs_oclass = RP_*GX
    if is_harasser(self, "rebuild"):
        oclass_list = self.params.get("dfs_oclass", "/run/rebuild/*")
    # update IOR cmdline for each additional IOR obj
    for api in api_list:
        for b_size in bsize_list:
            for t_size in tsize_list:
                for o_type in oclass_list:
                    ior_cmd = IorCommand()
                    ior_cmd.namespace = ior_params
                    ior_cmd.get_params(self)
                    if iteration is not None and iteration < 0:
                        ior_cmd.repetitions.update(1000000)
                    if self.job_timeout is not None:
                        ior_cmd.max_duration.update(self.job_timeout)
                    else:
                        ior_cmd.max_duration.update(10)
                    if api == "HDF5-VOL":
                        ior_cmd.api.update("HDF5")
                    else:
                        ior_cmd.api.update(api)
                    ior_cmd.block_size.update(b_size)
                    ior_cmd.transfer_size.update(t_size)
                    ior_cmd.dfs_oclass.update(o_type)
                    if ior_cmd.api.value == "DFS":
                        ior_cmd.test_file.update(
                            os.path.join("/", "testfile"))
                    ior_cmd.set_daos_params(self.server_group, pool)
                    env = ior_cmd.get_default_env("srun")
                    sbatch_cmds = ["module load -q {}".format(mpi_module)]
                    # include dfuse cmdlines
                    if api in ["HDF5-VOL", "POSIX"]:
                        dfuse, dfuse_start_cmdlist = start_dfuse(
                            self, pool, nodesperjob, "SLURM")
                        sbatch_cmds.extend(dfuse_start_cmdlist)
                        ior_cmd.test_file.update(
                            os.path.join(dfuse.mount_dir.value, "testfile"))
                    # add envs if api is HDF5-VOL
                    if api == "HDF5-VOL":
                        env["HDF5_VOL_CONNECTOR"] = "daos"
                        env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path)
                        # env["H5_DAOS_BYPASS_DUNS"] = 1
                    srun_cmd = Srun(ior_cmd)
                    srun_cmd.assign_processes(nodesperjob * ppn)
                    srun_cmd.assign_environment(env, True)
                    srun_cmd.ntasks_per_node.update(ppn)
                    srun_cmd.nodes.update(nodesperjob)
                    sbatch_cmds.append(str(srun_cmd))
                    sbatch_cmds.append("status=$?")
                    if api in ["HDF5-VOL", "POSIX"]:
                        sbatch_cmds.extend(
                            stop_dfuse(dfuse, nodesperjob, "SLURM"))
                    sbatch_cmds.append("exit $status")
                    log_name = "{}_{}_{}_{}".format(
                        api, b_size, t_size, o_type)
                    commands.append([sbatch_cmds, log_name])
                    self.log.info(
                        "<<IOR {} cmdlines>>:".format(api))
                    for cmd in sbatch_cmds:
                        self.log.info("%s", cmd)
    return commands
Ejemplo n.º 4
0
def create_mdtest_cmdline(self, job_spec, pool, ppn, nodesperjob):
    """Create an MDTEST cmdline to run in slurm batch.

    Args:

        self (obj): soak obj
        job_spec (str):   mdtest job in yaml to run
        pool (obj):       TestPool obj
        ppn(int):         number of tasks to run on each node
        nodesperjob(int): number of nodes per job

    Returns:
        cmd: cmdline string

    """
    commands = []
    mdtest_params = os.path.join(os.sep, "run", job_spec, "*")
    mpi_module = self.params.get("mpi_module",
                                 "/run/*",
                                 default="mpi/mpich-x86_64")
    # mdtest job specs with a list of parameters; update each value
    api_list = self.params.get("api", mdtest_params)
    write_bytes_list = self.params.get("write_bytes", mdtest_params)
    read_bytes_list = self.params.get("read_bytes", mdtest_params)
    depth_list = self.params.get("depth", mdtest_params)
    flag = self.params.get("flags", mdtest_params)
    oclass_list = self.params.get("dfs_oclass", mdtest_params)
    num_of_files_dirs = self.params.get("num_of_files_dirs", mdtest_params)
    # update mdtest cmdline for each additional mdtest obj
    for api in api_list:
        if api in ["POSIX"] and ppn > 16:
            continue
        for write_bytes in write_bytes_list:
            for read_bytes in read_bytes_list:
                for depth in depth_list:
                    for oclass in oclass_list:
                        # Get the parameters for Mdtest
                        mdtest_cmd = MdtestCommand()
                        mdtest_cmd.namespace = mdtest_params
                        mdtest_cmd.get_params(self)
                        mdtest_cmd.api.update(api)
                        mdtest_cmd.write_bytes.update(write_bytes)
                        mdtest_cmd.read_bytes.update(read_bytes)
                        mdtest_cmd.depth.update(depth)
                        mdtest_cmd.flags.update(flag)
                        mdtest_cmd.num_of_files_dirs.update(num_of_files_dirs)
                        if "POSIX" in api:
                            mdtest_cmd.dfs_oclass.update(None)
                            mdtest_cmd.dfs_dir_oclass.update(None)
                        else:
                            mdtest_cmd.dfs_oclass.update(oclass)
                            mdtest_cmd.dfs_dir_oclass.update(oclass)
                        if "EC" in oclass:
                            # oclass_dir can not be EC must be RP based on rf
                            rf = get_rf(oclass)
                            if rf >= 2:
                                mdtest_cmd.dfs_dir_oclass.update("RP_3G1")
                            elif rf == 1:
                                mdtest_cmd.dfs_dir_oclass.update("RP_2G1")
                            else:
                                mdtest_cmd.dfs_dir_oclass.update("SX")
                        add_containers(self, pool, oclass)
                        mdtest_cmd.set_daos_params(self.server_group, pool,
                                                   self.container[-1].uuid)
                        env = mdtest_cmd.get_default_env("srun")
                        sbatch_cmds = ["module load -q {}".format(mpi_module)]
                        # include dfuse cmdlines
                        log_name = "{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
                            job_spec, api, write_bytes, read_bytes, depth,
                            oclass, nodesperjob * ppn, nodesperjob, ppn)
                        if api in ["POSIX"]:
                            dfuse, dfuse_start_cmdlist = start_dfuse(
                                self,
                                pool,
                                self.container[-1],
                                nodesperjob,
                                "SLURM",
                                name=log_name,
                                job_spec=job_spec)
                            sbatch_cmds.extend(dfuse_start_cmdlist)
                            mdtest_cmd.test_dir.update(dfuse.mount_dir.value)
                        srun_cmd = Srun(mdtest_cmd)
                        srun_cmd.assign_processes(nodesperjob * ppn)
                        srun_cmd.assign_environment(env, True)
                        srun_cmd.ntasks_per_node.update(ppn)
                        srun_cmd.nodes.update(nodesperjob)
                        sbatch_cmds.append(str(srun_cmd))
                        sbatch_cmds.append("status=$?")
                        if api in ["POSIX"]:
                            sbatch_cmds.extend(
                                stop_dfuse(dfuse, nodesperjob, "SLURM"))
                        commands.append([sbatch_cmds, log_name])
                        self.log.info("<<MDTEST {} cmdlines>>:".format(api))
                        for cmd in sbatch_cmds:
                            self.log.info("%s", cmd)
    return commands
Ejemplo n.º 5
0
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob):
    """Create an IOR cmdline to run in slurm batch.

    Args:

        self (obj): soak obj
        job_spec (str):   ior job in yaml to run
        pool (obj):       TestPool obj
        ppn(int):         number of tasks to run on each node
        nodesperjob(int): number of nodes per job

    Returns:
        cmd: cmdline string

    """
    commands = []
    ior_params = os.path.join(os.sep, "run", job_spec, "*")
    ior_timeout = self.params.get("job_timeout", ior_params, 10)
    mpi_module = self.params.get("mpi_module",
                                 "/run/*",
                                 default="mpi/mpich-x86_64")
    # IOR job specs with a list of parameters; update each value
    api_list = self.params.get("api", ior_params)
    tsize_list = self.params.get("transfer_size", ior_params)
    bsize_list = self.params.get("block_size", ior_params)
    oclass_list = self.params.get("dfs_oclass", ior_params)
    plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/")
    # update IOR cmdline for each additional IOR obj
    for api in api_list:
        for b_size in bsize_list:
            for t_size in tsize_list:
                for o_type in oclass_list:
                    # Cancel for ticket DAOS-6095
                    if (api in ["HDF5-VOL", "HDF5", "POSIX"] and t_size == "4k"
                            and o_type in ["RP_2G1", 'RP_2GX']):
                        self.add_cancel_ticket(
                            "DAOS-6095",
                            "IOR -a {} with -t {} and -o {}".format(
                                api, t_size, o_type))
                        continue
                    # Cancel for ticket DAOS-6308
                    if api == "MPIIO" and o_type == "RP_2GX":
                        self.add_cancel_ticket(
                            "DAOS-6308",
                            "IOR -a {} with -o {}".format(api, o_type))
                        continue
                    if api in ["HDF5-VOL", "HDF5", "POSIX"] and ppn > 16:
                        continue
                    ior_cmd = IorCommand()
                    ior_cmd.namespace = ior_params
                    ior_cmd.get_params(self)
                    ior_cmd.max_duration.update(ior_timeout)
                    if api == "HDF5-VOL":
                        ior_cmd.api.update("HDF5")
                    else:
                        ior_cmd.api.update(api)
                    ior_cmd.block_size.update(b_size)
                    ior_cmd.transfer_size.update(t_size)
                    if (api in ["HDF5-VOL", "POSIX"]):
                        ior_cmd.dfs_oclass.update(None)
                        ior_cmd.dfs_dir_oclass.update(None)
                    else:
                        ior_cmd.dfs_oclass.update(o_type)
                        ior_cmd.dfs_dir_oclass.update(o_type)
                    if ior_cmd.api.value == "DFS":
                        ior_cmd.test_file.update(os.path.join("/", "testfile"))
                    add_containers(self, pool, o_type)
                    ior_cmd.set_daos_params(self.server_group, pool,
                                            self.container[-1].uuid)
                    env = ior_cmd.get_default_env("srun")
                    sbatch_cmds = ["module load -q {}".format(mpi_module)]
                    # include dfuse cmdlines
                    log_name = "{}_{}_{}_{}_{}_{}_{}_{}".format(
                        job_spec, api, b_size, t_size, o_type,
                        nodesperjob * ppn, nodesperjob, ppn)
                    if api in ["HDF5-VOL", "POSIX"]:
                        dfuse, dfuse_start_cmdlist = start_dfuse(
                            self,
                            pool,
                            self.container[-1],
                            nodesperjob,
                            "SLURM",
                            name=log_name,
                            job_spec=job_spec)
                        sbatch_cmds.extend(dfuse_start_cmdlist)
                        ior_cmd.test_file.update(
                            os.path.join(dfuse.mount_dir.value, "testfile"))
                    # add envs if api is HDF5-VOL
                    if api == "HDF5-VOL":
                        env["HDF5_VOL_CONNECTOR"] = "daos"
                        env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path)
                        # env["H5_DAOS_BYPASS_DUNS"] = 1
                    srun_cmd = Srun(ior_cmd)
                    srun_cmd.assign_processes(nodesperjob * ppn)
                    srun_cmd.assign_environment(env, True)
                    srun_cmd.ntasks_per_node.update(ppn)
                    srun_cmd.nodes.update(nodesperjob)
                    sbatch_cmds.append(str(srun_cmd))
                    sbatch_cmds.append("status=$?")
                    if api in ["HDF5-VOL", "POSIX"]:
                        sbatch_cmds.extend(
                            stop_dfuse(dfuse, nodesperjob, "SLURM"))
                    commands.append([sbatch_cmds, log_name])
                    self.log.info("<<IOR {} cmdlines>>:".format(api))
                    for cmd in sbatch_cmds:
                        self.log.info("%s", cmd)
    return commands