Beispiel #1
0
    def build_ior_script(self, job):
        """
        Builds an IOR command string which is then added to slurm script

        job --which job to read in the yaml file

        """

        # for the moment build IOR
        #IorUtils.build_ior(self.basepath)

        # read job info
        job_params = "/run/" + job + "/"
        job_name = self.params.get("name", job_params)
        job_nodes = self.params.get("nodes", job_params)
        job_processes = self.params.get("process_per_node",
                                        job_params)
        job_spec = self.params.get("jobspec", job_params)

        # read ior cmd info
        spec = "/run/" + job_spec + "/"
        iteration = self.params.get("iter", spec + 'iteration/')
        ior_flags = self.params.get("F", spec + 'iorflags/')
        transfer_size = self.params.get("t", spec + 'transfersize/')
        record_size = self.params.get("r", spec + 'recordsize/*')
        stripe_size = self.params.get("s", spec + 'stripesize/*')
        stripe_count = self.params.get("c", spec + 'stripecount/')
        async_io = self.params.get("a", spec + 'asyncio/')
        object_class = self.params.get("o", spec + 'objectclass/')

        self.partition = self.params.get("partition",
                                         '/run/hosts/test_machines/')

        pool_uuid = self.pool.get_uuid_str()
        tmplist = []
        svc_list = ""
        for i in range(self.createsvc):
            tmplist.append(int(self.pool.svc.rl_ranks[i]))
            svc_list += str(tmplist[i]) + ":"
        svc_list = svc_list[:-1]

        block_size = '1536m'

        if stripe_size == '8m':
            transfer_size = stripe_size

        hostfile = os.path.join(self.tmpdir, "ior_hosts_" + job_name)

        cmd = ior_utils.get_ior_cmd(ior_flags, iteration, block_size,
                                    transfer_size, pool_uuid, svc_list,
                                    record_size, stripe_size, stripe_count,
                                    async_io, object_class, self.basepath,
                                    hostfile, job_processes)

        output = os.path.join(self.tmpdir, job_name + "_results.out")
        script = slurm_utils.write_slurm_script(self.tmpdir, job_name,
                                                output, int(job_nodes), [cmd])
        return script
Beispiel #2
0
    def build_ior_script(self, job):
        """
        Builds an IOR command string which is then added to slurm script

        job --which job to read in the yaml file

        """

        # for the moment build IOR
        #IorUtils.build_ior(self.basepath)

        # read job info
        job_params = "/run/" + job + "/"
        job_name = self.params.get("name", job_params)
        job_nodes = self.params.get("nodes", job_params)
        job_processes = self.params.get("process_per_node", job_params)
        job_spec = self.params.get("jobspec", job_params)

        # read ior cmd info
        spec = "/run/" + job_spec + "/"
        iteration = self.params.get("iter", spec + 'iteration/')
        ior_flags = self.params.get("F", spec + 'iorflags/')
        transfer_size = self.params.get("t", spec + 'transfersize/')
        record_size = self.params.get("r", spec + 'recordsize/*')
        stripe_size = self.params.get("s", spec + 'stripesize/*')
        stripe_count = self.params.get("c", spec + 'stripecount/')
        async_io = self.params.get("a", spec + 'asyncio/')
        object_class = self.params.get("o", spec + 'objectclass/')

        self.partition = self.params.get("partition",
                                         '/run/hosts/test_machines/')

        pool_uuid = self.pool.get_uuid_str()
        tmplist = []
        svc_list = ""
        for i in range(self.createsvc):
            tmplist.append(int(self.pool.svc.rl_ranks[i]))
            svc_list += str(tmplist[i]) + ":"
        svc_list = svc_list[:-1]

        block_size = '1536m'

        if stripe_size == '8m':
            transfer_size = stripe_size

        hostfile = os.path.join(self.tmpdir, "ior_hosts_" + job_name)

        cmd = ior_utils.get_ior_cmd(ior_flags, iteration, block_size,
                                    transfer_size, pool_uuid, svc_list,
                                    record_size, stripe_size, stripe_count,
                                    async_io, object_class, self.basepath,
                                    hostfile, job_processes)

        output = os.path.join(self.tmpdir, job_name + "_results.out")
        script = slurm_utils.write_slurm_script(self.tmpdir, job_name, output,
                                                int(job_nodes), [cmd])
        return script
Beispiel #3
0
    def test_soak_3(self):
        """
        Test ID: DAOS-2192
        Test Description: this time try a dmg command combined with IOR run
        Use Cases:
        :avocado: tags=soak3
        """

        script1 = None
        script2 = None
        try:
            # retrieve IOR job parameters
            script1 = self.build_ior_script('job1')
            job_id1 = slurm_utils.run_slurm_script(script1)
            slurm_utils.register_for_job_results(job_id1, self, maxwait=3600)

            # now do the dmg job
            dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params,
                                               self.basepath)

            s3_job2_name = self.params.get("name", '/run/job3/')
            s3_job2_nodes = self.params.get("nodes", '/run/job3/')
            output = os.path.join(self.tmpdir, s3_job2_name + "_results.out")
            script2 = slurm_utils.write_slurm_script(self.tmpdir, s3_job2_name,
                                                     output, s3_job2_nodes,
                                                     dmgcmds)
            job_id2 = slurm_utils.run_slurm_script(script2)
            slurm_utils.register_for_job_results(job_id2, self, maxwait=3600)

            # wait for all the jobs to finish
            while len(self.soak_results) < 2:
                time.sleep(10)

            for job, result in self.soak_results.iteritems():
                if result != "COMPLETED":
                    self.fail(
                        "Soak job: {} didn't complete as expected: {}".format(
                            job, result))

        except (DaosApiError, ior_utils.IorFailed) as error:
            self.fail("Soak Test 3 Failed\n {}".format(error))
        finally:
            try:
                os.remove(script1)
            except StandardError:
                pass
            try:
                os.remove(script2)
            except StandardError:
                pass
Beispiel #4
0
    def test_soak_3(self):
        """
        Test ID: DAOS-2192
        Test Description: this time try a dmg command combined with IOR run
        Use Cases:
        :avocado: tags=soak3
        """

        script1 = None
        script2 = None
        try:
            # retrieve IOR job parameters
            script1 = self.build_ior_script('job1')
            job_id1 = slurm_utils.run_slurm_script(script1)
            slurm_utils.register_for_job_results(job_id1, self, maxwait=3600)

            # now do the dmg job
            dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params,
                                               self.basepath)

            s3_job2_name = self.params.get("name", '/run/job3/')
            s3_job2_nodes = self.params.get("nodes", '/run/job3/')
            output = os.path.join(self.tmpdir, s3_job2_name + "_results.out")
            script2 = slurm_utils.write_slurm_script(self.tmpdir, s3_job2_name,
                                                     output, s3_job2_nodes,
                                                     dmgcmds)
            job_id2 = slurm_utils.run_slurm_script(script2)
            slurm_utils.register_for_job_results(job_id2, self, maxwait=3600)

            # wait for all the jobs to finish
            while len(self.soak_results) < 2:
                time.sleep(10)

            for job, result in self.soak_results.iteritems():
                if result != "COMPLETED":
                    self.fail("Soak job: {} didn't complete as expected: {}".
                              format(job, result))

        except (DaosApiError, ior_utils.IorFailed) as error:
            self.fail("Soak Test 3 Failed\n {}".format(error))
        finally:
            try:
                os.remove(script1)
            except StandardError:
                pass
            try:
                os.remove(script2)
            except StandardError:
                pass
Beispiel #5
0
    def build_job_script(self, commands, job, ppn, nodesperjob):
        """Create a slurm batch script that will execute a list of cmdlines.

        Args:
            commands(list): commandlines and cmd specific log_name
            job(str): the job name that will be defined in the slurm script
            ppn(int): number of tasks to run on each node

        Returns:
            script_list: list of slurm batch scripts

        """
        self.log.info("<<Build Script>> at %s", time.ctime())
        script_list = []

        # Start the daos_agent in the batch script for now
        # TO-DO:  daos_agents start with systemd
        agent_launch_cmds = [
            "mkdir -p {}".format(os.environ.get("DAOS_TEST_LOG_DIR"))
        ]
        agent_launch_cmds.append(" ".join(
            [str(self.agent_managers[0].manager.job), "&"]))

        # Create the sbatch script for each cmdline
        used = []
        for cmd, log_name in commands:
            output = os.path.join(
                self.test_log_dir, "%N_" + self.test_name + "_" + job +
                "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_")
            error = os.path.join(
                self.test_log_dir,
                "%N_" + self.test_name + "_" + job + "_%j_%t_" +
                str(ppn * nodesperjob) + "_" + log_name + "_ERROR_")
            sbatch = {
                "time": str(self.job_timeout) + ":00",
                "exclude": NodeSet.fromlist(self.exclude_slurm_nodes),
                "error": str(error)
            }
            # include the cluster specific params
            sbatch.update(self.srun_params)
            unique = get_random_string(5, used)
            script = slurm_utils.write_slurm_script(self.test_log_dir, job,
                                                    output, nodesperjob,
                                                    agent_launch_cmds + [cmd],
                                                    unique, sbatch)
            script_list.append(script)
            used.append(unique)
        return script_list
Beispiel #6
0
def build_job_script(self, commands, job, nodesperjob):
    """Create a slurm batch script that will execute a list of cmdlines.

    Args:
        self (obj): soak obj
        commands(list): commandlines and cmd specific log_name
        job(str): the job name that will be defined in the slurm script

    Returns:
        script_list: list of slurm batch scripts

    """
    job_timeout = self.params.get("job_timeout", "/run/" + job + "/*", 10)
    self.log.info("<<Build Script>> at %s", time.ctime())
    script_list = []
    # if additional cmds are needed in the batch script
    prepend_cmds = [
        "set -e",
        "/usr/bin/daos pool query --pool {} ".format(self.pool[1].uuid),
        "/usr/bin/daos pool query --pool {} ".format(self.pool[0].uuid)
    ]
    append_cmds = [
        "/usr/bin/daos pool query --pool {} ".format(self.pool[1].uuid),
        "/usr/bin/daos pool query --pool {} ".format(self.pool[0].uuid)
    ]
    exit_cmd = ["exit $status"]
    # Create the sbatch script for each list of cmdlines
    for cmd, log_name in commands:
        if isinstance(cmd, str):
            cmd = [cmd]
        output = os.path.join(self.test_log_dir,
                              self.test_name + "_" + log_name + "_%N_" + "%j_")
        error = os.path.join(str(output) + "ERROR_")
        sbatch = {
            "time": str(job_timeout) + ":00",
            "exclude": NodeSet.fromlist(self.exclude_slurm_nodes),
            "error": str(error),
            "export": "ALL"
        }
        # include the cluster specific params
        sbatch.update(self.srun_params)
        unique = get_random_string(5, self.used)
        script = slurm_utils.write_slurm_script(
            self.test_log_dir, job, output, nodesperjob,
            prepend_cmds + cmd + append_cmds + exit_cmd, unique, sbatch)
        script_list.append(script)
        self.used.append(unique)
    return script_list
Beispiel #7
0
    def build_job_script(self, commands, job, ppn, nodesperjob):
        """Create a slurm batch script that will execute a list of cmdlines.

        Args:
            commands(list): commandlines and cmd specific log_name
            job(str): the job name that will be defined in the slurm script
            ppn(int): number of tasks to run on each node

        Returns:
            script_list: list of slurm batch scripts

        """
        self.log.info("<<Build Script>> at %s", time.ctime())
        script_list = []
        # if additional cmds are needed in the batch script
        additional_cmds = []
        # Create the sbatch script for each list of cmdlines
        for cmd, log_name in commands:
            if isinstance(cmd, str):
                cmd = [cmd]
            output = os.path.join(
                self.test_log_dir, "%N_" + self.test_name + "_" + job +
                "_%j_%t_" + str(ppn * nodesperjob) + "_" + log_name + "_")
            error = os.path.join(
                self.test_log_dir,
                "%N_" + self.test_name + "_" + job + "_%j_%t_" +
                str(ppn * nodesperjob) + "_" + log_name + "_ERROR_")
            sbatch = {
                "time": str(self.job_timeout) + ":00",
                "exclude": NodeSet.fromlist(self.exclude_slurm_nodes),
                "error": str(error),
                "export": "ALL"
            }
            # include the cluster specific params
            sbatch.update(self.srun_params)
            unique = get_random_string(5, self.used)
            script = slurm_utils.write_slurm_script(self.test_log_dir, job,
                                                    output, nodesperjob,
                                                    additional_cmds + cmd,
                                                    unique, sbatch)
            script_list.append(script)
            self.used.append(unique)
        return script_list
Beispiel #8
0
    def test_soak_2(self):
        """
        Test ID: DAOS-2192
        Test Description: This test verifies that a dmg script can be submitted.
        :avocado: tags=soak2
        """

        script = None
        try:
            dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params,
                                               self.basepath)

            s2_job1_name = self.params.get("name", '/run/job3/')
            s2_job1_nodes = self.params.get("nodes", '/run/job3/')

            output = os.path.join(self.tmpdir, s2_job1_name + "_results.out")

            script = slurm_utils.write_slurm_script(self.tmpdir, s2_job1_name,
                                                    output, s2_job1_nodes,
                                                    dmgcmds)
            job_id = slurm_utils.run_slurm_script(script)
            slurm_utils.register_for_job_results(job_id, self, maxwait=3600)

            # wait for all the jobs to finish
            while len(self.soak_results) < 1:
                time.sleep(10)

            for job, result in self.soak_results.iteritems():
                if result != "COMPLETED":
                    self.fail(
                        "Soak job: {} didn't complete as expected: {}".format(
                            job, result))

        except (DaosApiError, ior_utils.IorFailed) as error:
            self.fail("Soak Test 2 Failed/n {}".format(error))
        finally:
            try:
                os.remove(script)
            finally:
                pass
Beispiel #9
0
    def build_job_script(self, commands, job, ppn, nodesperjob):
        """Create a slurm batch script that will execute a list of cmdlines.

        Args:
            commands(list): commandlines
            job(str): the job name that will be defined in the slurm script
            ppn(int): number of tasks to run on each node

        Returns:
            script_list: list of slurm batch scripts

        """
        self.log.info("<<Build Script>> at %s", time.ctime())
        script_list = []
        # Start the daos_agent in the batch script for now
        # TO-DO:  daos_agents start with systemd
        added_cmd_list = [
            "srun -l --mpi=pmi2 --ntasks-per-node=1 "
            "--export=ALL {} -o {} &".format(
                os.path.join(self.bin, "daos_agent"),
                os.path.join(self.tmp, "daos_agent.yml"))
        ]
        # Create the sbatch script for each cmdline
        for cmd in commands:
            output = os.path.join(
                self.test_log_dir, "%N_" + self.test_name + "_" + job +
                "_%j_%t_" + str(ppn) + "_")
            sbatch = {
                "time": str(self.job_timeout) + ":00",
                "exclude": NodeSet.fromlist(self.exclude_slurm_nodes)
            }
            # include the cluster specific params
            sbatch.update(self.srun_params)
            script = slurm_utils.write_slurm_script(self.test_log_dir, job,
                                                    output, nodesperjob,
                                                    added_cmd_list + [cmd],
                                                    sbatch)
            script_list.append(script)
        return script_list
Beispiel #10
0
    def test_soak_2(self):
        """
        Test ID: DAOS-2192
        Test Description: This test verifies that a dmg script can be submitted.
        :avocado: tags=soak2
        """

        script = None
        try:
            dmgcmds = dmg_utils.get_dmg_script("dmg1", self.params,
                                               self.basepath)

            s2_job1_name = self.params.get("name", '/run/job3/')
            s2_job1_nodes = self.params.get("nodes", '/run/job3/')

            output = os.path.join(self.tmpdir, s2_job1_name + "_results.out")

            script = slurm_utils.write_slurm_script(self.tmpdir, s2_job1_name,
                                                    output,
                                                    s2_job1_nodes, dmgcmds)
            job_id = slurm_utils.run_slurm_script(script)
            slurm_utils.register_for_job_results(job_id, self, maxwait=3600)

            # wait for all the jobs to finish
            while len(self.soak_results) < 1:
                time.sleep(10)

            for job, result in self.soak_results.iteritems():
                if result != "COMPLETED":
                    self.fail("Soak job: {} didn't complete as expected: {}".
                              format(job, result))

        except (DaosApiError, ior_utils.IorFailed) as error:
            self.fail("Soak Test 2 Failed/n {}".format(error))
        finally:
            try:
                os.remove(script)
            finally:
                pass
Beispiel #11
0
    def build_job_script(self, nodesperjob, job, pool):
        """Create a slurm batch script that will execute a list of jobs.

        Args:
            nodesperjob(int): number of nodes executing each job
            job(str): the job that will be defined in the slurm script with
            /run/"job"/.  It is currently defined in the yaml as:
            Example job:
            job1:
                name: job1    - unique name
                time: 10      - cmdline time in seconds; used in IOR -T param
                tasks: 1      - number of processes per node --ntaskspernode
                jobspec:
                    - ior_daos
                    - ior_mpiio
            pool (obj):   TestPool obj

        Returns:
            script_list: list of slurm batch scripts

        """
        self.log.info("<<Build Script for job %s >> at %s", job, time.ctime())

        script_list = []
        # create one batch script per cmdline
        # get job params
        job_params = "/run/" + job + "/"
        job_name = self.params.get("name", job_params + "*")
        job_specs = self.params.get("jobspec", job_params + "*")
        task_list = self.params.get("tasks", job_params + "*")
        job_time = self.params.get("time", job_params + "*")

        # job_time in minutes:seconds format
        job_time = str(job_time) + ":00"
        for job_spec in job_specs:
            if "ior" in job_spec:
                # Create IOR cmdline
                cmd_list = self.create_ior_cmdline(job_params, job_spec, pool)
            elif "dmg" in job_spec:
                # create dmg cmdline
                cmd_list = self.create_dmg_cmdline(job_params, job_spec, pool)
            else:
                raise SoakTestError(
                    "<<FAILED: Soak job: {} Job spec {} is invalid>>".format(
                        job, job_spec))

            # a single cmdline per batch job; so that a failure is per cmdline
            # change to multiple cmdlines per batch job  later.
            for cmd in cmd_list:
                # additional sbatch params
                for tasks in task_list:
                    output = os.path.join(
                        self.rem_pass_dir,
                        "%N_" + self.test_name + "_" + job_name + "_" +
                        job_spec + "_results.out_%j_%t_" + str(tasks) + "_")
                    num_tasks = nodesperjob * tasks
                    sbatch = {
                        "ntasks-per-node": tasks,
                        "ntasks": num_tasks,
                        "time": job_time,
                        "partition": self.partition_clients,
                        "exclude": self.test_node[0]
                    }
                    script = slurm_utils.write_slurm_script(
                        self.rem_pass_dir, job_name, output, nodesperjob,
                        [cmd], sbatch)
                    script_list.append(script)
        return script_list