Ejemplo n.º 1
0
    def setUp(self):
        if not self.du.get_platform().startswith('cray'):
            self.skipTest("Test suite only meant to run on a Cray")
        TestFunctional.setUp(self)

        # no node in 'resv' and 'use' in apstat
        cu = CrayUtils()
        self.assertEqual(cu.count_node_summ('resv'), 0,
                         "No compute node should be having ALPS reservation")
        self.assertEqual(cu.count_node_summ('use'), 0,
                         "No compute node should be in use")

        # The number of compute nodes in State up and batch mode
        # (State = 'UP  B') should equal the number of cray_compute nodes.
        nodes_up_b = cu.count_node_state('UP  B')
        self.logger.info("Nodes with State 'UP  B' : %s" % nodes_up_b)
        nodes_up_i = cu.count_node_state('UP  I')
        self.logger.info("Nodes with State 'UP  I' : %s" % nodes_up_i)
        nodes = self.server.filter(NODE,
                                   {ATTR_rescavail + '.vntype':
                                    'cray_compute'})
        num_cray_compute = len(nodes[ATTR_rescavail + '.vntype=cray_compute'])
        self.assertEqual(nodes_up_b, num_cray_compute)
        self.logger.info("nodes in State 'UP  B': %s == cray_compute: %s" %
                         (nodes_up_b, num_cray_compute))

        # nodes are free and resources are available.
        nodes = self.server.status(NODE)
        for node in nodes:
            self.assertEqual(node['state'], 'free')
            self.assertEqual(node['resources_assigned.ncpus'], '0')
            self.assertEqual(node['resources_assigned.mem'], '0kb')
Ejemplo n.º 2
0
    def test_hyperthread(self):
        """
        Check for a compute node that has hyperthreads, if there is one
        submit a job to that node requesting the hyperthreads.  Check
        there are no errors in the job error output.
        If there is no node with hyperthreads, skip the test.
        """
        # Get the compute nodes from PBS and see if they are threaded
        cu = CrayUtils()
        all_nodes = self.server.status(NODE)
        threaded = 0
        for n in all_nodes:
            if n['resources_available.vntype'] == 'cray_compute':
                numthreads = cu.get_numthreads(
                    n['resources_available.PBScraynid'])
                if numthreads > 1:
                    self.logger.info(
                        "Node %s has %s hyperthreads" %
                        (n['resources_available.vnode'], numthreads))
                    ncpus = n['resources_available.ncpus']
                    vnode = n['resources_available.vnode']
                    threaded = 1
                    break
        if not threaded:
            self.skipTest("Test suite needs nodes with hyperthreads")

        # There is a node with hyperthreads, get the number of cpus
        aprun_args = '-j %d -n %d' % (int(numthreads), int(ncpus))
        self.server.manager(MGR_CMD_SET, SERVER,
                            {'job_history_enable': 'True'})
        j1 = Job(
            TEST_USER, {
                ATTR_l + '.select': '1:ncpus=%d:vnode=%s' %
                (int(ncpus), vnode),
                ATTR_N: 'hyperthread'
            })

        scr = []
        scr += ['hostname\n']
        scr += ['/bin/sleep 5\n']
        scr += ['aprun -b %s /bin/hostname\n' % aprun_args]

        sub_dir = self.du.mkdtemp(uid=TEST_USER.uid)
        j1.create_script(scr)
        jid1 = self.server.submit(j1, submit_dir=sub_dir)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)

        # Verify the contents of the output/error files
        self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x')
        error_file = os.path.join(sub_dir,
                                  'hyperthread.e' + jid1.split('.')[0])
        self.assertEqual(os.stat(error_file).st_size,
                         0,
                         msg="Job error file should be empty")
Ejemplo n.º 3
0
    def test_hyperthread(self):
        """
        Check for a compute node that has hyperthreads, if there is one
        submit a job to that node requesting the hyperthreads.  Check
        there are no errors in the job error output.
        If there is no node with hyperthreads, skip the test.
        """
        # Get the compute nodes from PBS and see if they are threaded
        cu = CrayUtils()
        all_nodes = self.server.status(NODE)
        threaded = 0
        for n in all_nodes:
            if n['resources_available.vntype'] == 'cray_compute':
                numthreads = cu.get_numthreads(
                    n['resources_available.PBScraynid'])
                if numthreads > 1:
                    self.logger.info("Node %s has %s hyperthreads" %
                                     (n['resources_available.vnode'],
                                      numthreads))
                    ncpus = n['resources_available.ncpus']
                    vnode = n['resources_available.vnode']
                    threaded = 1
                    break
        if not threaded:
            self.skipTest("Test suite needs nodes with hyperthreads")

        # There is a node with hyperthreads, get the number of cpus
        aprun_args = '-j %d -n %d' % (int(numthreads), int(ncpus))
        self.server.manager(MGR_CMD_SET, SERVER,
                            {'job_history_enable': 'True'})
        j1 = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=%d:vnode=%s' %
                             (int(ncpus), vnode),
                             ATTR_N: 'hyperthread'})

        scr = []
        scr += ['hostname\n']
        scr += ['/bin/sleep 5\n']
        scr += ['aprun -b %s /bin/hostname\n' % aprun_args]

        sub_dir = self.du.mkdtemp(uid=TEST_USER.uid)
        j1.create_script(scr)
        jid1 = self.server.submit(j1, submit_dir=sub_dir)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)

        # Verify the contents of the output/error files
        self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x')
        error_file = os.path.join(
            sub_dir, 'hyperthread.e' + jid1.split('.')[0])
        self.assertEqual(os.stat(error_file).st_size, 0,
                         msg="Job error file should be empty")
Ejemplo n.º 4
0
    def test_cray_login_job(self):
        """
        Submit a simple sleep job that requests to run on a login node
        and expect that job to go in running state on a login node.
        Verify that the job runs to completion and check job output/error.
        """
        self.server.manager(MGR_CMD_SET, SERVER,
                            {'job_history_enable': 'True'})
        j1 = Job(TEST_USER, {
            ATTR_l + '.vntype': 'cray_login',
            ATTR_N: 'cray_login'
        })

        scr = []
        scr += ['echo Hello World\n']
        scr += ['/bin/sleep 5\n']

        sub_dir = self.du.mkdtemp(uid=TEST_USER.uid)
        j1.create_script(scr)
        jid1 = self.server.submit(j1, submit_dir=sub_dir)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        # fetch node name where the job is running and check that the
        # node is a login node
        self.server.status(JOB, 'exec_vnode', id=jid1)
        vname = j1.get_vnodes()[0]
        self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_login'},
                           id=vname,
                           max_attempts=1)

        cu = CrayUtils()
        # Check if number of compute nodes in use are 0
        self.assertEqual(cu.count_node_summ('use'), 0)

        # verify the contents of output/error files
        self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x')
        error_file = os.path.join(sub_dir, 'cray_login.e' + jid1.split('.')[0])
        self.assertEqual(os.stat(error_file).st_size,
                         0,
                         msg="Job error file should be empty")

        output_file = os.path.join(sub_dir,
                                   'cray_login.o' + jid1.split('.')[0])
        foundhw = self.find_hw(output_file)
        self.assertEqual(foundhw, 1, msg="Job output file incorrect")
Ejemplo n.º 5
0
    def test_cray_compute_job(self):
        """
        Submit a simple sleep job that runs on a compute node and
        expect the job to go in running state on a compute node.
        Verify that the job runs to completion and check job output/error.
        """
        self.server.manager(MGR_CMD_SET, SERVER,
                            {'job_history_enable': 'True'})
        j1 = Job(TEST_USER, {
            ATTR_l + '.vntype': 'cray_compute',
            ATTR_N: 'cray_compute'
        })

        scr = []
        scr += ['echo Hello World\n']
        scr += ['/bin/sleep 5\n']
        scr += ['aprun -B /bin/sleep 10\n']

        sub_dir = self.du.mkdtemp(uid=TEST_USER.uid)
        j1.create_script(scr)
        jid1 = self.server.submit(j1, submit_dir=sub_dir)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        # fetch node name where the job is running and check that the
        # node is a compute node
        self.server.status(JOB, 'exec_vnode', id=jid1)
        vname = j1.get_vnodes()[0]
        self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_compute'},
                           id=vname)
        # Sleep for some time before aprun actually starts
        # using the reservation
        self.logger.info(
            "Sleeping 6 seconds before aprun starts using the reservation")
        time.sleep(6)

        cu = CrayUtils()
        # Check if number of compute nodes in use is 1
        self.assertEqual(cu.count_node_summ('resv'), 1)
        if self.du.get_platform() == 'cray':
            # Cray simulator will not show anything in 'use' because
            # aprun command is just a pass through on simulator
            self.assertEqual(cu.count_node_summ('use'), 1)
        # verify the contents of output/error files
        self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x')
        error_file = os.path.join(sub_dir,
                                  'cray_compute.e' + jid1.split('.')[0])
        self.assertEqual(os.stat(error_file).st_size,
                         0,
                         msg="Job error file should be empty")

        output_file = os.path.join(sub_dir,
                                   'cray_compute.o' + jid1.split('.')[0])
        foundhw = self.find_hw(output_file)
        self.assertEqual(foundhw, 1, msg="Job output file incorrect")

        (cu.node_status, cu.node_summary) = cu.parse_apstat_rn()
        self.assertEqual(cu.count_node_summ('resv'), 0)
        if self.du.get_platform() == 'cray':
            self.assertEqual(cu.count_node_summ('use'), 0)
Ejemplo n.º 6
0
    def setUp(self):
        if not self.du.get_platform().startswith('cray'):
            self.skipTest("Test suite only meant to run on a Cray")
        TestFunctional.setUp(self)

        # no node in 'resv' and 'use' in apstat
        cu = CrayUtils()
        self.assertEqual(cu.count_node_summ('resv'), 0,
                         "No compute node should be having ALPS reservation")
        self.assertEqual(cu.count_node_summ('use'), 0,
                         "No compute node should be in use")

        # The number of compute nodes in State up and batch mode
        # (State = 'UP  B') should equal the number of cray_compute nodes.
        nodes_up_b = cu.count_node_state('UP  B')
        self.logger.info("Nodes with State 'UP  B' : %s" % nodes_up_b)
        nodes_up_i = cu.count_node_state('UP  I')
        self.logger.info("Nodes with State 'UP  I' : %s" % nodes_up_i)
        nodes = self.server.filter(
            NODE, {ATTR_rescavail + '.vntype': 'cray_compute'})
        num_cray_compute = len(nodes[ATTR_rescavail + '.vntype=cray_compute'])
        self.assertEqual(nodes_up_b, num_cray_compute)
        self.logger.info("nodes in State 'UP  B': %s == cray_compute: %s" %
                         (nodes_up_b, num_cray_compute))

        # nodes are free and resources are available.
        nodes = self.server.status(NODE)
        for node in nodes:
            self.assertEqual(node['state'], 'free')
            self.assertEqual(node['resources_assigned.ncpus'], '0')
            self.assertEqual(node['resources_assigned.mem'], '0kb')
Ejemplo n.º 7
0
    def test_cray_login_job(self):
        """
        Submit a simple sleep job that requests to run on a login node
        and expect that job to go in running state on a login node.
        Verify that the job runs to completion and check job output/error.
        """
        self.server.manager(MGR_CMD_SET, SERVER,
                            {'job_history_enable': 'True'})
        j1 = Job(TEST_USER, {ATTR_l + '.vntype': 'cray_login',
                             ATTR_N: 'cray_login'})

        scr = []
        scr += ['echo Hello World\n']
        scr += ['/bin/sleep 5\n']

        sub_dir = self.du.mkdtemp(uid=TEST_USER.uid)
        j1.create_script(scr)
        jid1 = self.server.submit(j1, submit_dir=sub_dir)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        # fetch node name where the job is running and check that the
        # node is a login node
        self.server.status(JOB, 'exec_vnode', id=jid1)
        vname = j1.get_vnodes()[0]
        self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_login'},
                           id=vname, max_attempts=1)

        cu = CrayUtils()
        # Check if number of compute nodes in use are 0
        self.assertEqual(cu.count_node_summ('use'), 0)

        # verify the contents of output/error files
        self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x')
        error_file = os.path.join(sub_dir, 'cray_login.e' + jid1.split('.')[0])
        self.assertEqual(os.stat(error_file).st_size, 0,
                         msg="Job error file should be empty")

        output_file = os.path.join(
            sub_dir, 'cray_login.o' + jid1.split('.')[0])
        foundhw = self.find_hw(output_file)
        self.assertEqual(foundhw, 1, msg="Job output file incorrect")
Ejemplo n.º 8
0
    def test_cray_compute_job(self):
        """
        Submit a simple sleep job that runs on a compute node and
        expect the job to go in running state on a compute node.
        Verify that the job runs to completion and check job output/error.
        """
        self.server.manager(MGR_CMD_SET, SERVER,
                            {'job_history_enable': 'True'})
        j1 = Job(TEST_USER, {ATTR_l + '.vntype': 'cray_compute',
                             ATTR_N: 'cray_compute'})

        scr = []
        scr += ['echo Hello World\n']
        scr += ['/bin/sleep 5\n']
        scr += ['aprun -b -B /bin/sleep 10\n']

        sub_dir = self.du.mkdtemp(uid=TEST_USER.uid)
        j1.create_script(scr)
        jid1 = self.server.submit(j1, submit_dir=sub_dir)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        # fetch node name where the job is running and check that the
        # node is a compute node
        self.server.status(JOB, 'exec_vnode', id=jid1)
        vname = j1.get_vnodes()[0]
        self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_compute'},
                           id=vname)
        # Sleep for some time before aprun actually starts
        # using the reservation
        self.logger.info(
            "Sleeping 6 seconds before aprun starts using the reservation")
        time.sleep(6)

        cu = CrayUtils()
        # Check if number of compute nodes in use is 1
        self.assertEqual(cu.count_node_summ('resv'), 1)
        if self.du.get_platform() == 'cray':
            # Cray simulator will not show anything in 'use' because
            # aprun command is just a pass through on simulator
            self.assertEqual(cu.count_node_summ('use'), 1)
        # verify the contents of output/error files
        self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x')
        error_file = os.path.join(
            sub_dir, 'cray_compute.e' + jid1.split('.')[0])
        self.assertEqual(os.stat(error_file).st_size, 0,
                         msg="Job error file should be empty")

        output_file = os.path.join(
            sub_dir, 'cray_compute.o' + jid1.split('.')[0])
        foundhw = self.find_hw(output_file)
        self.assertEqual(foundhw, 1, msg="Job output file incorrect")

        (cu.node_status, cu.node_summary) = cu.parse_apstat_rn()
        self.assertEqual(cu.count_node_summ('resv'), 0)
        if self.du.get_platform() == 'cray':
            self.assertEqual(cu.count_node_summ('use'), 0)
Ejemplo n.º 9
0
class TestCheckpoint(TestFunctional):
    """
    This test suite targets Checkpoint functionality.
    """
    abort_file = ''
    cu = CrayUtils()

    def setUp(self):
        TestFunctional.setUp(self)
        a = {'job_history_enable': 'True'}
        self.server.manager(MGR_CMD_SET, SERVER, a)
        abort_script = """#!/bin/bash
kill $1
exit 0
"""
        self.abort_file = self.du.create_temp_file(body=abort_script)
        self.du.chmod(path=self.abort_file, mode=0755)
        self.du.chown(path=self.abort_file, uid=0, gid=0, runas=ROOT_USER)
        c = {'$action': 'checkpoint_abort 30 !' + self.abort_file + ' %sid'}
        self.mom.add_config(c)
        self.platform = self.du.get_platform()
        if self.platform != 'cray' and self.platform != 'craysim':
            self.attrs = {
                ATTR_l + '.select': '1:ncpus=1',
                ATTR_l + '.place': 'excl'
            }
        else:
            nv = self.cu.num_compute_vnodes(self.server)
            self.assertNotEqual(nv, 0, "No cray_compute vnodes are present.")
            self.attrs = {
                ATTR_l + '.select': '%d:ncpus=1' % nv,
                ATTR_l + '.place': 'scatter'
            }

    def verify_checkpoint_abort(self, jid, stime):
        """
        Verify that checkpoint and abort happened.
        """
        self.ck_dir = os.path.join(self.server.pbs_conf['PBS_HOME'],
                                   'checkpoint', jid + '.CK')
        self.assertTrue(self.du.isdir(path=self.ck_dir, runas=ROOT_USER),
                        msg="Checkpoint directory %s not found" % self.ck_dir)
        _msg1 = "%s;req_holdjob: Checkpoint initiated." % jid
        self.mom.log_match(_msg1, starttime=stime)
        _msg2 = "%s;checkpoint_abort script %s: exit code 0" % (
            jid, self.abort_file)
        self.mom.log_match(_msg2, starttime=stime)
        _msg3 = "%s;checkpointed to %s" % (jid, self.ck_dir)
        self.mom.log_match(_msg3, starttime=stime)
        _msg4 = "%s;task 00000001 terminated" % jid
        self.mom.log_match(_msg4, starttime=stime)

    def start_server_hot(self):
        """
        Start the server with the hot option.
        """
        pbs_exec = self.server.pbs_conf['PBS_EXEC']
        svrname = self.server.pbs_server_name
        pbs_server_hot = [
            os.path.join(pbs_exec, 'sbin', 'pbs_server'), '-t', 'hot'
        ]
        self.du.run_cmd(svrname, cmd=pbs_server_hot, sudo=True)
        self.assertTrue(self.server.isUp())

    def checkpoint_abort_with_qterm_restart_hot(self, qterm_type):
        """
        Checkpointing with qterm -t <type>, hot server restart.
        """

        j1 = Job(TEST_USER, self.attrs)
        j1.set_sleep_time(20)
        jid1 = self.server.submit(j1)
        self.server.expect(JOB, {'job_state': 'R'}, id=jid1)

        start_time = int(time.time())
        self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'})
        self.server.qterm(manner=qterm_type)

        self.verify_checkpoint_abort(jid1, start_time)

        self.start_server_hot()
        self.assertTrue(self.server.isUp())

        msg = "%s;Requeueing job, substate: 10 Requeued in queue: workq" % jid1
        self.server.log_match(msg, starttime=start_time)

        # wait for the server to hot start the job
        self.server.expect(JOB, {'job_state': 'R'}, id=jid1, interval=2)
        self.server.expect(JOB, 'exec_vnode', id=jid1, op=SET)
        self.assertFalse(os.path.exists(self.ck_dir),
                         msg=self.ck_dir + " still exists")
        self.server.expect(JOB, {'job_state': 'F'},
                           jid1,
                           extend='x',
                           interval=5)

    def test_checkpoint_abort_with_preempt(self):
        """
        This test verifies that checkpoint_abort works as expected when
        a job is preempted via checkpoint. It does so by submitting a job
        in express queue which preempts a running job in the default queue.
        """
        self.server.manager(MGR_CMD_SET,
                            SCHED, {'preempt_order': 'C'},
                            runas=ROOT_USER)
        a = {
            'queue_type': 'execution',
            'started': 'True',
            'enabled': 'True',
            'Priority': 200
        }
        self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq")

        j1 = Job(TEST_USER, self.attrs)
        j1.set_sleep_time(20)
        jid1 = self.server.submit(j1)
        self.server.expect(JOB, {'job_state': 'R'}, id=jid1)

        self.attrs['queue'] = 'expressq'
        j2 = Job(TEST_USER, self.attrs)
        j2.set_sleep_time(20)
        start_time = int(time.time())
        jid2 = self.server.submit(j2)
        self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
        self.server.expect(JOB, {'job_state': 'Q'}, id=jid1)

        self.verify_checkpoint_abort(jid1, start_time)

        self.server.expect(JOB, {'job_state': 'F'},
                           jid2,
                           extend='x',
                           interval=5)
        self.server.expect(JOB, {'job_state': 'F'},
                           jid1,
                           extend='x',
                           interval=5)

    def test_checkpoint_abort_with_qhold(self):
        """
        This test uses qhold for checkpointing.
        """
        j1 = Job(TEST_USER, self.attrs)
        jid1 = self.server.submit(j1)
        self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
        start_time = int(time.time())
        self.server.holdjob(jid1)
        self.server.expect(JOB, {'job_state': 'H'}, id=jid1)

        self.verify_checkpoint_abort(jid1, start_time)

    def test_checkpoint_abort_with_qterm_immediate_restart_hot(self):
        """
        This tests checkpointing with qterm -t immediate, hot server restart.
        """
        self.checkpoint_abort_with_qterm_restart_hot("immediate")

    def test_checkpoint_abort_with_qterm_delay_restart_hot(self):
        """
        This tests checkpointing with qterm -t delay, hot server restart.
        """
        self.checkpoint_abort_with_qterm_restart_hot("delay")

    def tearDown(self):
        TestFunctional.tearDown(self)
        try:
            os.remove(self.abort_file)
        except OSError:
            pass
Ejemplo n.º 10
0
class TestSuspendResumeOnCray(TestFunctional):

    """
    Test special cases where suspend/resume functionality differs on cray
    as compared to other platforms.
    This test suite expects the platform to be 'cray' and assumes that
    suspend/resume feature is enabled on it.
    """
    cu = CrayUtils()

    def setUp(self):
        if not self.du.get_platform().startswith('cray'):
            self.skipTest("Test suite only meant to run on a Cray")
        TestFunctional.setUp(self)

    @tags('cray', 'smoke')
    def test_default_restrict_res_to_release_on_suspend_setting(self):
        """
        Check that on Cray restrict_res_to_release_on_suspend is always set
        to 'ncpus' by default
        """

        # Set restrict_res_to_release_on_suspend server attribute
        a = {ATTR_restrict_res_to_release_on_suspend: 'ncpus'}
        self.server.expect(SERVER, a)

    def test_exclusive_job_not_suspended(self):
        """
        If a running job is a job with exclusive placement then this job can
        not be suspended.
        This test is checking for a log message which is an unstable
        interface and may need change in future when interface changes.
        """

        msg_expected = "BASIL;ERROR: ALPS error: apsched: \
at least resid .* is exclusive"
        # Submit a job
        j = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1',
                            ATTR_l + '.place': 'excl'})
        check_after = int(time.time())
        jid = self.server.submit(j)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid)

        # suspend job
        try:
            self.server.sigjob(jobid=jid, signal="suspend")
        except PbsSignalError as e:
            self.assertTrue("Switching ALPS reservation failed" in e.msg[0])

        self.server.expect(JOB, 'exec_host', id=jid, op=SET)
        job_stat = self.server.status(JOB, id=jid)
        ehost = job_stat[0]['exec_host'].partition('/')[0]
        run_mom = self.moms[ehost]
        s = run_mom.log_match(msg_expected, starttime=check_after, regexp=True,
                              max_attempts=10)
        self.assertTrue(s)

    @tags('cray')
    def test_basic_admin_suspend_restart(self):
        """
        Test basic admin-suspend funcionality for jobs and array jobs with
        restart on Cray. The restart will test if the node recovers properly
        in maintenance. After turning off scheduling and a server restart, a
        subjob is always requeued and node shows up as free.
        """
        j1 = Job(TEST_USER)
        jid1 = self.server.submit(j1)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)

        qstat = self.server.status(JOB, 'exec_vnode', id=jid1)
        vname = qstat[0]['exec_vnode'].partition(':')[0].strip('(')

        # admin-suspend regular job
        self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname)
        self.server.expect(NODE, {'maintenance_jobs': jid1})

        self.server.restart()
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname)
        self.server.expect(NODE, {'maintenance_jobs': jid1})

        # Adding sleep to avoid failure at resume since PBS licenses
        # might not be available and as a result resume fails
        time.sleep(2)

        # admin-resume regular job. Make sure the node retuns to state
        # job-exclusive.
        self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname)
        self.server.cleanup_jobs()

        # admin-suspend job array
        jA = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_J: '1-2'})
        jidA = self.server.submit(jA)
        self.server.expect(JOB, {ATTR_state: 'B'}, id=jidA)

        subjobs = self.server.status(JOB, id=jidA, extend='t')
        # subjobs[0] is the array itself.  Need the subjobs
        jid1 = subjobs[1]['id']
        jid2 = subjobs[2]['id']

        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2)

        qstat = self.server.status(JOB, 'exec_vnode', id=jid1)
        vname1 = qstat[0]['exec_vnode'].partition(':')[0].strip('(')
        qstat = self.server.status(JOB, 'exec_vnode', id=jid2)
        vname2 = qstat[0]['exec_vnode'].partition(':')[0].strip('(')

        # admin-suspend subjob 1
        self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname1)
        self.server.expect(NODE, {'maintenance_jobs': jid1})

        # admin-resume subjob 1 . Make sure the node retuns to state
        # job-exclusive.
        self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname1)

        # admin-suspend subjob 2
        self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid2)
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname2)
        self.server.expect(NODE, {'maintenance_jobs': jid2})

        # Turn off scheduling and restart server
        self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'})
        self.server.restart()

        # Check that nodes are now free
        self.server.expect(NODE, {'state': 'free'}, id=vname1)
        self.server.expect(NODE, {'state': 'free'}, id=vname2)

    def test_admin_suspend_wrong_state(self):
        """
        Check that wrong 'resume' signal is correctly rejected.
        """
        j1 = Job(TEST_USER)
        jid1 = self.server.submit(j1)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.sigjob(jid1, "suspend", runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)

        try:
            self.server.sigjob(jid1, "admin-resume", runas=ROOT_USER)
        except PbsSignalError as e:
            self.assertTrue(
                'Job can not be resumed with the requested resume signal'
                in e.msg[0])
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)

        j2 = Job(TEST_USER)
        jid2 = self.server.submit(j2)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2)
        self.server.sigjob(jid2, "admin-suspend", runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2)

        try:
            self.server.sigjob(jid2, "resume", runas=ROOT_USER)
        except PbsSignalError as e:
            self.assertTrue(
                'Job can not be resumed with the requested resume signal'
                in e.msg[0])

        # The job should be in the same state as it was prior to the signal
        self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2)

    def submit_resv(self, resv_start, chunks, resv_dur):
        """
        Function to request a PBS reservation with start time, chunks and
        duration as arguments.
        """
        a = {'Resource_List.select': '%d:ncpus=1:vntype=cray_compute' % chunks,
             'Resource_List.place': 'scatter',
             'reserve_start': int(resv_start),
             'reserve_duration': int(resv_dur)
             }
        r = Reservation(TEST_USER, attrs=a)
        rid = self.server.submit(r)
        try:
            a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')}
            d = self.server.expect(RESV, a, id=rid)
        except PtlExpectError as e:
            d = e.rv
        return d

    @timeout(300)
    def test_preempt_STF(self):
        """
        Test shrink to fit by creating a reservation for all compute nodes
        starting in 100 sec. with a duration of two hours.  A preempted STF job
        with min_walltime of 1 min. and max_walltime of 2 hours will stay
        suspended after higher priority job goes away if its
        min_walltime can't be satisfied.
        """
        qname = 'highp'
        a = {'queue_type': 'execution'}
        self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname)
        a = {'enabled': 'True', 'started': 'True', 'priority': '150'}
        self.server.manager(MGR_CMD_SET, QUEUE, a, qname)

        # Reserve all the compute nodes
        nv = self.cu.num_compute_vnodes(self.server)
        self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.")
        now = time.time()
        resv_start = now + 100
        resv_dur = 7200
        d = self.submit_resv(resv_start, nv, resv_dur)
        self.assertTrue(d)

        j = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv,
                            ATTR_l + '.place': 'scatter',
                            ATTR_l + '.min_walltime': '00:01:00',
                            ATTR_l + '.max_walltime': '02:00:00'})
        jid = self.server.submit(j)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid)
        self.server.expect(
            JOB, {ATTR_l + '.walltime': (LE, '00:01:40')}, id=jid)
        self.server.expect(
            JOB, {ATTR_l + '.walltime': (GE, '00:01:00')}, id=jid)

        # The sleep below will leave less than 1 minute window for jid
        # after j2id is deleted. The min_walltime of jid can't be
        # satisfied and jid will stay in S state.
        time.sleep(35)

        j2 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv,
                             ATTR_l + '.walltime': '00:01:00',
                             ATTR_l + '.place': 'scatter',
                             ATTR_q: 'highp'})
        j2id = self.server.submit(j2)

        self.server.expect(JOB, {ATTR_state: 'R'}, id=j2id)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid)

        # The sleep below will leave less than 1 minute window for jid
        time.sleep(50)

        self.server.delete(j2id)
        a = {'scheduling': 'True'}
        self.server.manager(MGR_CMD_SET, SERVER, a)
        self.server.expect(SERVER, {'server_state': 'Active'})
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid)

    def test_multi_express(self):
        """
        Test of multiple express queues of different priorities.
        See that jobs from the higher express queues preempt jobs
        from lower express queues.  Also see when express jobs finish
        (or are deleted), suspended jobs restart.
        Make sure loadLimit is set to 4 on the server node:
        # apmgr config loadLimit 4
        """

        _t = ('\"express_queue, normal_jobs, server_softlimits,' +
              ' queue_softlimits\"')
        a = {'preempt_prio': _t}
        self.scheduler.set_sched_config(a)

        a = {'queue_type': 'e',
             'started': 'True',
             'enabled': 'True',
             'Priority': 150}
        self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq")

        a['Priority'] = 160
        self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq2")

        a['Priority'] = 170
        self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq3")

        # Count the compute nodes
        nv = self.cu.num_compute_vnodes(self.server)
        self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.")

        j1 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv,
                             ATTR_l + '.place': 'scatter',
                             ATTR_l + '.walltime': 3600})
        j1id = self.server.submit(j1)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=j1id)

        j2 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv,
                             ATTR_l + '.place': 'scatter',
                             ATTR_l + '.walltime': 3600,
                             ATTR_q: 'expressq'})
        j2id = self.server.submit(j2)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=j1id)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=j2id)

        j3 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv,
                             ATTR_l + '.place': 'scatter',
                             ATTR_l + '.walltime': 3600,
                             ATTR_q: 'expressq2'})
        j3id = self.server.submit(j3)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=j2id)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=j3id)

        j4 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv,
                             ATTR_l + '.place': 'scatter',
                             ATTR_l + '.walltime': 3600,
                             ATTR_q: 'expressq3'})
        j4id = self.server.submit(j4)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=j3id)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=j4id)

        self.server.delete(j4id)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=j3id)

    def test_preempted_topjob_calendared(self):
        """
        That even if topjob_ineligible is set for
        a preempted job and sched_preempt_enforce_resumption
        is set true, the preempted job will be calendared
        """
        self.server.manager(MGR_CMD_SET, SCHED,
                            {'sched_preempt_enforce_resumption': 'true'})
        self.server.manager(MGR_CMD_SET, SERVER, {'backfill_depth': '2'})

        # Count the compute nodes
        nv = self.cu.num_compute_vnodes(self.server)
        self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.")

        # Submit a job
        j = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv,
                            ATTR_l + '.place': 'scatter',
                            ATTR_l + '.walltime': '120'})
        jid1 = self.server.submit(j)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)

        # Alter topjob_ineligible for runnng job
        self.server.alterjob(jid1, {ATTR_W: "topjob_ineligible = true"},
                             runas=ROOT_USER, logerr=True)

        # Create a high priority queue
        a = {'queue_type': 'e', 'started': 't',
             'enabled': 'True', 'priority': '150'}
        self.server.manager(MGR_CMD_CREATE, QUEUE, a, id="highp")

        # Submit a job to high priority queue
        j = Job(TEST_USER, {ATTR_queue: 'highp', ATTR_l + '.walltime': '60'})
        jid2 = self.server.submit(j)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2)

        # Verify that job1 is calendared
        self.server.expect(JOB, 'estimated.start_time',
                           op=SET, id=jid1)
        qstat = self.server.status(JOB, 'estimated.start_time',
                                   id=jid1)
        est_time = qstat[0]['estimated.start_time']
        self.assertNotEqual(est_time, None)
        self.scheduler.log_match(jid1 + ";Job is a top job",
                                 starttime=self.server.ctime,
                                 max_attempts=10)
Ejemplo n.º 11
0
class TestSuspendResumeOnCray(TestFunctional):

    """
    Test special cases where suspend/resume functionality differs on cray
    as compared to other platforms.
    This test suite expects the platform to be 'cray' and assumes that
    suspend/resume feature is enabled on it.
    """
    cu = CrayUtils()

    def setUp(self):
        if not self.du.get_platform().startswith('cray'):
            self.skipTest("Test suite only meant to run on a Cray")
        TestFunctional.setUp(self)

    @tags('cray', 'smoke')
    def test_default_restrict_res_to_release_on_suspend_setting(self):
        """
        Check that on Cray restrict_res_to_release_on_suspend is always set
        to 'ncpus' by default
        """

        # Set restrict_res_to_release_on_suspend server attribute
        a = {ATTR_restrict_res_to_release_on_suspend: 'ncpus'}
        self.server.expect(SERVER, a)

    def test_exclusive_job_not_suspended(self):
        """
        If a running job is a job with exclusive placement then this job can
        not be suspended.
        This test is checking for a log message which is an unstable
        interface and may need change in future when interface changes.
        """

        msg_expected = "BASIL;ERROR: ALPS error: apsched: \
at least resid .* is exclusive"
        # Submit a job
        j = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1',
                            ATTR_l + '.place': 'excl'})
        check_after = int(time.time())
        jid = self.server.submit(j)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid)

        # suspend job
        try:
            self.server.sigjob(jobid=jid, signal="suspend")
        except PbsSignalError as e:
            self.assertTrue("Switching ALPS reservation failed" in e.msg[0])

        self.server.expect(JOB, 'exec_host', id=jid, op=SET)
        job_stat = self.server.status(JOB, id=jid)
        ehost = job_stat[0]['exec_host'].partition('/')[0]
        run_mom = self.moms[ehost]
        s = run_mom.log_match(msg_expected, starttime=check_after, regexp=True,
                              max_attempts=10)
        self.assertTrue(s)

    @tags('cray')
    def test_basic_admin_suspend_restart(self):
        """
        Test basic admin-suspend funcionality for jobs and array jobs with
        restart on Cray. The restart will test if the node recovers properly
        in maintenance. After turning off scheduling and a server restart, a
        subjob is always requeued and node shows up as free.
        """
        j1 = Job(TEST_USER)
        jid1 = self.server.submit(j1)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)

        qstat = self.server.status(JOB, 'exec_vnode', id=jid1)
        vname = qstat[0]['exec_vnode'].partition(':')[0].strip('(')

        # admin-suspend regular job
        self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname)
        self.server.expect(NODE, {'maintenance_jobs': jid1})

        self.server.restart()
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname)
        self.server.expect(NODE, {'maintenance_jobs': jid1})

        # Adding sleep to avoid failure at resume since PBS licenses
        # might not be available and as a result resume fails
        time.sleep(2)

        # admin-resume regular job. Make sure the node retuns to state
        # job-exclusive.
        self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname)
        self.server.cleanup_jobs()

        # admin-suspend job array
        jA = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_J: '1-2'})
        jidA = self.server.submit(jA)
        self.server.expect(JOB, {ATTR_state: 'B'}, id=jidA)

        subjobs = self.server.status(JOB, id=jidA, extend='t')
        # subjobs[0] is the array itself.  Need the subjobs
        jid1 = subjobs[1]['id']
        jid2 = subjobs[2]['id']

        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2)

        qstat = self.server.status(JOB, 'exec_vnode', id=jid1)
        vname1 = qstat[0]['exec_vnode'].partition(':')[0].strip('(')
        qstat = self.server.status(JOB, 'exec_vnode', id=jid2)
        vname2 = qstat[0]['exec_vnode'].partition(':')[0].strip('(')

        # admin-suspend subjob 1
        self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname1)
        self.server.expect(NODE, {'maintenance_jobs': jid1})

        # admin-resume subjob 1 . Make sure the node retuns to state
        # job-exclusive.
        self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname1)

        # admin-suspend subjob 2
        self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid2)
        self.server.expect(NODE, {'state': 'maintenance'}, id=vname2)
        self.server.expect(NODE, {'maintenance_jobs': jid2})

        # Turn off scheduling and restart server
        self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'})
        self.server.restart()

        # Check that nodes are now free
        self.server.expect(NODE, {'state': 'free'}, id=vname1)
        self.server.expect(NODE, {'state': 'free'}, id=vname2)

    def test_admin_suspend_wrong_state(self):
        """
        Check that wrong 'resume' signal is correctly rejected.
        """
        j1 = Job(TEST_USER)
        jid1 = self.server.submit(j1)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1)
        self.server.sigjob(jid1, "suspend", runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)

        try:
            self.server.sigjob(jid1, "admin-resume", runas=ROOT_USER)
        except PbsSignalError as e:
            self.assertTrue(
                'Job can not be resumed with the requested resume signal'
                in e.msg[0])
        self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1)

        j2 = Job(TEST_USER)
        jid2 = self.server.submit(j2)
        self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2)
        self.server.sigjob(jid2, "admin-suspend", runas=ROOT_USER)
        self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2)

        try:
            self.server.sigjob(jid2, "resume", runas=ROOT_USER)
        except PbsSignalError as e:
            self.assertTrue(
                'Job can not be resumed with the requested resume signal'
                in e.msg[0])

        # The job should be in the same state as it was prior to the signal
        self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2)

    def submit_resv(self, resv_start, chunks, resv_dur):
        """
        Function to request a PBS reservation with start time, chunks and
        duration as arguments.
        """
        a = {'Resource_List.select': '%d:ncpus=1:vntype=cray_compute' % chunks,
             'Resource_List.place': 'scatter',
             'reserve_start': int(resv_start),
             'reserve_duration': int(resv_dur)
             }
        r = Reservation(TEST_USER, attrs=a)
        rid = self.server.submit(r)
        try:
            a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')}
            d = self.server.expect(RESV, a, id=rid)
        except PtlExpectError, e:
            d = e.rv
        return d