def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) # no node in 'resv' and 'use' in apstat cu = CrayUtils() self.assertEqual(cu.count_node_summ('resv'), 0, "No compute node should be having ALPS reservation") self.assertEqual(cu.count_node_summ('use'), 0, "No compute node should be in use") # The number of compute nodes in State up and batch mode # (State = 'UP B') should equal the number of cray_compute nodes. nodes_up_b = cu.count_node_state('UP B') self.logger.info("Nodes with State 'UP B' : %s" % nodes_up_b) nodes_up_i = cu.count_node_state('UP I') self.logger.info("Nodes with State 'UP I' : %s" % nodes_up_i) nodes = self.server.filter(NODE, {ATTR_rescavail + '.vntype': 'cray_compute'}) num_cray_compute = len(nodes[ATTR_rescavail + '.vntype=cray_compute']) self.assertEqual(nodes_up_b, num_cray_compute) self.logger.info("nodes in State 'UP B': %s == cray_compute: %s" % (nodes_up_b, num_cray_compute)) # nodes are free and resources are available. nodes = self.server.status(NODE) for node in nodes: self.assertEqual(node['state'], 'free') self.assertEqual(node['resources_assigned.ncpus'], '0') self.assertEqual(node['resources_assigned.mem'], '0kb')
def test_hyperthread(self): """ Check for a compute node that has hyperthreads, if there is one submit a job to that node requesting the hyperthreads. Check there are no errors in the job error output. If there is no node with hyperthreads, skip the test. """ # Get the compute nodes from PBS and see if they are threaded cu = CrayUtils() all_nodes = self.server.status(NODE) threaded = 0 for n in all_nodes: if n['resources_available.vntype'] == 'cray_compute': numthreads = cu.get_numthreads( n['resources_available.PBScraynid']) if numthreads > 1: self.logger.info( "Node %s has %s hyperthreads" % (n['resources_available.vnode'], numthreads)) ncpus = n['resources_available.ncpus'] vnode = n['resources_available.vnode'] threaded = 1 break if not threaded: self.skipTest("Test suite needs nodes with hyperthreads") # There is a node with hyperthreads, get the number of cpus aprun_args = '-j %d -n %d' % (int(numthreads), int(ncpus)) self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job( TEST_USER, { ATTR_l + '.select': '1:ncpus=%d:vnode=%s' % (int(ncpus), vnode), ATTR_N: 'hyperthread' }) scr = [] scr += ['hostname\n'] scr += ['/bin/sleep 5\n'] scr += ['aprun -b %s /bin/hostname\n' % aprun_args] sub_dir = self.du.mkdtemp(uid=TEST_USER.uid) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # Verify the contents of the output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join(sub_dir, 'hyperthread.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty")
def test_hyperthread(self): """ Check for a compute node that has hyperthreads, if there is one submit a job to that node requesting the hyperthreads. Check there are no errors in the job error output. If there is no node with hyperthreads, skip the test. """ # Get the compute nodes from PBS and see if they are threaded cu = CrayUtils() all_nodes = self.server.status(NODE) threaded = 0 for n in all_nodes: if n['resources_available.vntype'] == 'cray_compute': numthreads = cu.get_numthreads( n['resources_available.PBScraynid']) if numthreads > 1: self.logger.info("Node %s has %s hyperthreads" % (n['resources_available.vnode'], numthreads)) ncpus = n['resources_available.ncpus'] vnode = n['resources_available.vnode'] threaded = 1 break if not threaded: self.skipTest("Test suite needs nodes with hyperthreads") # There is a node with hyperthreads, get the number of cpus aprun_args = '-j %d -n %d' % (int(numthreads), int(ncpus)) self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=%d:vnode=%s' % (int(ncpus), vnode), ATTR_N: 'hyperthread'}) scr = [] scr += ['hostname\n'] scr += ['/bin/sleep 5\n'] scr += ['aprun -b %s /bin/hostname\n' % aprun_args] sub_dir = self.du.mkdtemp(uid=TEST_USER.uid) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # Verify the contents of the output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join( sub_dir, 'hyperthread.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty")
def test_cray_login_job(self): """ Submit a simple sleep job that requests to run on a login node and expect that job to go in running state on a login node. Verify that the job runs to completion and check job output/error. """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, { ATTR_l + '.vntype': 'cray_login', ATTR_N: 'cray_login' }) scr = [] scr += ['echo Hello World\n'] scr += ['/bin/sleep 5\n'] sub_dir = self.du.mkdtemp(uid=TEST_USER.uid) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # fetch node name where the job is running and check that the # node is a login node self.server.status(JOB, 'exec_vnode', id=jid1) vname = j1.get_vnodes()[0] self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_login'}, id=vname, max_attempts=1) cu = CrayUtils() # Check if number of compute nodes in use are 0 self.assertEqual(cu.count_node_summ('use'), 0) # verify the contents of output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join(sub_dir, 'cray_login.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty") output_file = os.path.join(sub_dir, 'cray_login.o' + jid1.split('.')[0]) foundhw = self.find_hw(output_file) self.assertEqual(foundhw, 1, msg="Job output file incorrect")
def test_cray_compute_job(self): """ Submit a simple sleep job that runs on a compute node and expect the job to go in running state on a compute node. Verify that the job runs to completion and check job output/error. """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, { ATTR_l + '.vntype': 'cray_compute', ATTR_N: 'cray_compute' }) scr = [] scr += ['echo Hello World\n'] scr += ['/bin/sleep 5\n'] scr += ['aprun -B /bin/sleep 10\n'] sub_dir = self.du.mkdtemp(uid=TEST_USER.uid) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # fetch node name where the job is running and check that the # node is a compute node self.server.status(JOB, 'exec_vnode', id=jid1) vname = j1.get_vnodes()[0] self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_compute'}, id=vname) # Sleep for some time before aprun actually starts # using the reservation self.logger.info( "Sleeping 6 seconds before aprun starts using the reservation") time.sleep(6) cu = CrayUtils() # Check if number of compute nodes in use is 1 self.assertEqual(cu.count_node_summ('resv'), 1) if self.du.get_platform() == 'cray': # Cray simulator will not show anything in 'use' because # aprun command is just a pass through on simulator self.assertEqual(cu.count_node_summ('use'), 1) # verify the contents of output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join(sub_dir, 'cray_compute.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty") output_file = os.path.join(sub_dir, 'cray_compute.o' + jid1.split('.')[0]) foundhw = self.find_hw(output_file) self.assertEqual(foundhw, 1, msg="Job output file incorrect") (cu.node_status, cu.node_summary) = cu.parse_apstat_rn() self.assertEqual(cu.count_node_summ('resv'), 0) if self.du.get_platform() == 'cray': self.assertEqual(cu.count_node_summ('use'), 0)
def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) # no node in 'resv' and 'use' in apstat cu = CrayUtils() self.assertEqual(cu.count_node_summ('resv'), 0, "No compute node should be having ALPS reservation") self.assertEqual(cu.count_node_summ('use'), 0, "No compute node should be in use") # The number of compute nodes in State up and batch mode # (State = 'UP B') should equal the number of cray_compute nodes. nodes_up_b = cu.count_node_state('UP B') self.logger.info("Nodes with State 'UP B' : %s" % nodes_up_b) nodes_up_i = cu.count_node_state('UP I') self.logger.info("Nodes with State 'UP I' : %s" % nodes_up_i) nodes = self.server.filter( NODE, {ATTR_rescavail + '.vntype': 'cray_compute'}) num_cray_compute = len(nodes[ATTR_rescavail + '.vntype=cray_compute']) self.assertEqual(nodes_up_b, num_cray_compute) self.logger.info("nodes in State 'UP B': %s == cray_compute: %s" % (nodes_up_b, num_cray_compute)) # nodes are free and resources are available. nodes = self.server.status(NODE) for node in nodes: self.assertEqual(node['state'], 'free') self.assertEqual(node['resources_assigned.ncpus'], '0') self.assertEqual(node['resources_assigned.mem'], '0kb')
def test_cray_login_job(self): """ Submit a simple sleep job that requests to run on a login node and expect that job to go in running state on a login node. Verify that the job runs to completion and check job output/error. """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, {ATTR_l + '.vntype': 'cray_login', ATTR_N: 'cray_login'}) scr = [] scr += ['echo Hello World\n'] scr += ['/bin/sleep 5\n'] sub_dir = self.du.mkdtemp(uid=TEST_USER.uid) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # fetch node name where the job is running and check that the # node is a login node self.server.status(JOB, 'exec_vnode', id=jid1) vname = j1.get_vnodes()[0] self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_login'}, id=vname, max_attempts=1) cu = CrayUtils() # Check if number of compute nodes in use are 0 self.assertEqual(cu.count_node_summ('use'), 0) # verify the contents of output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join(sub_dir, 'cray_login.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty") output_file = os.path.join( sub_dir, 'cray_login.o' + jid1.split('.')[0]) foundhw = self.find_hw(output_file) self.assertEqual(foundhw, 1, msg="Job output file incorrect")
def test_cray_compute_job(self): """ Submit a simple sleep job that runs on a compute node and expect the job to go in running state on a compute node. Verify that the job runs to completion and check job output/error. """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, {ATTR_l + '.vntype': 'cray_compute', ATTR_N: 'cray_compute'}) scr = [] scr += ['echo Hello World\n'] scr += ['/bin/sleep 5\n'] scr += ['aprun -b -B /bin/sleep 10\n'] sub_dir = self.du.mkdtemp(uid=TEST_USER.uid) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # fetch node name where the job is running and check that the # node is a compute node self.server.status(JOB, 'exec_vnode', id=jid1) vname = j1.get_vnodes()[0] self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_compute'}, id=vname) # Sleep for some time before aprun actually starts # using the reservation self.logger.info( "Sleeping 6 seconds before aprun starts using the reservation") time.sleep(6) cu = CrayUtils() # Check if number of compute nodes in use is 1 self.assertEqual(cu.count_node_summ('resv'), 1) if self.du.get_platform() == 'cray': # Cray simulator will not show anything in 'use' because # aprun command is just a pass through on simulator self.assertEqual(cu.count_node_summ('use'), 1) # verify the contents of output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join( sub_dir, 'cray_compute.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty") output_file = os.path.join( sub_dir, 'cray_compute.o' + jid1.split('.')[0]) foundhw = self.find_hw(output_file) self.assertEqual(foundhw, 1, msg="Job output file incorrect") (cu.node_status, cu.node_summary) = cu.parse_apstat_rn() self.assertEqual(cu.count_node_summ('resv'), 0) if self.du.get_platform() == 'cray': self.assertEqual(cu.count_node_summ('use'), 0)
class TestCheckpoint(TestFunctional): """ This test suite targets Checkpoint functionality. """ abort_file = '' cu = CrayUtils() def setUp(self): TestFunctional.setUp(self) a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) abort_script = """#!/bin/bash kill $1 exit 0 """ self.abort_file = self.du.create_temp_file(body=abort_script) self.du.chmod(path=self.abort_file, mode=0755) self.du.chown(path=self.abort_file, uid=0, gid=0, runas=ROOT_USER) c = {'$action': 'checkpoint_abort 30 !' + self.abort_file + ' %sid'} self.mom.add_config(c) self.platform = self.du.get_platform() if self.platform != 'cray' and self.platform != 'craysim': self.attrs = { ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.place': 'excl' } else: nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "No cray_compute vnodes are present.") self.attrs = { ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter' } def verify_checkpoint_abort(self, jid, stime): """ Verify that checkpoint and abort happened. """ self.ck_dir = os.path.join(self.server.pbs_conf['PBS_HOME'], 'checkpoint', jid + '.CK') self.assertTrue(self.du.isdir(path=self.ck_dir, runas=ROOT_USER), msg="Checkpoint directory %s not found" % self.ck_dir) _msg1 = "%s;req_holdjob: Checkpoint initiated." % jid self.mom.log_match(_msg1, starttime=stime) _msg2 = "%s;checkpoint_abort script %s: exit code 0" % ( jid, self.abort_file) self.mom.log_match(_msg2, starttime=stime) _msg3 = "%s;checkpointed to %s" % (jid, self.ck_dir) self.mom.log_match(_msg3, starttime=stime) _msg4 = "%s;task 00000001 terminated" % jid self.mom.log_match(_msg4, starttime=stime) def start_server_hot(self): """ Start the server with the hot option. """ pbs_exec = self.server.pbs_conf['PBS_EXEC'] svrname = self.server.pbs_server_name pbs_server_hot = [ os.path.join(pbs_exec, 'sbin', 'pbs_server'), '-t', 'hot' ] self.du.run_cmd(svrname, cmd=pbs_server_hot, sudo=True) self.assertTrue(self.server.isUp()) def checkpoint_abort_with_qterm_restart_hot(self, qterm_type): """ Checkpointing with qterm -t <type>, hot server restart. """ j1 = Job(TEST_USER, self.attrs) j1.set_sleep_time(20) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) start_time = int(time.time()) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.server.qterm(manner=qterm_type) self.verify_checkpoint_abort(jid1, start_time) self.start_server_hot() self.assertTrue(self.server.isUp()) msg = "%s;Requeueing job, substate: 10 Requeued in queue: workq" % jid1 self.server.log_match(msg, starttime=start_time) # wait for the server to hot start the job self.server.expect(JOB, {'job_state': 'R'}, id=jid1, interval=2) self.server.expect(JOB, 'exec_vnode', id=jid1, op=SET) self.assertFalse(os.path.exists(self.ck_dir), msg=self.ck_dir + " still exists") self.server.expect(JOB, {'job_state': 'F'}, jid1, extend='x', interval=5) def test_checkpoint_abort_with_preempt(self): """ This test verifies that checkpoint_abort works as expected when a job is preempted via checkpoint. It does so by submitting a job in express queue which preempts a running job in the default queue. """ self.server.manager(MGR_CMD_SET, SCHED, {'preempt_order': 'C'}, runas=ROOT_USER) a = { 'queue_type': 'execution', 'started': 'True', 'enabled': 'True', 'Priority': 200 } self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq") j1 = Job(TEST_USER, self.attrs) j1.set_sleep_time(20) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.attrs['queue'] = 'expressq' j2 = Job(TEST_USER, self.attrs) j2.set_sleep_time(20) start_time = int(time.time()) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'R'}, id=jid2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) self.verify_checkpoint_abort(jid1, start_time) self.server.expect(JOB, {'job_state': 'F'}, jid2, extend='x', interval=5) self.server.expect(JOB, {'job_state': 'F'}, jid1, extend='x', interval=5) def test_checkpoint_abort_with_qhold(self): """ This test uses qhold for checkpointing. """ j1 = Job(TEST_USER, self.attrs) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) start_time = int(time.time()) self.server.holdjob(jid1) self.server.expect(JOB, {'job_state': 'H'}, id=jid1) self.verify_checkpoint_abort(jid1, start_time) def test_checkpoint_abort_with_qterm_immediate_restart_hot(self): """ This tests checkpointing with qterm -t immediate, hot server restart. """ self.checkpoint_abort_with_qterm_restart_hot("immediate") def test_checkpoint_abort_with_qterm_delay_restart_hot(self): """ This tests checkpointing with qterm -t delay, hot server restart. """ self.checkpoint_abort_with_qterm_restart_hot("delay") def tearDown(self): TestFunctional.tearDown(self) try: os.remove(self.abort_file) except OSError: pass
class TestSuspendResumeOnCray(TestFunctional): """ Test special cases where suspend/resume functionality differs on cray as compared to other platforms. This test suite expects the platform to be 'cray' and assumes that suspend/resume feature is enabled on it. """ cu = CrayUtils() def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) @tags('cray', 'smoke') def test_default_restrict_res_to_release_on_suspend_setting(self): """ Check that on Cray restrict_res_to_release_on_suspend is always set to 'ncpus' by default """ # Set restrict_res_to_release_on_suspend server attribute a = {ATTR_restrict_res_to_release_on_suspend: 'ncpus'} self.server.expect(SERVER, a) def test_exclusive_job_not_suspended(self): """ If a running job is a job with exclusive placement then this job can not be suspended. This test is checking for a log message which is an unstable interface and may need change in future when interface changes. """ msg_expected = "BASIL;ERROR: ALPS error: apsched: \ at least resid .* is exclusive" # Submit a job j = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.place': 'excl'}) check_after = int(time.time()) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # suspend job try: self.server.sigjob(jobid=jid, signal="suspend") except PbsSignalError as e: self.assertTrue("Switching ALPS reservation failed" in e.msg[0]) self.server.expect(JOB, 'exec_host', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) ehost = job_stat[0]['exec_host'].partition('/')[0] run_mom = self.moms[ehost] s = run_mom.log_match(msg_expected, starttime=check_after, regexp=True, max_attempts=10) self.assertTrue(s) @tags('cray') def test_basic_admin_suspend_restart(self): """ Test basic admin-suspend funcionality for jobs and array jobs with restart on Cray. The restart will test if the node recovers properly in maintenance. After turning off scheduling and a server restart, a subjob is always requeued and node shows up as free. """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) qstat = self.server.status(JOB, 'exec_vnode', id=jid1) vname = qstat[0]['exec_vnode'].partition(':')[0].strip('(') # admin-suspend regular job self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vname) self.server.expect(NODE, {'maintenance_jobs': jid1}) self.server.restart() self.server.expect(NODE, {'state': 'maintenance'}, id=vname) self.server.expect(NODE, {'maintenance_jobs': jid1}) # Adding sleep to avoid failure at resume since PBS licenses # might not be available and as a result resume fails time.sleep(2) # admin-resume regular job. Make sure the node retuns to state # job-exclusive. self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname) self.server.cleanup_jobs() # admin-suspend job array jA = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_J: '1-2'}) jidA = self.server.submit(jA) self.server.expect(JOB, {ATTR_state: 'B'}, id=jidA) subjobs = self.server.status(JOB, id=jidA, extend='t') # subjobs[0] is the array itself. Need the subjobs jid1 = subjobs[1]['id'] jid2 = subjobs[2]['id'] self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) qstat = self.server.status(JOB, 'exec_vnode', id=jid1) vname1 = qstat[0]['exec_vnode'].partition(':')[0].strip('(') qstat = self.server.status(JOB, 'exec_vnode', id=jid2) vname2 = qstat[0]['exec_vnode'].partition(':')[0].strip('(') # admin-suspend subjob 1 self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vname1) self.server.expect(NODE, {'maintenance_jobs': jid1}) # admin-resume subjob 1 . Make sure the node retuns to state # job-exclusive. self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname1) # admin-suspend subjob 2 self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid2) self.server.expect(NODE, {'state': 'maintenance'}, id=vname2) self.server.expect(NODE, {'maintenance_jobs': jid2}) # Turn off scheduling and restart server self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.server.restart() # Check that nodes are now free self.server.expect(NODE, {'state': 'free'}, id=vname1) self.server.expect(NODE, {'state': 'free'}, id=vname2) def test_admin_suspend_wrong_state(self): """ Check that wrong 'resume' signal is correctly rejected. """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.sigjob(jid1, "suspend", runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) try: self.server.sigjob(jid1, "admin-resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) j2 = Job(TEST_USER) jid2 = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) self.server.sigjob(jid2, "admin-suspend", runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2) try: self.server.sigjob(jid2, "resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) # The job should be in the same state as it was prior to the signal self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2) def submit_resv(self, resv_start, chunks, resv_dur): """ Function to request a PBS reservation with start time, chunks and duration as arguments. """ a = {'Resource_List.select': '%d:ncpus=1:vntype=cray_compute' % chunks, 'Resource_List.place': 'scatter', 'reserve_start': int(resv_start), 'reserve_duration': int(resv_dur) } r = Reservation(TEST_USER, attrs=a) rid = self.server.submit(r) try: a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} d = self.server.expect(RESV, a, id=rid) except PtlExpectError as e: d = e.rv return d @timeout(300) def test_preempt_STF(self): """ Test shrink to fit by creating a reservation for all compute nodes starting in 100 sec. with a duration of two hours. A preempted STF job with min_walltime of 1 min. and max_walltime of 2 hours will stay suspended after higher priority job goes away if its min_walltime can't be satisfied. """ qname = 'highp' a = {'queue_type': 'execution'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = {'enabled': 'True', 'started': 'True', 'priority': '150'} self.server.manager(MGR_CMD_SET, QUEUE, a, qname) # Reserve all the compute nodes nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.") now = time.time() resv_start = now + 100 resv_dur = 7200 d = self.submit_resv(resv_start, nv, resv_dur) self.assertTrue(d) j = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.min_walltime': '00:01:00', ATTR_l + '.max_walltime': '02:00:00'}) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) self.server.expect( JOB, {ATTR_l + '.walltime': (LE, '00:01:40')}, id=jid) self.server.expect( JOB, {ATTR_l + '.walltime': (GE, '00:01:00')}, id=jid) # The sleep below will leave less than 1 minute window for jid # after j2id is deleted. The min_walltime of jid can't be # satisfied and jid will stay in S state. time.sleep(35) j2 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.walltime': '00:01:00', ATTR_l + '.place': 'scatter', ATTR_q: 'highp'}) j2id = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=j2id) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) # The sleep below will leave less than 1 minute window for jid time.sleep(50) self.server.delete(j2id) a = {'scheduling': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) self.server.expect(SERVER, {'server_state': 'Active'}) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_multi_express(self): """ Test of multiple express queues of different priorities. See that jobs from the higher express queues preempt jobs from lower express queues. Also see when express jobs finish (or are deleted), suspended jobs restart. Make sure loadLimit is set to 4 on the server node: # apmgr config loadLimit 4 """ _t = ('\"express_queue, normal_jobs, server_softlimits,' + ' queue_softlimits\"') a = {'preempt_prio': _t} self.scheduler.set_sched_config(a) a = {'queue_type': 'e', 'started': 'True', 'enabled': 'True', 'Priority': 150} self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq") a['Priority'] = 160 self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq2") a['Priority'] = 170 self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq3") # Count the compute nodes nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.") j1 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600}) j1id = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=j1id) j2 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600, ATTR_q: 'expressq'}) j2id = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'S'}, id=j1id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j2id) j3 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600, ATTR_q: 'expressq2'}) j3id = self.server.submit(j3) self.server.expect(JOB, {ATTR_state: 'S'}, id=j2id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j3id) j4 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600, ATTR_q: 'expressq3'}) j4id = self.server.submit(j4) self.server.expect(JOB, {ATTR_state: 'S'}, id=j3id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j4id) self.server.delete(j4id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j3id) def test_preempted_topjob_calendared(self): """ That even if topjob_ineligible is set for a preempted job and sched_preempt_enforce_resumption is set true, the preempted job will be calendared """ self.server.manager(MGR_CMD_SET, SCHED, {'sched_preempt_enforce_resumption': 'true'}) self.server.manager(MGR_CMD_SET, SERVER, {'backfill_depth': '2'}) # Count the compute nodes nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.") # Submit a job j = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': '120'}) jid1 = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # Alter topjob_ineligible for runnng job self.server.alterjob(jid1, {ATTR_W: "topjob_ineligible = true"}, runas=ROOT_USER, logerr=True) # Create a high priority queue a = {'queue_type': 'e', 'started': 't', 'enabled': 'True', 'priority': '150'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id="highp") # Submit a job to high priority queue j = Job(TEST_USER, {ATTR_queue: 'highp', ATTR_l + '.walltime': '60'}) jid2 = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) # Verify that job1 is calendared self.server.expect(JOB, 'estimated.start_time', op=SET, id=jid1) qstat = self.server.status(JOB, 'estimated.start_time', id=jid1) est_time = qstat[0]['estimated.start_time'] self.assertNotEqual(est_time, None) self.scheduler.log_match(jid1 + ";Job is a top job", starttime=self.server.ctime, max_attempts=10)
class TestSuspendResumeOnCray(TestFunctional): """ Test special cases where suspend/resume functionality differs on cray as compared to other platforms. This test suite expects the platform to be 'cray' and assumes that suspend/resume feature is enabled on it. """ cu = CrayUtils() def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) @tags('cray', 'smoke') def test_default_restrict_res_to_release_on_suspend_setting(self): """ Check that on Cray restrict_res_to_release_on_suspend is always set to 'ncpus' by default """ # Set restrict_res_to_release_on_suspend server attribute a = {ATTR_restrict_res_to_release_on_suspend: 'ncpus'} self.server.expect(SERVER, a) def test_exclusive_job_not_suspended(self): """ If a running job is a job with exclusive placement then this job can not be suspended. This test is checking for a log message which is an unstable interface and may need change in future when interface changes. """ msg_expected = "BASIL;ERROR: ALPS error: apsched: \ at least resid .* is exclusive" # Submit a job j = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.place': 'excl'}) check_after = int(time.time()) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # suspend job try: self.server.sigjob(jobid=jid, signal="suspend") except PbsSignalError as e: self.assertTrue("Switching ALPS reservation failed" in e.msg[0]) self.server.expect(JOB, 'exec_host', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) ehost = job_stat[0]['exec_host'].partition('/')[0] run_mom = self.moms[ehost] s = run_mom.log_match(msg_expected, starttime=check_after, regexp=True, max_attempts=10) self.assertTrue(s) @tags('cray') def test_basic_admin_suspend_restart(self): """ Test basic admin-suspend funcionality for jobs and array jobs with restart on Cray. The restart will test if the node recovers properly in maintenance. After turning off scheduling and a server restart, a subjob is always requeued and node shows up as free. """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) qstat = self.server.status(JOB, 'exec_vnode', id=jid1) vname = qstat[0]['exec_vnode'].partition(':')[0].strip('(') # admin-suspend regular job self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vname) self.server.expect(NODE, {'maintenance_jobs': jid1}) self.server.restart() self.server.expect(NODE, {'state': 'maintenance'}, id=vname) self.server.expect(NODE, {'maintenance_jobs': jid1}) # Adding sleep to avoid failure at resume since PBS licenses # might not be available and as a result resume fails time.sleep(2) # admin-resume regular job. Make sure the node retuns to state # job-exclusive. self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname) self.server.cleanup_jobs() # admin-suspend job array jA = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_J: '1-2'}) jidA = self.server.submit(jA) self.server.expect(JOB, {ATTR_state: 'B'}, id=jidA) subjobs = self.server.status(JOB, id=jidA, extend='t') # subjobs[0] is the array itself. Need the subjobs jid1 = subjobs[1]['id'] jid2 = subjobs[2]['id'] self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) qstat = self.server.status(JOB, 'exec_vnode', id=jid1) vname1 = qstat[0]['exec_vnode'].partition(':')[0].strip('(') qstat = self.server.status(JOB, 'exec_vnode', id=jid2) vname2 = qstat[0]['exec_vnode'].partition(':')[0].strip('(') # admin-suspend subjob 1 self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vname1) self.server.expect(NODE, {'maintenance_jobs': jid1}) # admin-resume subjob 1 . Make sure the node retuns to state # job-exclusive. self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname1) # admin-suspend subjob 2 self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid2) self.server.expect(NODE, {'state': 'maintenance'}, id=vname2) self.server.expect(NODE, {'maintenance_jobs': jid2}) # Turn off scheduling and restart server self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.server.restart() # Check that nodes are now free self.server.expect(NODE, {'state': 'free'}, id=vname1) self.server.expect(NODE, {'state': 'free'}, id=vname2) def test_admin_suspend_wrong_state(self): """ Check that wrong 'resume' signal is correctly rejected. """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.sigjob(jid1, "suspend", runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) try: self.server.sigjob(jid1, "admin-resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) j2 = Job(TEST_USER) jid2 = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) self.server.sigjob(jid2, "admin-suspend", runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2) try: self.server.sigjob(jid2, "resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) # The job should be in the same state as it was prior to the signal self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2) def submit_resv(self, resv_start, chunks, resv_dur): """ Function to request a PBS reservation with start time, chunks and duration as arguments. """ a = {'Resource_List.select': '%d:ncpus=1:vntype=cray_compute' % chunks, 'Resource_List.place': 'scatter', 'reserve_start': int(resv_start), 'reserve_duration': int(resv_dur) } r = Reservation(TEST_USER, attrs=a) rid = self.server.submit(r) try: a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} d = self.server.expect(RESV, a, id=rid) except PtlExpectError, e: d = e.rv return d