def test_slurmenv_api_cancel_kill_nl(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) print(f'tmpdir: {tmpdir}') try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) iters=10 ids = m.submit(Jobs(). add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}', stderr='sleep.err.${it}', numCores=1) ) jid = ids[0] assert len(m.list()) == 1 list_jid = list(m.list().keys())[0] assert list_jid == jid # wait for job to start executing sleep(2) m.cancel([jid]) # wait for SIGTERM job cancel sleep(2) jinfos = m.info_parsed(ids) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'QUEUED')) # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT) sleep(ExecutionJob.SIG_KILL_TIMEOUT) jinfos = m.info_parsed(ids, withChilds=True) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'CANCELED')) # the canceled iterations are included in 'failed' entry in job statistics # the cancel status is presented in 'childs/state' entry assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0, jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters, jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters)) assert len(jinfos[jid].childs) == iters for iteration in range(iters): job_it = jinfos[jid].childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration), job_it.status == 'CANCELED')), str(job_it) m.remove(jid) finally: m.finish() m.cleanup()
def test_resume_simple(tmpdir): try: ncores = 4 m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores)], {'wdir': str(tmpdir)}) its = 10 job_req = { 'name': 'sleep', 'execution': { 'exec': '/bin/sleep', 'args': [ '4s' ], 'stdout': 'out', }, 'iteration': { 'stop': its }, 'resources': { 'numCores': { 'exact': 1 } } } jobs = Jobs().add_std(job_req) job_ids = m.submit(jobs) # because job iterations executes in order, after finish of 4th iteration, the three previous should also finish m.wait4('sleep:3') jinfos = m.info_parsed(job_ids, withChilds=True) assert jinfos jinfo = jinfos['sleep'] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == ncores, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] exp_status = ['SUCCEED'] if iteration > 3: exp_status = ['EXECUTING', 'SCHEDULED', 'QUEUED'] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('sleep', iteration), job_it.status in exp_status)),\ f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != {exp_status}" # kill process m.kill_manager_process() m.cleanup() ncores = 4 m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores), '--resume', tmpdir], {'wdir': str(tmpdir)}) m.wait4all() jinfos = m.info_parsed(job_ids, withChilds=True) assert jinfos jinfo = jinfos['sleep'] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('sleep', iteration), job_it.status == 'SUCCEED')), \ f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != SUCCEED" finally: if m: m.finish() m.cleanup()