def test_slurmenv_api_cancel_kill_nl(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) print(f'tmpdir: {tmpdir}') try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) iters=10 ids = m.submit(Jobs(). add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}', stderr='sleep.err.${it}', numCores=1) ) jid = ids[0] assert len(m.list()) == 1 list_jid = list(m.list().keys())[0] assert list_jid == jid # wait for job to start executing sleep(2) m.cancel([jid]) # wait for SIGTERM job cancel sleep(2) jinfos = m.info_parsed(ids) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'QUEUED')) # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT) sleep(ExecutionJob.SIG_KILL_TIMEOUT) jinfos = m.info_parsed(ids, withChilds=True) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'CANCELED')) # the canceled iterations are included in 'failed' entry in job statistics # the cancel status is presented in 'childs/state' entry assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0, jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters, jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters)) assert len(jinfos[jid].childs) == iters for iteration in range(iters): job_it = jinfos[jid].childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration), job_it.status == 'CANCELED')), str(job_it) m.remove(jid) finally: m.finish() m.cleanup()
def test_slurmenv_api_submit_exceed_total_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'resources': { 'numCores': { 'exact': resources.total_cores + 1 } }}) with pytest.raises(ConnectionError, match=r".*Not enough resources.*"): m.submit(jobs) assert len(m.list()) == 0 jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'resources': { 'numNodes': { 'exact': resources.total_nodes + 1 } }}) with pytest.raises(ConnectionError, match=r".*Not enough resources.*"): ids = m.submit(jobs) assert len(m.list()) == 0 jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date', 'stdout': 'std.out', }, 'resources': { 'numCores': { 'exact': resources.total_cores } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') assert jinfos['date'].total_cores == resources.total_cores finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_local_manager_submit_simple(tmpdir): cores = 4 # switch on debugging (by default in api.log file) m = LocalManager(['--wd', str(tmpdir), '--nodes', str(cores)], {'wdir': str(tmpdir)}) try: res = m.resources() assert all( ('total_nodes' in res, 'total_cores' in res, res['total_nodes'] == 1, res['total_cores'] == cores)) ids = m.submit(Jobs().add(name='host', exec='/bin/hostname', args=['--fqdn'], stdout='host.stdout').add( name='date', exec='/bin/date', stdout='date.stdout', numCores={'exact': 2})) assert len(m.list()) == 2 m.wait4(ids) jinfos = m.info(ids) assert all( ('jobs' in jinfos, len(jinfos['jobs'].keys()) == 2, 'host' in jinfos['jobs'], 'date' in jinfos['jobs'], jinfos['jobs']['host'].get('data', {}).get('status', '') == 'SUCCEED', jinfos['jobs']['date'].get('data', {}).get('status', '') == 'SUCCEED')) aux_dir = find_single_aux_dir(str(tmpdir)) assert all( (exists(tmpdir.join('.qcgpjm-client', 'api.log')), exists(join(aux_dir, 'service.log')), exists(tmpdir.join('host.stdout')), exists(tmpdir.join('date.stdout')))) finally: m.finish() # m.stopManager() m.cleanup()