Ejemplo n.º 1
0
def test_slurmenv_api_cancel_kill_nl():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))
    print(f'tmpdir: {tmpdir}')

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        iters=10
        ids = m.submit(Jobs().
                       add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}',
                           stderr='sleep.err.${it}', numCores=1)
                       )
        jid = ids[0]
        assert len(m.list()) == 1

        list_jid = list(m.list().keys())[0]
        assert list_jid == jid

        # wait for job to start executing
        sleep(2)

        m.cancel([jid])

        # wait for SIGTERM job cancel
        sleep(2)

        jinfos = m.info_parsed(ids)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'QUEUED'))

        # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT)
        sleep(ExecutionJob.SIG_KILL_TIMEOUT)

        jinfos = m.info_parsed(ids, withChilds=True)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'CANCELED'))

        # the canceled iterations are included in 'failed' entry in job statistics
        # the cancel status is presented in 'childs/state' entry
        assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0,
                    jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters,
                    jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters))
        assert len(jinfos[jid].childs) == iters
        for iteration in range(iters):
            job_it = jinfos[jid].childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration),
                        job_it.status == 'CANCELED')), str(job_it)

        m.remove(jid)

    finally:
        m.finish()
        m.cleanup()
Ejemplo n.º 2
0
def test_slurmenv_api_submit_exceed_total_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'date',
                     'execution': { 'exec': '/bin/date' },
                     'resources': {
                         'numCores': { 'exact': resources.total_cores + 1 }
                     }})
        with pytest.raises(ConnectionError, match=r".*Not enough resources.*"):
            m.submit(jobs)
        assert len(m.list()) == 0

        jobs = Jobs(). \
        add_std({ 'name': 'date',
                     'execution': { 'exec': '/bin/date' },
                     'resources': {
                         'numNodes': { 'exact': resources.total_nodes + 1 }
                     }})
        with pytest.raises(ConnectionError, match=r".*Not enough resources.*"):
            ids = m.submit(jobs)
        assert len(m.list()) == 0

        jobs = Jobs(). \
            add_std({ 'name': 'date',
                     'execution': {
                         'exec': '/bin/date',
                         'stdout': 'std.out',
                     },
                     'resources': { 'numCores': { 'exact': resources.total_cores  } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        assert jinfos['date'].total_cores == resources.total_cores
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Ejemplo n.º 3
0
def test_local_manager_submit_simple(tmpdir):
    cores = 4

    # switch on debugging (by default in api.log file)
    m = LocalManager(['--wd', str(tmpdir), '--nodes',
                      str(cores)], {'wdir': str(tmpdir)})

    try:
        res = m.resources()

        assert all(
            ('total_nodes' in res, 'total_cores'
             in res, res['total_nodes'] == 1, res['total_cores'] == cores))

        ids = m.submit(Jobs().add(name='host',
                                  exec='/bin/hostname',
                                  args=['--fqdn'],
                                  stdout='host.stdout').add(
                                      name='date',
                                      exec='/bin/date',
                                      stdout='date.stdout',
                                      numCores={'exact': 2}))

        assert len(m.list()) == 2

        m.wait4(ids)

        jinfos = m.info(ids)

        assert all(
            ('jobs' in jinfos, len(jinfos['jobs'].keys()) == 2, 'host'
             in jinfos['jobs'], 'date' in jinfos['jobs'],
             jinfos['jobs']['host'].get('data', {}).get('status',
                                                        '') == 'SUCCEED',
             jinfos['jobs']['date'].get('data', {}).get('status',
                                                        '') == 'SUCCEED'))

        aux_dir = find_single_aux_dir(str(tmpdir))

        assert all(
            (exists(tmpdir.join('.qcgpjm-client', 'api.log')),
             exists(join(aux_dir,
                         'service.log')), exists(tmpdir.join('host.stdout')),
             exists(tmpdir.join('date.stdout'))))
    finally:
        m.finish()
        #        m.stopManager()
        m.cleanup()