Exemple #1
0
def test_slurmenv_api_resources():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})
        api_res = m.resources()

        assert all(('total_nodes' in api_res, 'total_cores' in api_res))
        assert all((api_res['total_nodes'] == resources.total_nodes, api_res['total_cores'] == resources.total_cores))

        aux_dir = find_single_aux_dir(str(tmpdir))

        assert all((exists(join(tmpdir, '.qcgpjm-client', 'api.log')),
                    exists(join(aux_dir, 'service.log'))))

    finally:
        if m:
            m.finish()
            # stopManager is using 'terminate' method on service process, which is not a best option when using
            # pytest and gathering code coverage
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Exemple #2
0
def test_slurmenv_api_submit_many_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': { 'numCores': { 'exact': resources.total_cores } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')

        # check working directories of job's inside working directory of service
        assert tmpdir == jinfos['host'].wdir, str(jinfos['host'].wdir)
        assert all((len(jinfos['host'].nodes) == resources.total_nodes,
                    jinfos['host'].total_cores == resources.total_cores)), str(jinfos['host'])

    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Exemple #3
0
def test_slurmenv_simple_resources():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip(
            'test not run in slurm allocation or allocation is smaller than 2 nodes'
        )

    get_slurm_resources()
Exemple #4
0
def _detect_resources(config):
    """Return resource parse function according to actual environment.

    This function checks if user specified resource configuration in QCG-PilotJob parameters - in such case, the
    available resources will be taken from user configuration. If no there is a check if QCG-PilotJob has been launched
    in Slurm allocation, and in such case the Slurm allocation will be available resources. Finally, the local available
    cores are returned as available resources.

    Args:
        config (dict): QCG-PilotJob configuration

    Returns:
        function: function for parsing available resources
    """
    _logger.info(
        'determining source of information about available resources ...')

    if Config.EXECUTION_NODES.get(config):
        _logger.info('selected local resources information')
        return parse_local_resources(config)

    if in_slurm_allocation():
        _logger.info('selected SLURM resources information')
        return parse_slurm_resources(config)

    _logger.info('selected local resources information')
    return parse_local_resources(config)
Exemple #5
0
def test_slurmenv_api_submit_simple():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs().\
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'std.out',
                         'stderr': 'std.err'
                     }})
        assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Exemple #6
0
def test_slurmenv_api_cancel_kill_nl():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))
    print(f'tmpdir: {tmpdir}')

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        iters=10
        ids = m.submit(Jobs().
                       add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}',
                           stderr='sleep.err.${it}', numCores=1)
                       )
        jid = ids[0]
        assert len(m.list()) == 1

        list_jid = list(m.list().keys())[0]
        assert list_jid == jid

        # wait for job to start executing
        sleep(2)

        m.cancel([jid])

        # wait for SIGTERM job cancel
        sleep(2)

        jinfos = m.info_parsed(ids)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'QUEUED'))

        # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT)
        sleep(ExecutionJob.SIG_KILL_TIMEOUT)

        jinfos = m.info_parsed(ids, withChilds=True)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'CANCELED'))

        # the canceled iterations are included in 'failed' entry in job statistics
        # the cancel status is presented in 'childs/state' entry
        assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0,
                    jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters,
                    jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters))
        assert len(jinfos[jid].childs) == iters
        for iteration in range(iters):
            job_it = jinfos[jid].childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration),
                        job_it.status == 'CANCELED')), str(job_it)

        m.remove(jid)

    finally:
        m.finish()
        m.cleanup()
Exemple #7
0
def test_slurmenv_api_iteration_simple():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        its = 2
        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': 1 } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        assert jinfos
        jinfo = jinfos['host']
        print('jinfo: {}'.format(jinfo))
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0))

        its = 2
        jobs = Jobs(). \
            add_std({ 'name': 'host2',
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': 1 } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos['host2']
        print('jinfo: {}'.format(jinfo))
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0))
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('host2', iteration),
                        job_it.wdir == tmpdir, job_it.total_cores == 1))
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
def _select_auto_environment():
    """Select proper job execution environment.

    When QCG-PilotJob manager is executed inside Slurm allocation, the same execution environment is returned.
    For local modes, the base (common) environment is used.

    Returns:
        Environment: the selected job execution environment
    """
    if in_slurm_allocation():
        return SlurmEnvironment

    return CommonEnvironment
Exemple #9
0
def test_slurmenv_api_submit_exceed_total_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'date',
                     'execution': { 'exec': '/bin/date' },
                     'resources': {
                         'numCores': { 'exact': resources.total_cores + 1 }
                     }})
        with pytest.raises(ConnectionError, match=r".*Not enough resources.*"):
            m.submit(jobs)
        assert len(m.list()) == 0

        jobs = Jobs(). \
        add_std({ 'name': 'date',
                     'execution': { 'exec': '/bin/date' },
                     'resources': {
                         'numNodes': { 'exact': resources.total_nodes + 1 }
                     }})
        with pytest.raises(ConnectionError, match=r".*Not enough resources.*"):
            ids = m.submit(jobs)
        assert len(m.list()) == 0

        jobs = Jobs(). \
            add_std({ 'name': 'date',
                     'execution': {
                         'exec': '/bin/date',
                         'stdout': 'std.out',
                     },
                     'resources': { 'numCores': { 'exact': resources.total_cores  } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        assert jinfos['date'].total_cores == resources.total_cores
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Exemple #10
0
def test_slurmenv_simple_resources_binding():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip(
            'test not run in slurm allocation or allocation is smaller than 2 nodes'
        )

    print('SLURM_NODELIST: {}'.format(environ.get('SLURM_NODELIST', None)))
    print('SLURM_JOB_CPUS_PER_NODE: {}'.format(
        environ.get('SLURM_JOB_CPUS_PER_NODE', None)))
    print('SLURM_CPU_BIND_LIST: {}'.format(
        environ.get('SLURM_CPU_BIND_LIST', None)))
    print('SLURM_CPU_BIND_TYPE: {}'.format(
        environ.get('SLURM_CPU_BIND_TYPE', None)))
    print('CUDA_VISIBLE_DEVICES: {}'.format(
        environ.get('CUDA_VISIBLE_DEVICES', None)))

    get_slurm_resources_binded()
Exemple #11
0
def test_slurmenv_api_std_streams_many_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': 'cat',
                         'stdin': '/etc/system-release',
                         'stdout': 'out',
                         'stderr': 'err'
                     },
                     'resources': {
                         'numCores': { 'exact': 2 }
                     }
                     })
        assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')

        assert all((exists(join(tmpdir, 'out')), exists(join(tmpdir, 'err'))))

        with open(join(tmpdir, 'out'), 'rt') as out_f:
            out = out_f.read()

        with open(join('/etc/system-release'), 'rt') as sr_f:
            system_release = sr_f.read()

        assert system_release in out
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Exemple #12
0
def test_slurm_partition_resources():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()

    governor_dir = tempfile.mkdtemp(dir=SHARED_PATH)
    print('governor work dir: {}'.format(governor_dir))

    governor_id = 'm-governor'
    governor_args = ['--log', 'debug', '--wd', governor_dir, '--report-format', 'json', '--id', governor_id,
                     '--system-core', '--slurm-partition-nodes', '1']
    governor_process, governor_address = fork_manager(governor_args)

    time.sleep(10)

    try:
        resources_reply = send_request_valid(governor_address, {'request': 'resourcesInfo'})
        print('got total resource reply: {}'.format(str(resources_reply)))
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))
        # as 1 core is reserved for each partition manager, total number of allocated cores
        # will be the same as number of partition managers - in our case 1 partition manager
        # per node
        assert all((resources_reply['data'].get('total_nodes', 0) == resources.total_nodes,
                    resources_reply['data'].get('total_cores', 0) == resources.total_cores,
                    resources_reply['data'].get('used_cores', -1) == resources.total_nodes,
                    resources_reply['data'].get('free_cores', 0) == resources.total_cores - resources.total_nodes)),\
            str(resources_reply)

        send_request_valid(governor_address, {'request': 'finish'})
        governor_process.join(30)
        assert governor_process.exitcode == 0
    finally:
        if governor_process:
            governor_process.terminate()

    rmtree(governor_dir)
Exemple #13
0
def test_slurmenv_launcher_agents():
    pytest.skip('somehow this test doesn\t properly close event loop')

    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip(
            'test not run in slurm allocation or allocation is smaller than 2 nodes'
        )

    resources, allocation = get_slurm_resources_binded()

    #    with tempfile.TemporaryDirectory(dir=SHARED_PATH) as tmpdir:
    set_pythonpath_to_qcg_module()
    tmpdir = tempfile.mkdtemp(dir=SHARED_PATH)
    print('tmpdir: {}'.format(tmpdir))

    try:
        auxdir = join(tmpdir, 'qcg')

        print("aux directory set to: {}".format(auxdir))
        mkdir(auxdir)

        if asyncio.get_event_loop() and asyncio.get_event_loop().is_closed():
            asyncio.set_event_loop(asyncio.new_event_loop())

        LauncherExecutionJob.start_agents(tmpdir, auxdir, resources.nodes,
                                          resources.binding)

        try:
            assert len(
                LauncherExecutionJob.launcher.agents) == resources.total_nodes

            # there should be only one launcher per node, so check based on 'node' in agent['data']['slurm']
            node_names = set(node.name for node in resources.nodes)
            for agent_name, agent in LauncherExecutionJob.launcher.agents.items(
            ):
                print("found agent {}: {}".format(agent_name, str(agent)))
                assert all(('process' in agent, 'data' in agent, 'options'
                            in agent.get('data', {}))), str(agent)
                assert all(('slurm' in agent['data'], 'node'
                            in agent.get('data', {}).get('slurm',
                                                         {}))), str(agent)
                assert agent['data']['slurm']['node'] in node_names
                assert all(
                    ('binding' in agent['data']['options'],
                     agent['data']['options']['binding'] == True)), str(agent)

                node_names.remove(agent['data']['slurm']['node'])

            assert len(node_names) == 0

            # launching once more should raise exception
            with pytest.raises(Exception):
                LauncherExecutionJob.start_agents(tmpdir, auxdir,
                                                  resources.nodes,
                                                  resources.binding)

        finally:
            asyncio.get_event_loop().run_until_complete(
                asyncio.ensure_future(LauncherExecutionJob.stop_agents()))
            time.sleep(1)

            asyncio.get_event_loop().close()
    finally:
        rmtree(tmpdir)
        pass
Exemple #14
0
def test_slurmenv_simple_job():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip(
            'test not run in slurm allocation or allocation is smaller than 2 nodes'
        )

    resources, allocation = get_slurm_resources_binded()
    resources_node_names = set(n.name for n in resources.nodes)

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    file_path = join(tmpdir, 'jobs.json')
    print('tmpdir: {}'.format(tmpdir))

    jobName = 'mdate'
    jobs = [
        job.to_dict() for job in [
            Job(
                jobName,
                JobExecution('date',
                             wd=abspath(join(tmpdir, 'date.sandbox')),
                             stdout='date.out',
                             stderr='date.err'),
                JobResources(numCores=ResourceSize(1)))
        ]
    ]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--log', 'debug', '--file', '--file-path',
        str(file_path), '--wd', tmpdir, '--report-format', 'json'
    ]
    QCGPMService().start()

    jobEntries = check_job_status_in_json([jobName],
                                          workdir=tmpdir,
                                          dest_state='SUCCEED')
    assert all(
        (isdir(abspath(join(tmpdir, 'date.sandbox'))),
         exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')),
         exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')),
         stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')).st_size
         > 0))
    # there can be some debugging messages in the stderr
    #                stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')).st_size == 0))

    for jname, jentry in jobEntries.items():
        assert all(('runtime' in jentry, 'allocation'
                    in jentry.get('runtime', {})))

        jalloc = jentry['runtime']['allocation']
        for jalloc_node in jalloc.split(','):
            node_name = jalloc_node[:jalloc_node.index('[')]
            print('{} in available nodes ({})'.format(
                node_name, ','.join(resources_node_names)))
            assert node_name in resources_node_names, '{} not in nodes ({}'.format(
                node_name, ','.join(resources_node_names))

    with pytest.raises(ValueError):
        check_job_status_in_json([jobName + 'xxx'],
                                 workdir=tmpdir,
                                 dest_state='SUCCEED')

    rmtree(tmpdir)
Exemple #15
0
def test_slurmenv_many_nodes_many_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip(
            'test not run in slurm allocation or allocation is smaller than 2 nodes'
        )

    resources, allocation = get_slurm_resources_binded()
    resources_node_names = set(n.name for n in resources.nodes)

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    file_path = join(tmpdir, 'jobs.json')
    print('tmpdir: {}'.format(tmpdir))

    jobName = 'hostname'
    jobwdir_base = 'hostname.sandbox'
    cores_num = resources.nodes[0].free
    nodes_num = resources.total_nodes
    jobs = [
        job.to_dict() for job in [
            Job(
                jobName,
                JobExecution(exec='mpirun',
                             args=['--allow-run-as-root', 'hostname'],
                             wd=abspath(join(tmpdir, jobwdir_base)),
                             stdout='hostname.out',
                             stderr='hostname.err',
                             modules=['mpi/openmpi-x86_64']),
                JobResources(numCores=ResourceSize(cores_num),
                             numNodes=ResourceSize(nodes_num)))
        ]
    ]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--log', 'debug', '--file', '--file-path',
        str(file_path), '--wd', tmpdir, '--report-format', 'json'
    ]
    QCGPMService().start()

    jobEntries = check_job_status_in_json([jobName],
                                          workdir=tmpdir,
                                          dest_state='SUCCEED')
    assert all(
        (isdir(abspath(join(tmpdir, jobwdir_base))),
         exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.out')),
         exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.err')),
         stat(join(abspath(join(tmpdir, jobwdir_base)),
                   'hostname.out')).st_size > 0))

    job_nodes = []
    allocated_cores = 0
    for jname, jentry in jobEntries.items():
        assert all(('runtime' in jentry, 'allocation'
                    in jentry.get('runtime', {})))

        jalloc = jentry['runtime']['allocation']
        for jalloc_node in jalloc.split(','):
            node_name = jalloc_node[:jalloc_node.index('[')]
            job_nodes.append(node_name)
            print('{} in available nodes ({})'.format(
                node_name, ','.join(resources_node_names)))
            assert node_name in resources_node_names, '{} not in nodes ({}'.format(
                node_name, ','.join(resources_node_names))

            ncores = len(jalloc_node[jalloc_node.index('[') + 1:-1].split(':'))
            print('#{} cores on node {}'.format(ncores, node_name))
            allocated_cores += ncores
    assert len(job_nodes) == nodes_num, str(job_nodes)
    assert allocated_cores == nodes_num * cores_num, allocated_cores

    # check if hostname is in stdout in two lines
    with open(abspath(join(tmpdir, join(jobwdir_base, 'hostname.out'))),
              'rt') as stdout_file:
        stdout_content = [line.rstrip() for line in stdout_file.readlines()]
    assert len(stdout_content) == nodes_num * cores_num, str(stdout_content)
    assert all(hostname in job_nodes
               for hostname in stdout_content), str(stdout_content)

    with pytest.raises(ValueError):
        check_job_status_in_json([jobName + 'xxx'],
                                 workdir=tmpdir,
                                 dest_state='SUCCEED')

    rmtree(tmpdir)
Exemple #16
0
def test_slurmenv_api_submit_resource_ranges():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': { 'numCores': { 'min': 1 } }
                     })
        # job should faile because of missing 'max' parameter
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'FAILED')
        jinfo = jinfos['host']
        assert "Both core's range boundaries (min, max) must be defined" in jinfo.messages, str(jinfo)

        jobs = Jobs(). \
            add_std({ 'name': 'host2',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': {
                         'numNodes': { 'exact': 1 },
                         'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } }
                     })
        # job should run on single node (the first free) with all available cores
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        jinfo = jinfos['host2']
        assert all((len(jinfo.nodes) == 1, jinfo.total_cores == resources.nodes[0].total)), str(jinfo)

        jobs = Jobs(). \
            add_std({ 'name': 'host3',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': {
                         'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } }
                     })
        # job should run on at least two nodes with total maximum given cores
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        jinfo = jinfos['host3']
        assert all((len(jinfo.nodes) == 2, jinfo.total_cores == resources.nodes[0].total + 1)), str(jinfo)

    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Exemple #17
0
    async def _launch_partition_managers(self, nodes_in_partition):
        """Launch partition managers.

        The information about Slurm allocation is gathered, and all nodes are split by ``nodes_in_partition`` to form
        a single partition, and partition manager instance is created to control each partition.

        Args:
            nodes_in_partition (int): how many nodes each partition should have

        Raises:
            InvalidRequest: when
                * ``nodes_in_partition`` is less than 1
                * governor manager has not been launched in Slurm allocation
                * missing nodes in Slurm allocation
            InternalError: when
                * missing ZMQ interface in governor manager
        """
        _logger.info('setup allocation split into partitions by %s nodes',
                     nodes_in_partition)

        if nodes_in_partition < 1:
            _logger.error(
                'Failed to partition resources - partition size must be greater or equal 1'
            )
            raise InvalidRequest(
                'Failed to partition resources - partition size must be greater or equal 1'
            )

        # get slurm resources - currently only slurm is supported
        if not in_slurm_allocation():
            _logger.error(
                'Failed to partition resources - partitioning resources is currently available only within '
                'slurm allocation')
            raise InvalidRequest(
                'Failed to partition resources - partitioning resources is currently available only '
                'within slurm allocation')

        if not self.zmq_address:
            _logger.error(
                'Failed to partition resources - missing zmq interface address'
            )
            raise InternalError(
                'Failed to partition resources - missing zmq interface address'
            )

        slurm_resources = parse_slurm_resources(self._config)

        if slurm_resources.total_nodes < 1:
            raise InvalidRequest(
                'Failed to partition resources - allocation contains no nodes')

        npartitions = math.ceil(slurm_resources.total_nodes /
                                nodes_in_partition)
        _logger.info(
            '%s partitions will be created (in allocation containing %s total nodes)',
            npartitions, slurm_resources.total_nodes)

        # launch partition manager in the same directory as governor
        partition_manager_wdir = Config.EXECUTOR_WD.get(self._config)
        partition_manager_auxdir = Config.AUX_DIR.get(self._config)

        _logger.debug('partition managers working directory %s',
                      partition_manager_wdir)

        for part_idx in range(npartitions):
            _logger.debug('creating partition manager %s configuration',
                          part_idx)

            part_node = slurm_resources.nodes[part_idx * nodes_in_partition]

            _logger.debug('partition manager node %s', part_node.name)

            self._partition_managers.append(
                PartitionManager(
                    'partition-{}'.format(part_idx), part_node.name,
                    part_idx * nodes_in_partition,
                    min(part_idx * nodes_in_partition + nodes_in_partition,
                        slurm_resources.total_nodes), partition_manager_wdir,
                    self.zmq_address, partition_manager_auxdir, self._config))

            _logger.debug('partition manager %s configuration created',
                          part_idx)

        self._min_scheduling_managers = len(self._partition_managers)

        _logger.info(
            'created partition managers configuration and set minimum scheduling managers to %s',
            self._min_scheduling_managers)

        asyncio.ensure_future(self.manage_start_partition_managers())

        # launch task in the background that schedule buffered submit requests
        self._schedule_buffered_jobs_task = asyncio.ensure_future(
            self._schedule_buffered_jobs())
Exemple #18
0
def test_slurmenv_api_iteration_core_scheduling():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        # in that case the 'split-into' is default the number of iterations
        # so total available resources should be splited into two partitions and each of the
        # iteration should run on its own partition
        jname = 'host'
        its = 2
        jobs = Jobs(). \
           add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        # all iterations has been scheduled across all resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores
        assert all(child.total_cores == resources.total_cores / its for child in jinfo.childs)

        # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the
        # previous example
        jname = 'host2'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        # all iterations has been scheduled across all resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores
        assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs)

        # we explicity specify the 'split-into' parameter to 4, the two iterations should be sheduled
        # on half of the available resources
        jname = 'host3'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 4 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        # all iterations has been scheduled across all resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores / 2
        assert all(child.total_cores == resources.total_cores / 4 for child in jinfo.childs)

        # we explicity specify the 'split-into' parameter to 2, but the number of iterations is larger than
        # available partitions in the same time, so they should be executed serially (by parts)
        jname = 'host4'
        its = 10
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs)

        # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available
        # resources
        jname = 'host5'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores

        # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available
        # resources
        jname = 'host6'
        its = resources.total_cores
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores

        # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers
        # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each
        # iteration inside 'step'
        jname = 'host7'
        its = resources.total_cores
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert (child.total_cores == 1 for child in jinfo.childs)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores

        # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers
        # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each
        # iteration inside 'step'
        jname = 'host8'
        its = resources.total_cores * 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert (child.total_cores == 1 for child in jinfo.childs)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2

        # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers
        # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each
        # iteration inside 'step'
        jname = 'host9'
        its = resources.total_cores + 1
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1)), str(job_it)
        assert (child.total_cores == 1 for child in jinfo.childs)
        # because all iterations will be splited in two 'steps' and in each step the iterations that has been assigned
        # for the step should usage maximum available resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2


        # in this case where two iterations can't fit at once on resources, all the iterations should be scheduled
        # serially on all available resources
        jname = 'host10'
        its = resources.total_nodes
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': resources.total_cores - 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.total_cores, len(job_it.nodes) == resources.total_nodes)),\
                str(job_it)
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Exemple #19
0
def test_slurm_partition_submit():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()

    governor_dir = tempfile.mkdtemp(dir=SHARED_PATH)
    print('governor work dir: {}'.format(governor_dir))

    governor_id = 'm-governor'
    governor_args = ['--log', 'debug', '--wd', str(governor_dir), '--report-format', 'json', '--id', governor_id,
                     '--system-core', '--slurm-partition-nodes', '1']
    governor_process, governor_address = fork_manager(governor_args)

    time.sleep(10)

    try:
        resources_reply = send_request_valid(governor_address, {'request': 'resourcesInfo'})
        print('got total resource reply: {}'.format(str(resources_reply)))
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))

        # as 1 core is reserved for each partition manager, total number of allocated cores
        # will be the same as number of partition managers - in our case 1 partition manager
        # per node
        assert all((resources_reply['data'].get('total_nodes', 0) == resources.total_nodes,
                    resources_reply['data'].get('total_cores', 0) == resources.total_cores,
                    resources_reply['data'].get('used_cores', -1) == resources.total_nodes,
                    resources_reply['data'].get('free_cores', 0) == resources.total_cores - resources.total_nodes)), str(resources_reply)

        # submit a bunch of jobs
        njobs = 100
        jname = 'date_iter'
        wdir_base = 'env.sandbox'
        submit_reply = send_request_valid(governor_address, { 'request': 'submit', 'jobs': [
                {
                    'name': jname,
                    'iteration': { "start": 0, "stop": njobs },
                    'execution': {
                        'exec': '/usr/bin/env',
                        'args': [ 'date' ],
                        'wd': abspath(join(governor_dir, "{}_$${{it}}".format(wdir_base))),
                        'stdout': 'env.stdout',
                        'stderr': 'env.stderr'
                    },
                    'resources': { 'numCores': { 'exact': 1 } }
                }
            ] })
        assert all((submit_reply['code'] == 0, 'data' in resources_reply))
        assert all((submit_reply['data'].get('submitted', 0) == 1,
                    len(submit_reply['data'].get('jobs', [])) == 1,
                    jname in submit_reply['data'].get('jobs', [])))

        status_reply = send_request_valid(governor_address, { 'request': 'jobStatus', 'jobNames': [ jname ]})
        assert all((status_reply['code'] == 0, 'data' in status_reply))
        assert 'jobs' in status_reply['data']
        assert len(status_reply['data']['jobs']) == 1
        assert jname in status_reply['data']['jobs']

        jstatus = status_reply['data']['jobs'][jname]
        assert all((jstatus['status'] == 0, jstatus['data']['jobName'] == jname,
                    jstatus['data']['status'] in [ JobState.QUEUED.name, JobState.SUCCEED.name ]))

        status_reply = send_request_valid(governor_address,
                                          {'request': 'control', 'command': 'finishAfterAllTasksDone'})
        assert status_reply['code'] == 0

        # wait up to 5 seconds
        governor_process.join(20)
        assert governor_process.exitcode == 0

        for i in range(njobs):
            job_wd = join(governor_dir, "{}_{}".format(wdir_base, i))

            assert isdir(job_wd), job_wd

            # check stdout & stderr files
            assert all((isfile(join(job_wd, 'env.stdout')),
                        isfile(join(job_wd, 'env.stderr'))))
            assert all((getsize(join(job_wd, 'env.stdout')) > 0,
                        getsize(join(job_wd, 'env.stderr')) == 0))

    finally:
        if governor_process:
            governor_process.terminate()

    rmtree(str(governor_dir))
Exemple #20
0
def test_slurmenv_api_iteration_node_scheduling():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    # TODO: it's hard to write comprehensive iteration scheduling node tests on only two nodes (in slurm's \
    #  development docker)

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        # in that case the 'split-into' is default the number of iterations
        # so total available resources should be splited into two partitions and each of the
        # iteration should run on its own partition
        jname = 'host'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out_${it}', 'stderr': 'err_${it}' },
                     'resources': { 'numCores': { 'exact': resources.nodes[0].total },
                                    'numNodes': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it)
        # all iterations has been scheduled across all nodes
        assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes
        # the iterations should execute on different node
        assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0]

        # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the
        # previous example
        jname = 'host2'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': resources.nodes[0].total },
                                    'numNodes': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it)
        # all iterations has been scheduled across all nodes
        assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes
        # the iterations should execute on different node
        assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0]

        # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available
        # resources
        jname = 'host3'
        its = 4
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': resources.nodes[0].total },
                                    'numNodes': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
         })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it)
        assert sum([len(child.nodes) for child in jinfo.childs]) == its

    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)