Ejemplo n.º 1
0
def test_register_manager_resources_single(tmpdir):
    governor_dir = tmpdir.join('governor')
    direct_dir = tmpdir.join('direct1')

    os.makedirs(str(governor_dir))
    os.makedirs(str(direct_dir))

    governor_process = direct_process = None

    try:
        governor_id = 'm-governor'
        governor_process, governor_address = fork_manager([ '--log', 'debug', '--wd', str(governor_dir), '--report-format',
            'json', '--id', governor_id, '--governor' ])

        direct_id = 'm-direct-1'
        direct_tags = ['custom', 'first', 'manager']
        direct_nodes = 2
        direct_cores = 4
        nodes_conf = ','.join(['n{}:{}'.format(idx, direct_cores) for idx in range(direct_nodes)])
        direct_process, direct_address = fork_manager(['--log', 'debug',
            '--nodes', nodes_conf, '--wd', str(direct_dir), '--report-format', 'json',
            '--id', direct_id, '--tags', ','.join(direct_tags), '--parent', governor_address ])

        # get managers time to communicate and exchange info about resources
        time.sleep(2)

        resources_reply = send_request_valid(governor_address, { 'request': 'resourcesInfo' })
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))
        assert all((resources_reply['data'].get('total_nodes', 0) == direct_nodes,
                    resources_reply['data'].get('total_cores', 0) == direct_nodes * direct_cores,
                    resources_reply['data'].get('used_cores', -1) == 0,
                    resources_reply['data'].get('free_cores', 0) == direct_nodes * direct_cores)), str(resources_reply)

        send_request_valid(governor_address, { 'request': 'finish' })
        governor_process.join()

        # we don't have to close the direct manager, as it will be automatically closed
        # by the governor when it finish
#        send_request_valid(direct_address, { 'request': 'finish' })
#        direct_process.join()
    finally:
        if governor_process:
            governor_process.terminate()

        if direct_process:
            direct_process.terminate()
Ejemplo n.º 2
0
def test_slurm_partition_resources():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()

    governor_dir = tempfile.mkdtemp(dir=SHARED_PATH)
    print('governor work dir: {}'.format(governor_dir))

    governor_id = 'm-governor'
    governor_args = ['--log', 'debug', '--wd', governor_dir, '--report-format', 'json', '--id', governor_id,
                     '--system-core', '--slurm-partition-nodes', '1']
    governor_process, governor_address = fork_manager(governor_args)

    time.sleep(10)

    try:
        resources_reply = send_request_valid(governor_address, {'request': 'resourcesInfo'})
        print('got total resource reply: {}'.format(str(resources_reply)))
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))
        # as 1 core is reserved for each partition manager, total number of allocated cores
        # will be the same as number of partition managers - in our case 1 partition manager
        # per node
        assert all((resources_reply['data'].get('total_nodes', 0) == resources.total_nodes,
                    resources_reply['data'].get('total_cores', 0) == resources.total_cores,
                    resources_reply['data'].get('used_cores', -1) == resources.total_nodes,
                    resources_reply['data'].get('free_cores', 0) == resources.total_cores - resources.total_nodes)),\
            str(resources_reply)

        send_request_valid(governor_address, {'request': 'finish'})
        governor_process.join(30)
        assert governor_process.exitcode == 0
    finally:
        if governor_process:
            governor_process.terminate()

    rmtree(governor_dir)
Ejemplo n.º 3
0
def test_register_manager(tmpdir):
    governor_dir = tmpdir.join('governor')
    direct_dir = tmpdir.join('direct1')

    os.makedirs(str(governor_dir))
    os.makedirs(str(direct_dir))

    print('governor work dir: {}'.format(str(governor_dir)))
    print('direct work dir: {}'.format(str(direct_dir)))

    governor_id = 'm-governor'

    governor_args = [ '--log', 'debug', '--wd', str(governor_dir), '--report-format', 'json', '--id', governor_id,
                      '--governor' ]
    governor_process, governor_address = fork_manager(governor_args)

    reqs = [ {'request': 'resourcesInfo'},
             {'request': 'control', 'command': 'finishAfterAllTasksDone'} ]
    reqs_path = direct_dir.join('jobs.json')
    save_reqs_to_file(reqs, reqs_path)
    print('jobs saved to file_path: {}'.format(str(reqs_path)))

    direct_id = 'm-direct-1'
    manager_tags = ['custom', 'first', 'manager']

    sys.argv = ['QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(reqs_path), '--nodes', '2',
                '--wd', str(direct_dir), '--report-format', 'json', '--id', direct_id, '--tags', ','.join(manager_tags),
                '--parent', governor_address ]

    QCGPMService().start()

    send_request_valid(governor_address, { 'request': 'finish' })
    governor_process.join()
    governor_process.terminate()

    rmtree(str(tmpdir))
Ejemplo n.º 4
0
def test_slurm_partition_submit():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()

    governor_dir = tempfile.mkdtemp(dir=SHARED_PATH)
    print('governor work dir: {}'.format(governor_dir))

    governor_id = 'm-governor'
    governor_args = ['--log', 'debug', '--wd', str(governor_dir), '--report-format', 'json', '--id', governor_id,
                     '--system-core', '--slurm-partition-nodes', '1']
    governor_process, governor_address = fork_manager(governor_args)

    time.sleep(10)

    try:
        resources_reply = send_request_valid(governor_address, {'request': 'resourcesInfo'})
        print('got total resource reply: {}'.format(str(resources_reply)))
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))

        # as 1 core is reserved for each partition manager, total number of allocated cores
        # will be the same as number of partition managers - in our case 1 partition manager
        # per node
        assert all((resources_reply['data'].get('total_nodes', 0) == resources.total_nodes,
                    resources_reply['data'].get('total_cores', 0) == resources.total_cores,
                    resources_reply['data'].get('used_cores', -1) == resources.total_nodes,
                    resources_reply['data'].get('free_cores', 0) == resources.total_cores - resources.total_nodes)), str(resources_reply)

        # submit a bunch of jobs
        njobs = 100
        jname = 'date_iter'
        wdir_base = 'env.sandbox'
        submit_reply = send_request_valid(governor_address, { 'request': 'submit', 'jobs': [
                {
                    'name': jname,
                    'iteration': { "start": 0, "stop": njobs },
                    'execution': {
                        'exec': '/usr/bin/env',
                        'args': [ 'date' ],
                        'wd': abspath(join(governor_dir, "{}_$${{it}}".format(wdir_base))),
                        'stdout': 'env.stdout',
                        'stderr': 'env.stderr'
                    },
                    'resources': { 'numCores': { 'exact': 1 } }
                }
            ] })
        assert all((submit_reply['code'] == 0, 'data' in resources_reply))
        assert all((submit_reply['data'].get('submitted', 0) == 1,
                    len(submit_reply['data'].get('jobs', [])) == 1,
                    jname in submit_reply['data'].get('jobs', [])))

        status_reply = send_request_valid(governor_address, { 'request': 'jobStatus', 'jobNames': [ jname ]})
        assert all((status_reply['code'] == 0, 'data' in status_reply))
        assert 'jobs' in status_reply['data']
        assert len(status_reply['data']['jobs']) == 1
        assert jname in status_reply['data']['jobs']

        jstatus = status_reply['data']['jobs'][jname]
        assert all((jstatus['status'] == 0, jstatus['data']['jobName'] == jname,
                    jstatus['data']['status'] in [ JobState.QUEUED.name, JobState.SUCCEED.name ]))

        status_reply = send_request_valid(governor_address,
                                          {'request': 'control', 'command': 'finishAfterAllTasksDone'})
        assert status_reply['code'] == 0

        # wait up to 5 seconds
        governor_process.join(20)
        assert governor_process.exitcode == 0

        for i in range(njobs):
            job_wd = join(governor_dir, "{}_{}".format(wdir_base, i))

            assert isdir(job_wd), job_wd

            # check stdout & stderr files
            assert all((isfile(join(job_wd, 'env.stdout')),
                        isfile(join(job_wd, 'env.stderr'))))
            assert all((getsize(join(job_wd, 'env.stdout')) > 0,
                        getsize(join(job_wd, 'env.stderr')) == 0))

    finally:
        if governor_process:
            governor_process.terminate()

    rmtree(str(governor_dir))
Ejemplo n.º 5
0
def test_governor_submit_many_instances(tmpdir):
    """
    Run a single governor with many instances of direct managers.
    Submit a bunch of jobs, check if all were submitted properly.
    Get info about submitted jobs.
    Finish governor after all job finished.
    Confirm existence of job's working directories in any of of direct manager working dir.
    Finish all direct managers.
    """
    instances = 4

    governor_dir = tmpdir.join('governor')
    direct_dir = [ '{}_{}'.format(str(tmpdir.join('direct')), i) for i in range(instances) ]

    os.makedirs(str(governor_dir))
    for i in range(instances):
        os.makedirs(direct_dir[i])

    governor_process = None
    direct_process = [None] * instances
    direct_address = [None] * instances

    try:
        governor_id = 'm-governor'
        governor_process, governor_address = fork_manager([ '--log', 'debug', '--wd', str(governor_dir), '--report-format',
            'json', '--id', governor_id, '--governor' ])

        direct_id = [ '{}_{}'.format('m-direct', i) for i in range(instances) ]
        direct_tags = ['custom', 'first', 'manager']

        direct_nodes = [0] * instances
        direct_cores = [0] * instances

        for i in range(instances):
            direct_nodes[i] = i + 1
            direct_cores[i] = (i + 1) * 2
            nodes_conf = ','.join(['n{}:{}'.format(idx, direct_cores[i]) for idx in range(direct_nodes[i])])
            direct_process[i], direct_address[i] = fork_manager(['--log', 'debug',
                '--nodes', nodes_conf, '--wd', direct_dir[i], '--report-format', 'json',
                '--id', direct_id[i], '--tags', ','.join(direct_tags), '--parent', governor_address ])

        # get managers time to communicate and exchange info about resources
        time.sleep(2)

        resources_reply = send_request_valid(governor_address, { 'request': 'resourcesInfo' })
        print('got total resource reply: {}'.format(str(resources_reply)))
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))
        assert all((resources_reply['data'].get('total_nodes', 0) == sum(direct_nodes),
                    resources_reply['data'].get('total_cores', 0) == sum([direct_nodes[i] * direct_cores[i] for i in range(instances)]),
                    resources_reply['data'].get('used_cores', -1) == 0,
                    resources_reply['data'].get('free_cores', 0) == sum([direct_nodes[i] * direct_cores[i] for i in range(instances)]))),\
            str(resources_reply)

        # submit a bunch of jobs
        njobs = 100
        jname = 'date'
        jnames = [ 'date:{}'.format(i) for i in range(0, njobs)]
        wdir_base = 'date.sandbox'
        submit_reply = send_request_valid(governor_address, { 'request': 'submit', 'jobs': [
                {
                    'name': jname,
                    'iteration': { 'stop': njobs },
                    'execution': {
                        'exec': '/usr/bin/env',
                        'args': [ 'date' ],
                        'wd': '{}_{}'.format(wdir_base, '${it}'),
                        'stdout': 'date.stdout',
                        'stderr': 'date.stderr'
                    },
                    'resources': { 'numCores': { 'exact': 1 } }
                }
            ] })
        assert all((submit_reply['code'] == 0, 'data' in resources_reply))
        assert all((submit_reply['data'].get('submitted', 0) == 1,
                    len(submit_reply['data'].get('jobs', [])) == 1,
                    jname in submit_reply['data'].get('jobs', [])))

        status_reply = send_request_valid(governor_address, { 'request': 'jobStatus', 'jobNames': [ jname ] })
        assert all((status_reply['code'] == 0, 'data' in status_reply))
        assert 'jobs' in status_reply['data']
        assert len(status_reply['data']['jobs']) == 1
        assert jname in status_reply['data']['jobs']

        jstatus = status_reply['data']['jobs'][jname]
        assert all((jstatus['status'] == 0, jstatus['data']['jobName'] == jname,
                    jstatus['data']['status'] in [ JobState.QUEUED.name, JobState.SUCCEED.name ]))

        status_reply = send_request_valid(governor_address, { 'request': 'control', 'command': 'finishAfterAllTasksDone' })
        assert status_reply['code'] == 0

        # wait up to 5 seconds
        governor_process.join(10)
        assert governor_process.exitcode == 0

        for i, process in enumerate(direct_process):
            # we don't have to close the direct manager, as it will be automatically closed
            # by the governor when it finish
            #send_request_valid(direct_address[i], { 'request': 'finish' })
            process.join()

        # check working directories of jobs
        for i in range(njobs):
            job_wd = '{}_{}'.format(wdir_base, i)

            full_job_wd_path = [ join(direct_dir[i], job_wd) for i in range(instances) if isdir(join(direct_dir[i], job_wd))]
            assert len(full_job_wd_path) == 1

            # check stdout & stderr files
            assert all((isfile(join(full_job_wd_path[0], 'date.stdout')),
                        isfile(join(full_job_wd_path[0], 'date.stderr'))))
            assert all((getsize(join(full_job_wd_path[0], 'date.stdout')) > 0,
                        getsize(join(full_job_wd_path[0], 'date.stderr')) == 0))

    finally:
        if governor_process:
            governor_process.terminate()

        for process in direct_process:
            if process:
                process.terminate()

    rmtree(str(tmpdir))
Ejemplo n.º 6
0
def test_governor_wait_for_all_jobs_many(tmpdir):
    governor_dir = tmpdir.join('governor')
    direct_dir = tmpdir.join('direct1')

    os.makedirs(str(governor_dir))
    os.makedirs(str(direct_dir))

    governor_process = direct_process = None

    try:
        governor_id = 'm-governor'
        governor_process, governor_address = fork_manager([ '--log', 'debug', '--wd', str(governor_dir), '--report-format',
            'json', '--id', governor_id, '--governor' ])

        direct_id = 'm-direct-1'
        direct_tags = ['custom', 'first', 'manager']
        direct_nodes = 1
        direct_cores = 4
        nodes_conf = ','.join(['n{}:{}'.format(idx, direct_cores) for idx in range(direct_nodes)])
        direct_process, direct_address = fork_manager(['--log', 'debug',
            '--nodes', nodes_conf, '--wd', str(direct_dir), '--report-format', 'json',
            '--id', direct_id, '--tags', ','.join(direct_tags), '--parent', governor_address ])

        # get managers time to communicate and exchange info about resources
        time.sleep(2)

        resources_reply = send_request_valid(governor_address, { 'request': 'resourcesInfo' })
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))
        assert all((resources_reply['data'].get('total_nodes', 0) == direct_nodes,
                    resources_reply['data'].get('total_cores', 0) == direct_nodes * direct_cores,
                    resources_reply['data'].get('used_cores', -1) == 0,
                    resources_reply['data'].get('free_cores', 0) == direct_nodes * direct_cores)), str(resources_reply)

        # submit a bunch of jobs
        njobs = 10
        jname = 'date'
        jnames = [ 'date:{}'.format(i) for i in range(0, njobs)]
        submit_reply = send_request_valid(governor_address, { 'request': 'submit', 'jobs': [
                {
                    'name': jname,
                    'iteration': { 'stop': njobs },
                    'execution': {
                        'exec': '/usr/bin/env',
                        'args': [ 'date' ],
                        'wd': 'date.sandbox',
                        'stdout': 'sleep2.stdout',
                        'stderr': 'sleep2.stderr'
                    },
                    'resources': { 'numCores': { 'exact': 1 } }
                }
            ] })
        assert all((submit_reply['code'] == 0, 'data' in resources_reply))
        assert all((submit_reply['data'].get('submitted', 0) == 1,
                    len(submit_reply['data'].get('jobs', [])) == 1))
        assert ((jname in submit_reply['data'].get('jobs', [])))

        status_reply = send_request_valid(governor_address, { 'request': 'jobStatus', 'jobNames': [ jname ]})
        assert all((status_reply['code'] == 0, 'data' in status_reply))
        assert 'jobs' in status_reply['data']
        assert len(status_reply['data']['jobs']) == 1
        assert ((jname in status_reply['data']['jobs']))

        jstatus = status_reply['data']['jobs'][jname]
        assert all((jstatus['status'] == 0, jstatus['data']['jobName'] == jname,
                   jstatus['data']['status'] in [ JobState.QUEUED.name, JobState.SUCCEED.name ]))

        status_reply = send_request_valid(governor_address, { 'request': 'control', 'command': 'finishAfterAllTasksDone' })
        assert status_reply['code'] == 0

        # wait up to 5 seconds
        governor_process.join(10)
        assert governor_process.exitcode == 0

        # we don't have to close the direct manager, as it will be automatically closed
        # by the governor when it finish
        #send_request_valid(direct_address, { 'request': 'finish' })
        direct_process.join()
    finally:
        if governor_process:
            governor_process.terminate()

        if direct_process:
            direct_process.terminate()

    rmtree(str(tmpdir))
Ejemplo n.º 7
0
def test_register_manager_resources_many(tmpdir):
    governor_dir = tmpdir.join('governor')
    direct_dir = tmpdir.join('direct')

    instances = 4

    os.makedirs(str(governor_dir))
    for i in range(instances):
        os.makedirs('{}_{}'.format(str(direct_dir), i))

    governor_process = None
    direct_process = [None] * instances
    direct_address = [None] * instances

    try:
        governor_id = 'm-governor'
        governor_process, governor_address = fork_manager([ '--log', 'debug', '--wd', str(governor_dir), '--report-format',
            'json', '--id', governor_id, '--governor' ])

        direct_id = 'm-direct'
        direct_tags = ['custom', 'first', 'manager']

        direct_nodes = [0] * instances
        direct_cores = [0] * instances

        for i in range(instances):
            direct_nodes[i] = i + 1
            direct_cores[i] = (i + 1) * 2
            nodes_conf = ','.join(['n{}:{}'.format(idx, direct_cores[i]) for idx in range(direct_nodes[i])])
            direct_process[i], direct_address[i] = fork_manager(['--log', 'debug',
                '--nodes', nodes_conf, '--wd', str(direct_dir), '--report-format', 'json',
                '--id', '{}.{}'.format(direct_id, i), '--tags', ','.join(direct_tags), '--parent', governor_address ])

        # get managers time to communicate and exchange info about resources
        time.sleep(2)

        resources_reply = send_request_valid(governor_address, { 'request': 'resourcesInfo' })
        print('got total resource reply: {}'.format(str(resources_reply)))
        assert all((resources_reply['code'] == 0, 'data' in resources_reply))
        assert all((resources_reply['data'].get('total_nodes', 0) == sum(direct_nodes),
                    resources_reply['data'].get('total_cores', 0) == sum([direct_nodes[i] * direct_cores[i] for i in range(instances)]),
                    resources_reply['data'].get('used_cores', -1) == 0,
                    resources_reply['data'].get('free_cores', 0) == sum([direct_nodes[i] * direct_cores[i] for i in range(instances)]))),\
            str(resources_reply)

        send_request_valid(governor_address, { 'request': 'finish' })
        governor_process.join()

        # we don't have to close the direct manager, as it will be automatically closed
        # by the governor when it finish
#        for address in direct_address:
#            send_request_valid(address, { 'request': 'finish' })

        for proc in direct_process:
            proc.join()
    finally:
        if governor_process:
            governor_process.terminate()

        for proc in direct_process:
            if proc:
                proc.terminate()

    rmtree(str(tmpdir))