Beispiel #1
0
def test_request_remove_job(tmpdir):
    # raw removeJob request test
    req = RemoveJobReq({'request': 'removeJob', 'jobNames': ['job1', 'job2']})
    req_clone = RemoveJobReq(json.loads(req.to_json()))
    assert req.to_json() == req_clone.to_json()

    m = LocalManager(['--wd', str(tmpdir), '--nodes', 2],
                     {'wdir': str(tmpdir)})

    try:
        # missing 'jobNames' for jobInfo request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong remove job request - missing job names.*"):
            m.send_request({'request': 'removeJob'})

        # wrong format of 'jobNames' element
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong remove job request - missing job names.*"):
            m.send_request({'request': 'removeJob', 'jobNames': 'not a list'})

        # wrong format of 'jobNames' element - empty list
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong remove job request - missing job names.*"):
            m.send_request({'request': 'removeJob', 'jobNames': []})
    finally:
        m.finish()
Beispiel #2
0
def test_resume_failed(tmpdir):
    non_existing_path = 'some-non-existing-directory'
    with pytest.raises(ServiceError, match=r".*Resume directory.*not exists or is not valid QCG-PilotJob auxiliary directory.*"):
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', '4', '--resume',
                          non_existing_path], {'wdir': str(tmpdir)})

    non_existing_path = join(tmpdir, 'non-pilotjob-dir')
    mkdir(non_existing_path)
    with pytest.raises(ServiceError, match=r".*Resume directory.*not exists or is not valid QCG-PilotJob auxiliary directory.*"):
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', '4', '--resume',
                          non_existing_path], {'wdir': str(tmpdir)})
Beispiel #3
0
    def __init__(self,
                 wd=".",
                 resources=None,
                 reserve_core=False,
                 enable_rt_stats=False,
                 wrapper_rt_stats=None,
                 log_level='info',
                 *other_args):

        self.finished = False

        # ---- QCG PILOT JOB INITIALISATION ---

        # Establish logging levels
        service_log_level, client_log_level = self._setup_qcgpj_logging(
            log_level)

        # Prepare input arguments for QCG-PJM

        args = ['--log', service_log_level, '--wd', wd]

        if resources:
            args.append('--nodes')
            args.append(str(resources))

        if reserve_core:
            args.append('--system-core')

        if enable_rt_stats:
            args.append('--enable-rt-stats')

        if wrapper_rt_stats:
            args.append('--wrapper-rt-stats')
            args.append(wrapper_rt_stats)

        if other_args:
            args.append(other_args)

        client_conf = {
            'log_file': wd + '/api.log',
            'log_level': client_log_level
        }

        _logger.info(f'Starting QCG-PJ Manager with arguments: {args}')

        # create QCGPJ Manager (service part)
        self._qcgpjm = LocalManager(args, client_conf)
Beispiel #4
0
def test_slurmenv_api_resources():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})
        api_res = m.resources()

        assert all(('total_nodes' in api_res, 'total_cores' in api_res))
        assert all((api_res['total_nodes'] == resources.total_nodes, api_res['total_cores'] == resources.total_cores))

        aux_dir = find_single_aux_dir(str(tmpdir))

        assert all((exists(join(tmpdir, '.qcgpjm-client', 'api.log')),
                    exists(join(aux_dir, 'service.log'))))

    finally:
        if m:
            m.finish()
            # stopManager is using 'terminate' method on service process, which is not a best option when using
            # pytest and gathering code coverage
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #5
0
def test_slurmenv_api_submit_many_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': { 'numCores': { 'exact': resources.total_cores } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')

        # check working directories of job's inside working directory of service
        assert tmpdir == jinfos['host'].wdir, str(jinfos['host'].wdir)
        assert all((len(jinfos['host'].nodes) == resources.total_nodes,
                    jinfos['host'].total_cores == resources.total_cores)), str(jinfos['host'])

    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #6
0
def test_request_general(tmpdir):
    m = LocalManager(['--wd', str(tmpdir), '--nodes', 2],
                     {'wdir': str(tmpdir)})

    try:
        # missing 'request' element
        with pytest.raises(ConnectionError, match=r".*Invalid request.*"):
            m.send_request({'notARequestElement': 'some value'})

        # unknown 'request'
        with pytest.raises(ConnectionError, match=r".*Unknown request name.*"):
            m.send_request({'request': 'some unknown request'})
    finally:
        m.finish()
Beispiel #7
0
def test_resume_tracker_files(tmpdir):
    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', '4'], {'wdir': str(tmpdir)})

        job_req = {
                   'name': 'host',
                   'execution': {
                       'exec': '/bin/date',
                       'stdout': 'out',
                   },
                   'resources': { 'numCores': { 'exact': 1 } }
                   }
        jobs = Jobs().add_std(job_req)
        submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')

        time.sleep(1)
        aux_dir = find_single_aux_dir(str(tmpdir))

        print(f'aux_dir content: {str(listdir(aux_dir))}')

        assert all(exists(join(aux_dir, fname)) for fname in ['track.reqs', 'track.states']), \
            f"missing tracker files in {aux_dir}: {str(listdir(aux_dir))}"

    finally:
        if m:
            m.finish()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #8
0
def test_slurmenv_api_submit_simple():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs().\
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'std.out',
                         'stderr': 'std.err'
                     }})
        assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #9
0
def test_request_control(tmpdir):
    # raw control request test
    req = ControlReq({
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    })
    req_clone = ControlReq(json.loads(req.to_json()))
    assert req.to_json() == req_clone.to_json()

    m = LocalManager(['--wd', str(tmpdir), '--nodes', 2],
                     {'wdir': str(tmpdir)})

    try:
        # missing 'command' for control request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong control request - missing command.*"):
            m.send_request({'request': 'control'})

        # unknown 'command' for control request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong control request - unknown command.*"):
            m.send_request({
                'request': 'control',
                'command': 'unknown command'
            })

        # finishAfterAllTasksDone 'command' for control request
        res = m.send_request({
            'request': 'control',
            'command': 'finishAfterAllTasksDone'
        })
        assert all(
            (res.get('code', -1) == 0,
             res.get('message',
                     None) == 'finishAfterAllTasksDone command accepted'))
    finally:
        try:
            # if finishAfterAllTasksDone has been sent we might get error 'Finish request already requested'
            m.finish()
        except Exception:
            pass
def test_local_manager_resources(tmpdir):
    cores = 4

    # switch on debugging (by default in api.log file)
    m = LocalManager(['--wd', str(tmpdir), '--nodes',
                      str(cores)], {'wdir': str(tmpdir)})

    res = m.resources()

    assert all(('total_nodes' in res, 'total_cores'
                in res, res['total_nodes'] == 1, res['total_cores'] == cores))

    m.finish()
    #    m.stopManager()
    m.cleanup()
Beispiel #11
0
def test_slurmenv_api_iteration_simple():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        its = 2
        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': 1 } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        assert jinfos
        jinfo = jinfos['host']
        print('jinfo: {}'.format(jinfo))
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0))

        its = 2
        jobs = Jobs(). \
            add_std({ 'name': 'host2',
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': 1 } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos['host2']
        print('jinfo: {}'.format(jinfo))
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0))
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('host2', iteration),
                        job_it.wdir == tmpdir, job_it.total_cores == 1))
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
def test_local_manager_resources_nodes(tmpdir):
    nodes = 2
    cores_per_node = 3
    res_desc = ','.join([str(cores_per_node) for i in range(nodes)])

    # switch on debugging (by default in api.log file)
    m = LocalManager(['--wd', str(tmpdir), '--nodes', res_desc],
                     {'wdir': str(tmpdir)})

    res = m.resources()

    assert all(
        ('total_nodes' in res, 'total_cores' in res, res['total_nodes'] == 2,
         res['total_cores'] == cores_per_node * nodes))

    m.finish()
    #    m.stopManager()
    m.cleanup()
Beispiel #13
0
def test_slurmenv_api_std_streams_many_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': 'cat',
                         'stdin': '/etc/system-release',
                         'stdout': 'out',
                         'stderr': 'err'
                     },
                     'resources': {
                         'numCores': { 'exact': 2 }
                     }
                     })
        assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')

        assert all((exists(join(tmpdir, 'out')), exists(join(tmpdir, 'err'))))

        with open(join(tmpdir, 'out'), 'rt') as out_f:
            out = out_f.read()

        with open(join('/etc/system-release'), 'rt') as sr_f:
            system_release = sr_f.read()

        assert system_release in out
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #14
0
class QCGPJExecutor(Executor):
    """QCG-PilotJob Executor. It provides simplified interface for common uses of QCG-PilotJob

    Parameters
    ----------
    wd : str, optional
        Working directory where QCG-PilotJob manager should be started, by default it is
        a current directory
    resources : str, optional
        The resources to use. If specified forces usage of Local mode of QCG-PilotJob Manager.
        The format is compliant with the NODES format of QCG-PilotJob, i.e.:
        [node_name:]cores_on_node[,node_name2:cores_on_node][,...].
        Eg. to define 4 cores on an unnamed node use `resources="4"`,
        to define 2 nodes: node_1 with 2 cores and node_2 with 3 cores, use `resources="node_1:2,node_2:3"`
    reserve_core : bool, optional
        If True reserves a core for QCG-PilotJob Manager instance,
        by default QCG-PilotJob Manager shares a core with computing tasks
        Parameters.
    enable_rt_stats : bool, optional
        If True, QCG-PilotJob Manager will collect its runtime statistics
    wrapper_rt_stats : str, optional
        The path to the QCG-PilotJob Manager tasks wrapper program used for collection of statistics
    log_level : str, optional
        Logging level for QCG-PilotJob Manager (for both service and client part).
    other_args : optional
        Optional list of additional arguments for initialisation of QCG-PilotJob Manager

    Returns
    -------
    None

    """
    def __init__(self,
                 wd=".",
                 resources=None,
                 reserve_core=False,
                 enable_rt_stats=False,
                 wrapper_rt_stats=None,
                 log_level='info',
                 *other_args):

        self.finished = False

        # ---- QCG PILOT JOB INITIALISATION ---

        # Establish logging levels
        service_log_level, client_log_level = self._setup_qcgpj_logging(
            log_level)

        # Prepare input arguments for QCG-PJM

        args = ['--log', service_log_level, '--wd', wd]

        if resources:
            args.append('--nodes')
            args.append(str(resources))

        if reserve_core:
            args.append('--system-core')

        if enable_rt_stats:
            args.append('--enable-rt-stats')

        if wrapper_rt_stats:
            args.append('--wrapper-rt-stats')
            args.append(wrapper_rt_stats)

        if other_args:
            args.append(other_args)

        client_conf = {
            'log_file': wd + '/api.log',
            'log_level': client_log_level
        }

        _logger.info(f'Starting QCG-PJ Manager with arguments: {args}')

        # create QCGPJ Manager (service part)
        self._qcgpjm = LocalManager(args, client_conf)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.shutdown()

    def shutdown(self, wait=True):
        """Shutdowns the QCG-PJ manager service. If it is already closed, the method has no effect.
        """
        if not self.finished:
            self._qcgpjm.finish()
            self.finished = True
        else:
            pass

    def submit(self, fn: Callable[..., Union[str, Tuple[str, Dict[str, Any]]]],
               *args, **kwargs):
        """Submits a specific task to the QCG-PJ manager using template-based, executor-like interface.

        Parameters
        ----------
        fn : Callable
            A callable that returns a tuple representing a task's template.
            The first element of the tuple should be a string containing
            a QCG-PilotJob task's description with placeholders
            (identifiers preceded by $ symbol) and the second a dictionary
            that assigns default values for selected placeholders.
        *args: variable length list with dicts, optional
            A set of dicts which contain parameters that will be used to substitute placeholders
            defined in the template.
            Note: *args overwrite defaults, but they are overwritten by **kwargs
        **kwargs: arbitrary keyword arguments
            A set of keyword arguments that will be used to substitute placeholders defined in
            the template.
            Note: **kwargs overwrite *args and defaults.

        Returns
        -------
        QCGPJFuture
            The QCGPJFuture object assigned with the submitted task

        """
        template = fn()
        if isinstance(template, tuple):
            template_str = template[0]
            defaults = template[1]
        else:
            template_str = template
            defaults = {}

        t = Template(textwrap.dedent(template_str))

        substitutions = {}

        for a in args:
            if a is not None:
                substitutions.update(a)

        substitutions.update(kwargs)

        td_str = t.substitute(defaults, **substitutions)
        td = ast.literal_eval(td_str)
        if 'env' not in td['execution']:
            td['execution']['env'] = {}
        td['execution']['env']['QCG_PM_EXEC_API_JOB_ID'] = '${jname}'
        jobs = Jobs()
        jobs.add_std(td)
        jobs_ids = self._qcgpjm.submit(jobs)
        return QCGPJFuture(jobs_ids, self._qcgpjm)

    @property
    def qcgpj_manager(self):
        """Returns current QCG-PilotJob manager instance
        """
        return self._qcgpjm

    @staticmethod
    def _setup_qcgpj_logging(log_level):
        log_level = log_level.upper()

        try:
            service_log_level = ServiceLogLevel[log_level].value
        except KeyError:
            service_log_level = ServiceLogLevel.DEBUG.value

        try:
            client_log_level = ClientLogLevel[log_level].value
        except KeyError:
            client_log_level = ClientLogLevel.DEBUG.value

        return service_log_level, client_log_level
Beispiel #15
0
def test_resume_simple(tmpdir):
    try:
        ncores = 4
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores)],
                         {'wdir': str(tmpdir)})

        its = 10
        job_req = {
            'name': 'sleep',
            'execution': {
                'exec': '/bin/sleep',
                'args': [ '4s' ],
                'stdout': 'out',
            },
            'iteration': { 'stop': its },
            'resources': { 'numCores': { 'exact': 1 } }
        }
        jobs = Jobs().add_std(job_req)
        job_ids = m.submit(jobs)

        # because job iterations executes in order, after finish of 4th iteration, the three previous should also finish
        m.wait4('sleep:3')
        jinfos = m.info_parsed(job_ids, withChilds=True)
        assert jinfos
        jinfo = jinfos['sleep']

        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == ncores, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]

            exp_status = ['SUCCEED']
            if iteration > 3:
                exp_status = ['EXECUTING', 'SCHEDULED', 'QUEUED']
            assert all((job_it.iteration == iteration,
                        job_it.name == '{}:{}'.format('sleep', iteration),
                        job_it.status in exp_status)),\
                f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != {exp_status}"

        # kill process
        m.kill_manager_process()
        m.cleanup()

        ncores = 4
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores),
                          '--resume', tmpdir],
                         {'wdir': str(tmpdir)})

        m.wait4all()
        jinfos = m.info_parsed(job_ids, withChilds=True)
        assert jinfos
        jinfo = jinfos['sleep']

        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]

            assert all((job_it.iteration == iteration,
                        job_it.name == '{}:{}'.format('sleep', iteration),
                        job_it.status == 'SUCCEED')), \
                f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != SUCCEED"
    finally:
        if m:
            m.finish()
            m.cleanup()
Beispiel #16
0
WORKER_CORES = int(args.WORKER_CORES)
DATA_DIR = 'input_csv'
if WORKER_CORES > 1:
    PYTHON_CMD = "mpirun -n %d python3" % (WORKER_CORES)
else:
    PYTHON_CMD = "python3"
'''
######################################################################
    config PilotJob
######################################################################  
'''

from qcg.pilotjob.api.manager import Manager
from qcg.pilotjob.api.manager import LocalManager
from qcg.pilotjob.api.job import Jobs
m = LocalManager(cfg={'log_level': 'DEBUG'}, server_args=['--log', 'debug'])

# get available resources
print("\n\navailable resources:\n%s\n" % str(m.resources()))

# submit jobs and save their names in 'ids' list
jobs = Jobs()

print("Start Adding jobs . . .\n\n")

WORKER_INDEX = 0
for i in range(NUM_WORKERS):
    for SUBMODEL in ['macro', 'micro']:
        cmd = '%s run_couple.py --submodel %s --data_dir=%s --worker_index %d --coupling_type %s --num_workers %d --weather_coupling %s' % (
            PYTHON_CMD, SUBMODEL, DATA_DIR, WORKER_INDEX, COUPLING_TYPE,
            NUM_WORKERS, WEATHER_COUPLING)
Beispiel #17
0
DATA_DIR = "input_csv"
if INSTANCE_CORES > 1:
    PYTHON_CMD = "mpirun -n %d python3" % (INSTANCE_CORES)
else:
    PYTHON_CMD = "python3"
"""
######################################################################
    config PilotJob
######################################################################
"""

from qcg.pilotjob.api.manager import Manager
from qcg.pilotjob.api.manager import LocalManager
from qcg.pilotjob.api.job import Jobs
# m = LocalManager(cfg={'log_level': 'DEBUG'}, server_args=['--log', 'debug'])
m = LocalManager()

# get available resources
print("\n\navailable resources:\n%s\n" % str(m.resources()))

# submit jobs and save their names in 'ids' list
jobs = Jobs()

print("Start Adding jobs . . .\n\n")

INSTANCE_INDEX = 0
for i in range(NUM_INSTANCES):
    for SUBMODEL in ['macro', 'micro']:
        cmd = '%s run_mscale.py --submodel %s --data_dir=%s --instance_index %d --coupling_type %s --num_instances %d --weather_coupling %s' % (
            PYTHON_CMD, SUBMODEL, DATA_DIR, INSTANCE_INDEX, COUPLING_TYPE,
            NUM_INSTANCES, WEATHER_COUPLING)
Beispiel #18
0
def test_slurmenv_api_cancel_kill_nl():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))
    print(f'tmpdir: {tmpdir}')

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        iters=10
        ids = m.submit(Jobs().
                       add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}',
                           stderr='sleep.err.${it}', numCores=1)
                       )
        jid = ids[0]
        assert len(m.list()) == 1

        list_jid = list(m.list().keys())[0]
        assert list_jid == jid

        # wait for job to start executing
        sleep(2)

        m.cancel([jid])

        # wait for SIGTERM job cancel
        sleep(2)

        jinfos = m.info_parsed(ids)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'QUEUED'))

        # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT)
        sleep(ExecutionJob.SIG_KILL_TIMEOUT)

        jinfos = m.info_parsed(ids, withChilds=True)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'CANCELED'))

        # the canceled iterations are included in 'failed' entry in job statistics
        # the cancel status is presented in 'childs/state' entry
        assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0,
                    jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters,
                    jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters))
        assert len(jinfos[jid].childs) == iters
        for iteration in range(iters):
            job_it = jinfos[jid].childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration),
                        job_it.status == 'CANCELED')), str(job_it)

        m.remove(jid)

    finally:
        m.finish()
        m.cleanup()
Beispiel #19
0
def test_request_notify(tmpdir):
    # raw notify request test
    req = NotifyReq({
        'request': 'notify',
        'entity': 'job',
        'params': {
            'name': 'j1',
            'state': 'FINISHED',
            'attributes': {
                'a1': True
            }
        }
    })
    req_clone = NotifyReq(json.loads(req.to_json()))
    assert req.to_json() == req_clone.to_json()

    m = LocalManager(['--wd', str(tmpdir), '--nodes', 2],
                     {'wdir': str(tmpdir)})

    try:
        # missing 'entity' for notify request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong notify request - missing/unknown entity.*"):
            m.send_request({'request': 'notify'})

        # unknown 'entity' for notify request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong notify request - missing/unknown entity.*"):
            m.send_request({'request': 'notify', 'entity': 'task'})

        # missing params
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong notify request - missing register parameters.*"
        ):
            m.send_request({'request': 'notify', 'entity': 'job'})

        # missing key params
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong notify request - missing key notify parameters.*"):
            m.send_request({
                'request': 'notify',
                'entity': 'job',
                'params': {
                    'name': 'j1'
                }
            })

        # missing key params
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong notify request - missing key notify parameters.*"):
            m.send_request({
                'request': 'notify',
                'entity': 'job',
                'params': {
                    'name': 'j1',
                    'state': 'FINISHED'
                }
            })

        # missing key params
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong notify request - missing key notify parameters.*"):
            m.send_request({
                'request': 'notify',
                'entity': 'job',
                'params': {
                    'state': 'FINISHED',
                    'attributes': 'a1'
                }
            })

    finally:
        m.finish()
Beispiel #20
0
def test_slurmenv_api_submit_resource_ranges():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'host',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': { 'numCores': { 'min': 1 } }
                     })
        # job should faile because of missing 'max' parameter
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'FAILED')
        jinfo = jinfos['host']
        assert "Both core's range boundaries (min, max) must be defined" in jinfo.messages, str(jinfo)

        jobs = Jobs(). \
            add_std({ 'name': 'host2',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': {
                         'numNodes': { 'exact': 1 },
                         'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } }
                     })
        # job should run on single node (the first free) with all available cores
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        jinfo = jinfos['host2']
        assert all((len(jinfo.nodes) == 1, jinfo.total_cores == resources.nodes[0].total)), str(jinfo)

        jobs = Jobs(). \
            add_std({ 'name': 'host3',
                     'execution': {
                         'exec': '/bin/hostname',
                         'args': [ '--fqdn' ],
                         'stdout': 'out',
                     },
                     'resources': {
                         'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } }
                     })
        # job should run on at least two nodes with total maximum given cores
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        jinfo = jinfos['host3']
        assert all((len(jinfo.nodes) == 2, jinfo.total_cores == resources.nodes[0].total + 1)), str(jinfo)

    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #21
0
def test_request_register(tmpdir):
    # raw register request test
    req = RegisterReq({
        'request': 'register',
        'entity': 'manager',
        'params': {
            'id': 'm1',
            'address': '0.0.0.0',
            'resources': {
                'nodes': 2
            }
        }
    })
    req_clone = RegisterReq(json.loads(req.to_json()))
    assert req.to_json() == req_clone.to_json()

    m = LocalManager(['--wd', str(tmpdir), '--nodes', 2],
                     {'wdir': str(tmpdir)})

    try:
        # missing 'entity' for register request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong register request - missing/unknown entity.*"):
            m.send_request({'request': 'register'})

        # unknown 'entity' for register request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong register request - missing/unknown entity.*"):
            m.send_request({'request': 'register', 'entity': 'job'})

        # missing params
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong register request - missing register parameters.*"):
            m.send_request({'request': 'register', 'entity': 'manager'})

        # missing key params
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong register request - missing key register parameters.*"
        ):
            m.send_request({
                'request': 'register',
                'entity': 'manager',
                'params': {
                    'id': 'm1'
                }
            })

        # missing key params
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong register request - missing key register parameters.*"
        ):
            m.send_request({
                'request': 'register',
                'entity': 'manager',
                'params': {
                    'id': 'm1',
                    'address': '0.0.0.0'
                }
            })

        # missing key params
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong register request - missing key register parameters.*"
        ):
            m.send_request({
                'request': 'register',
                'entity': 'manager',
                'params': {
                    'resources': {
                        'nodes': 1
                    },
                    'address': '0.0.0.0'
                }
            })

    finally:
        m.finish()
Beispiel #22
0
def test_slurmenv_api_iteration_core_scheduling():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        # in that case the 'split-into' is default the number of iterations
        # so total available resources should be splited into two partitions and each of the
        # iteration should run on its own partition
        jname = 'host'
        its = 2
        jobs = Jobs(). \
           add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        # all iterations has been scheduled across all resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores
        assert all(child.total_cores == resources.total_cores / its for child in jinfo.childs)

        # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the
        # previous example
        jname = 'host2'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        # all iterations has been scheduled across all resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores
        assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs)

        # we explicity specify the 'split-into' parameter to 4, the two iterations should be sheduled
        # on half of the available resources
        jname = 'host3'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 4 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        # all iterations has been scheduled across all resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores / 2
        assert all(child.total_cores == resources.total_cores / 4 for child in jinfo.childs)

        # we explicity specify the 'split-into' parameter to 2, but the number of iterations is larger than
        # available partitions in the same time, so they should be executed serially (by parts)
        jname = 'host4'
        its = 10
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs)

        # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available
        # resources
        jname = 'host5'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores

        # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available
        # resources
        jname = 'host6'
        its = resources.total_cores
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores

        # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers
        # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each
        # iteration inside 'step'
        jname = 'host7'
        its = resources.total_cores
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert (child.total_cores == 1 for child in jinfo.childs)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores

        # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers
        # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each
        # iteration inside 'step'
        jname = 'host8'
        its = resources.total_cores * 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it)
        assert (child.total_cores == 1 for child in jinfo.childs)
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2

        # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers
        # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each
        # iteration inside 'step'
        jname = 'host9'
        its = resources.total_cores + 1
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores >= 1)), str(job_it)
        assert (child.total_cores == 1 for child in jinfo.childs)
        # because all iterations will be splited in two 'steps' and in each step the iterations that has been assigned
        # for the step should usage maximum available resources
        assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2


        # in this case where two iterations can't fit at once on resources, all the iterations should be scheduled
        # serially on all available resources
        jname = 'host10'
        its = resources.total_nodes
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'min': resources.total_cores - 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.total_cores, len(job_it.nodes) == resources.total_nodes)),\
                str(job_it)
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #23
0
def test_slurmenv_api_submit_exceed_total_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        jobs = Jobs(). \
            add_std({ 'name': 'date',
                     'execution': { 'exec': '/bin/date' },
                     'resources': {
                         'numCores': { 'exact': resources.total_cores + 1 }
                     }})
        with pytest.raises(ConnectionError, match=r".*Not enough resources.*"):
            m.submit(jobs)
        assert len(m.list()) == 0

        jobs = Jobs(). \
        add_std({ 'name': 'date',
                     'execution': { 'exec': '/bin/date' },
                     'resources': {
                         'numNodes': { 'exact': resources.total_nodes + 1 }
                     }})
        with pytest.raises(ConnectionError, match=r".*Not enough resources.*"):
            ids = m.submit(jobs)
        assert len(m.list()) == 0

        jobs = Jobs(). \
            add_std({ 'name': 'date',
                     'execution': {
                         'exec': '/bin/date',
                         'stdout': 'std.out',
                     },
                     'resources': { 'numCores': { 'exact': resources.total_cores  } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED')
        assert jinfos['date'].total_cores == resources.total_cores
    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
def test_local_manager_wait4all(tmpdir):
    cores = 4

    # switch on debugging (by default in api.log file)
    m = LocalManager(['--wd', str(tmpdir), '--nodes',
                      str(cores)], {'wdir': str(tmpdir)})

    res = m.resources()

    assert all(('total_nodes' in res, 'total_cores'
                in res, res['total_nodes'] == 1, res['total_cores'] == cores))

    ids = m.submit(Jobs().add(name='host',
                              exec='/bin/hostname',
                              args=['--fqdn'],
                              stdout='host.stdout').add(name='date',
                                                        exec='/bin/date',
                                                        stdout='date.stdout',
                                                        numCores={'exact': 2}))

    assert len(m.list()) == 2

    m.wait4all()

    jinfos = m.info(ids)

    assert all(('jobs' in jinfos, len(jinfos['jobs'].keys()) == 2, 'host'
                in jinfos['jobs'], 'date' in jinfos['jobs'],
                jinfos['jobs']['host'].get('data', {}).get('status',
                                                           '') == 'SUCCEED',
                jinfos['jobs']['date'].get('data', {}).get('status',
                                                           '') == 'SUCCEED'))

    aux_dir = find_single_aux_dir(str(tmpdir))

    assert all(
        (exists(tmpdir.join('.qcgpjm-client', 'api.log')),
         exists(join(aux_dir,
                     'service.log')), exists(tmpdir.join('host.stdout')),
         exists(tmpdir.join('date.stdout'))))

    m.finish()
    #    m.stopManager()
    m.cleanup()
Beispiel #25
0
def test_slurmenv_api_iteration_node_scheduling():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    # TODO: it's hard to write comprehensive iteration scheduling node tests on only two nodes (in slurm's \
    #  development docker)

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        # in that case the 'split-into' is default the number of iterations
        # so total available resources should be splited into two partitions and each of the
        # iteration should run on its own partition
        jname = 'host'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out_${it}', 'stderr': 'err_${it}' },
                     'resources': { 'numCores': { 'exact': resources.nodes[0].total },
                                    'numNodes': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into' } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it)
        # all iterations has been scheduled across all nodes
        assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes
        # the iterations should execute on different node
        assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0]

        # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the
        # previous example
        jname = 'host2'
        its = 2
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': resources.nodes[0].total },
                                    'numNodes': { 'min': 1,
                                                  'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } }
                     })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it)
        # all iterations has been scheduled across all nodes
        assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes
        # the iterations should execute on different node
        assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0]

        # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available
        # resources
        jname = 'host3'
        its = 4
        jobs = Jobs(). \
            add_std({ 'name': jname,
                     'iteration': { 'stop': its },
                     'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' },
                     'resources': { 'numCores': { 'exact': resources.nodes[0].total },
                                    'numNodes': { 'min': 1,
                                                  'scheduler': { 'name': 'maximum-iters' } } }
         })
        jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True)
        assert jinfos
        jinfo = jinfos[jname]
        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]
            print('job iteration {}: {}'.format(iteration, str(job_it)))
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration),
                        job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it)
        assert sum([len(child.nodes) for child in jinfo.childs]) == its

    finally:
        if m:
            m.finish()
#            m.stopManager()
            m.cleanup()

    rmtree(tmpdir)
Beispiel #26
0
                        default="log_MOO.txt")

    parser.add_argument("--cores", action="store", type=int, default="1")
    parser.add_argument("--USE_PJ", action="store", default="False")

    args = parser.parse_args()

    execution_mode = args.execution_mode
    simulation_period = args.simulation_period
    cores = args.cores

    if args.USE_PJ.lower() == "true":
        USE_PJ = True
        from qcg.pilotjob.api.manager import LocalManager
        QCG_MANAGER = LocalManager(
            cfg={'log_level': 'DEBUG'}, server_args=['--log', 'debug']
        )
    else:
        USE_PJ = False

    EXEC_LOG_FILE = os.path.join(work_dir, args.exec_log_file)

    MOO_log(msg="run_MOO input args : {}".format(args))

    # read MOO setting from config yaml file
    MOO_CONFIG = read_MOO_setting_yaml()
    MOO_log(msg="MOO_CONFIG =\n{}".format(pformat(MOO_CONFIG)))

    problem = FLEE_MOO_Problem(
        execution_mode=execution_mode,
        simulation_period=simulation_period,
Beispiel #27
0
def test_request_submit(tmpdir):
    # raw submit request test
    req = SubmitReq({
        'request':
        'submit',
        'jobs': [{
            'name': 'job1',
            'execution': {
                'exec': '/bin/date',
                'args': ['1', '2']
            }
        }, {
            'name': 'job2',
            'execution': {
                'script': 'date'
            },
            'resources': {
                'numCores': {
                    'exact': 1
                }
            }
        }]
    })
    req_clone = SubmitReq(json.loads(req.to_json()))
    assert req.to_json() == req_clone.to_json()

    m = LocalManager(['--wd', str(tmpdir), '--nodes', 2],
                     {'wdir': str(tmpdir)})

    try:
        # missing 'jobs' for submit request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong submit request - missing jobs data.*"):
            m.send_request({'request': 'submit'})

        # wrong 'jobs' data format for submit request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong submit request - missing jobs data.*"):
            m.send_request({'request': 'submit', 'jobs': None})

        # wrong 'jobs' data format for submit request
        with pytest.raises(
                ConnectionError,
                match=r".*Wrong submit request - missing jobs data.*"):
            m.send_request({'request': 'submit', 'jobs': 'not a list'})

        # wrong 'jobs' data format for submit request
        with pytest.raises(ConnectionError,
                           match=r".*Wrong submit request - wrong job data.*"):
            m.send_request({'request': 'submit', 'jobs': ['not a dictionary']})

        # missing job's name
        with pytest.raises(ConnectionError,
                           match=r".*Missing name in job description.*"):
            m.send_request({
                'request': 'submit',
                'jobs': [{
                    'execution': '/bin/date'
                }]
            })

        # missing execution element
        with pytest.raises(
                ConnectionError,
                match=r".*Missing execution element in job description.*"):
            m.send_request({'request': 'submit', 'jobs': [{'name': 'date'}]})

        # wrong iterations format
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong format of iteration directive: not a dictionary.*"):
            m.send_request({
                'request':
                'submit',
                'jobs': [{
                    'name': 'date',
                    'execution': {
                        'exec': '/bin/date'
                    },
                    'iteration': 'not a list'
                }]
            })

        # wrong iterations format
        with pytest.raises(
                ConnectionError,
                match=
                r".*Wrong format of iteration directive: start index larger then stop one.*"
        ):
            m.send_request({
                'request':
                'submit',
                'jobs': [{
                    'name': 'date',
                    'execution': {
                        'exec': '/bin/date'
                    },
                    'iteration': {
                        'start': 2,
                        'stop': 1
                    }
                }]
            })

    finally:
        m.finish()