Example #1
0
def test_runtime_mismatch(pilot_description):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=DeprecationWarning,
                                module='radical.pilot.task_manager')
        warnings.filterwarnings('ignore', category=DeprecationWarning,
                                module='radical.pilot.db.database')
        warnings.filterwarnings('ignore', category=DeprecationWarning,
                                module='radical.pilot.session')

        session = rp.Session()

        with session:
            original_pmgr = rp.PilotManager(session=session)
            pilot = original_pmgr.submit_pilots(rp.PilotDescription(pilot_description))
            original_tmgr = rp.TaskManager(session=session)
            original_tmgr.add_pilots(pilot)

        assert session.closed
        # This assertion may not be true:
        # assert pilot.state in rp.FINAL
        # Note that Pilot and other components may still be shutting down, but the
        # intention is that, from this point, pmgr, pilot, and tmgr are now "stale".

        session = rp.Session()

        with session:
            state = Runtime(session=session)

            with pytest.raises(APIError):
                state.task_manager(original_tmgr)
            original_tmgr.close()

            tmgr = rp.TaskManager(session=session)
            state.task_manager(tmgr)

            with pytest.raises(APIError):
                state.pilot_manager(original_pmgr)
            original_pmgr.close()

            pmgr = rp.PilotManager(session=session)
            state.pilot_manager(pmgr)

            # The UID will not resolve in the stored PilotManager.
            with pytest.raises(ValueError):
                state.pilot(pilot.uid)

            # The Pilot is detectably invalid.
            with pytest.raises(APIError):
                state.pilot(pilot)

            # Even here, the old Pilot may still be in 'PMGR_ACTIVE_PENDING'
            if pilot.state not in rp.FINAL:
                pilot.cancel()
            tmgr.close()
            pmgr.close()
        assert session.closed
Example #2
0
    def __init__(self, descr: dict, executor: jpsi.JobExecutor,
                 url: str) -> None:

        jpsi.ExecutorAdaptorBase.__init__(self, descr, executor, url)

        self._url = ru.Url(url)
        if self._url.schema != 'rp':
            raise ValueError('handle only rp:// URLs, not %s', self._url)

        try:
            self._jobs = dict()  # {job.uid : [JPSI_JOB, RP_TASK]
            self._lock = mt.Lock()

            self._session = rp.Session()

            self._pmgr = rp.PilotManager(session=self._session)
            self._tmgr = rp.TaskManager(session=self._session)

            self._pmgr.register_callback(self._pilot_state_cb)
            self._tmgr.register_callback(self._task_state_cb)

            # this is layer 0, so we just create a dummy pilot
            pd = rp.PilotDescription({
                'resource': 'local.localhost',
                'cores': 16,
                'runtime': 60
            })
            self._pilot = self._pmgr.submit_pilots(pd)
            self._tmgr.add_pilots(self._pilot)

        except Exception:
            self._log.exception('init failed')
            raise
Example #3
0
def test_runtime_bad_uid(pilot_description):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=DeprecationWarning,
                                module='radical.pilot.task_manager')
        warnings.filterwarnings('ignore', category=DeprecationWarning,
                                module='radical.pilot.db.database')
        warnings.filterwarnings('ignore', category=DeprecationWarning,
                                module='radical.pilot.session')

        session = rp.Session()

        with session:
            state = Runtime(session=session)

            with pytest.raises(ValueError):
                state.task_manager('spam')

            tmgr = rp.TaskManager(session=session)
            state.task_manager(tmgr)

            with pytest.raises(ValueError):
                state.pilot_manager('spam')

            pmgr = rp.PilotManager(session=session)
            state.pilot_manager(pmgr)

            with pytest.raises(ValueError):
                state.pilot_manager('spam')

            tmgr.close()
            pmgr.close()

        assert session.closed
Example #4
0
def _new_taskmanager(session: rp.Session, pilot: rp.Pilot):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore',
                                category=DeprecationWarning,
                                module='radical.pilot.task_manager')
        warnings.filterwarnings('ignore',
                                category=DeprecationWarning,
                                module='radical.pilot.db.database')
        warnings.filterwarnings('ignore',
                                category=DeprecationWarning,
                                module='radical.pilot.session')

        tmgr = rp.TaskManager(session=session)
        tmgr.add_pilots(pilot)
    return tmgr
Example #5
0
def test_rp_raptor_staging(pilot_description, rp_venv):
    """Test file staging for raptor Master and Worker tasks.

    - upon pilot startup, transfer a file to the pilot sandbox
    - upon master startup, create a link to that file for each master
    - for each task, copy the file into the task sandbox
    - upon task completion, transfer the files to the client (and rename them)
    """
    import time
    import radical.pilot as rp

    # Note: we need to install the current scalems package to test remotely.
    # If this is problematic, we can add a check like the following.
    #     if pilot_description.resource != 'local.localhost' \
    #             and pilot_description.access_schema \
    #             and pilot_description.access_schema != 'local':
    #         pytest.skip('This test is only for local execution.')

    # Note: radical.pilot.Session creation causes several deprecation warnings.
    # Ref https://github.com/radical-cybertools/radical.pilot/issues/2185
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=DeprecationWarning)
        session = rp.Session()
    fname = '%d.dat' % os.getpid()
    fpath = os.path.join('/tmp', fname)
    data: str = time.asctime()

    # Hopefully, this requirement is temporary.
    if rp_venv is None:
        pytest.skip('This test requires a user-provided static RP venv.')

    if rp_venv:
        pre_exec = ['. {}/bin/activate'.format(rp_venv)]
    else:
        pre_exec = None

    try:
        pmgr = rp.PilotManager(session=session)
        tmgr = rp.TaskManager(session=session)

        # Illustrate data staging as part of the Pilot launch.
        # By default, file is copied to the root of the Pilot sandbox,
        # where it can be referenced as 'pilot:///filename'
        # Alternatively: pilot.stage_in() and pilot.stage_output() (blocking calls)
        pilot_description.exit_on_error = True
        pilot_description.input_staging = [fpath]
        with open(fpath, 'w') as fh:
            fh.writelines([data])
        try:
            pilot = pmgr.submit_pilots(pilot_description)
            # Confirmation that the input file has been staged by waiting for pilot state.
            pilot.wait(state=[rp.states.PMGR_ACTIVE] + rp.FINAL)
        finally:
            os.unlink(fpath)

        tmgr.add_pilots(pilot)

        uid = 'scalems.master.001'
        # Illustrate another mode of data staging with the Master task submission.
        td = rp.TaskDescription({
            'uid':
            uid,
            'executable':
            'scalems_rp_master',
            'input_staging': [{
                'source': 'pilot:///%s' % fname,
                'target': 'pilot:///%s.%s.lnk' % (fname, uid),
                'action': rp.LINK
            }],
            'pre_exec':
            pre_exec
            # 'named_env': 'scalems_env'
        })

        master = tmgr.submit_tasks(td)

        # Illustrate availability of scheduler and of data staged with Master task.
        # When the task enters AGENT_SCHEDULING_PENDING it has passed all input staging,
        # and the files will be available.
        # (see https://docs.google.com/drawings/d/1q5ehxIVdln5tXEn34mJyWAmxBk_DqZ5wwkl3En-t5jo/)

        # Confirm that Master script is running (and ready to receive raptor tasks)
        # WARNING: rp.Task.wait() *state* parameter does not handle tuples, but does not check type.
        master.wait(state=[rp.states.AGENT_EXECUTING] + rp.FINAL)
        assert master.state not in {rp.CANCELED, rp.FAILED}

        # define raptor tasks and submit them to the master
        tds = list()
        # Illustrate data staging as part of raptor task submission.
        # Note that tasks submitted by the client
        # a sandboxed task directory, whereas those submitted by the Master (through Master.request(),
        # through the wrapper script or the Master.create_initial_tasks() hook) do not,
        # and do not have a data staging phase.
        for i in range(3):
            uid = 'scalems.%06d' % i
            work = {
                'mode': 'call',
                'cores': 1,
                'timeout': 10,  # seconds
                'data': {
                    'method': 'hello',
                    'kwargs': {
                        'world': uid
                    }
                }
            }
            tds.append(
                rp.TaskDescription({
                    'uid':
                    uid,
                    'executable':
                    '-',
                    'input_staging': [{
                        'source':
                        'pilot:///%s.%s.lnk' % (fname, master.uid),
                        'target':
                        'task:///%s' % fname,
                        'action':
                        rp.COPY
                    }],
                    'output_staging': [{
                        'source':
                        'task:///%s' % fname,
                        'target':
                        'client:///%s.%s.out' % (fname, uid),
                        'action':
                        rp.TRANSFER
                    }],
                    'scheduler':
                    master.uid,
                    'arguments': [json.dumps(work)],
                    'pre_exec':
                    pre_exec
                }))
        # TODO: Organize client-side data with managed hierarchical paths.
        # Question: RP maintains a filesystem hierarchy on the client side, correct?
        # Answer: only for profiling and such: do not use for data or user-facing stuff.
        tasks = tmgr.submit_tasks(tds)
        # TODO: Clarify the points at which the data exists or is accessed.
        # * When the (client-submitted) task enters AGENT_STAGING_OUTPUT_PENDING,
        #   it has finished executing and output data should be accessible as 'task:///outfile'.
        # * When the (client-submitted) task reaches one of the rp.FINAL stages, it has finished
        #   output staging and files are accessible at the location specified in 'output_staging'.
        # * Tasks submitted directly by the Master (example?) do not perform output staging;
        #   data is written before entering Master.result_cb().
        # RP Issue: client-submitted Tasks need to be accessible through a path that is common
        # with the Master-submitted (`request()`) tasks. (SCALE-MS #108)

        assert len(tasks) == len(tds)
        # 'arguments' (element 0) gets wrapped in a Request at the Master by _receive_tasks,
        # then the list of requests is passed to Master.request(), which is presumably
        # an extension point for derived Master implementations. The base class method
        # converts requests to dictionaries and adds them to a request queue, from which they are
        # picked up by the Worker in _request_cb. Then picked up in forked interpreter
        # by Worker._dispatch, which checks the *mode* of the Request and dispatches
        # according to native or registered mode implementations. (e.g. 'call' (native) or 'scalems')

        # task process is launched with Python multiprocessing (native) module and added to self._pool.
        # When the task runs, it's result triggers _result_cb

        # wait for *those* tasks to complete and report results
        tmgr.wait_tasks(uids=[t.uid for t in tasks])

        # Cancel the master.
        tmgr.cancel_tasks(uids=master.uid)
        # Cancel blocks until the task is done so the following wait it currently redundant,
        # but there is a ticket open to change this behavior.
        # See https://github.com/radical-cybertools/radical.pilot/issues/2336
        tmgr.wait_tasks()

        # Note that these map as follows:
        #     * 'client:///' == $PWD
        #     * 'task:///' == urllib.parse.urlparse(task.sandbox).path
        #     * 'pilot:///' == urllib.parse.urlparse(pilot.pilot_sandbox).path

        for t in tasks:
            print(t)
            outfile = './%s.%s.out' % (fname, t.uid)
            assert os.path.exists(outfile)
            with open(outfile, 'r') as outfh:
                assert outfh.readline().rstrip() == data
            os.unlink(outfile)

        pilot.cancel()
        tmgr.close()
        pmgr.close()

    finally:
        session.close(download=False)
Example #6
0
def _connect_rp(config: Configuration) -> Runtime:
    """Establish the RP Session.

    Acquire as many re-usable resources as possible. The scope established by
    this function is as broad as it can be within the life of this instance.

    Once instance._connect_rp() succeeds, instance._disconnect_rp() must be called to
    clean up resources. Use the async context manager behavior of the instance to
    automatically follow this protocol. I.e. instead of calling
    ``instance._connect_rp(); ...; instance._disconnect_rp()``,
    use::
        async with instance:
            ...

    Raises:
        DispatchError if task dispatching could not be set up.

        CanceledError if parent asyncio.Task is cancelled while executing.

    """
    # TODO: Consider inlining this into __aenter__().
    # A non-async method is potentially useful for debugging, but causes the event loop
    # to block while waiting for the RP tasks included here. If this continues to be a
    # slow function, we can wrap the remaining RP calls and let this function be
    # inlined, or stick the whole function in a separate thread with
    # loop.run_in_executor().

    # TODO: RP triggers SIGINT in various failure modes.
    #  We should use loop.add_signal_handler() to convert to an exception
    #  that we can raise in an appropriate task.
    # Note that PilotDescription can use `'exit_on_error': False` to suppress the SIGINT,
    # but we have not explored the consequences of doing so.

    try:
        #
        # Start the Session.
        #

        # Note that we cannot resolve the full _resource config until we have a Session
        # object.
        # We cannot get the default session config until after creating the Session,
        # so we don't have a template for allowed, required, or default values.
        # Question: does the provided *cfg* need to be complete? Or will it be merged
        # with default values from some internal definition, such as by dict.update()?
        # I don't remember what the use cases are for overriding the default session
        # config.
        session_config = None
        # At some point soon, we need to track Session ID for the workflow metadata.
        # We may also want Session ID to be deterministic (or to be re-used?).
        session_id = None

        # Note: the current implementation implies that only one Task for the dispatcher
        # will exist at a time. We are further assuming that there will probably only
        # be one Task per the lifetime of the dispatcher object.
        # We could choose another approach and change our assumptions, if appropriate.
        logger.debug(
            'Entering RP dispatching context. Waiting for rp.Session.')

        # Note: radical.pilot.Session creation causes several deprecation warnings.
        # Ref https://github.com/radical-cybertools/radical.pilot/issues/2185
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=DeprecationWarning)
            # This would be a good time to `await`, if an event-loop friendly
            # Session creation function becomes available.
            runtime = Runtime(
                session=rp.Session(uid=session_id, cfg=session_config))
        session_id = runtime.session.uid
        # Do we want to log this somewhere?
        # session_config = copy.deepcopy(self.session.cfg.as_dict())
        logger.debug('RP dispatcher acquired session {}'.format(session_id))

        # We can launch an initial Pilot, but we may have to run further Pilots
        # during self._queue_runner_task (or while servicing scalems.wait() within the
        # with block) to handle dynamic work load requirements.
        # Optionally, we could refrain from launching the pilot here, at all,
        # but it seems like a good chance to start bootstrapping the agent environment.
        logger.debug('Launching PilotManager.')
        pilot_manager = rp.PilotManager(session=runtime.session)
        logger.debug('Got PilotManager {}.'.format(pilot_manager.uid))
        runtime.pilot_manager(pilot_manager)

        logger.debug('Launching TaskManager.')
        task_manager = rp.TaskManager(session=runtime.session)
        logger.debug(('Got TaskManager {}'.format(task_manager.uid)))
        runtime.task_manager(task_manager)

        #
        # Get a Pilot
        #

        # # TODO: #94 Describe (link to) configuration points.
        # resource_config['local.localhost'].update({
        #     'project': None,
        #     'queue': None,
        #     'schema': None,
        #     'cores': 1,
        #     'gpus': 0
        # })

        # _pilot_description = dict(_resource=_resource,
        #                          runtime=30,
        #                          exit_on_error=True,
        #                          project=resource_config[_resource]['project'],
        #                          queue=resource_config[_resource]['queue'],
        #                          cores=resource_config[_resource]['cores'],
        #                          gpus=resource_config[_resource]['gpus'])

        # TODO: How to specify PilotDescription? (see also #121)
        # Where should this actually be coming from?
        # We need to inspect both the HPC allocation and the work load, I think,
        # and combine with user-provided preferences.
        pilot_description = {}
        pilot_description.update(
            config.rp_resource_params.get('PilotDescription', {}))
        pilot_description.update({'resource': config.execution_target})

        # TODO: Pilot venv (#90, #94).
        # Currently, Pilot venv must be specified in the JSON file for resource
        # definitions.
        pilot_description = rp.PilotDescription(pilot_description)

        # How and when should we update pilot description?
        logger.debug('Submitting PilotDescription {}'.format(
            repr(pilot_description)))
        pilot = pilot_manager.submit_pilots(pilot_description)
        logger.debug('Got Pilot {}'.format(pilot.uid))
        runtime.pilot(pilot)

        # Note that the task description for the master (and worker) can specify a
        # *named_env* attribute to use a venv prepared via Pilot.prepare_env
        # E.g.         pilot.prepare_env({'numpy_env' : {'type'   : 'virtualenv',
        #                                           'version': '3.6',
        #                                           'setup'  : ['numpy']}})
        #   td.named_env = 'numpy_env'
        # Note that td.named_env MUST be a key that is given to pilot.prepare_env(arg:
        # dict) or the task will wait indefinitely to be scheduled.
        # Alternatively, we could use a pre-installed venv by putting
        # `. path/to/ve/bin/activate`
        # in the TaskDescription.pre_exec list.

        # TODO: Use archives generated from (acquired through) the local installations.
        # # Could we stage in archive distributions directly?
        # # self.pilot.stage_in()
        # pilot.prepare_env(
        #     {
        #         'scalems_env': {
        #             'type': 'virtualenv',
        #             'version': '3.8',
        #             'setup': [
        #                 # TODO: Generalize scalems dependency resolution.
        #                 # Ideally, we would check the current API version
        #                 # requirement, map that to a package version,
        #                 # and specify >=min_version, allowing cached archives to
        #                 # satisfy the dependency.
        #                 rp_spec,
        #                 scalems_spec
        #             ]}})

        # Question: when should we remove the pilot from the task manager?
        task_manager.add_pilots(pilot)
        logger.debug('Added Pilot {} to task manager {}.'.format(
            pilot.uid, task_manager.uid))

        pre_exec = get_pre_exec(config)
        assert isinstance(pre_exec, tuple)
        assert len(pre_exec) > 0
        # Verify usable SCALEMS RP connector.
        # TODO: Fetch a profile of the venv for client-side analysis (e.g. `pip freeze`).
        # TODO: Check for compatible installed scalems API version.
        rp_check = task_manager.submit_tasks(
            rp.TaskDescription({
                # 'executable': py_venv,
                'executable':
                'python3',
                'arguments':
                ['-c', 'import radical.pilot as rp; print(rp.version)'],
                'pre_exec':
                list(pre_exec)
                # 'named_env': 'scalems_env'
            }))
        logger.debug('Checking RP execution environment.')
        states = task_manager.wait_tasks(uids=[rp_check.uid])
        if states[0] != rp.states.DONE or rp_check.exit_code != 0:
            raise DispatchError(
                'Could not verify RP in execution environment.')

        try:
            remote_rp_version = packaging.version.parse(
                rp_check.stdout.rstrip())
        except Exception as e:
            raise DispatchError(
                'Could not determine remote RP version.') from e
        # TODO: #100 Improve compatibility checking.
        if remote_rp_version < packaging.version.parse('1.6.0'):
            raise DispatchError(
                f'Incompatible radical.pilot version in execution '
                f'environment: {str(remote_rp_version)}')

        #
        # Get a scheduler task.
        #

        assert runtime.scheduler is None
        # TODO: #119 Re-enable raptor.
        # runtime.scheduler = _get_scheduler(
        #     'raptor.scalems',
        #     pre_exec=execution_manager._pre_exec,
        #     task_manager=task_manager)
        # Note that we can derive scheduler_name from self.scheduler.uid in later methods.
        # Note: The worker script name only appears in the config file.
        # logger.info('RP scheduler ready.')
        # logger.debug(repr(execution_manager.scheduler))

        return runtime

    except asyncio.CancelledError as e:
        raise e
    except Exception as e:
        logger.exception('Exception while connecting RADICAL Pilot.',
                         exc_info=e)
        raise DispatchError('Failed to launch SCALE-MS master task.') from e