def test_runtime_mismatch(pilot_description): with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.task_manager') warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.db.database') warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.session') session = rp.Session() with session: original_pmgr = rp.PilotManager(session=session) pilot = original_pmgr.submit_pilots(rp.PilotDescription(pilot_description)) original_tmgr = rp.TaskManager(session=session) original_tmgr.add_pilots(pilot) assert session.closed # This assertion may not be true: # assert pilot.state in rp.FINAL # Note that Pilot and other components may still be shutting down, but the # intention is that, from this point, pmgr, pilot, and tmgr are now "stale". session = rp.Session() with session: state = Runtime(session=session) with pytest.raises(APIError): state.task_manager(original_tmgr) original_tmgr.close() tmgr = rp.TaskManager(session=session) state.task_manager(tmgr) with pytest.raises(APIError): state.pilot_manager(original_pmgr) original_pmgr.close() pmgr = rp.PilotManager(session=session) state.pilot_manager(pmgr) # The UID will not resolve in the stored PilotManager. with pytest.raises(ValueError): state.pilot(pilot.uid) # The Pilot is detectably invalid. with pytest.raises(APIError): state.pilot(pilot) # Even here, the old Pilot may still be in 'PMGR_ACTIVE_PENDING' if pilot.state not in rp.FINAL: pilot.cancel() tmgr.close() pmgr.close() assert session.closed
def __init__(self, descr: dict, executor: jpsi.JobExecutor, url: str) -> None: jpsi.ExecutorAdaptorBase.__init__(self, descr, executor, url) self._url = ru.Url(url) if self._url.schema != 'rp': raise ValueError('handle only rp:// URLs, not %s', self._url) try: self._jobs = dict() # {job.uid : [JPSI_JOB, RP_TASK] self._lock = mt.Lock() self._session = rp.Session() self._pmgr = rp.PilotManager(session=self._session) self._tmgr = rp.TaskManager(session=self._session) self._pmgr.register_callback(self._pilot_state_cb) self._tmgr.register_callback(self._task_state_cb) # this is layer 0, so we just create a dummy pilot pd = rp.PilotDescription({ 'resource': 'local.localhost', 'cores': 16, 'runtime': 60 }) self._pilot = self._pmgr.submit_pilots(pd) self._tmgr.add_pilots(self._pilot) except Exception: self._log.exception('init failed') raise
def test_runtime_bad_uid(pilot_description): with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.task_manager') warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.db.database') warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.session') session = rp.Session() with session: state = Runtime(session=session) with pytest.raises(ValueError): state.task_manager('spam') tmgr = rp.TaskManager(session=session) state.task_manager(tmgr) with pytest.raises(ValueError): state.pilot_manager('spam') pmgr = rp.PilotManager(session=session) state.pilot_manager(pmgr) with pytest.raises(ValueError): state.pilot_manager('spam') tmgr.close() pmgr.close() assert session.closed
def _new_taskmanager(session: rp.Session, pilot: rp.Pilot): with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.task_manager') warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.db.database') warnings.filterwarnings('ignore', category=DeprecationWarning, module='radical.pilot.session') tmgr = rp.TaskManager(session=session) tmgr.add_pilots(pilot) return tmgr
def test_rp_raptor_staging(pilot_description, rp_venv): """Test file staging for raptor Master and Worker tasks. - upon pilot startup, transfer a file to the pilot sandbox - upon master startup, create a link to that file for each master - for each task, copy the file into the task sandbox - upon task completion, transfer the files to the client (and rename them) """ import time import radical.pilot as rp # Note: we need to install the current scalems package to test remotely. # If this is problematic, we can add a check like the following. # if pilot_description.resource != 'local.localhost' \ # and pilot_description.access_schema \ # and pilot_description.access_schema != 'local': # pytest.skip('This test is only for local execution.') # Note: radical.pilot.Session creation causes several deprecation warnings. # Ref https://github.com/radical-cybertools/radical.pilot/issues/2185 with warnings.catch_warnings(): warnings.simplefilter('ignore', category=DeprecationWarning) session = rp.Session() fname = '%d.dat' % os.getpid() fpath = os.path.join('/tmp', fname) data: str = time.asctime() # Hopefully, this requirement is temporary. if rp_venv is None: pytest.skip('This test requires a user-provided static RP venv.') if rp_venv: pre_exec = ['. {}/bin/activate'.format(rp_venv)] else: pre_exec = None try: pmgr = rp.PilotManager(session=session) tmgr = rp.TaskManager(session=session) # Illustrate data staging as part of the Pilot launch. # By default, file is copied to the root of the Pilot sandbox, # where it can be referenced as 'pilot:///filename' # Alternatively: pilot.stage_in() and pilot.stage_output() (blocking calls) pilot_description.exit_on_error = True pilot_description.input_staging = [fpath] with open(fpath, 'w') as fh: fh.writelines([data]) try: pilot = pmgr.submit_pilots(pilot_description) # Confirmation that the input file has been staged by waiting for pilot state. pilot.wait(state=[rp.states.PMGR_ACTIVE] + rp.FINAL) finally: os.unlink(fpath) tmgr.add_pilots(pilot) uid = 'scalems.master.001' # Illustrate another mode of data staging with the Master task submission. td = rp.TaskDescription({ 'uid': uid, 'executable': 'scalems_rp_master', 'input_staging': [{ 'source': 'pilot:///%s' % fname, 'target': 'pilot:///%s.%s.lnk' % (fname, uid), 'action': rp.LINK }], 'pre_exec': pre_exec # 'named_env': 'scalems_env' }) master = tmgr.submit_tasks(td) # Illustrate availability of scheduler and of data staged with Master task. # When the task enters AGENT_SCHEDULING_PENDING it has passed all input staging, # and the files will be available. # (see https://docs.google.com/drawings/d/1q5ehxIVdln5tXEn34mJyWAmxBk_DqZ5wwkl3En-t5jo/) # Confirm that Master script is running (and ready to receive raptor tasks) # WARNING: rp.Task.wait() *state* parameter does not handle tuples, but does not check type. master.wait(state=[rp.states.AGENT_EXECUTING] + rp.FINAL) assert master.state not in {rp.CANCELED, rp.FAILED} # define raptor tasks and submit them to the master tds = list() # Illustrate data staging as part of raptor task submission. # Note that tasks submitted by the client # a sandboxed task directory, whereas those submitted by the Master (through Master.request(), # through the wrapper script or the Master.create_initial_tasks() hook) do not, # and do not have a data staging phase. for i in range(3): uid = 'scalems.%06d' % i work = { 'mode': 'call', 'cores': 1, 'timeout': 10, # seconds 'data': { 'method': 'hello', 'kwargs': { 'world': uid } } } tds.append( rp.TaskDescription({ 'uid': uid, 'executable': '-', 'input_staging': [{ 'source': 'pilot:///%s.%s.lnk' % (fname, master.uid), 'target': 'task:///%s' % fname, 'action': rp.COPY }], 'output_staging': [{ 'source': 'task:///%s' % fname, 'target': 'client:///%s.%s.out' % (fname, uid), 'action': rp.TRANSFER }], 'scheduler': master.uid, 'arguments': [json.dumps(work)], 'pre_exec': pre_exec })) # TODO: Organize client-side data with managed hierarchical paths. # Question: RP maintains a filesystem hierarchy on the client side, correct? # Answer: only for profiling and such: do not use for data or user-facing stuff. tasks = tmgr.submit_tasks(tds) # TODO: Clarify the points at which the data exists or is accessed. # * When the (client-submitted) task enters AGENT_STAGING_OUTPUT_PENDING, # it has finished executing and output data should be accessible as 'task:///outfile'. # * When the (client-submitted) task reaches one of the rp.FINAL stages, it has finished # output staging and files are accessible at the location specified in 'output_staging'. # * Tasks submitted directly by the Master (example?) do not perform output staging; # data is written before entering Master.result_cb(). # RP Issue: client-submitted Tasks need to be accessible through a path that is common # with the Master-submitted (`request()`) tasks. (SCALE-MS #108) assert len(tasks) == len(tds) # 'arguments' (element 0) gets wrapped in a Request at the Master by _receive_tasks, # then the list of requests is passed to Master.request(), which is presumably # an extension point for derived Master implementations. The base class method # converts requests to dictionaries and adds them to a request queue, from which they are # picked up by the Worker in _request_cb. Then picked up in forked interpreter # by Worker._dispatch, which checks the *mode* of the Request and dispatches # according to native or registered mode implementations. (e.g. 'call' (native) or 'scalems') # task process is launched with Python multiprocessing (native) module and added to self._pool. # When the task runs, it's result triggers _result_cb # wait for *those* tasks to complete and report results tmgr.wait_tasks(uids=[t.uid for t in tasks]) # Cancel the master. tmgr.cancel_tasks(uids=master.uid) # Cancel blocks until the task is done so the following wait it currently redundant, # but there is a ticket open to change this behavior. # See https://github.com/radical-cybertools/radical.pilot/issues/2336 tmgr.wait_tasks() # Note that these map as follows: # * 'client:///' == $PWD # * 'task:///' == urllib.parse.urlparse(task.sandbox).path # * 'pilot:///' == urllib.parse.urlparse(pilot.pilot_sandbox).path for t in tasks: print(t) outfile = './%s.%s.out' % (fname, t.uid) assert os.path.exists(outfile) with open(outfile, 'r') as outfh: assert outfh.readline().rstrip() == data os.unlink(outfile) pilot.cancel() tmgr.close() pmgr.close() finally: session.close(download=False)
def _connect_rp(config: Configuration) -> Runtime: """Establish the RP Session. Acquire as many re-usable resources as possible. The scope established by this function is as broad as it can be within the life of this instance. Once instance._connect_rp() succeeds, instance._disconnect_rp() must be called to clean up resources. Use the async context manager behavior of the instance to automatically follow this protocol. I.e. instead of calling ``instance._connect_rp(); ...; instance._disconnect_rp()``, use:: async with instance: ... Raises: DispatchError if task dispatching could not be set up. CanceledError if parent asyncio.Task is cancelled while executing. """ # TODO: Consider inlining this into __aenter__(). # A non-async method is potentially useful for debugging, but causes the event loop # to block while waiting for the RP tasks included here. If this continues to be a # slow function, we can wrap the remaining RP calls and let this function be # inlined, or stick the whole function in a separate thread with # loop.run_in_executor(). # TODO: RP triggers SIGINT in various failure modes. # We should use loop.add_signal_handler() to convert to an exception # that we can raise in an appropriate task. # Note that PilotDescription can use `'exit_on_error': False` to suppress the SIGINT, # but we have not explored the consequences of doing so. try: # # Start the Session. # # Note that we cannot resolve the full _resource config until we have a Session # object. # We cannot get the default session config until after creating the Session, # so we don't have a template for allowed, required, or default values. # Question: does the provided *cfg* need to be complete? Or will it be merged # with default values from some internal definition, such as by dict.update()? # I don't remember what the use cases are for overriding the default session # config. session_config = None # At some point soon, we need to track Session ID for the workflow metadata. # We may also want Session ID to be deterministic (or to be re-used?). session_id = None # Note: the current implementation implies that only one Task for the dispatcher # will exist at a time. We are further assuming that there will probably only # be one Task per the lifetime of the dispatcher object. # We could choose another approach and change our assumptions, if appropriate. logger.debug( 'Entering RP dispatching context. Waiting for rp.Session.') # Note: radical.pilot.Session creation causes several deprecation warnings. # Ref https://github.com/radical-cybertools/radical.pilot/issues/2185 with warnings.catch_warnings(): warnings.simplefilter('ignore', category=DeprecationWarning) # This would be a good time to `await`, if an event-loop friendly # Session creation function becomes available. runtime = Runtime( session=rp.Session(uid=session_id, cfg=session_config)) session_id = runtime.session.uid # Do we want to log this somewhere? # session_config = copy.deepcopy(self.session.cfg.as_dict()) logger.debug('RP dispatcher acquired session {}'.format(session_id)) # We can launch an initial Pilot, but we may have to run further Pilots # during self._queue_runner_task (or while servicing scalems.wait() within the # with block) to handle dynamic work load requirements. # Optionally, we could refrain from launching the pilot here, at all, # but it seems like a good chance to start bootstrapping the agent environment. logger.debug('Launching PilotManager.') pilot_manager = rp.PilotManager(session=runtime.session) logger.debug('Got PilotManager {}.'.format(pilot_manager.uid)) runtime.pilot_manager(pilot_manager) logger.debug('Launching TaskManager.') task_manager = rp.TaskManager(session=runtime.session) logger.debug(('Got TaskManager {}'.format(task_manager.uid))) runtime.task_manager(task_manager) # # Get a Pilot # # # TODO: #94 Describe (link to) configuration points. # resource_config['local.localhost'].update({ # 'project': None, # 'queue': None, # 'schema': None, # 'cores': 1, # 'gpus': 0 # }) # _pilot_description = dict(_resource=_resource, # runtime=30, # exit_on_error=True, # project=resource_config[_resource]['project'], # queue=resource_config[_resource]['queue'], # cores=resource_config[_resource]['cores'], # gpus=resource_config[_resource]['gpus']) # TODO: How to specify PilotDescription? (see also #121) # Where should this actually be coming from? # We need to inspect both the HPC allocation and the work load, I think, # and combine with user-provided preferences. pilot_description = {} pilot_description.update( config.rp_resource_params.get('PilotDescription', {})) pilot_description.update({'resource': config.execution_target}) # TODO: Pilot venv (#90, #94). # Currently, Pilot venv must be specified in the JSON file for resource # definitions. pilot_description = rp.PilotDescription(pilot_description) # How and when should we update pilot description? logger.debug('Submitting PilotDescription {}'.format( repr(pilot_description))) pilot = pilot_manager.submit_pilots(pilot_description) logger.debug('Got Pilot {}'.format(pilot.uid)) runtime.pilot(pilot) # Note that the task description for the master (and worker) can specify a # *named_env* attribute to use a venv prepared via Pilot.prepare_env # E.g. pilot.prepare_env({'numpy_env' : {'type' : 'virtualenv', # 'version': '3.6', # 'setup' : ['numpy']}}) # td.named_env = 'numpy_env' # Note that td.named_env MUST be a key that is given to pilot.prepare_env(arg: # dict) or the task will wait indefinitely to be scheduled. # Alternatively, we could use a pre-installed venv by putting # `. path/to/ve/bin/activate` # in the TaskDescription.pre_exec list. # TODO: Use archives generated from (acquired through) the local installations. # # Could we stage in archive distributions directly? # # self.pilot.stage_in() # pilot.prepare_env( # { # 'scalems_env': { # 'type': 'virtualenv', # 'version': '3.8', # 'setup': [ # # TODO: Generalize scalems dependency resolution. # # Ideally, we would check the current API version # # requirement, map that to a package version, # # and specify >=min_version, allowing cached archives to # # satisfy the dependency. # rp_spec, # scalems_spec # ]}}) # Question: when should we remove the pilot from the task manager? task_manager.add_pilots(pilot) logger.debug('Added Pilot {} to task manager {}.'.format( pilot.uid, task_manager.uid)) pre_exec = get_pre_exec(config) assert isinstance(pre_exec, tuple) assert len(pre_exec) > 0 # Verify usable SCALEMS RP connector. # TODO: Fetch a profile of the venv for client-side analysis (e.g. `pip freeze`). # TODO: Check for compatible installed scalems API version. rp_check = task_manager.submit_tasks( rp.TaskDescription({ # 'executable': py_venv, 'executable': 'python3', 'arguments': ['-c', 'import radical.pilot as rp; print(rp.version)'], 'pre_exec': list(pre_exec) # 'named_env': 'scalems_env' })) logger.debug('Checking RP execution environment.') states = task_manager.wait_tasks(uids=[rp_check.uid]) if states[0] != rp.states.DONE or rp_check.exit_code != 0: raise DispatchError( 'Could not verify RP in execution environment.') try: remote_rp_version = packaging.version.parse( rp_check.stdout.rstrip()) except Exception as e: raise DispatchError( 'Could not determine remote RP version.') from e # TODO: #100 Improve compatibility checking. if remote_rp_version < packaging.version.parse('1.6.0'): raise DispatchError( f'Incompatible radical.pilot version in execution ' f'environment: {str(remote_rp_version)}') # # Get a scheduler task. # assert runtime.scheduler is None # TODO: #119 Re-enable raptor. # runtime.scheduler = _get_scheduler( # 'raptor.scalems', # pre_exec=execution_manager._pre_exec, # task_manager=task_manager) # Note that we can derive scheduler_name from self.scheduler.uid in later methods. # Note: The worker script name only appears in the config file. # logger.info('RP scheduler ready.') # logger.debug(repr(execution_manager.scheduler)) return runtime except asyncio.CancelledError as e: raise e except Exception as e: logger.exception('Exception while connecting RADICAL Pilot.', exc_info=e) raise DispatchError('Failed to launch SCALE-MS master task.') from e