def initialize_child(self): self._pwd = os.getcwd() self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) self._cancel_lock = threading.RLock() self._cus_to_cancel = list() self._cus_to_watch = list() self._watch_queue = Queue.Queue() self._pilot_id = self._cfg['pilot_id'] # run watcher thread self._watcher = ru.Thread(target=self._watch, name="Watcher") self._watcher.start() # The AgentExecutingComponent needs the LaunchMethods to construct # commands. self._task_launcher = rp.agent.LM.create( name=self._cfg.get('task_launch_method'), cfg=self._cfg, session=self._session) self._mpi_launcher = rp.agent.LM.create( name=self._cfg.get('mpi_launch_method'), cfg=self._cfg, session=self._session) self._cu_environment = self._populate_cu_environment() self.gtod = "%s/gtod" % self._pwd self.tmpdir = tempfile.gettempdir() # if we need to transplant any original env into the CU, we dig the # respective keys from the dump made by bootstrap_1.sh self._env_cu_export = dict() if self._cfg.get('export_to_cu'): with open('env.orig', 'r') as f: for line in f.readlines(): if '=' in line: k, v = line.split('=', 1) key = k.strip() val = v.strip() if key in self._cfg['export_to_cu']: self._env_cu_export[key] = val
def lrms_config_hook(cls, name, cfg, lrms, logger, profiler): """ FIXME: this config hook will manipulate the LRMS nodelist. Not a nice thing to do, but hey... :P What really should be happening is that the LRMS digs information on node reservation out of the config and configures the node list accordingly. This config hook should be limited to starting the DVM. """ dvm_command = ru.which('orte-dvm') if not dvm_command: raise Exception("Couldn't find orte-dvm") # Now that we found the orte-dvm, get ORTE version out, err, ret = ru.sh_callout('orte-info | grep "Open RTE"', shell=True) orte_info = dict() for line in out.split('\n'): line = line.strip() if not line: continue key, val = line.split(':', 1) if 'Open RTE' == key.strip(): orte_info['version'] = val.strip() elif 'Open RTE repo revision' == key.strip(): orte_info['version_detail'] = val.strip() assert (orte_info.get('version')) logger.info("Found Open RTE: %s / %s", orte_info['version'], orte_info.get('version_detail')) # Use (g)stdbuf to disable buffering. # We need this to get the "DVM ready", # without waiting for orte-dvm to complete. # The command seems to be generally available on our Cray's, # if not, we can code some home-coooked pty stuff. stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf']) if not stdbuf_cmd: raise Exception("Couldn't find (g)stdbuf") stdbuf_arg = "-oL" # Base command = (g)stdbuf <args> + orte-dvm + debug_args dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command] # Additional (debug) arguments to orte-dvm if os.environ.get('RADICAL_PILOT_ORTE_VERBOSE'): debug_strings = [ '--debug-devel', '--mca odls_base_verbose 100', '--mca rml_base_verbose 100' ] else: debug_strings = [] # Split up the debug strings into args and add them to the dvm_args [dvm_args.extend(ds.split()) for ds in debug_strings] vm_size = len(lrms.node_list) logger.info("Start DVM on %d nodes ['%s']", vm_size, ' '.join(dvm_args)) profiler.prof(event='orte_dvm_start', uid=cfg['pilot_id']) dvm_uri = None dvm_process = mp.Popen(dvm_args, stdout=mp.PIPE, stderr=mp.STDOUT) while True: line = dvm_process.stdout.readline().strip() if line.startswith('VMURI:'): if len(line.split(' ')) != 2: raise Exception("Unknown VMURI format: %s" % line) label, dvm_uri = line.split(' ', 1) if label != 'VMURI:': raise Exception("Unknown VMURI format: %s" % line) logger.info("ORTE DVM URI: %s" % dvm_uri) elif line == 'DVM ready': if not dvm_uri: raise Exception("VMURI not found!") logger.info("ORTE DVM startup successful!") profiler.prof(event='orte_dvm_ok', uid=cfg['pilot_id']) break else: # Check if the process is still around, # and log output in debug mode. if dvm_process.poll() is None: logger.debug("ORTE: %s", line) else: # Process is gone: fatal! raise Exception("ORTE DVM process disappeared") profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id']) # ---------------------------------------------------------------------- def _watch_dvm(): logger.info('starting DVM watcher') retval = dvm_process.poll() while retval is None: line = dvm_process.stdout.readline().strip() if line: logger.debug('dvm output: %s', line) else: time.sleep(1.0) if retval != 0: # send a kill signal to the main thread. # We know that Python and threading are likely not to play well # with signals - but this is an exceptional case, and not part # of the stadard termination sequence. If the signal is # swallowed, the next `orte-submit` call will trigger # termination anyway. os.kill(os.getpid()) logger.info('DVM stopped (%d)' % dvm_process.returncode) # ---------------------------------------------------------------------- dvm_watcher = ru.Thread(target=_watch_dvm, name="DVMWatcher") dvm_watcher.start() lm_info = {'dvm_uri': dvm_uri, 'version_info': {name: orte_info}} # we need to inform the actual LM instance about the DVM URI. So we # pass it back to the LRMS which will keep it in an 'lm_info', which # will then be passed as part of the slots via the scheduler return lm_info
def __init__ (self, _adaptor, _method_type, _method_context, _ttype) : """ This saga.Task constructor is private. ``_adaptor`` references the adaptor class instance from which this task was created via an asynchronous function. Note that the API level object instance can be inferred via ``_adaptor.get_api ()``. Further, the adaptor will reference an _adaptor._container class, which will be considered the target for bulk operations for this task. ``_method_type`` specifies the SAGA API method which task is representing. For example, for the following code:: d = saga.filesystem.Directory ("file:///") t = d.copy ('/etc/passwd', '/tmp/passwd.bak', saga.task.ASYNC) The resulting task ``t`` would represent the *'copy'* method. This is required to forward :class:`saga.task.Container` calls to the correct bulk method, in this case ``container_copy()``. ``_method_context`` describes the context in which the task method is running. It is up to the creator of the task to provide that context -- in general, it will at least include method parameters. ``ttype`` determines in what state the constructor will leave the task: ``DONE`` for ``ttype=SYNC``, ``RUNNING`` for ``ttype=ASYNC`` and ``NEW`` for ``ttype=TASK``. If the ``_method_context`` has *exactly* two elements, names ``_call`` and ``args``, then the created task will wrap a :class:`ru.Thread` with that ``_call (_args)``. """ self._base = super (Task, self) self._base.__init__ () self._thread = None self._ttype = _ttype self._adaptor = _adaptor self._method_type = _method_type self._method_context = _method_context # set attribute interface properties self._attributes_extensible (False) self._attributes_allow_private (True) self._attributes_camelcasing (True) # register properties with the attribute interface self._attributes_register (RESULT, None, satt.ANY, satt.SCALAR, satt.READONLY) self._attributes_set_getter (RESULT, self.get_result) self._attributes_set_setter (RESULT, self._set_result) self._attributes_register (EXCEPTION, None, satt.ANY, satt.SCALAR, satt.READONLY) self._attributes_set_getter (EXCEPTION, self.get_exception) self._attributes_set_setter (EXCEPTION, self._set_exception) self._attributes_register (STATE, UNKNOWN, satt.ENUM, satt.SCALAR, satt.READONLY) self._attributes_set_enums (STATE, [UNKNOWN, NEW, RUNNING, DONE, FAILED, CANCELED]) self._attributes_set_getter (STATE, self.get_state) self._attributes_set_setter (STATE, self._set_state) self._set_state (NEW) # check if this task is supposed to wrap a callable in a thread if '_call' in self._method_context : if not '_args' in self._method_context : self._method_context['_args'] = () if not '_kwargs' in self._method_context : self._method_context['_kwargs'] = {} if 3 != len (self._method_context) : raise se.BadParameter ("invalid call context for callable task") call = self._method_context['_call'] args = self._method_context['_args'] kwargs = self._method_context['_kwargs'] if not '_from_task' in kwargs : kwargs['_from_task'] = self self._thread = ru.Thread (call, *args, **kwargs) # ensure task goes into the correct state if self._ttype == SYNC : self.run () self.wait () elif self._ttype == ASYNC : self.run () elif self._ttype == TASK : pass
# printed pid to send a SIGUSR1 like this: # # kill -USR1 <pid> # # and the DebugHelper will print stack traces for all threads to stdout. # # ------------------------------------------------------------------------------ # def worker_outer(): worker_inner() def worker_inner(): cnt = 0 print('worker starts') time.sleep(3) while cnt < 100: print(cnt) cnt += 1 time.sleep(1) # ------------------------------------------------------------------------------ # print(os.getpid()) dh = ru.DebugHelper() t = ru.Thread(name='worker', target=worker_outer) t.start() time.sleep(50) t.join()