def __init__(self, cfg, session): self._cfg = cfg self._pid = cfg.pid self._pmgr = cfg.pmgr self._pwd = cfg.pilot_sandbox self._session = session self._log = session._log self._starttime = time.time() self._final_cause = None # this is the earliest point to sync bootstrap and agent profiles prof = ru.Profiler(ns='radical.pilot', name='agent.0') prof.prof('sync_rel', uid=cfg.pid, msg='agent.0') prof.prof('hostname', uid=cfg.pid, msg=ru.get_hostname()) # connect to MongoDB for state push/pull self._connect_db() # configure ResourceManager before component startup, as components need # ResourceManager information for function (scheduler, executor) self._configure_rm() # ensure that app communication channels are visible to workload self._configure_app_comm() # expose heartbeat channel to sub-agents, bridges and components, # and start those self._cmgr = rpu.ComponentManager(self._cfg) self._cfg.heartbeat = self._cmgr.cfg.heartbeat self._cmgr.start_bridges() self._cmgr.start_components() # create the sub-agent configs and start the sub agents self._write_sa_configs() self._start_sub_agents() # TODO: move to cmgr? # at this point the session is up and connected, and it should have # brought up all communication bridges and components. We are # ready to rumble! rpu.Worker.__init__(self, self._cfg, session) # run our own slow-paced heartbeat monitor to watch pgr heartbeats self._hb = ru.Heartbeat( uid=self._pid, timeout=10.0, # FIXME: configurable interval=1.0, # FIXME: configurable beat_cb=self._hb_check, # no own heartbeat(pmgr pulls) term_cb=self._hb_term_cb, log=self._log) self._hb.start() # register pmgr heartbeat self._log.info('hb init for %s', self._pmgr) self._hb.beat(uid=self._pmgr)
def initialize_parent(self): # create the sub-agent configs self._write_sa_configs() # and start the sub agents self._start_sub_agents() # register the command callback which pulls the DB for commands self.register_timed_cb(self._agent_command_cb, timer=self._cfg['db_poll_sleeptime']) # registers the staging_input_queue as this is what we want to push # units to self.register_output(rps.AGENT_STAGING_INPUT_PENDING, rpc.AGENT_STAGING_INPUT_QUEUE) # sub-agents are started, components are started, bridges are up: we are # ready to roll! pilot = { 'type': 'pilot', 'uid': self._pid, 'state': rps.PMGR_ACTIVE, 'resource_details': { 'lm_info': self._lrms.lm_info.get('version_info'), 'lm_detail': self._lrms.lm_info.get('lm_detail') }, '$set': ['resource_details'] } self.advance(pilot, publish=True, push=False) # register idle callback to pull for units -- which is the only action # we have to perform, really self.register_timed_cb(self._check_units_cb, timer=self._cfg['db_poll_sleeptime']) # record hostname in profile to enable mapping of profile entries self._prof.prof(event='hostname', uid=self._pid, msg=ru.get_hostname())
def initialize_parent(self): # create the sub-agent configs self._write_sa_configs() # and start the sub agents self._start_sub_agents() # register the command callback which pulls the DB for commands self.register_timed_cb(self._agent_command_cb, timer=self._cfg['db_poll_sleeptime']) # registers the staging_input_queue as this is what we want to push # units to self.register_output(rps.AGENT_STAGING_INPUT_PENDING, rpc.AGENT_STAGING_INPUT_QUEUE) # sub-agents are started, components are started, bridges are up: we are # ready to roll! pilot = {'type' : 'pilot', 'uid' : self._pid, 'state' : rps.PMGR_ACTIVE, 'resource_details' : { 'lm_info' : self._lrms.lm_info.get('version_info'), 'lm_detail' : self._lrms.lm_info.get('lm_detail'), 'rm_info' : self._lrms.lrms_info}, '$set' : ['resource_details']} self.advance(pilot, publish=True, push=False) # register idle callback to pull for units -- which is the only action # we have to perform, really self.register_timed_cb(self._check_units_cb, timer=self._cfg['db_poll_sleeptime']) # record hostname in profile to enable mapping of profile entries self._prof.prof(event='hostname', uid=self._pid, msg=ru.get_hostname())
def rm_config_hook(cls, name, cfg, rm, logger, profiler): profiler.prof('flux_start') flux = ru.which('flux') if not flux: raise Exception("Couldn't find flux") try: import sys print(sys.path) import flux except: raise Exception("Couldn't import flux") # cmd = 'flux start -o,-v,-S,log-filename=out'.split() # proc = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.STDOUT) # proc.stdin.write(ru.as_bytes('flux getattr local-uri\necho "OK"\n')) check = 'flux env; echo "OK"; while true; do echo "ok"; sleep 1; done' start = 'flux start -o,-v,-S,log-filename=out' cmd = '/bin/bash -c "echo \\\"%s\\\" | %s"' % (check, start) proc = sp.Popen(cmd, shell=True, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.STDOUT) flux_env = dict() while True: line = ru.as_string(proc.stdout.readline().strip()) logger.debug('=== %s', line) if line.startswith('export '): k, v = line.split(' ', 1)[1].strip().split('=', 1) flux_env[k] = v.strip('"') logger.debug('%s = %s' % (k, v.strip('"'))) elif line == 'OK': break assert ('FLUX_URI' in flux_env) # TODO check perf implications flux_url = ru.Url(flux_env['FLUX_URI']) flux_url.host = ru.get_hostname() flux_url.scheme = 'ssh' flux_env['FLUX_URI'] = str(flux_url) profiler.prof('flux_started') # ---------------------------------------------------------------------- def _watch_flux(flux_env): logger.info('=== starting flux watcher') for k, v in flux_env.items(): os.environ[k] = v ret = None while not ret: out, err, ret = ru.sh_callout('flux ping -c 1 all') logger.debug('=== flux watcher out: %s', out) if ret: logger.error('=== flux watcher err: %s', err) break time.sleep(0.1) logger.info('flux stopped?') # FIXME: trigger termination # ---------------------------------------------------------------------- flux_watcher = mt.Thread(target=_watch_flux, args=[flux_env]) flux_watcher.daemon = True flux_watcher.start() logger.info("flux startup successful: [%s]", flux_env['FLUX_URI']) lm_info = {'flux_env': flux_env, 'flux_pid': proc.pid} return lm_info