Exemple #1
0
    def __init__(self, cfg, session):

        self._cfg = cfg
        self._pid = cfg.pid
        self._pmgr = cfg.pmgr
        self._pwd = cfg.pilot_sandbox
        self._session = session
        self._log = session._log

        self._starttime = time.time()
        self._final_cause = None

        # this is the earliest point to sync bootstrap and agent profiles
        prof = ru.Profiler(ns='radical.pilot', name='agent.0')
        prof.prof('sync_rel', uid=cfg.pid, msg='agent.0')
        prof.prof('hostname', uid=cfg.pid, msg=ru.get_hostname())

        # connect to MongoDB for state push/pull
        self._connect_db()

        # configure ResourceManager before component startup, as components need
        # ResourceManager information for function (scheduler, executor)
        self._configure_rm()

        # ensure that app communication channels are visible to workload
        self._configure_app_comm()

        # expose heartbeat channel to sub-agents, bridges and components,
        # and start those
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cfg.heartbeat = self._cmgr.cfg.heartbeat

        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # create the sub-agent configs and start the sub agents
        self._write_sa_configs()
        self._start_sub_agents()  # TODO: move to cmgr?

        # at this point the session is up and connected, and it should have
        # brought up all communication bridges and components.  We are
        # ready to rumble!
        rpu.Worker.__init__(self, self._cfg, session)

        # run our own slow-paced heartbeat monitor to watch pgr heartbeats
        self._hb = ru.Heartbeat(
            uid=self._pid,
            timeout=10.0,  # FIXME:  configurable
            interval=1.0,  # FIXME:  configurable
            beat_cb=self._hb_check,  # no own heartbeat(pmgr pulls)
            term_cb=self._hb_term_cb,
            log=self._log)
        self._hb.start()

        # register pmgr heartbeat
        self._log.info('hb init for %s', self._pmgr)
        self._hb.beat(uid=self._pmgr)
Exemple #2
0
    def initialize_parent(self):

        # create the sub-agent configs
        self._write_sa_configs()

        # and start the sub agents
        self._start_sub_agents()

        # register the command callback which pulls the DB for commands
        self.register_timed_cb(self._agent_command_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # registers the staging_input_queue as this is what we want to push
        # units to
        self.register_output(rps.AGENT_STAGING_INPUT_PENDING,
                             rpc.AGENT_STAGING_INPUT_QUEUE)

        # sub-agents are started, components are started, bridges are up: we are
        # ready to roll!
        pilot = {
            'type': 'pilot',
            'uid': self._pid,
            'state': rps.PMGR_ACTIVE,
            'resource_details': {
                'lm_info': self._lrms.lm_info.get('version_info'),
                'lm_detail': self._lrms.lm_info.get('lm_detail')
            },
            '$set': ['resource_details']
        }
        self.advance(pilot, publish=True, push=False)

        # register idle callback to pull for units -- which is the only action
        # we have to perform, really
        self.register_timed_cb(self._check_units_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # record hostname in profile to enable mapping of profile entries
        self._prof.prof(event='hostname', uid=self._pid, msg=ru.get_hostname())
    def initialize_parent(self):

        # create the sub-agent configs
        self._write_sa_configs()

        # and start the sub agents
        self._start_sub_agents()

        # register the command callback which pulls the DB for commands
        self.register_timed_cb(self._agent_command_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # registers the staging_input_queue as this is what we want to push
        # units to
        self.register_output(rps.AGENT_STAGING_INPUT_PENDING,
                             rpc.AGENT_STAGING_INPUT_QUEUE)

        # sub-agents are started, components are started, bridges are up: we are
        # ready to roll!
        pilot = {'type'             : 'pilot',
                 'uid'              : self._pid,
                 'state'            : rps.PMGR_ACTIVE,
                 'resource_details' : {
                     'lm_info'      : self._lrms.lm_info.get('version_info'),
                     'lm_detail'    : self._lrms.lm_info.get('lm_detail'), 
                     'rm_info'      : self._lrms.lrms_info},
                 '$set'             : ['resource_details']}
        self.advance(pilot, publish=True, push=False)

        # register idle callback to pull for units -- which is the only action
        # we have to perform, really
        self.register_timed_cb(self._check_units_cb,
                               timer=self._cfg['db_poll_sleeptime'])


        # record hostname in profile to enable mapping of profile entries
        self._prof.prof(event='hostname', uid=self._pid, msg=ru.get_hostname())
Exemple #4
0
    def rm_config_hook(cls, name, cfg, rm, logger, profiler):

        profiler.prof('flux_start')

        flux = ru.which('flux')
        if not flux:
            raise Exception("Couldn't find flux")

        try:
            import sys
            print(sys.path)
            import flux
        except:
            raise Exception("Couldn't import flux")

    # cmd  = 'flux start -o,-v,-S,log-filename=out'.split()
    # proc = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.STDOUT)
    # proc.stdin.write(ru.as_bytes('flux getattr local-uri\necho "OK"\n'))

        check = 'flux env; echo "OK"; while true; do echo "ok"; sleep 1; done'
        start = 'flux start -o,-v,-S,log-filename=out'
        cmd = '/bin/bash -c "echo \\\"%s\\\" | %s"' % (check, start)
        proc = sp.Popen(cmd,
                        shell=True,
                        stdin=sp.PIPE,
                        stdout=sp.PIPE,
                        stderr=sp.STDOUT)

        flux_env = dict()
        while True:

            line = ru.as_string(proc.stdout.readline().strip())
            logger.debug('=== %s', line)

            if line.startswith('export '):
                k, v = line.split(' ', 1)[1].strip().split('=', 1)
                flux_env[k] = v.strip('"')
                logger.debug('%s = %s' % (k, v.strip('"')))

            elif line == 'OK':
                break

        assert ('FLUX_URI' in flux_env)

        # TODO check perf implications
        flux_url = ru.Url(flux_env['FLUX_URI'])
        flux_url.host = ru.get_hostname()
        flux_url.scheme = 'ssh'
        flux_env['FLUX_URI'] = str(flux_url)

        profiler.prof('flux_started')

        # ----------------------------------------------------------------------
        def _watch_flux(flux_env):

            logger.info('=== starting flux watcher')

            for k, v in flux_env.items():
                os.environ[k] = v

            ret = None
            while not ret:

                out, err, ret = ru.sh_callout('flux ping -c 1 all')
                logger.debug('=== flux watcher out: %s', out)

                if ret:
                    logger.error('=== flux watcher err: %s', err)
                    break

                time.sleep(0.1)

            logger.info('flux stopped?')
            # FIXME: trigger termination

        # ----------------------------------------------------------------------

        flux_watcher = mt.Thread(target=_watch_flux, args=[flux_env])
        flux_watcher.daemon = True
        flux_watcher.start()

        logger.info("flux startup successful: [%s]", flux_env['FLUX_URI'])

        lm_info = {'flux_env': flux_env, 'flux_pid': proc.pid}

        return lm_info