def setUpClass(self):

        # Directory of sample config files
        self.sample_resource_dir = os.path.dirname(os.path.realpath(__file__))

        # Load sample resource test config
        self.cfg_sample = ru.read_json(os.path.join(self.sample_resource_dir, "sample_resources.json"))
        self.cfg_sample_1 = self.cfg_sample['sample_resource_1']
        self.cfg_sample_2 = self.cfg_sample['sample_resource_2']
        self.cfg_sample_3 = self.cfg_sample['sample_resource_3']

        # Directory of pilot resource config files
        self.pilot_resource_dir = '../../src/radical/pilot/configs'

        # Load xsede pilot resource config
        self.cfg_xsede = ru.read_json(os.path.join(self.pilot_resource_dir, 'resource_xsede.json'))
        self.cfg_xsede_bridges = self.cfg_xsede['bridges']
        self.cfg_xsede_comet_ssh = self.cfg_xsede['comet_ssh']
        self.cfg_xsede_comet_orte = self.cfg_xsede['comet_orte']
        self.cfg_xsede_comet_ortelib = self.cfg_xsede['comet_ortelib']
        self.cfg_xsede_comet_spark = self.cfg_xsede['comet_spark']
        self.cfg_xsede_supermic_ssh = self.cfg_xsede['supermic_ssh']
        self.cfg_xsede_supermic_orte = self.cfg_xsede['supermic_orte']
        self.cfg_xsede_supermic_ortelib = self.cfg_xsede['supermic_ortelib']
        self.cfg_xsede_supermic_spark = self.cfg_xsede['supermic_spark']

        return
def test():

    s = None
    try:
        cfg = ru.read_json("%s/session.json" % os.path.dirname(__file__))
        dh  = ru.DebugHelper()
        s   = rp.Session(cfg=cfg)

        ca1 = CompA(s)
        cb1 = CompB(s)
        cb2 = CompB(s)

        ca1.start()
        cb1.start()
        cb2.start()

      # s._controller.add_things([ca1, cb1, cb2])

        time.sleep(3)


    finally:
        if s:
            print 'close'
            s.close()
Ejemplo n.º 3
0
def get_session_docs(db, sid, cache=None, cachedir=None):

    # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in
    # that case we pull it from there instead of the database, which will be
    # much quicker.  Also, we do cache any retrieved docs to that place, for
    # later use.  An optional cachdir parameter changes that default location
    # for lookup and storage.
    if not cachedir:
        cachedir = _CACHE_BASEDIR

    if not cache:
        cache = "%s/%s.json" % (cachedir, sid)

    try:
        if os.path.isfile(cache):
            # print 'using cache: %s' % cache
            return ru.read_json(cache)
    except Exception as e:
        # continue w/o cache
        sys.stderr.write("cannot read session cache at %s (%s)\n" % (cache, e))

    # cache not used or not found -- go to db
    json_data = dict()

    # convert bson to json, i.e. serialize the ObjectIDs into strings.
    json_data['session'] = bson2json(list(db[sid].find({'type': 'session'})))
    json_data['pmgr'] = bson2json(list(db[sid].find({'type': 'pmgr'})))
    json_data['pilot'] = bson2json(list(db[sid].find({'type': 'pilot'})))
    json_data['umgr'] = bson2json(list(db[sid].find({'type': 'umgr'})))
    json_data['unit'] = bson2json(list(db[sid].find({'type': 'unit'})))

    if len(json_data['session']) == 0:
        raise ValueError('no session %s in db' % sid)

# if  len(json_data['session']) > 1:
#     print 'more than one session document -- picking first one'

# there can only be one session, not a list of one
    json_data['session'] = json_data['session'][0]

    # we want to add a list of handled units to each pilot doc
    for pilot in json_data['pilot']:

        pilot['unit_ids'] = list()

        for unit in json_data['unit']:

            if unit['pilot'] == pilot['uid']:
                pilot['unit_ids'].append(unit['uid'])

    # if we got here, we did not find a cached version -- thus add this dataset
    # to the cache
    try:
        os.system('mkdir -p %s' % cachedir)
        ru.write_json(json_data, "%s/%s.json" % (cachedir, sid))
    except Exception:
        # we can live without cache, no problem...
        pass

    return json_data
Ejemplo n.º 4
0
    def _read_config(self, config_path, hostname, port, username, password,
                     reattempts, resubmit_failed, autoterminate,
                     write_workflow, rts, rmq_cleanup, rts_config):

        if not config_path:
            config_path = os.path.dirname(os.path.abspath(__file__))

        config = ru.read_json(os.path.join(config_path, 'config.json'))

        def _if(val1, val2):
            if val1 is not None: return val1
            else: return val2

        self._hostname = _if(hostname, config['hostname'])
        self._port = _if(port, config['port'])
        self._username = _if(username, config['username'])
        self._password = _if(password, config['password'])
        self._reattempts = _if(reattempts, config['reattempts'])
        self._resubmit_failed = _if(resubmit_failed, config['resubmit_failed'])
        self._autoterminate = _if(autoterminate, config['autoterminate'])
        self._write_workflow = _if(write_workflow, config['write_workflow'])
        self._rmq_cleanup = _if(rmq_cleanup, config['rmq_cleanup'])
        self._rts_config = _if(rts_config, config['rts_config'])
        self._rts = _if(rts, config['rts'])

        credentials = pika.PlainCredentials(self._username, self._password)
        self._rmq_conn_params = pika.connection.ConnectionParameters(
            host=self._hostname, port=self._port, credentials=credentials)

        self._num_pending_qs = config['pending_qs']
        self._num_completed_qs = config['completed_qs']

        if self._rts not in ['radical.pilot', 'mock']:
            raise ValueError('invalid RTS %s' % self._rts)
Ejemplo n.º 5
0
    def __init__(self, cfg):

        if isinstance(cfg, str): cfg = ru.Config(cfg=ru.read_json(cfg))
        else: cfg = ru.Config(cfg=cfg)

        self._uid = cfg.uid
        self._term = mt.Event()
        self._info = ru.Config(cfg=cfg.get('info', {}))
        self._session = Session(cfg=cfg, _primary=False)

        rpu.Component.__init__(self, cfg, self._session)

        # connect to master
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)
        self.register_publisher(rpc.CONTROL_PUBSUB)

        info = self.initialize()

        self.publish(rpc.CONTROL_PUBSUB, {
            'cmd': 'worker_register',
            'arg': {
                'uid': self._uid,
                'info': info
            }
        })
Ejemplo n.º 6
0
def main():
    # TODO: Test both with and without a provided config file.
    kwargs = {}
    if len(sys.argv) > 1:
        cfg = ru.Config(cfg=ru.read_json(sys.argv[1]))
        kwargs['cfg'] = cfg
        descr = cfg.worker_descr,
        count = cfg.n_workers,
        cores = cfg.cpn,
        gpus = cfg.gpn
    else:
        descr = rp.TaskDescription({
            'uid': 'raptor.worker',
            'executable': 'scalems_rp_worker',
            'arguments': []
        })
        count = 1
        cores = 1
        gpus = 0
    master = ScaleMSMaster(**kwargs)

    master.submit(descr=descr, count=count, cores=cores, gpus=gpus)

    master.start()
    master.join()
    master.stop()
Ejemplo n.º 7
0
def get_session_docs (db, sid, cache=None, cachedir=None) :

    # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in that
    # case we pull it from there instead of the database, which will be much
    # quicker.  Also, we do cache any retrieved docs to that place, for later
    # use.  An optional cachdir parameter changes that default location for
    # lookup and storage.
    if  not cachedir :
        cachedir = _CACHE_BASEDIR

    if  not cache :
        cache = "%s/%s.json" % (cachedir, sid)

    try :
        if  os.path.isfile (cache) :
            return ru.read_json (cache)
    except Exception as e :
        # continue w/o cache
        sys.stderr.write ("warning: cannot read session cache at %s (%s)\n" % (cache, e))


    # cache not used or not found -- go to db
    json_data = dict()

    # convert bson to json, i.e. serialize the ObjectIDs into strings.
    json_data['session'] = bson2json (list(db["%s"    % sid].find ()))
    json_data['pmgr'   ] = bson2json (list(db["%s.pm" % sid].find ()))
    json_data['pilot'  ] = bson2json (list(db["%s.p"  % sid].find ()))
    json_data['umgr'   ] = bson2json (list(db["%s.um" % sid].find ()))
    json_data['unit'   ] = bson2json (list(db["%s.cu" % sid].find ()))

    if  len(json_data['session']) == 0 :
        raise ValueError ('no such session %s' % sid)

  # if  len(json_data['session']) > 1 :
  #     print 'more than one session document -- picking first one'

    # there can only be one session, not a list of one
    json_data['session'] = json_data['session'][0]

    # we want to add a list of handled units to each pilot doc
    for pilot in json_data['pilot'] :

        pilot['unit_ids'] = list()

        for unit in json_data['unit'] :

            if  unit['pilot'] == str(pilot['_id']) :
                pilot['unit_ids'].append (str(unit['_id']))

    # if we got here, we did not find a cached version -- thus add this dataset
    # to the cache
    try :
        os.system ('mkdir -p %s' % _CACHE_BASEDIR)
        ru.write_json (json_data, "%s/%s.json" % (_CACHE_BASEDIR, sid))
    except Exception as e :
        # we can live without cache, no problem...
        pass

    return json_data
Ejemplo n.º 8
0
def write_workflow(workflow, uid):

    try:
        os.mkdir(uid)
    except:
        pass

    data = list()
    if os.path.isfile('%s/entk_workflow.json' % uid):
        data = ru.read_json('%s/entk_workflow.json' % uid)

    for pipe in workflow:

        p = dict()
        p['uid'] = pipe.uid
        p['name'] = pipe.name
        p['state_history'] = pipe.state_history
        p['stages'] = list()

        for stage in pipe.stages:

            s = dict()
            s['uid'] = stage.uid
            s['name'] = stage.name
            s['state_history'] = stage.state_history
            s['tasks'] = list()

            for task in stage.tasks:
                s['tasks'].append(task.to_dict())

            p['stages'].append(s)

        data.append(p)

    ru.write_json(data, '%s/entk_workflow.json' % uid)
Ejemplo n.º 9
0
def test_get_session_description():

    sid = 're.session.host.user.012345.1234'
    src = '%s/sample_data/profiler' % pwd
    desc = get_session_description(sid=sid, src=src)

    assert desc == ru.read_json('%s/expected_desc_get_session.json' % src)
Ejemplo n.º 10
0
    def _read_config(self, config_path, hostname, port, reattempts,
                     resubmit_failed, autoterminate, write_workflow, rts,
                     rmq_cleanup, rts_config):

        if not config_path:
            config_path = os.path.dirname(os.path.abspath(__file__))

        config = ru.read_json(os.path.join(config_path, 'config.json'))

        self._mq_hostname = hostname if hostname else str(config['hostname'])
        self._port = port if port else config['port']
        self._reattempts = reattempts if reattempts else config['reattempts']
        self._resubmit_failed = resubmit_failed if resubmit_failed is not None else config[
            'resubmit_failed']
        self._autoterminate = autoterminate if autoterminate is not None else config[
            'autoterminate']
        self._write_workflow = write_workflow if write_workflow is not None else config[
            'write_workflow']
        self._rts = rts if rts in ['radical.pilot', 'mock'] else str(
            config['rts'])
        self._rmq_cleanup = rmq_cleanup if rmq_cleanup is not None else config[
            'rmq_cleanup']
        self._rts_config = rts_config if rts_config is not None else config[
            'rts_config']

        self._num_pending_qs = config['pending_qs']
        self._num_completed_qs = config['completed_qs']
Ejemplo n.º 11
0
def test_get_session_description():

    sid = 're.session.host.user.012345.1234'
    curdir = os.path.dirname(os.path.abspath(__file__))
    src = '%s/sample_data/profiler' % curdir
    desc = get_session_description(sid=sid, src=src)

    assert desc == ru.read_json('%s/expected_desc_get_session.json' % src)
Ejemplo n.º 12
0
def test_get_session_description():

    sid = 're.session.vivek-HP-Pavilion-m6-Notebook-PC.vivek.017732.0002'
    curdir = os.path.dirname(os.path.abspath(__file__))
    src = '%s/sample_data/profiler' % curdir
    desc = get_session_description(sid=sid, src=src)

    assert desc == ru.read_json('%s/expected_desc.json' % src)
Ejemplo n.º 13
0
def get_session_description(sid, src=None):

    if not src:
        src = './%s/' % sid

    if not os.path.isdir(src):
        raise EnTKError('No such directory %s' % src)

    return ru.read_json('%s/%s.json' % (src, sid))
Ejemplo n.º 14
0
def test_write_session_description():

    amgr = AppManager(hostname=hostname,
                      port=port,
                      username=username,
                      password=password)
    amgr.resource_desc = {
        'resource': 'xsede.stampede',
        'walltime': 59,
        'cpus': 128,
        'gpus': 64,
        'project': 'xyz',
        'queue': 'high'
    }

    workflow = [generate_pipeline(1), generate_pipeline(2)]
    amgr.workflow = workflow

    amgr._wfp = WFprocessor(sid=amgr.sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            resubmit_failed=amgr._resubmit_failed,
                            rmq_conn_params=amgr._rmq_conn_params)
    amgr._workflow = amgr._wfp.workflow

    amgr._task_manager = TaskManager(sid=amgr._sid,
                                     pending_queue=amgr._pending_queue,
                                     completed_queue=amgr._completed_queue,
                                     rmgr=amgr._rmgr,
                                     rmq_conn_params=amgr._rmq_conn_params)

    write_session_description(amgr)

    desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid))
    # tasks are originally set but saved as a list in json
    # uses sorting for convenient comparison, this doesn't change validity
    for k, v in (desc['tree'].items()):
        if k.startswith("stage"):
            desc['tree'][k]['children'] = sorted(v['children'])

    src = '%s/sample_data' % pwd

    assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
Ejemplo n.º 15
0
def test_write_session_description():

    hostname = os.environ.get('RMQ_HOSTNAME', 'localhost')
    port = int(os.environ.get('RMQ_PORT', 5672))
    amgr = AppManager(hostname=hostname, port=port)
    amgr.resource_desc = {
        'resource': 'xsede.stampede',
        'walltime': 60,
        'cpus': 128,
        'gpus': 64,
        'project': 'xyz',
        'queue': 'high'
    }

    workflow = [generate_pipeline(1), generate_pipeline(2)]
    amgr.workflow = workflow

    amgr._wfp = WFprocessor(sid=amgr._sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            mq_hostname=amgr._mq_hostname,
                            port=amgr._port,
                            resubmit_failed=amgr._resubmit_failed)
    amgr._wfp._initialize_workflow()
    amgr._workflow = amgr._wfp.workflow

    amgr._task_manager = TaskManager(sid=amgr._sid,
                                     pending_queue=amgr._pending_queue,
                                     completed_queue=amgr._completed_queue,
                                     mq_hostname=amgr._mq_hostname,
                                     rmgr=amgr._resource_manager,
                                     port=amgr._port
                                     )

    # os.mkdir(amgr._sid)

    write_session_description(amgr)

    desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid))
    curdir = os.path.dirname(os.path.abspath(__file__))
    src = '%s/sample_data' % curdir
    assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
Ejemplo n.º 16
0
    def setUp(self):
        ret = list()
        for fin in glob.glob(
                'tests/test_agent_stagein/test_cases/unit.*.json'):
            tc = ru.read_json(fin)
            unit = tc['unit']
            result = tc['results']
            if result:
                ret.append([unit, result])

        return ret
Ejemplo n.º 17
0
def test_executor_run():

    fpath = os.path.dirname(os.path.abspath(__file__))

    schedule = list()

    for x in range(10):
        t = Task(ops=100)
        c = Core(10)
        schedule.append({'task': t.to_dict(), 'core': c.to_dict()})

    with open('%s/../config_test.yml' % fpath) as fp:
        cfg = yaml.load(fp)

    conn = pika.BlockingConnection(
        pika.ConnectionParameters(host=cfg['rmq']['host'],
                                  port=cfg['rmq']['port']))
    chan = conn.channel()

    chan.basic_publish(exchange=cfg['rmq']['executor']['exchange'],
                       routing_key=cfg['rmq']['executor']['queues']['config'],
                       body=json.dumps({'engine_uid': 'test.0000'}))

    chan.basic_publish(
        exchange=cfg['rmq']['executor']['exchange'],
        routing_key=cfg['rmq']['executor']['queues']['schedule'],
        body=json.dumps(schedule))

    conn.close()

    executor = Executor(cfg_path='%s/../config_test.yml' % fpath)
    t = threading.Thread(target=func_for_test_executor_run, args=(executor, ))
    t.daemon = True
    t.start()
    t.join(timeout=5)

    executor._write_profile()

    assert os.path.isfile('./profile.%s.json' % (executor._uid))
    prof = ru.read_json('./profile.%s.json' % (executor._uid))

    assert 'test.0000' in prof.keys()
    assert len(prof['test.0000']) == 10

    for ind, x in enumerate(prof['test.0000']):
        assert x['task'] == schedule[ind]['task']['uid']
        assert x['core'] == schedule[ind]['core']['uid']
        assert x['end_time'] == 10
        assert x['exec_time'] == 10
        assert x['start_time'] == 0

    for f in glob('profile.*'):
        os.remove(f)
    def setUpClass(cls):
        """Initialize tests, just creates instance variables needed."""
        super(AcceptanceTests, cls).setUpClass()

        cls.resource = None
        cls.session  = None
        cls.pmgr     = None
        cls.umgr     = None
        cls.n        = 128   # number of units to run
        cls.config   = ru.read_json('%s/config.json' % os.path.dirname(__file__))

        cls.setUp()
Ejemplo n.º 19
0
    def setUpClass(cls):
        cls._base_dir = ru.get_radical_base('utils')
        cls._pid_str = '%06d' % os.getpid()
        cls._user = None
        try:
            import getpass
            cls._user = getpass.getuser()
        except:
            cls._user = '******'

        cls._test_cases = []
        for f in glob.glob(TEST_CASES_PATH):
            cls._test_cases.extend(ru.read_json(f))
Ejemplo n.º 20
0
    def register_input(self, states, input, worker=None):
        '''
        Using this method, the component can be connected to a queue on which
        things are received to be worked upon.  The given set of states (which
        can be a single state or a list of states) will trigger an assert check
        upon thing arrival.

        This method will further associate a thing state with a specific worker.
        Upon thing arrival, the thing state will be used to lookup the
        respective worker, and the thing will be handed over.  Workers should
        call self.advance(thing), in order to push the thing toward the next
        component.  If, for some reason, that is not possible before the worker
        returns, the component will retain ownership of the thing, and should
        call advance() asynchronously at a later point in time.

        Worker invocation is synchronous, ie. the main event loop will only
        check for the next thing once the worker method returns.
        '''

        if not isinstance(states, list):
            states = [states]

        name = '%s.%s.%s' % (self.uid, worker.__name__, '_'.join(states))

        if name in self._inputs:
            raise ValueError('input %s already registered' % name)

        # dig the addresses from the bridge's config file
        fname = '%s/%s.cfg' % (self._cfg.path, input)
        cfg = ru.read_json(fname)

        self._inputs[name] = {
            'queue': ru.zmq.Getter(input, url=cfg['get'], log=self._log),
            'states': states
        }

        self._log.debug('registered input %s', name)

        # we want exactly one worker associated with a state -- but a worker can
        # be responsible for multiple states
        for state in states:

            self._log.debug('%s register input %s: %s', self.uid, state, name)

            if state in self._workers:
                self._log.warn("%s replaces worker for %s (%s)" %
                               (self.uid, state, self._workers[state]))
            self._workers[state] = worker

            self._log.debug('registered worker %s [%s]', worker.__name__,
                            state)
Ejemplo n.º 21
0
def get_session_description(sid, src=None):

    if not src:
        src = os.getcwd()

    if os.path.exists(src):

        # EnTK profiles are always on localhost
        desc = ru.read_json("%s/%s/radical.entk.%s.json" % (src, sid, sid))

    else:
        raise EnTKError('%s/%s does not exist' % (src, sid))

    return desc
def get_session_description(sid, src=None):

    if not src:
        src = os.getcwd()

    if os.path.exists(src):

        # EnTK profiles are always on localhost
        desc = ru.read_json("%s/%s/radical.entk.%s.json" % (src, sid, sid))

    else:
        raise EnTKError('%s/%s does not exist' % (src, sid))

    return desc
Ejemplo n.º 23
0
def test_write_session_description():

    amgr = AppManager(hostname=hostname, port=port)
    amgr.resource_desc = {
        'resource': 'xsede.stampede',
        'walltime': 59,
        'cpus': 128,
        'gpus': 64,
        'project': 'xyz',
        'queue': 'high'
    }

    workflow = [generate_pipeline(1), generate_pipeline(2)]
    amgr.workflow = workflow

    amgr._wfp = WFprocessor(sid=amgr.sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            resubmit_failed=amgr._resubmit_failed,
                            rmq_conn_params=amgr._rmq_conn_params)
    amgr._wfp.initialize_workflow()
    amgr._workflow = amgr._wfp.workflow

    amgr._task_manager = TaskManager(sid=amgr._sid,
                                     pending_queue=amgr._pending_queue,
                                     completed_queue=amgr._completed_queue,
                                     rmgr=amgr._rmgr,
                                     rmq_conn_params=amgr._rmq_conn_params)

    write_session_description(amgr)

    desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid))
    src = '%s/sample_data' % pwd

    assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
Ejemplo n.º 24
0
def test_write_workflow():

    try:
        wf = list()
        wf.append(generate_pipeline(1))
        wf.append(generate_pipeline(2))

        amgr = AppManager(hostname=hostname, port=port)
        amgr.workflow = wf
        amgr._wfp = WFprocessor(sid=amgr._sid,
                                workflow=amgr._workflow,
                                pending_queue=amgr._pending_queue,
                                completed_queue=amgr._completed_queue,
                                mq_hostname=amgr._mq_hostname,
                                port=amgr._port,
                                resubmit_failed=amgr._resubmit_failed)
        amgr._wfp._initialize_workflow()
        wf = amgr._wfp.workflow

        write_workflow(wf, 'test')

        data = ru.read_json('test/entk_workflow.json')
        assert len(data) == len(wf) + 1

        stack = data.pop(0)
        assert stack.keys() == ['stack']
        assert stack['stack'].keys() == ['sys','radical']
        assert stack['stack']['sys'].keys() == ["python","pythonpath","virtualenv"]
        assert stack['stack']['radical'].keys() == ['saga', 'radical.pilot', 'radical.utils', 'radical.entk']

        p_cnt = 0
        for p in data:
            assert p['uid'] == wf[p_cnt].uid
            assert p['name'] == wf[p_cnt].name
            assert p['state_history'] == wf[p_cnt].state_history
            s_cnt = 0
            for s in p['stages']:
                assert s['uid'] == wf[p_cnt].stages[s_cnt].uid
                assert s['name'] == wf[p_cnt].stages[s_cnt].name
                assert s['state_history'] == wf[p_cnt].stages[s_cnt].state_history
                for t in wf[p_cnt].stages[s_cnt].tasks:
                    assert t.to_dict() in s['tasks']
                s_cnt += 1
            p_cnt += 1

    except Exception as ex:
        shutil.rmtree('test')
        raise
Ejemplo n.º 25
0
    def register_publisher(self, pubsub):
        '''
        Using this method, the component can registered itself to be a publisher
        of notifications on the given pubsub channel.
        '''

        assert (pubsub not in self._publishers)

        # dig the addresses from the bridge's config file
        fname = '%s/%s.cfg' % (self._cfg.path, pubsub)
        cfg = ru.read_json(fname)
        addr = cfg['pub']

        self._publishers[pubsub] = ru.zmq.Publisher(pubsub,
                                                    url=addr,
                                                    log=self._log)

        self._log.debug('registered publisher for %s', pubsub)
Ejemplo n.º 26
0
    def _pilots_backfill(self, requests):
        '''
        Request new backfill pilots, chunked by the given max_cores and
        max_walltime.  The given request_stub is used as template for the pilot
        descriptions.
        '''

        self._rep.info('\nrequesting backfilled pilots\n')
        pds = list()

        for request in requests:

            del (request['backfill'])

            policy = request['policy']
            partition = request['partition']

            PWD = os.path.dirname(__file__)
            policy = ru.read_json('%s/policies/%s.json' %
                                  (PWD, request['policy']))

            max_cores = policy.get('max_cores', MAX_CORES)
            max_walltime = policy.get('max_walltime', MAX_WALLTIME)

            self._rep.info('\nrequesting backfill pilots\n')
            bf = get_backfill(request['partition'], max_cores, max_walltime)

            for [partition, cores, walltime] in bf:
                pd = {
                    'resource': request.get('resource', 'local.localhost'),
                    'project': request.get('project'),
                    'queue': request.get('queue'),
                    'cores': cores,
                    'runtime': walltime
                }
                self._rep.ok(
                    'backfill @ %s [%5dcores * %4dmin] @ %10s(%10s)]\n' %
                    (pd['resource'], pd['cores'], pd['runtime'], pd['queue'],
                     pd['project']))
                # pprint.pprint(pd)
                pds.append(rp.ComputePilotDescription(pd))

        return pds
Ejemplo n.º 27
0
def write_workflow(workflow, uid, workflow_fout='entk_workflow', fwrite=True):

    try:
        os.mkdir(uid)
    except:
        pass

    data = list()
    if os.path.isfile('%s/%s.json' % (uid, workflow_fout)):
        data = ru.read_json('%s/%s.json' % (uid, workflow_fout))

    stack = ru.stack()
    data.append({'stack': stack})

    for pipe in workflow:

        p = dict()
        p['uid'] = pipe.uid
        p['name'] = pipe.name
        p['state_history'] = pipe.state_history
        p['stages'] = list()

        for stage in pipe.stages:

            s = dict()
            s['uid'] = stage.uid
            s['name'] = stage.name
            s['state_history'] = stage.state_history
            s['tasks'] = list()

            for task in stage.tasks:
                s['tasks'].append(task.to_dict())

            p['stages'].append(s)

        data.append(p)

    if fwrite:
        ru.write_json(data, '%s/%s.json' % (uid, workflow_fout))
        return 0

    return data
def write_workflow(workflow, uid, workflow_fout='entk_workflow', fwrite=True):

    try:
        os.mkdir(uid)
    except:
        pass

    data = list()
    if os.path.isfile('%s/%s.json' % (uid, workflow_fout)):
        data = ru.read_json('%s/%s.json' % (uid, workflow_fout))

    stack = ru.stack()
    data.append({'stack': stack})

    for pipe in workflow:

        p = dict()
        p['uid'] = pipe.uid
        p['name'] = pipe.name
        p['state_history'] = pipe.state_history
        p['stages'] = list()

        for stage in pipe.stages:

            s = dict()
            s['uid'] = stage.uid
            s['name'] = stage.name
            s['state_history'] = stage.state_history
            s['tasks'] = list()

            for task in stage.tasks:
                s['tasks'].append(task.to_dict())

            p['stages'].append(s)

        data.append(p)

    if fwrite:
        ru.write_json(data, '%s/%s.json' % (uid, workflow_fout))
        return 0

    return data
Ejemplo n.º 29
0
    def _configure(self):

        import flux

        flux_url = self._cfg['rm_info']['lm_info']['flux_env']['FLUX_URI']
        self._flux = flux.Flux(url=flux_url)

        # don't advance tasks via the component's `advance()`, but push them
        # toward the executor *without state change* - state changes are
        # performed in retrospect by the executor, based on the scheduling and
        # execution events collected from Flux.
        qname = rpc.AGENT_EXECUTING_QUEUE
        fname = '%s/%s.cfg' % (self._cfg.path, qname)
        cfg = ru.read_json(fname)
        self._q = ru.zmq.Putter(qname, cfg['put'])

        # create job spec via the flux LM
        self._lm = LaunchMethod.create(name='FLUX',
                                       cfg=self._cfg,
                                       session=self._session)
Ejemplo n.º 30
0
def push_tasks(bulk_id, unit):
    '''
    Once a bulk of tasks has been executed, push the resulting jsons back to the
    QCArchive service endpoint.  The results are read from
    the unit's `stdout` file, which the executor needs to fetch back to
    localhost.

    Units which failed are marked and returned in a separate bulk, using the
    `shutdown` operation.
    '''

    data_ok = dict()
    data_nok = list()

    for unit in units:

        if unit.state == rp.DONE:
            fout = unit.metadata['fout']  # FIXME: implies data staging
            result = ru.read_json(fout)
            data_ok[unit.name] = (result, 'single', [])

        else:
            data_nok.apppend(unit.name)

    if data_ok:
        payload = {"meta": {"name": bulk_id}, "data": data_ok}
        r = requests.post(address + "queue_manager",
                          json=payload,
                          verify=False)
        print('%s ok: %s' % (bulk_id, r.json()))

    if data_nok:
        payload = {
            "meta": {
                "name": bulk_id,
                "operation": 'shutdown'
            },
            "data": data_nok
        }
        r = requests.put(address + "queue_manager", json=payload, verify=False)
        print('%s nok: %s' % (bulk_id, r))
    def _read_config(self, config_path, hostname, port, reattempts,
                     resubmit_failed, autoterminate, write_workflow,
                     rts, rmq_cleanup, rts_config):

        if not config_path:
            config_path = os.path.dirname(os.path.abspath(__file__))

        config = ru.read_json(os.path.join(config_path, 'config.json'))

        self._mq_hostname = hostname if hostname else str(config['hostname'])
        self._port = int(port if port else config['port'])
        self._reattempts = reattempts if reattempts else config['reattempts']
        self._resubmit_failed = resubmit_failed if resubmit_failed is not None else config['resubmit_failed']
        self._autoterminate = autoterminate if autoterminate is not None else config['autoterminate']
        self._write_workflow = write_workflow if write_workflow is not None else config['write_workflow']
        self._rts = rts if rts in ['radical.pilot', 'mock'] else str(config['rts'])
        self._rmq_cleanup = rmq_cleanup if rmq_cleanup is not None else config['rmq_cleanup']
        self._rts_config = rts_config if rts_config is not None else config['rts_config']

        self._num_pending_qs = config['pending_qs']
        self._num_completed_qs = config['completed_qs']
Ejemplo n.º 32
0
def createWorkload(inputFile, nthreads):
    '''
	Creates a workload composed of as many pipelines as they are specified in inputFile. Each pipeline is composed of a set of stages that correspond to bag of tasks. The ith stage of the jth pipeline contains a number of tasks equal to the entry (j,y) in the matrix specified in inputFile
	'''
    workloadDesc = ru.read_json(inputFile)
    workload = []
    totNumCUs = 0
    for pipeline in workloadDesc:
        stageList = []
        workload.append(stageList)
        for bof in pipeline:
            taskList = []
            stageList.append(taskList)
            totNumCUs += bof
            for i in range(0, bof):
                cud = CUDef.createTAUGromacsCU(
                    nthreads
                )  ## The number of cores per CU has been set to 1. (hard coded, can be changed). The creation of the CU could stay outside the loop since all the CUs are the same
                #cud = CUDef.createDateCU() ## Create a /bin/date CU --- Comment the line above and de-comment this one if you want to try /bin/date
                taskList.append(cud)
    return (workload, totNumCUs)
Ejemplo n.º 33
0
def setUp(test_type, test_name):

    ret = list()
    for fin in glob.glob('tests/test_cases/unit.*.json'):

        tc = ru.read_json(fin)
        unit = tc['unit']
        setup = tc['setup'].get(test_type, {})
        result = tc['results'].get(test_type, {}).get(test_name)
        resource_file = tc['results'].get('resource_file', {}).get(test_name)
        resource_filename = tc['results'].get('resource_filename',
                                              {}).get(test_name)
        test = ru.dict_merge(unit, setup, ru.PRESERVE)

        if result:
            if resource_file and resource_filename:
                ret.append([test, result, resource_file, resource_filename])
            else:
                ret.append([test, result])

    return ret
Ejemplo n.º 34
0
    def setUpClass(cls):
        """Initialize tests, just creates instance variables needed."""
        super(AcceptanceTests, cls).setUpClass()

        # Set-up the resource, hard-coding 'localhost' for now...
        cls.resource = None

        # Create a new session. No need to try/except this: if session creation
        # fails, there is not much we can do anyways...
        cls.session = None
        # Add a Pilot Manager. Pilot managers manage one or more ComputePilots.
        cls.pmgr = None
        # Create a UnitManager object.
        cls.umgr = None

        # Read in configuration
        cls.config = ru.read_json('%s/config.json' %
                                  os.path.dirname(os.path.abspath(__file__)))

        # Number of Compute Units (CUs)
        cls.n = 128  # number of units to run
def test_duration_method_with_data_from_run_with_execution_barriers():

    '''
    This function tests if the durations obtained from the analytics function is the same
    as the duration obtained by the utils function. They should both be less than the 'max-min' (of the FINAL 
    and INITIAL states respectively) as the data set in this case consists of profiles when not all units are concurrently
    being executed, i.e. there is an execution barrier between them and thus a 'gap' between their executions.
    '''


    data_loc = '{0}/barrier_data'.format(os.path.dirname(os.path.realpath(__file__)))
    json_files = glob.glob('{0}/*.json'.format(data_loc))
    json_file = json_files[0]
    json      = ru.read_json(json_file)
    sid       = os.path.basename(json_file)[:-5]

    session = ra.Session(sid, 'radical.pilot', src='{0}/'.format(data_loc))       
       

    assert get_duration_using_analytics(session) == get_duration_using_utils(session) 
    assert get_duration_using_analytics(session) < get_duration_using_minmax(session)
Ejemplo n.º 36
0
def test_write_workflow():

    wf = list()
    wf.append(generate_pipeline(1))
    wf.append(generate_pipeline(2))

    amgr = AppManager(hostname=hostname, port=port)
    amgr.workflow = wf
    amgr._wfp = WFprocessor(sid=amgr._sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            mq_hostname=amgr._mq_hostname,
                            port=amgr._port,
                            resubmit_failed=amgr._resubmit_failed)
    amgr._wfp._initialize_workflow()
    wf = amgr._wfp.workflow

    write_workflow(wf, 'test')

    data = ru.read_json('test/entk_workflow.json')
    assert len(data) == len(wf)

    p_cnt = 0
    for p in data:
        assert p['uid'] == wf[p_cnt].uid
        assert p['name'] == wf[p_cnt].name
        assert p['state_history'] == wf[p_cnt].state_history
        s_cnt = 0
        for s in p['stages']:
            assert s['uid'] == wf[p_cnt].stages[s_cnt].uid
            assert s['name'] == wf[p_cnt].stages[s_cnt].name
            assert s['state_history'] == wf[p_cnt].stages[s_cnt].state_history
            for t in wf[p_cnt].stages[s_cnt].tasks:
                assert t.to_dict() in s['tasks']
            s_cnt += 1
        p_cnt += 1

    shutil.rmtree('test')
Ejemplo n.º 37
0
    def register_output(self, states, output=None):
        '''
        Using this method, the component can be connected to a queue to which
        things are sent after being worked upon.  The given set of states (which
        can be a single state or a list of states) will trigger an assert check
        upon thing departure.

        If a state but no output is specified, we assume that the state is
        final, and the thing is then considered 'dropped' on calling advance() on
        it.  The advance() will trigger a state notification though, and then
        mark the drop in the log.  No other component should ever again work on
        such a final thing.  It is the responsibility of the component to make
        sure that the thing is in fact in a final state.
        '''

        if not isinstance(states, list):
            states = [states]

        for state in states:

            self._log.debug('%s register output %s:%s', self.uid, state,
                            output)

            # we want a *unique* output queue for each state.
            if state in self._outputs:
                self._log.warn("%s replaces output for %s : %s -> %s" %
                               (self.uid, state, self._outputs[state], output))

            if not output:
                # this indicates a final state
                self._outputs[state] = None

            else:
                # non-final state, ie. we want a queue to push to
                # dig the addresses from the bridge's config file
                fname = '%s/%s.cfg' % (self._cfg.path, output)
                cfg = ru.read_json(fname)

                self._outputs[state] = ru.zmq.Putter(output, url=cfg['put'])
Ejemplo n.º 38
0
def test_executor_write_profile():

    fpath = os.path.dirname(os.path.abspath(__file__))
    executor = Executor(cfg_path='%s/../config_test.yml' % fpath)

    tasks = list()
    engine_uid = 'engine.0000'
    output = list()
    for x in range(10):
        task = Task()
        task.exec_core = 'core.%s' % x
        task.start_time = random()
        task.end_time = random()

        output.append({
            'task': task.uid,
            'ops': task.ops,
            'core': task.exec_core,
            'start_time': task.start_time,
            'end_time': task.end_time,
            'exec_time': task.end_time - task.start_time
        })

        tasks.append(task)

    executor._profile[engine_uid] = output
    executor._profile_loc = '%s/test.prof' % fpath

    executor._write_profile()

    assert os.path.isfile('%s/test.%s.prof' % (fpath, executor._uid))
    prof = ru.read_json('%s/test.%s.prof' % (fpath, executor._uid))

    assert engine_uid in prof.keys()
    assert output == prof[engine_uid]

    for f in glob('test.*'):
        os.remove(f)
Ejemplo n.º 39
0
    def run(self):
        """Starts the process when Process.start() is called.
        """

        global JOB_CHECK_INTERVAL

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                db = self._session.get_db()
                pilot_col = db["%s.p" % self._session.uid]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._terminate.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                if self._disabled.is_set():
                    # don't process any new pilot start requests.
                    # NOTE: this is not clean, in principle there could be other
                    #       launchers alive which want to still start those
                    #       pending pilots.  In practice we only ever use one
                    #       pmgr though, and its during its shutdown that we get
                    #       here...
                    ts = time.time()
                    compute_pilot = pilot_col.find_and_modify(
                        query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                        update={
                            "$set": {"state": CANCELED},
                            "$push": {"statehistory": {"state": CANCELED, "timestamp": ts}},
                        },
                    )

                    # run state checks more frequently.
                    JOB_CHECK_INTERVAL = 3
                    time.sleep(1)
                    continue

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = time.time()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                    update={
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},
                    },
                )

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_id = self._session.uid
                        database_url = self._session.dburl

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        candidate_hosts = compute_pilot["description"]["candidate_hosts"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_launch_method = resource_cfg.get("agent_launch_method")
                        agent_dburl = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        rc_agent_config = resource_cfg.get("agent_config", DEFAULT_AGENT_CONFIG)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap_1 = resource_cfg.get("pre_bootstrap_1")
                        pre_bootstrap_2 = resource_cfg.get("pre_bootstrap_2")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")
                        cores_per_node = resource_cfg.get("cores_per_node")
                        shared_filesystem = resource_cfg.get("shared_filesystem", True)
                        health_check = resource_cfg.get("health_check", True)
                        python_dist = resource_cfg.get("python_dist")
                        cu_pre_exec = resource_cfg.get("cu_pre_exec")
                        cu_post_exec = resource_cfg.get("cu_post_exec")
                        export_to_cu = resource_cfg.get("export_to_cu")

                        # Agent configuration that is not part of the public API.
                        # The agent config can either be a config dict, or
                        # a string pointing to a configuration name.  If neither
                        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
                        # set.  The last fallback is 'agent_default'
                        agent_config = compute_pilot["description"].get("_config")
                        if not agent_config:
                            agent_config = os.environ.get("RADICAL_PILOT_AGENT_CONFIG")
                        if not agent_config:
                            agent_config = rc_agent_config

                        if isinstance(agent_config, dict):
                            # nothing to do
                            agent_cfg_dict = agent_config
                            pass

                        elif isinstance(agent_config, basestring):
                            try:
                                if os.path.exists(agent_config):
                                    # try to open as file name
                                    logger.info("Read agent config file: %s" % agent_config)
                                    agent_cfg_dict = ru.read_json(agent_config)
                                else:
                                    # otherwise interpret as a config name
                                    module_path = os.path.dirname(os.path.abspath(__file__))
                                    config_path = "%s/../configs/" % module_path
                                    agent_cfg_file = os.path.join(config_path, "agent_%s.json" % agent_config)
                                    logger.info("Read agent config file: %s" % agent_cfg_file)
                                    agent_cfg_dict = ru.read_json(agent_cfg_file)
                                # no matter how we read the config file, we
                                # allow for user level overload
                                cfg_base = os.path.basename(agent_cfg_file)
                                user_cfg = "%s/.radical/pilot/config/%s" % (os.environ["HOME"], cfg_base)
                                if os.path.exists(user_cfg):
                                    logger.info("merging user config: %s" % user_cfg)
                                    user_cfg_dict = ru.read_json(user_cfg)
                                    ru.dict_merge(agent_cfg_dict, user_cfg_dict, policy="overwrite")
                            except Exception as e:
                                logger.exception("Error reading agent config file: %s" % e)
                                raise

                        else:
                            # we can't handle this type
                            raise TypeError("agent config must be string (filename) or dict")

                        # TODO: use booleans all the way?
                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # Create a host:port string for use by the bootstrap_1.
                        db_url = saga.Url(agent_dburl)
                        if db_url.port:
                            db_hostport = "%s:%d" % (db_url.host, db_url.port)
                        else:
                            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

                        # Open the remote sandbox
                        # TODO: make conditional on shared_fs?
                        sandbox_tgt = saga.filesystem.Directory(
                            pilot_sandbox, session=self._session, flags=saga.filesystem.CREATE_PARENTS
                        )

                        LOCAL_SCHEME = "file"

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.
                        # This also creates the sandbox.
                        BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, BOOTSTRAPPER_SCRIPT))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, bootstrapper_path))

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, sandbox_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        if shared_filesystem:
                            sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT)

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for sdist_path in [ru.sdist_path, saga.sdist_path, rp_sdist_path]:

                                sdist_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, sdist_path))
                                msg = "Copying sdist '%s' to sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))
                                if shared_filesystem:
                                    sandbox_tgt.copy(sdist_url, os.path.basename(str(sdist_url)))

                        # ------------------------------------------------------
                        # Some machines cannot run pip due to outdated CA certs.
                        # For those, we also stage an updated certificate bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_url = saga.Url("%s://localhost/%s" % (LOCAL_SCHEME, cc_path))
                            msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % (cc_url, pilot_sandbox)
                            logentries.append(Logentry(msg, logger=logger.debug))
                            if shared_filesystem:
                                sandbox_tgt.copy(cc_url, os.path.basename(str(cc_url)))

                        # ------------------------------------------------------
                        # sanity checks
                        if not python_dist:
                            raise RuntimeError("missing python distribution")
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not agent_launch_method:
                            raise RuntimeError("missing agentlaunch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, rp_sdist_name])

                        # if cores_per_node is set (!= None), then we need to
                        # allocation full nodes, and thus round up
                        if cores_per_node:
                            cores_per_node = int(cores_per_node)
                            number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node))

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -d '%s'" % sdists
                        bootstrap_args += " -m '%s'" % virtenv_mode
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -r '%s'" % rp_version
                        bootstrap_args += " -s '%s'" % session_id
                        bootstrap_args += " -v '%s'" % virtenv
                        bootstrap_args += " -b '%s'" % python_dist

                        # set optional args
                        if agent_type:
                            bootstrap_args += " -a '%s'" % agent_type
                        if lrms == "CCM":
                            bootstrap_args += " -c"
                        if pre_bootstrap_1:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap_1)
                        if pre_bootstrap_2:
                            bootstrap_args += " -w '%s'" % "' -w '".join(pre_bootstrap_2)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -h '%s'" % db_hostport
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if tunnel_bind_device:
                            bootstrap_args += " -t '%s'" % tunnel_bind_device
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # set some agent configuration
                        agent_cfg_dict["cores"] = number_cores
                        agent_cfg_dict["resource_cfg"] = resource_cfg
                        agent_cfg_dict["debug"] = os.environ.get(
                            "RADICAL_PILOT_AGENT_VERBOSE", logger.getEffectiveLevel()
                        )
                        agent_cfg_dict["mongodb_url"] = str(agent_dburl)
                        agent_cfg_dict["lrms"] = lrms
                        agent_cfg_dict["spawner"] = agent_spawner
                        agent_cfg_dict["scheduler"] = agent_scheduler
                        agent_cfg_dict["runtime"] = runtime
                        agent_cfg_dict["pilot_id"] = pilot_id
                        agent_cfg_dict["session_id"] = session_id
                        agent_cfg_dict["agent_launch_method"] = agent_launch_method
                        agent_cfg_dict["task_launch_method"] = task_launch_method
                        agent_cfg_dict["export_to_cu"] = export_to_cu
                        agent_cfg_dict["cu_pre_exec"] = cu_pre_exec
                        agent_cfg_dict["cu_post_exec"] = cu_post_exec
                        if mpi_launch_method:
                            agent_cfg_dict["mpi_launch_method"] = mpi_launch_method
                        if cores_per_node:
                            agent_cfg_dict["cores_per_node"] = cores_per_node

                        # ------------------------------------------------------
                        # Write agent config dict to a json file in pilot sandbox.

                        cfg_tmp_dir = tempfile.mkdtemp(prefix="rp_agent_cfg_dir")
                        agent_cfg_name = "agent_0.cfg"
                        cfg_tmp_file = os.path.join(cfg_tmp_dir, agent_cfg_name)
                        cfg_tmp_handle = os.open(cfg_tmp_file, os.O_WRONLY | os.O_CREAT)

                        # Convert dict to json file
                        msg = "Writing agent configuration to file '%s'." % cfg_tmp_file
                        logentries.append(Logentry(msg, logger=logger.debug))
                        ru.write_json(agent_cfg_dict, cfg_tmp_file)

                        cf_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, cfg_tmp_file))
                        msg = "Copying agent configuration file '%s' to sandbox (%s)." % (cf_url, pilot_sandbox)
                        logentries.append(Logentry(msg, logger=logger.debug))
                        if shared_filesystem:
                            sandbox_tgt.copy(cf_url, agent_cfg_name)

                        # Close agent config file
                        os.close(cfg_tmp_handle)

                        # ------------------------------------------------------
                        # Done with all transfers to pilot sandbox, close handle
                        sandbox_tgt.close()

                        # ------------------------------------------------------
                        # now that the scripts are in place and configured,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                        else:
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "bootstrap_1.out"
                        jd.error = "bootstrap_1.err"
                        jd.total_cpu_count = number_cores
                        jd.processes_per_host = cores_per_node
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue
                        jd.candidate_hosts = candidate_hosts
                        jd.environment = dict()

                        # TODO: not all files might be required, this also needs to be made conditional
                        if not shared_filesystem:
                            jd.file_transfer = [
                                #'%s > %s' % (bootstrapper_path, os.path.basename(bootstrapper_path)),
                                "%s > %s"
                                % (
                                    bootstrapper_path,
                                    os.path.join(jd.working_directory, "input", os.path.basename(bootstrapper_path)),
                                ),
                                "%s > %s" % (cfg_tmp_file, os.path.join(jd.working_directory, "input", agent_cfg_name)),
                                #'%s < %s' % ('agent.log', os.path.join(jd.working_directory, 'agent.log')),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'agent.log'), 'agent.log'),
                                #'%s < %s' % ('agent.log', 'agent.log'),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'STDOUT'), 'unit.000000/STDOUT'),
                                #'%s < %s' % (os.path.join(jd.working_directory, 'unit.000000/STDERR'), 'STDERR')
                                #'%s < %s' % ('unit.000000/STDERR', 'unit.000000/STDERR')
                                # TODO: This needs to go into a per pilot directory on the submit node
                                "%s < %s" % ("pilot.0000.log.tgz", "pilot.0000.log.tgz"),
                            ]

                            if stage_sdist:
                                jd.file_transfer.extend(
                                    [
                                        #'%s > %s' % (rp_sdist_path, os.path.basename(rp_sdist_path)),
                                        "%s > %s"
                                        % (
                                            rp_sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(rp_sdist_path)
                                            ),
                                        ),
                                        #'%s > %s' % (saga.sdist_path, os.path.basename(saga.sdist_path)),
                                        "%s > %s"
                                        % (
                                            saga.sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(saga.sdist_path)
                                            ),
                                        ),
                                        #'%s > %s' % (ru.sdist_path, os.path.basename(ru.sdist_path)),
                                        "%s > %s"
                                        % (
                                            ru.sdist_path,
                                            os.path.join(
                                                jd.working_directory, "input", os.path.basename(ru.sdist_path)
                                            ),
                                        ),
                                    ]
                                )

                            if stage_cacerts:
                                jd.file_transfer.append(
                                    "%s > %s"
                                    % (cc_path, os.path.join(jd.working_directory, "input", os.path.basename(cc_path)))
                                )

                            if "RADICAL_PILOT_PROFILE" in os.environ:
                                # TODO: This needs to go into a per pilot directory on the submit node
                                jd.file_transfer.append("%s < %s" % ("pilot.0000.prof.tgz", "pilot.0000.prof.tgz"))

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment["RADICAL_PILOT_PROFILE"] = "TRUE"

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        try:
                            pilotjob = js.create_job(jd)
                        except saga.BadParameter as e:
                            raise ValueError("Pilot submission to %s failed: %s" % (resource_key, e))
                        pilotjob.run()

                        # Clean up agent config file and dir after submission
                        os.unlink(cfg_tmp_file)
                        os.rmdir(cfg_tmp_dir)

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = time.time()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": LAUNCHING},
                            {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id,
                                    "health_check_enabled": health_check,
                                    "agent_config": agent_cfg_dict,
                                },
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                {
                                    "$set": {"saga_job_id": saga_job_id, "health_check_enabled": health_check},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},
                                },
                            )

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = time.time()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(str(le.message))

                        pilot_col.update(
                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                            {
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )
                        logger.exception("\n".join(log_messages))

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

            thread.interrupt_main()
if __name__ == '__main__':

    if len(sys.argv) != 2:
        print "\n\tusage: %s <dir>\n" % sys.argv[0]
        sys.exit(1)

    loc = sys.argv[1]

    # find json file in dir, and derive session id
    json_files = glob.glob('%s/*.json' % loc)
    
    if len(json_files) < 1: raise ValueError('%s contains no json file!' % loc)
    if len(json_files) > 1: raise ValueError('%s contains more than one json file!' % loc)

    json_file = json_files[0]
    json      = ru.read_json(json_file)
    sid       = os.path.basename(json_file)[:-5]

    print 'sid: %s' % sid

    descr     = rp.utils.get_session_description(sid=sid, src=loc)
    prof      = rp.utils.get_session_profile    (sid=sid, src=loc)

    session = ra.Session(prof, descr)

    # A formatting helper before starting...
    def ppheader(message):
        separator = '\n' + 78 * '-' + '\n'
        print separator + message + separator

    # and here we go. Once we filter our session object so to keep only the
Ejemplo n.º 41
0
    random.shuffle(sequence)

    return sequence


# =============================================================================
# EXPERIMENT
# =============================================================================
if __name__ == '__main__':

    if len(sys.argv) < 2:
        print "\n\n\tusage: %s <config.json>\n\n" % sys.argv[0]
        sys.exit(-1)

    # read configuration file.
    cfg = ru.read_json(sys.argv[1])

    # TODO: Rename aimes.emgr config keys.
    cfg["skeleton_template"] = cfg["skeleton"]["template"]
    cfg["pct_concurrency"] = cfg["strategy"]["pct_concurrency"]
    cfg["pct_resources"] = cfg["strategy"]["pct_resources"]
    cfg["recipients"] = cfg["log"]["email"]["recipients"]

    # TODO: Override with json skeleton config entries.
    cfg['skeleton_task_duration'] = {
        "max": cfg["skeleton"]["tasks"]["duration"]["max"],
        "min": cfg["skeleton"]["tasks"]["duration"]["min"]}

    # cfg['bundle_resources']   = {'hopper.nersc.gov'          : 'pbs',
    #                              'stampede.tacc.xsede.org'   : 'slurm'}
    #                              'gordon.sdsc.xsede.org'     : 'pbs'}
Ejemplo n.º 42
0
    def _prepare_pilot(self, resource, rcfg, pilot):

        pid = pilot["uid"]
        ret = {'ft' : list(),
               'jd' : None  }

      # # ----------------------------------------------------------------------
      # # the rcfg can contain keys with string expansion placeholders where
      # # values from the pilot description need filling in.  A prominent
      # # example is `%(pd.project)s`, where the pilot description's `PROJECT`
      # # value needs to be filled in (here in lowercase).
      # expand = dict()
      # for k,v in pilot['description'].iteritems():
      #     if v is None:
      #         v = ''
      #     expand['pd.%s' % k] = v
      #     if isinstance(v, basestring):
      #         expand['pd.%s' % k.upper()] = v.upper()
      #         expand['pd.%s' % k.lower()] = v.lower()
      #     else:
      #         expand['pd.%s' % k.upper()] = v
      #         expand['pd.%s' % k.lower()] = v
      #
      # for k in rcfg:
      #     if isinstance(rcfg[k], basestring):
      #         orig     = rcfg[k]
      #         rcfg[k]  = rcfg[k] % expand
      #         expanded = rcfg[k]
      #         if orig != expanded:
      #             self._log.debug('RCFG:\n%s\n%s', orig, expanded)

        # ----------------------------------------------------------------------
        # Database connection parameters
        sid           = self._session.uid
        database_url  = self._session.dburl

        # some default values are determined at runtime
        default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
                          (resource, self._rp_version)

        # ----------------------------------------------------------------------
        # pilot description and resource configuration
        number_cores    = pilot['description']['cores']
        number_gpus     = pilot['description']['gpus']
        runtime         = pilot['description']['runtime']
        queue           = pilot['description']['queue']
        project         = pilot['description']['project']
        cleanup         = pilot['description']['cleanup']
        memory          = pilot['description']['memory']
        candidate_hosts = pilot['description']['candidate_hosts']

        # ----------------------------------------------------------------------
        # get parameters from resource cfg, set defaults where needed
        agent_launch_method     = rcfg.get('agent_launch_method')
        agent_dburl             = rcfg.get('agent_mongodb_endpoint', database_url)
        agent_spawner           = rcfg.get('agent_spawner',       DEFAULT_AGENT_SPAWNER)
        rc_agent_config         = rcfg.get('agent_config',        DEFAULT_AGENT_CONFIG)
        agent_scheduler         = rcfg.get('agent_scheduler')
        tunnel_bind_device      = rcfg.get('tunnel_bind_device')
        default_queue           = rcfg.get('default_queue')
        forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint')
        lrms                    = rcfg.get('lrms')
        mpi_launch_method       = rcfg.get('mpi_launch_method', '')
        pre_bootstrap_0         = rcfg.get('pre_bootstrap_0', [])
        pre_bootstrap_1         = rcfg.get('pre_bootstrap_1', [])
        python_interpreter      = rcfg.get('python_interpreter')
        task_launch_method      = rcfg.get('task_launch_method')
        rp_version              = rcfg.get('rp_version',          DEFAULT_RP_VERSION)
        virtenv_mode            = rcfg.get('virtenv_mode',        DEFAULT_VIRTENV_MODE)
        virtenv                 = rcfg.get('virtenv',             default_virtenv)
        cores_per_node          = rcfg.get('cores_per_node', 0)
        gpus_per_node           = rcfg.get('gpus_per_node',  0)
        lfs_path_per_node       = rcfg.get('lfs_path_per_node', None)
        lfs_size_per_node       = rcfg.get('lfs_size_per_node',  0)
        python_dist             = rcfg.get('python_dist')
        virtenv_dist            = rcfg.get('virtenv_dist',        DEFAULT_VIRTENV_DIST)
        cu_tmp                  = rcfg.get('cu_tmp')
        spmd_variation          = rcfg.get('spmd_variation')
        shared_filesystem       = rcfg.get('shared_filesystem', True)
        stage_cacerts           = rcfg.get('stage_cacerts', False)
        cu_pre_exec             = rcfg.get('cu_pre_exec')
        cu_post_exec            = rcfg.get('cu_post_exec')
        export_to_cu            = rcfg.get('export_to_cu')
        mandatory_args          = rcfg.get('mandatory_args', [])
        saga_jd_supplement      = rcfg.get('saga_jd_supplement', {})

        import pprint
        self._log.debug(cores_per_node)
        self._log.debug(pprint.pformat(rcfg))

        # make sure that mandatory args are known
        for ma in mandatory_args:
            if pilot['description'].get(ma) is None:
                raise  ValueError('attribute "%s" is required for "%s"'
                                 % (ma, resource))

        # get pilot and global sandbox
        resource_sandbox = self._session._get_resource_sandbox (pilot).path
        session_sandbox  = self._session._get_session_sandbox(pilot).path
        pilot_sandbox    = self._session._get_pilot_sandbox  (pilot).path

        pilot['resource_sandbox'] = str(self._session._get_resource_sandbox(pilot))
        pilot['pilot_sandbox']    = str(self._session._get_pilot_sandbox(pilot))
        pilot['client_sandbox']   = str(self._session._get_client_sandbox())

        # Agent configuration that is not part of the public API.
        # The agent config can either be a config dict, or
        # a string pointing to a configuration name.  If neither
        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
        # set.  The last fallback is 'agent_default'
        agent_config = pilot['description'].get('_config')
        if not agent_config:
            agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG')
        if not agent_config:
            agent_config = rc_agent_config

        if isinstance(agent_config, dict):

            # use dict as is
            agent_cfg = agent_config

        elif isinstance(agent_config, basestring):
            try:
                # interpret as a config name
                agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config)

                self._log.info("Read agent config file: %s",  agent_cfg_file)
                agent_cfg = ru.read_json(agent_cfg_file)

                # allow for user level overload
                user_cfg_file = '%s/.radical/pilot/config/%s' \
                              % (os.environ['HOME'], os.path.basename(agent_cfg_file))

                if os.path.exists(user_cfg_file):
                    self._log.info("merging user config: %s" % user_cfg_file)
                    user_cfg = ru.read_json(user_cfg_file)
                    ru.dict_merge (agent_cfg, user_cfg, policy='overwrite')

            except Exception as e:
                self._log.exception("Error reading agent config file: %s" % e)
                raise

        else:
            # we can't handle this type
            raise TypeError('agent config must be string (config name) or dict')

        # expand variables in virtenv string
        virtenv = virtenv % {'pilot_sandbox'   : pilot_sandbox,
                             'session_sandbox' : session_sandbox,
                             'resource_sandbox': resource_sandbox}

        # Check for deprecated global_virtenv
        if 'global_virtenv' in rcfg:
            raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource)

        # Create a host:port string for use by the bootstrap_0.
        db_url = rs.Url(agent_dburl)
        if db_url.port:
            db_hostport = "%s:%d" % (db_url.host, db_url.port)
        else:
            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

        # ----------------------------------------------------------------------
        # the version of the agent is derived from
        # rp_version, which has the following format
        # and interpretation:
        #
        # case rp_version:
        #   @<token>:
        #   @tag/@branch/@commit: # no sdist staging
        #       git clone $github_base radical.pilot.src
        #       (cd radical.pilot.src && git checkout token)
        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
        #       rm -rf radical.pilot.src
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   release: # no sdist staging
        #       pip install -t $VIRTENV/rp_install radical.pilot
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   local: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $VIRTENV/rp_install $sdist/
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   debug: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $SANDBOX/rp_install $sdist/
        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
        #
        #   installed: # no sdist staging
        #       true
        # esac
        #
        # virtenv_mode
        #   private : error  if ve exists, otherwise create, then use
        #   update  : update if ve exists, otherwise create, then use
        #   create  : use    if ve exists, otherwise create, then use
        #   use     : use    if ve exists, otherwise error,  then exit
        #   recreate: delete if ve exists, otherwise create, then use
        #      
        # examples   :
        #   [email protected]
        #   virtenv@devel
        #   virtenv@release
        #   virtenv@installed
        #   stage@local
        #   stage@/tmp/my_agent.py
        #
        # Note that some combinations may be invalid,
        # specifically in the context of virtenv_mode.  If, for
        # example, virtenv_mode is 'use', then the 'virtenv:tag'
        # will not make sense, as the virtenv is not updated.
        # In those cases, the virtenv_mode is honored, and
        # a warning is printed.
        #
        # Also, the 'stage' mode can only be combined with the
        # 'local' source, or with a path to the agent (relative
        # to root_dir, or absolute).
        #
        # A rp_version which does not adhere to the
        # above syntax is ignored, and the fallback stage@local
        # is used.

        if  not rp_version.startswith('@') and \
            not rp_version in ['installed', 'local', 'debug', 'release']:
            raise ValueError("invalid rp_version '%s'" % rp_version)

        if rp_version.startswith('@'):
            rp_version  = rp_version[1:]  # strip '@'


        # ----------------------------------------------------------------------
        # sanity checks
        if not python_dist        : raise RuntimeError("missing python distribution")
        if not virtenv_dist       : raise RuntimeError("missing virtualenv distribution")
        if not agent_spawner      : raise RuntimeError("missing agent spawner")
        if not agent_scheduler    : raise RuntimeError("missing agent scheduler")
        if not lrms               : raise RuntimeError("missing LRMS")
        if not agent_launch_method: raise RuntimeError("missing agentlaunch method")
        if not task_launch_method : raise RuntimeError("missing task launch method")

        # massage some values
        if not queue :
            queue = default_queue

        if  cleanup and isinstance (cleanup, bool) :
            #  l : log files
            #  u : unit work dirs
            #  v : virtualenv
            #  e : everything (== pilot sandbox)
            if shared_filesystem:
                cleanup = 'luve'
            else:
                # we cannot clean the sandbox from within the agent, as the hop
                # staging would then fail, and we'd get nothing back.
                # FIXME: cleanup needs to be done by the pmgr.launcher, or
                #        someone else, really, after fetching all logs and 
                #        profiles.
                cleanup = 'luv'

            # we never cleanup virtenvs which are not private
            if virtenv_mode is not 'private' :
                cleanup = cleanup.replace ('v', '')

        # add dists to staging files, if needed
        if rp_version in ['local', 'debug']:
            sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name]
            sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path]
        else:
            sdist_names = list()
            sdist_paths = list()

        # if cores_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if cores_per_node:
            cores_per_node = int(cores_per_node)
            number_cores   = int(cores_per_node
                           * math.ceil(float(number_cores) / cores_per_node))

        # if gpus_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if gpus_per_node:
            gpus_per_node = int(gpus_per_node)
            number_gpus   = int(gpus_per_node
                           * math.ceil(float(number_gpus) / gpus_per_node))

        # set mandatory args
        bootstrap_args  = ""
        bootstrap_args += " -d '%s'" % ':'.join(sdist_names)
        bootstrap_args += " -p '%s'" % pid
        bootstrap_args += " -s '%s'" % sid
        bootstrap_args += " -m '%s'" % virtenv_mode
        bootstrap_args += " -r '%s'" % rp_version
        bootstrap_args += " -b '%s'" % python_dist
        bootstrap_args += " -g '%s'" % virtenv_dist
        bootstrap_args += " -v '%s'" % virtenv
        bootstrap_args += " -y '%d'" % runtime

        # set optional args
        if lrms == "CCM":           bootstrap_args += " -c"
        if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
        if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport
        if python_interpreter:      bootstrap_args += " -i '%s'" % python_interpreter
        if tunnel_bind_device:      bootstrap_args += " -t '%s'" % tunnel_bind_device
        if cleanup:                 bootstrap_args += " -x '%s'" % cleanup

        for arg in pre_bootstrap_0:
            bootstrap_args += " -e '%s'" % arg
        for arg in pre_bootstrap_1:
            bootstrap_args += " -w '%s'" % arg

        agent_cfg['owner']              = 'agent_0'
        agent_cfg['cores']              = number_cores
        agent_cfg['gpus']               = number_gpus
        agent_cfg['lrms']               = lrms
        agent_cfg['spawner']            = agent_spawner
        agent_cfg['scheduler']          = agent_scheduler
        agent_cfg['runtime']            = runtime
        agent_cfg['dburl']              = str(database_url)
        agent_cfg['session_id']         = sid
        agent_cfg['pilot_id']           = pid
        agent_cfg['logdir']             = '.'
        agent_cfg['pilot_sandbox']      = pilot_sandbox
        agent_cfg['session_sandbox']    = session_sandbox
        agent_cfg['resource_sandbox']   = resource_sandbox
        agent_cfg['agent_launch_method']= agent_launch_method
        agent_cfg['task_launch_method'] = task_launch_method
        agent_cfg['mpi_launch_method']  = mpi_launch_method
        agent_cfg['cores_per_node']     = cores_per_node
        agent_cfg['gpus_per_node']      = gpus_per_node
        agent_cfg['lfs_path_per_node']  = lfs_path_per_node
        agent_cfg['lfs_size_per_node']  = lfs_size_per_node
        agent_cfg['cu_tmp']             = cu_tmp
        agent_cfg['export_to_cu']       = export_to_cu
        agent_cfg['cu_pre_exec']        = cu_pre_exec
        agent_cfg['cu_post_exec']       = cu_post_exec
        agent_cfg['resource_cfg']       = copy.deepcopy(rcfg)
        agent_cfg['debug']              = self._log.getEffectiveLevel()

        # we'll also push the agent config into MongoDB
        pilot['cfg'] = agent_cfg

        # ----------------------------------------------------------------------
        # Write agent config dict to a json file in pilot sandbox.

        agent_cfg_name = 'agent_0.cfg'
        cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
        os.close(cfg_tmp_handle)  # file exists now

        # Convert dict to json file
        self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file)
        self._log.debug(pprint.pformat(agent_cfg))
        ru.write_json(agent_cfg, cfg_tmp_file)

        ret['ft'].append({'src' : cfg_tmp_file, 
                          'tgt' : '%s/%s' % (pilot_sandbox, agent_cfg_name),
                          'rem' : True})  # purge the tmp file after packing

        # ----------------------------------------------------------------------
        # we also touch the log and profile tarballs in the target pilot sandbox
        ret['ft'].append({'src' : '/dev/null',
                          'tgt' : '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid),
                          'rem' : False})  # don't remove /dev/null
        # only stage profiles if we profile
        if self._prof.enabled:
            ret['ft'].append({
                          'src' : '/dev/null',
                          'tgt' : '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid),
                          'rem' : False})  # don't remove /dev/null

        # check if we have a sandbox cached for that resource.  If so, we have
        # nothing to do.  Otherwise we create the sandbox and stage the RP
        # stack etc.
        # NOTE: this will race when multiple pilot launcher instances are used!
        with self._cache_lock:

            if resource not in self._sandboxes:

                for sdist in sdist_paths:
                    base = os.path.basename(sdist)
                    ret['ft'].append({'src' : sdist, 
                                      'tgt' : '%s/%s' % (session_sandbox, base),
                                      'rem' : False})

                # Copy the bootstrap shell script.
                bootstrapper_path = os.path.abspath("%s/agent/%s"
                                  % (self._root_dir, BOOTSTRAPPER_0))
                self._log.debug("use bootstrapper %s", bootstrapper_path)

                ret['ft'].append({'src' : bootstrapper_path, 
                                  'tgt' : '%s/%s' % (session_sandbox, BOOTSTRAPPER_0),
                                  'rem' : False})

                # Some machines cannot run pip due to outdated CA certs.
                # For those, we also stage an updated certificate bundle
                # TODO: use booleans all the way?
                if stage_cacerts:

                    cc_name = 'cacert.pem.gz'
                    cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name))
                    self._log.debug("use CAs %s", cc_path)

                    ret['ft'].append({'src' : cc_path, 
                                      'tgt' : '%s/%s' % (session_sandbox, cc_name),
                                      'rem' : False})

                self._sandboxes[resource] = True


        # ----------------------------------------------------------------------
        # Create SAGA Job description and submit the pilot job

        jd = rs.job.Description()

        if shared_filesystem:
            bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0)
        else:
            bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0)

        jd.name                  = pid
        jd.executable            = "/bin/bash"
        jd.arguments             = ['-l %s' % bootstrap_tgt, bootstrap_args]
        jd.working_directory     = pilot_sandbox
        jd.project               = project
        jd.output                = "bootstrap_0.out"
        jd.error                 = "bootstrap_0.err"
        jd.total_cpu_count       = number_cores
        jd.total_gpu_count       = number_gpus
        jd.processes_per_host    = cores_per_node
        jd.spmd_variation        = spmd_variation
        jd.wall_time_limit       = runtime
        jd.total_physical_memory = memory
        jd.queue                 = queue
        jd.candidate_hosts       = candidate_hosts
        jd.environment           = dict()

        # we set any saga_jd_supplement keys which are not already set above
        for key, val in saga_jd_supplement.iteritems():
            if not jd[key]:
                self._log.debug('supplement %s: %s', key, val)
                jd[key] = val

        if 'RADICAL_PILOT_PROFILE' in os.environ :
            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

        # for condor backends and the like which do not have shared FSs, we add
        # additional staging directives so that the backend system binds the
        # files from the session and pilot sandboxes to the pilot job.
        jd.file_transfer = list()
        if not shared_filesystem:

            jd.file_transfer.extend([
                'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0),
                'site:%s/%s > %s' % (pilot_sandbox,   agent_cfg_name, agent_cfg_name),
                'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid),
                'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid)
            ])

            if 'RADICAL_PILOT_PROFILE' in os.environ:
                jd.file_transfer.extend([
                    'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid),
                    'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid)
                ])

            for sdist in sdist_names:
                jd.file_transfer.extend([
                    'site:%s/%s > %s' % (session_sandbox, sdist, sdist)
                ])

            if stage_cacerts:
                jd.file_transfer.extend([
                    'site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)
                ])

        self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments)

        ret['jd'] = jd
        return ret
    def __init__(self, session):
        """
        Creates a new PilotManager and attaches is to the session.

        **Arguments:**
            * session [:class:`radical.pilot.Session`]:
              The session instance to use.

        **Returns:**
            * A new `PilotManager` object [:class:`radical.pilot.PilotManager`].
        """

        self._bridges     = dict()
        self._components  = dict()
        self._pilots      = dict()
        self._pilots_lock = mt.RLock()
        self._callbacks   = dict()
        self._pcb_lock    = mt.RLock()
        self._terminate   = mt.Event()
        self._closed      = False
        self._rec_id      = 0       # used for session recording

        for m in rpt.PMGR_METRICS:
            self._callbacks[m] = dict()

        cfg = ru.read_json("%s/configs/pmgr_%s.json" \
                % (os.path.dirname(__file__),
                   os.environ.get('RADICAL_PILOT_PMGR_CFG', 'default')))

        assert(cfg['db_poll_sleeptime']), 'db_poll_sleeptime not configured'

        # initialize the base class (with no intent to fork)
        self._uid    = ru.generate_id('pmgr')
        cfg['owner'] = self.uid
        rpu.Component.__init__(self, cfg, session)
        self.start(spawn=False)

        # only now we have a logger... :/
        self._rep.info('<<create pilot manager')

        # The output queue is used to forward submitted pilots to the
        # launching component.
        self.register_output(rps.PMGR_LAUNCHING_PENDING,
                             rpc.PMGR_LAUNCHING_QUEUE)

        # we also listen on the control pubsub, to learn about completed staging
        # directives
        self.register_subscriber(rpc.CONTROL_PUBSUB, self._staging_ack_cb)
        self._active_sds = dict()
        self._sds_lock = mt.Lock()

        # register the state notification pull cb
        # FIXME: we may want to have the frequency configurable
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._state_pull_cb, 
                               timer=self._cfg['db_poll_sleeptime'])

        # also listen to the state pubsub for pilot state changes
        self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb)

        # let session know we exist
        self._session._register_pmgr(self)

        self._prof.prof('setup_done', uid=self._uid)
        self._rep.ok('>>ok\n')
    elif scheduler == 'round_robin': scheduler = rp.SCHED_ROUND_ROBIN
    else                           : scheduler = rp.SCHED_ROUND_ROBIN
    
    if not n_cores  : raise ValueError ("need number of cores")
    if not n_units  : raise ValueError ("need number of units")
    if not runtime  : raise ValueError ("need pilot runtime")
    if not resources: raise ValueError ("need target resource")
    if not load     : raise ValueError ("need load config")
    if not agent_cfg: raise ValueError ("need agent config")

    if not queue    : queue = None
    
    resources = resources.split(',')

    for resource in resources:
        if not resource in RESOURCES:
            raise ValueError ("unknown resource %s" % resource)

    cu_load = ru.read_json (load)

    n_cores = int(n_cores)
    n_units = int(n_units)
    runtime = int(runtime)

    sid = run_experiment (n_cores, n_units, resources, runtime, cu_load,
            agent_cfg, scheduler, queue)

    with open('last.sid', 'w') as f:
        f.write("%s\n" % sid)

Ejemplo n.º 45
0
if __name__ == "__main__":

    # we use a reporter class for nicer output
    report = ru.Reporter("Getting Started")

    # Create a new session. No need to try/except this: if session creation
    # fails, there is not much we can do anyways...
    session = rp.Session()

    # all other pilot code is now tried/excepted.  If an exception is caught, we
    # can rely on the session object to exist and be valid, and we can thus tear
    # the whole RP stack down via a 'session.close()' call in the 'finally'
    # clause...
    try:
        report.info('read configs')
        resources = ru.read_json('%s/config.json', os.path.dirname(__file__))
        report.ok('\\ok\n')

        report.header('submit pilots')

        # prepare some input files for the compute units
        os.system ('hostname > file1.dat')
        os.system ('date     > file2.dat')

        # Add a Pilot Manager. Pilot managers manage one or more ComputePilots.
        pmgr = rp.PilotManager(session=session)

        # Define an [n]-core local pilot that runs for [x] minutes
        pdescs = list()
        for resource in sys.argv[1:]:
            pdesc = rp.ComputePilotDescription()
Ejemplo n.º 46
0
    elif len(sys.argv) == 2: resource = sys.argv[1]
    else                   : resource = 'local.localhost'

    # Create a new session. No need to try/except this: if session creation
    # fails, there is not much we can do anyways...
    session = rp.Session()

    # all other pilot code is now tried/excepted.  If an exception is caught, we
    # can rely on the session object to exist and be valid, and we can thus tear
    # the whole RP stack down via a 'session.close()' call in the 'finally'
    # clause...
    try:

        # read the config used for resource details
        report.info('read config')
        config = ru.read_json('%s/config.json' % os.path.dirname(os.path.abspath(__file__)))
        report.ok('>>ok\n')

        report.header('submit pilots')

        # Add a Pilot Manager. Pilot managers manage one or more ComputePilots.
        pmgr = rp.PilotManager(session=session)

        # Define an [n]-core local pilot that runs for [x] minutes
        # Here we use a dict to initialize the description object
        report.info('create pilot description')
        pd_init = {
                'resource'      : resource,
                'cores'         : 64,  # pilot size
                'runtime'       : 15,  # pilot runtime (min)
                'exit_on_error' : True,
Ejemplo n.º 47
0
def get_session_description(sid, src=None, dburl=None):
    """
    This will return a description which is usable for radical.analytics
    evaluation.  It informs about
      - set of stateful entities
      - state models of those entities
      - event models of those entities (maybe)
      - configuration of the application / module

    If `src` is given, it is interpreted as path to search for session
    information (json dump).  `src` defaults to `$PWD/$sid`.

    if `dburl` is given, its value is used to fetch session information from
    a database.  The dburl value defaults to `RADICAL_PILOT_DBURL`.
    """

    from radical.pilot import states as rps
    from .session      import fetch_json

    if not src:
        src = "%s/%s" % (os.getcwd(), sid)

    if os.path.isfile('%s/%s.json' % (src, sid)):
        json = ru.read_json('%s/%s.json' % (src, sid))
    else:
        ftmp = fetch_json(sid=sid, dburl=dburl, tgt=src, skip_existing=True)
        json = ru.read_json(ftmp)

    # make sure we have uids
    # FIXME v0.47: deprecate
    def fix_json(json):
        def fix_uids(json):
            if isinstance(json, list):
                for elem in json:
                    fix_uids(elem)
            elif isinstance(json, dict):
                if 'unitmanager' in json and 'umgr' not in json:
                    json['umgr'] = json['unitmanager']
                if 'pilotmanager' in json and 'pmgr' not in json:
                    json['pmgr'] = json['pilotmanager']
                if '_id' in json and 'uid' not in json:
                    json['uid'] = json['_id']
                    if 'cfg' not in json:
                        json['cfg'] = dict()
                for k,v in json.iteritems():
                    fix_uids(v)
        fix_uids(json)
    fix_json(json)

    assert(sid == json['session']['uid']), 'sid inconsistent'

    ret             = dict()
    ret['entities'] = dict()

    tree      = dict()
    tree[sid] = {'uid'      : sid,
                 'etype'    : 'session',
                 'cfg'      : json['session']['cfg'],
                 'has'      : ['umgr', 'pmgr'],
                 'children' : list()
                }

    for pmgr in sorted(json['pmgr'], key=lambda k: k['uid']):
        uid = pmgr['uid']
        tree[sid]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'pmgr',
                     'cfg'      : pmgr['cfg'],
                     'has'      : ['pilot'],
                     'children' : list()
                    }

    for umgr in sorted(json['umgr'], key=lambda k: k['uid']):
        uid = umgr['uid']
        tree[sid]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'umgr',
                     'cfg'      : umgr['cfg'],
                     'has'      : ['unit'],
                     'children' : list()
                    }
        # also inject the pilot description, and resource specifically
        tree[uid]['description'] = dict()

    for pilot in sorted(json['pilot'], key=lambda k: k['uid']):
        uid  = pilot['uid']
        pmgr = pilot['pmgr']
        pilot['cfg']['resource_details'] = pilot['resource_details']
        tree[pmgr]['children'].append(uid)
        tree[uid] = {'uid'        : uid,
                     'etype'      : 'pilot',
                     'cfg'        : pilot['cfg'],
                     'description': pilot['description'],
                     'has'        : ['unit'],
                     'children'   : list()
                    }
        # also inject the pilot description, and resource specifically

    for unit in sorted(json['unit'], key=lambda k: k['uid']):
        uid  = unit['uid']
        pid  = unit['umgr']
        umgr = unit['pilot']
        tree[pid ]['children'].append(uid)
        tree[umgr]['children'].append(uid)
        tree[uid] = {'uid'         : uid,
                     'etype'       : 'unit',
                     'cfg'         : unit,
                     'description' : unit['description'],
                     'has'         : list(),
                     'children'    : list()
                    }
        # remove duplicate
        del(tree[uid]['cfg']['description'])

    ret['tree'] = tree

    ret['entities']['pilot']   = {'state_model'  : rps._pilot_state_values,
                                  'state_values' : rps._pilot_state_inv_full,
                                  'event_model'  : dict()}
    ret['entities']['unit']    = {'state_model'  : rps._unit_state_values,
                                  'state_values' : rps._unit_state_inv_full,
                                  'event_model'  : dict()}
    ret['entities']['session'] = {'state_model'  : None,  # has no states
                                  'state_values' : None,
                                  'event_model'  : dict()}

    ret['config'] = dict()  # session config goes here

    return ret
Ejemplo n.º 48
0
    def _read_json (filename) :

        data = ru.read_json (filename)
        os.unlink (filename)
        return data
Ejemplo n.º 49
0
    elif len(sys.argv) == 2: resource = sys.argv[1]
    else                   : resource = 'local.localhost'

    # Create a new session. No need to try/except this: if session creation
    # fails, there is not much we can do anyways...
    session = rp.Session()

    # all other pilot code is now tried/excepted.  If an exception is caught, we
    # can rely on the session object to exist and be valid, and we can thus tear
    # the whole RP stack down via a 'session.close()' call in the 'finally'
    # clause...
    if True:

        # read the config used for resource details
        report.info('read config')
        config = ru.read_json('config.json')
        report.ok('>>ok\n')

        report.header('submit pilots')

        # Add a Pilot Manager. Pilot managers manage one or more ComputePilots.
        pmgr = rp.PilotManager(session=session)

        # Define an [n]-core local pilot that runs for [x] minutes
        # Here we use a dict to initialize the description object
        pd_init = {
                'resource'      : resource,
                'cores'         : 2,  # pilot size
                'runtime'       : 300, # pilot runtime (min)
                'project'       : config[resource]['project'],
                'queue'         : config[resource]['queue'],
Ejemplo n.º 50
0
def emulate(command=None, samples=None, src=None):

    if (command and samples) or \
       (command and src    ) or \
       (samples and src    )    :
        raise ValueError ("emulate needs *either* command, sample *or* src")

    if not command and not samples and not src:
        print "warning: emulate needs either command, sample or src"
        return[0, None, None]

    if command or src:

        if command:
            profs = rsu.get_profiles (command, mode='pro')

            # FIXME: average vals over all retrieved profiles
            prof  = profs[0]['profile']

        else:
            prof = ru.read_json(src)

      # pprint.pprint (prof)

        # get time series to emulate (all types of operations are mixed)
        # FIXME: we should also sample walltime for _TIM.  As it is, mixing
        #        time and other samples will yield incorrect results due to
        #        mismatch in granularity.
        # FIXME: add network sample interpretation
        samples  = list()
      # samples += [[_TIM, x[0], {'real'       : x[1].get('real',       0)}]
      #                           for x in prof['time']]

        samples += [[_CPU, x[0], {'time'       : x[1].get('time',       0),
                                  'flops'      : x[1].get('ops',        0),
                                  'efficiency' : x[1].get('efficiency', 0)}]
                                  for x in prof['cpu']['sequence']]

        samples += [[_MEM, x[0], {'size'       : x[1].get('size',       0)}]
                                  for x in prof['mem']['sequence']]

        samples += [[_STO, x[0], {'src'        : x[1].get('src',        None),
                                  'rsize'      : x[1].get('read',       0),
                                  'tgt'        : x[1].get('tgt',        None),
                                  'wsize'      : x[1].get('write',      0)}]
                                  for x in prof['sto']['sequence']]

    # *globally* sort samples by time
    samples = sorted (samples, key=lambda x: x[1])

  # print "samples:\n---"
  # pprint.pprint (samples)
  # print "---"

    watchmode = os.environ.get ('RADICAL_SYNAPSE_WATCHMODE')
    if not watchmode or watchmode.lower in ['none', 'noop']:
        start = time.time()
        _emulator(samples)
        stop  = time.time()

        ret   = None
        info  = dict()
        info['cmd']  = command
        info['time'] = dict()
        info['time']['start'] = start
        info['time']['real']  = stop-start

    else:
        # let the profiler know that we run an emulation, so that the profile is not
        # stored as 'application run'.
        os.environ['_RADICAL_SYNAPSE_EMULATED'] = 'TRUE'
        os.environ['_RADICAL_SYNAPSE_EMULATEE'] = command

        info, ret, _ = profile(_emulator, samples)

        if 'ops' in info['cpu']:
            info['cpu']['efficiency'] = info['cpu']['ops']                       \
                                        / ( info['cpu']['ops']                   \
                                          + info['cpu']['cycles_stalled_front']  \
                                          + info['cpu']['cycles_stalled_back']   \
                                          )

       #print 'efficiency = %s / (%s + %s + %s) = %s' % (
       #          info['cpu']['ops'],
       #          info['cpu']['ops'],
       #          info['cpu']['cycles_stalled_front'],
       #          info['cpu']['cycles_stalled_back'],
       #          info['cpu']['efficiency'])


    return (info, ret, None)
    if   scheduler == 'direct'     : scheduler = rp.SCHED_DIRECT
    elif scheduler == 'backfilling': scheduler = rp.SCHED_BACKFILLING
    elif scheduler == 'round_robin': scheduler = rp.SCHED_ROUND_ROBIN
    else                           : scheduler = rp.SCHED_ROUND_ROBIN
    
    if not n_cores  : raise ValueError ("need number of cores")
    if not n_units  : raise ValueError ("need number of units")
    if not runtime  : raise ValueError ("need pilot runtime")
    if not resources: raise ValueError ("need target resource")
    if not load     : raise ValueError ("need load config")
    if not agent    : raise ValueError ("need agent config")
    
    resources = resources.split(',')

    for resource in resources:
        if not resource in RESOURCES:
            raise ValueError ("unknown resource %s" % resource)
    
    cu_load      = ru.read_json (load)
    agent_config = ru.read_json (agent)

    n_cores = int(n_cores)
    n_units = int(n_units)
    runtime = int(runtime)

    sid = run_experiment (n_cores, n_units, resources, runtime, cu_load,
            agent_config, scheduler, queue)

    print "session id: %s" % sid

Ejemplo n.º 52
0
 def __init__(self, uid, cfg, dbs):
     self.uid  = uid
     self._cfg = ru.read_json (credential['config'])
     self._dbs = dbs
Ejemplo n.º 53
0
    if   len(sys.argv)  > 2: report.exit('Usage:\t%s [resource]\n\n' % sys.argv[0])
    elif len(sys.argv) == 2: resource = sys.argv[1]
    else                   : resource = 'local.localhost'

    # Create a new session. No need to try/except this: if session creation
    # fails, there is not much we can do anyways...
    session = rp.Session()

    # all other pilot code is now tried/excepted.  If an exception is caught, we
    # can rely on the session object to exist and be valid, and we can thus tear
    # the whole RP stack down via a 'session.close()' call in the 'finally'
    # clause...
    try:
        # read the config used for resource details
        report.info('read config')
        config = ru.read_json('%s/../config.json' % PWD)
        report.ok('>>ok\n')

        report.header('submit pilots')

        # Add a Pilot Manager. Pilot managers manage one or more ComputePilots.
        pmgr = rp.PilotManager(session=session)

        # Define an [n]-core local pilot that runs for [x] minutes
        # Here we use a dict to initialize the description object
        pd_init = {
                      'resource'      : resource,
                      'runtime'       : 15,  # pilot runtime (min)
                      'exit_on_error' : True,
                      'project'       : config[resource]['project'],
                      'queue'         : config[resource]['queue'],
Ejemplo n.º 54
0
def get_session_description(sid, src=None, dburl=None):
    """
    This will return a description which is usable for radical.analytics
    evaluation.  It informs about
      - set of stateful entities
      - state models of those entities
      - event models of those entities (maybe)
      - configuration of the application / module

    If `src` is given, it is interpreted as path to search for session
    information (json dump).  `src` defaults to `$PWD/$sid`.

    if `dburl` is given, its value is used to fetch session information from
    a database.  The dburl value defaults to `RADICAL_PILOT_DBURL`.
    """

    from radical.pilot import states as rps
    from .session      import fetch_json

    if not src:
        src = "%s/%s" % (os.getcwd(), sid)

    ftmp = fetch_json(sid=sid, dburl=dburl, tgt=src, skip_existing=True)
    json = ru.read_json(ftmp)


    # make sure we have uids
    def fix_json(json):
        def fix_uids(json):
            if isinstance(json, list):
                for elem in json:
                    fix_uids(elem)
            elif isinstance(json, dict):
                if 'unitmanager' in json and 'umgr' not in json:
                    json['umgr'] = json['unitmanager']
                if 'pilotmanager' in json and 'pmgr' not in json:
                    json['pmgr'] = json['pilotmanager']
                if '_id' in json and 'uid' not in json:
                    json['uid'] = json['_id']
                    if not 'cfg' in json:
                        json['cfg'] = dict()
                for k,v in json.iteritems():
                    fix_uids(v)
        fix_uids(json)
    fix_json(json)

    ru.write_json(json, '/tmp/t.json')

    assert(sid == json['session']['uid'])

    ret             = dict()
    ret['entities'] = dict()

    tree      = dict()
    tree[sid] = {'uid'      : sid,
                 'etype'    : 'session',
               # 'cfg'      : json['session']['cfg'],
                 'has'      : ['umgr', 'pmgr'],
                 'children' : list()
                }

    for pmgr in sorted(json['pmgr'], key=lambda k: k['uid']):
        uid = pmgr['uid']
        tree[sid]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'pmgr',
                   # 'cfg'      : pmgr['cfg'],
                     'has'      : ['pilot'],
                     'children' : list()
                    }

    for umgr in sorted(json['umgr'], key=lambda k: k['uid']):
        uid = umgr['uid']
        tree[sid]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'umgr',
               #     'cfg'      : umgr['cfg'],
                     'has'      : ['unit'],
                     'children' : list()
                    }

    for pilot in sorted(json['pilot'], key=lambda k: k['uid']):
        uid  = pilot['uid']
        pmgr = pilot['pmgr']
        tree[pmgr]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'pilot',
               #     'cfg'      : pilot['cfg'],
                     'has'      : ['unit'],
                     'children' : list()
                    }

    for unit in sorted(json['unit'], key=lambda k: k['uid']):
        uid  = unit['uid']
        pid  = unit['umgr']
        umgr = unit['pilot']
        tree[pid ]['children'].append(uid)
        tree[umgr]['children'].append(uid)
        tree[uid] = {'uid'      : uid,
                     'etype'    : 'unit',
               #     'cfg'      : unit['description'],
                     'has'      : list(),
                     'children' : list()
                    }

    ret['tree'] = tree

    import pprint, sys
    pprint.pprint(tree)

    ret['entities']['pilot'] = {
            'state_model'  : rps.pilot_state_by_value,
            'state_values' : rps.pilot_state_value,
            'event_model'  : dict(),
            }

    ret['entities']['unit'] = {
            'state_model'  : rps.unit_state_by_value,
            'state_values' : rps.unit_state_value,
            'event_model'  : dict(),
            }

    ret['entities']['session'] = {
            'state_model'  : None, # session has no states, only events
            'state_values' : None,
            'event_model'  : dict(),
            }

    ret['config'] = dict() # magic to get session config goes here

    return ret
    def __init__(self, session, scheduler=None):
        """
        Creates a new UnitManager and attaches it to the session.

        **Arguments:**
            * session [:class:`radical.pilot.Session`]:
              The session instance to use.
            * scheduler (`string`): 
              The name of the scheduler plug-in to use.

        **Returns:**
            * A new `UnitManager` object [:class:`radical.pilot.UnitManager`].
        """

        self._bridges     = dict()
        self._components  = dict()
        self._pilots      = dict()
        self._pilots_lock = threading.RLock()
        self._units       = dict()
        self._units_lock  = threading.RLock()
        self._callbacks   = dict()
        self._cb_lock     = threading.RLock()
        self._terminate   = threading.Event()
        self._closed      = False
        self._rec_id      = 0       # used for session recording

        for m in rpt.UMGR_METRICS:
            self._callbacks[m] = dict()

        cfg = ru.read_json("%s/configs/umgr_%s.json" \
                % (os.path.dirname(__file__),
                   os.environ.get('RADICAL_PILOT_UMGR_CFG', 'default')))

        if scheduler:
            # overwrite the scheduler from the config file
            cfg['scheduler'] = scheduler

        if not cfg.get('scheduler'):
            # set default scheduler if needed
            cfg['scheduler'] = rpus.SCHEDULER_DEFAULT

        assert(cfg['db_poll_sleeptime']), 'db_poll_sleeptime not configured'

        # initialize the base class (with no intent to fork)
        self._uid    = ru.generate_id('umgr')
        cfg['owner'] = self.uid
        rpu.Component.__init__(self, cfg, session)
        self.start(spawn=False)
        self._log.info('started umgr %s', self._uid)

        # only now we have a logger... :/
        self._rep.info('<<create unit manager')

        # The output queue is used to forward submitted units to the
        # scheduling component.
        self.register_output(rps.UMGR_SCHEDULING_PENDING, 
                             rpc.UMGR_SCHEDULING_QUEUE)

        # the umgr will also collect units from the agent again, for output
        # staging and finalization
        self.register_output(rps.UMGR_STAGING_OUTPUT_PENDING, 
                             rpc.UMGR_STAGING_OUTPUT_QUEUE)

        # register the state notification pull cb
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._state_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # register callback which pulls units back from agent
        # FIXME: this should be a tailing cursor in the update worker
        self.register_timed_cb(self._unit_pull_cb,
                               timer=self._cfg['db_poll_sleeptime'])

        # also listen to the state pubsub for unit state changes
        self.register_subscriber(rpc.STATE_PUBSUB, self._state_sub_cb)

        # let session know we exist
        self._session._register_umgr(self)

        self._prof.prof('setup_done', uid=self._uid)
        self._rep.ok('>>ok\n')
Ejemplo n.º 56
0
def get_config (params) :
    """
    This method attempts to obtain configuration settings from a variety of
    sources, depending on the parameter. it can point to an env var, or to
    a directory containing configuration files, or to a single configuration
    file, or to a list of any above, or it is a config dict already, or a list
    of such dicts.  In all cases, the config is obtained from the respective
    source (which is assumed json formatted in the case of config files), and
    a single merged and expanded dict is returned.
    """


    ret = dict()

    # always make params list for simpler code below
    if  not isinstance(params, list) :
        params = [params]


    for param in params :

        if  not param or None == param : 

            # we silently accept None's, to save some 
            # repetetetetive checks on the calling side
            continue


        elif isinstance (param, dict) :

            # simply merge it into the result
            ru.dict_merge (ret, param, policy='overwrite')


        elif isinstance (param, basestring) :
        
            # check if the string points to an env variable
            if  param in os.environ :
                # assume that the value of the env var is what we really want
                param = os.environ[param]

            # is string, is not env, must be a dir or a file
            if  os.path.isdir (param) :
                # config dir
                cfg_files = glob.glob ("%s/*" % param)
              # print 'is dir %s/*' % param
              # print cfg_files

            elif os.path.isfile (param) :
                # single config file
                cfg_files = [param]

            else :
                troy._logger.warning ("cannot handle config location %s" % param)
                cfg_files = list()

            print 'files: %s' % cfg_files
            # read and merge all config files
            for cfg_file in cfg_files :
                cfg_dict = dict()
                try :
                    cfg_dict = ru.read_json (cfg_file)
                    troy._logger.info ("reading  config in %s" % cfg_file)
                except Exception as e :
                    troy._logger.critical ("skipping config in %s (%s)" % (cfg_file, e))
                    raise

              # import pprint
              # print '================'
              # print cfg_file
              # pprint.pprint (cfg_dict)
              # print '================'

                ru.dict_merge (ret, cfg_dict, policy='overwrite')



        else :
            raise TypeError ("get_config parameter must be (list of) dict or "
                             "string, not %s" % type(param))

  # print '================================'
  # pprint.pprint (ret)
  # print '================================'

    # expand config(s) before returning
    ru.dict_stringexpand (ret)

    return ret