def write_sub_configs(cfg, bridges, nodeip, log): """ create a sub_config for each sub-agent we intent to spawn """ # get bridge addresses from our bridges, and append them to the config if not "bridge_addresses" in cfg: cfg["bridge_addresses"] = dict() for b in bridges: # to avoid confusion with component input and output, we call bridge # input a 'sink', and a bridge output a 'source' (from the component # perspective) sink = ru.Url(bridges[b]["in"]) source = ru.Url(bridges[b]["out"]) # we replace the ip address with what we got from LRMS (nodeip). The # bridge should be listening on all interfaces, but we want to make sure # the sub-agents connect on an IP which is accessible to them sink.host = nodeip source.host = nodeip # keep the resultin URLs as strings, to be used as addresses cfg["bridge_addresses"][b] = dict() cfg["bridge_addresses"][b]["sink"] = str(sink) cfg["bridge_addresses"][b]["source"] = str(source) # write deep-copies of the config (with the corrected agent_name) for each # sub-agent (apart from agent_0, obviously) for sa in cfg.get("agent_layout"): if sa != "agent_0": sa_cfg = copy.deepcopy(cfg) sa_cfg["agent_name"] = sa ru.write_json(sa_cfg, "./%s.cfg" % sa)
def get_session_docs (db, sid, cache=None, cachedir=None) : # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in that # case we pull it from there instead of the database, which will be much # quicker. Also, we do cache any retrieved docs to that place, for later # use. An optional cachdir parameter changes that default location for # lookup and storage. if not cachedir : cachedir = _CACHE_BASEDIR if not cache : cache = "%s/%s.json" % (cachedir, sid) try : if os.path.isfile (cache) : return ru.read_json (cache) except Exception as e : # continue w/o cache sys.stderr.write ("warning: cannot read session cache at %s (%s)\n" % (cache, e)) # cache not used or not found -- go to db json_data = dict() # convert bson to json, i.e. serialize the ObjectIDs into strings. json_data['session'] = bson2json (list(db["%s" % sid].find ())) json_data['pmgr' ] = bson2json (list(db["%s.pm" % sid].find ())) json_data['pilot' ] = bson2json (list(db["%s.p" % sid].find ())) json_data['umgr' ] = bson2json (list(db["%s.um" % sid].find ())) json_data['unit' ] = bson2json (list(db["%s.cu" % sid].find ())) if len(json_data['session']) == 0 : raise ValueError ('no such session %s' % sid) # if len(json_data['session']) > 1 : # print 'more than one session document -- picking first one' # there can only be one session, not a list of one json_data['session'] = json_data['session'][0] # we want to add a list of handled units to each pilot doc for pilot in json_data['pilot'] : pilot['unit_ids'] = list() for unit in json_data['unit'] : if unit['pilot'] == str(pilot['_id']) : pilot['unit_ids'].append (str(unit['_id'])) # if we got here, we did not find a cached version -- thus add this dataset # to the cache try : os.system ('mkdir -p %s' % _CACHE_BASEDIR) ru.write_json (json_data, "%s/%s.json" % (_CACHE_BASEDIR, sid)) except Exception as e : # we can live without cache, no problem... pass return json_data
def write_workflow(workflow, uid): try: os.mkdir(uid) except: pass data = list() if os.path.isfile('%s/entk_workflow.json' % uid): data = ru.read_json('%s/entk_workflow.json' % uid) for pipe in workflow: p = dict() p['uid'] = pipe.uid p['name'] = pipe.name p['state_history'] = pipe.state_history p['stages'] = list() for stage in pipe.stages: s = dict() s['uid'] = stage.uid s['name'] = stage.name s['state_history'] = stage.state_history s['tasks'] = list() for task in stage.tasks: s['tasks'].append(task.to_dict()) p['stages'].append(s) data.append(p) ru.write_json(data, '%s/entk_workflow.json' % uid)
def get_session_docs(db, sid, cache=None, cachedir=None): # session docs may have been cached in /tmp/rp_cache_<uid>/<sid>.json -- in that # case we pull it from there instead of the database, which will be much # quicker. Also, we do cache any retrieved docs to that place, for later # use. An optional cachdir parameter changes that default location for # lookup and storage. if not cachedir: cachedir = _CACHE_BASEDIR if not cache: cache = "%s/%s.json" % (cachedir, sid) try: if os.path.isfile(cache): return ru.read_json(cache) except Exception as e: # continue w/o cache sys.stderr.write("warning: cannot read session cache at %s (%s)\n" % (cache, e)) # cache not used or not found -- go to db json_data = dict() # convert bson to json, i.e. serialize the ObjectIDs into strings. json_data['session'] = bson2json(list(db["%s" % sid].find())) json_data['pmgr'] = bson2json(list(db["%s.pm" % sid].find())) json_data['pilot'] = bson2json(list(db["%s.p" % sid].find())) json_data['umgr'] = bson2json(list(db["%s.um" % sid].find())) json_data['unit'] = bson2json(list(db["%s.cu" % sid].find())) if len(json_data['session']) == 0: raise ValueError('no such session %s' % sid) # if len(json_data['session']) > 1 : # print 'more than one session document -- picking first one' # there can only be one session, not a list of one json_data['session'] = json_data['session'][0] # we want to add a list of handled units to each pilot doc for pilot in json_data['pilot']: pilot['unit_ids'] = list() for unit in json_data['unit']: if unit['pilot'] == str(pilot['_id']): pilot['unit_ids'].append(str(unit['_id'])) # if we got here, we did not find a cached version -- thus add this dataset # to the cache try: os.system('mkdir -p %s' % _CACHE_BASEDIR) ru.write_json(json_data, "%s/%s.json" % (_CACHE_BASEDIR, sid)) except Exception as e: # we can live without cache, no problem... pass return json_data
def test_amgr_read_config(): amgr = Amgr(hostname=host, port=port) assert amgr._reattempts == 3 assert amgr._rmq_cleanup assert amgr._autoterminate assert not amgr._write_workflow assert not amgr._resubmit_failed assert amgr._rts == 'radical.pilot' assert amgr._num_pending_qs == 1 assert amgr._num_completed_qs == 1 assert amgr._rts_config == {"sandbox_cleanup": False, "db_cleanup" : False} d = {"hostname" : "radical.two", "port" : 25672, "username" : user, "password" : passwd, "reattempts" : 5, "resubmit_failed": True, "autoterminate" : False, "write_workflow" : True, "rts" : "mock", "rts_config" : {"sandbox_cleanup": True, "db_cleanup" : True}, "pending_qs" : 2, "completed_qs" : 3, "rmq_cleanup" : False} ru.write_json(d, './config.json') amgr._read_config(config_path='./', hostname=None, port=None, username=None, password=None, reattempts=None, resubmit_failed=None, autoterminate=None, write_workflow=None, rts=None, rmq_cleanup=None, rts_config=None) assert amgr._hostname == d['hostname'] assert amgr._port == d['port'] assert amgr._reattempts == d['reattempts'] assert amgr._resubmit_failed == d['resubmit_failed'] assert amgr._autoterminate == d['autoterminate'] assert amgr._write_workflow == d['write_workflow'] assert amgr._rts == d['rts'] assert amgr._rts_config == d['rts_config'] assert amgr._num_pending_qs == d['pending_qs'] assert amgr._num_completed_qs == d['completed_qs'] assert amgr._rmq_cleanup == d['rmq_cleanup'] os.remove('./config.json')
def _spawn(self, launcher, funcs): # NOTE: see documentation of funcs['sandbox'] semantics in the ComputeUnit # class definition. sandbox = '%s/%s' % (self._pwd, funcs['uid']) fname = '%s/%s.sh' % (sandbox, funcs['uid']) cfgname = '%s/%s.cfg' % (sandbox, funcs['uid']) descr = funcs['description'] rpu.rec_makedir(sandbox) ru.write_json(funcs.get('cfg'), cfgname) launch_cmd, hop_cmd = launcher.construct_command(funcs, fname) if hop_cmd : cmdline = hop_cmd else : cmdline = fname with open(fname, "w") as fout: fout.write('#!/bin/sh\n\n') # Create string for environment variable setting fout.write('export RP_SESSION_ID="%s"\n' % self._cfg['sid']) fout.write('export RP_PILOT_ID="%s"\n' % self._cfg['pid']) fout.write('export RP_AGENT_ID="%s"\n' % self._cfg['aid']) fout.write('export RP_SPAWNER_ID="%s"\n' % self.uid) fout.write('export RP_FUNCS_ID="%s"\n' % funcs['uid']) fout.write('export RP_GTOD="%s"\n' % self.gtod) fout.write('export RP_TMP="%s"\n' % self._cu_tmp) # also add any env vars requested in the unit description if descr.get('environment', []): for key,val in descr['environment'].items(): fout.write('export "%s=%s"\n' % (key, val)) fout.write('\n%s\n\n' % launch_cmd) fout.write('RETVAL=$?\n') fout.write("exit $RETVAL\n") # done writing to launch script, get it ready for execution. st = os.stat(fname) os.chmod(fname, st.st_mode | stat.S_IEXEC) fout = open('%s/%s.out' % (sandbox, funcs['uid']), "w") ferr = open('%s/%s.err' % (sandbox, funcs['uid']), "w") self._prof.prof('exec_start', uid=funcs['uid']) funcs['proc'] = subprocess.Popen(args = cmdline, executable = None, stdin = None, stdout = fout, stderr = ferr, preexec_fn = os.setsid, close_fds = True, shell = True, cwd = sandbox) self._prof.prof('exec_ok', uid=funcs['uid'])
def _write_profile(self): base = os.path.dirname(self._profile_loc) fname, ext = os.path.basename(self._profile_loc).split('.') op_name = base + '/' + fname + '.%s.' % self._uid + ext ru.write_json(data=self._profile, filename=op_name) self._logger.info('Profiles from executor %s written to %s' % (self._uid, op_name))
def write_data(data, proc_path): if 'rp.session' in proc_path: proc_path = os.path.dirname(os.path.dirname(proc_path)) + '/' + os.path.basename(proc_path) if not os.path.isdir(os.path.dirname(proc_path)): os.makedirs(os.path.dirname(proc_path)) ru.write_json(data,proc_path) return proc_path
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False, session=None, log=None): ''' returns file name ''' if not log and session: log = session._log rep = session._rep elif not log: log = ru.Logger('radical.pilot.utils') rep = ru.Reporter('radical.pilot.utils') if not tgt: tgt = '.' if tgt.startswith('/'): # Assume an absolute path dst = os.path.join(tgt, '%s.json' % sid) else: # Assume a relative path dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid) try: os.makedirs(os.path.dirname(tgt)) except OSError: pass # dir exists if skip_existing and os.path.isfile(dst) \ and os.stat(dst).st_size > 0: log.info("session already in %s", dst) else: if not dburl: dburl = os.environ.get('RADICAL_PILOT_DBURL') if not dburl: raise ValueError('RADICAL_PILOT_DBURL is not set') mongo, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) ru.write_json(json_docs, dst) log.info("session written to %s", dst) mongo.close() rep.ok("+ %s (json)\n" % sid) return dst
def submit(self, descr, count, cores, gpus): ''' submit n workers, and pass the queue info as configuration file. Do *not* wait for them to come up ''' # each worker gets the specified number of cores and gpus. All # resources need to be located on the same node. descr['cpu_processes'] = 1 descr['cpu_threads'] = cores descr['cpu_thread_type'] = 'POSIX' descr['gpu_processses'] = gpus tasks = list() for _ in range(count): # write config file for that worker cfg = copy.deepcopy(self._cfg) cfg['info'] = self._info uid = ru.generate_id('worker') sbox = '%s/%s' % (cfg['base'], uid) fname = '%s/%s.json' % (sbox, uid) cfg['kind'] = 'worker' cfg['uid'] = uid cfg['base'] = sbox cfg['cores'] = cores cfg['gpus'] = gpus ru.rec_makedir(sbox) ru.write_json(cfg, fname) # grab default settings via CUD construction descr_complete = ComputeUnitDescription(descr).as_dict() # create task dict task = dict() task['description'] = copy.deepcopy(descr_complete) task['state'] = rps.AGENT_STAGING_INPUT_PENDING task['type'] = 'unit' task['uid'] = uid task['unit_sandbox_path'] = sbox task['unit_sandbox'] = 'file://localhost/' + sbox task['pilot_sandbox'] = cfg.base task['session_sandbox'] = cfg.base + '/../' task['resource_sandbox'] = cfg.base + '/../../' task['description']['arguments'] += [fname] tasks.append(task) self._workers[uid] = task self._log.debug('submit %s', uid) # insert the task self.advance(tasks, publish=False, push=True)
def _initialize_primary(self, dburl): self._rep.info ('<<new session: ') self._rep.plain('[%s]' % self._uid) # create db connection - need a dburl to connect to if not dburl: dburl = self._cfg.dburl if not dburl: dburl = self._cfg.default_dburl if not dburl: raise RuntimeError("no db URL (set RADICAL_PILOT_DBURL)") self._cfg.dburl = dburl self._rep.info ('<<database : ') self._rep.plain('[%s]' % dburl) self._log.info('dburl %s' % dburl) # create/connect database handle on primary sessions try: self._dbs = DBSession(sid=self.uid, dburl=dburl, cfg=self._cfg, log=self._log) py_version_detail = sys.version.replace("\n", " ") from . import version_detail as rp_version_detail self.inject_metadata({'radical_stack': {'rp': rp_version_detail, 'rs': rs.version_detail, 'ru': ru.version_detail, 'py': py_version_detail}}) except Exception: self._rep.error(">>err\n") self._log.exception('session create failed [%s]', dburl) raise RuntimeError ('session create failed [%s]' % dburl) # primary sessions have a component manager which also manages # heartbeat. 'self._cmgr.close()` should be called during termination self._cmgr = rpu.ComponentManager(self._cfg) self._cmgr.start_bridges() self._cmgr.start_components() # expose the cmgr's heartbeat channel to anyone who wants to use it self._cfg.heartbeat = self._cmgr.cfg.heartbeat self._rec = False if self._cfg.record: # append session ID to recording path self._rec = "%s/%s" % (self._rec, self._uid) # create recording path and record session os.system('mkdir -p %s' % self._rec) ru.write_json({'dburl': str(self.dburl)}, "%s/session.json" % self._rec) self._log.info("recording session in %s" % self._rec) self._rep.ok('>>ok\n')
def write_data(data, proc_path, typ=None): if 'rp' == typ: proc_path = os.path.dirname( os.path.dirname(proc_path)) + '/' + os.path.basename(proc_path) if not os.path.isdir(os.path.dirname(proc_path)): os.makedirs(os.path.dirname(proc_path)) ru.write_json(data, proc_path) return proc_path
def write_workflows(workflows, uid, fname=None, fwrite=True): import warnings warnings.simplefilter("once") warnings.warn( "The function write_workflows will be deprecated in favor " + "of the profiles. Please set RADICAL_ENTK_PROFILE=TRUE", DeprecationWarning) try: os.mkdir(uid) except: pass if not fname: fname = 'entk_workflow.json' data = {'stack': ru.stack(), 'workflows': list()} for workflow in workflows: w = dict() w['pipes'] = list() for pipe in workflow: p = dict() p['uid'] = pipe.uid p['name'] = pipe.name p['state_history'] = pipe.state_history p['stages'] = list() for stage in pipe.stages: s = dict() s['uid'] = stage.uid s['name'] = stage.name s['state_history'] = stage.state_history s['tasks'] = list() for task in stage.tasks: s['tasks'].append(task.to_dict()) p['stages'].append(s) w['pipes'].append(p) data['workflows'].append(w) if fwrite: ru.write_json(data, '%s/%s' % (uid, fname)) return 0 return data
def test_amgr_read_config(self, mocked_init, mocked_PlainCredentials, mocked_ConnectionParameters, d): amgr = Amgr(hostname='host', port='port', username='******', password='******') d["rts"] = "mock" d["rts_config"] = {"sandbox_cleanup": True, "db_cleanup": True} ru.write_json(d, './config.json') amgr._read_config(config_path='./', hostname=None, port=None, username=None, password=None, reattempts=None, resubmit_failed=None, autoterminate=None, write_workflow=None, rts=None, rmq_cleanup=None, rts_config=None) self.assertEqual(amgr._hostname, d['hostname']) self.assertEqual(amgr._port, d['port']) self.assertEqual(amgr._reattempts, d['reattempts']) self.assertEqual(amgr._resubmit_failed, d['resubmit_failed']) self.assertEqual(amgr._autoterminate, d['autoterminate']) self.assertEqual(amgr._write_workflow, d['write_workflow']) self.assertEqual(amgr._rts, d['rts']) self.assertEqual(amgr._rts_config, d['rts_config']) self.assertEqual(amgr._num_pending_qs, d['pending_qs']) self.assertEqual(amgr._num_completed_qs, d['completed_qs']) self.assertEqual(amgr._rmq_cleanup, d['rmq_cleanup']) d['rts'] = 'another' ru.write_json(d, './config.json') print(d) with self.assertRaises(ValueError): amgr._read_config(config_path='./', hostname=None, port=None, username=None, password=None, reattempts=None, resubmit_failed=None, autoterminate=None, write_workflow=None, rts=None, rmq_cleanup=None, rts_config=None)
def test_amgr_read_config(): amgr = Amgr() assert amgr._mq_hostname == 'localhost' assert amgr._port == 5672 assert amgr._reattempts == 3 assert amgr._resubmit_failed == False assert amgr._autoterminate == True assert amgr._write_workflow == False assert amgr._rts == 'radical.pilot' assert amgr._num_pending_qs == 1 assert amgr._num_completed_qs == 1 assert amgr._rmq_cleanup == True assert amgr._rts_config == { "sandbox_cleanup": False, "db_cleanup": False} d = {"hostname": "radical.two", "port": 25672, "reattempts": 5, "resubmit_failed": True, "autoterminate": False, "write_workflow": True, "rts": "mock", "rts_config": { "sandbox_cleanup": True, "db_cleanup": True}, "pending_qs": 2, "completed_qs": 3, "rmq_cleanup": False} ru.write_json(d, './config.json') amgr._read_config(config_path='./', hostname=None, port=None, reattempts=None, resubmit_failed=None, autoterminate=None, write_workflow=None, rts=None, rmq_cleanup=None, rts_config=None) assert amgr._mq_hostname == d['hostname'] assert amgr._port == d['port'] assert amgr._reattempts == d['reattempts'] assert amgr._resubmit_failed == d['resubmit_failed'] assert amgr._autoterminate == d['autoterminate'] assert amgr._write_workflow == d['write_workflow'] assert amgr._rts == d['rts'] assert amgr._rts_config == d['rts_config'] assert amgr._num_pending_qs == d['pending_qs'] assert amgr._num_completed_qs == d['completed_qs'] assert amgr._rmq_cleanup == d['rmq_cleanup'] os.remove('./config.json')
def write_workflows(workflows, uid, fname=None, fwrite=True): try: os.mkdir(uid) except: pass if not fname: fname = 'entk_workflow.json' data = {'stack' : ru.stack(), 'workflows': list()} for workflow in workflows: w = dict() w['pipes'] = list() for pipe in workflow: p = dict() p['uid'] = pipe.uid p['name'] = pipe.name p['state_history'] = pipe.state_history p['stages'] = list() for stage in pipe.stages: s = dict() s['uid'] = stage.uid s['name'] = stage.name s['state_history'] = stage.state_history s['tasks'] = list() for task in stage.tasks: s['tasks'].append(task.to_dict()) p['stages'].append(s) w['pipes'].append(p) data['workflows'].append(w) if fwrite: ru.write_json(data, '%s/%s' % (uid, fname)) return 0 return data
def write_workflow(workflow, uid, workflow_fout='entk_workflow', fwrite=True): try: os.mkdir(uid) except: pass data = list() if os.path.isfile('%s/%s.json' % (uid, workflow_fout)): data = ru.read_json('%s/%s.json' % (uid, workflow_fout)) stack = ru.stack() data.append({'stack': stack}) for pipe in workflow: p = dict() p['uid'] = pipe.uid p['name'] = pipe.name p['state_history'] = pipe.state_history p['stages'] = list() for stage in pipe.stages: s = dict() s['uid'] = stage.uid s['name'] = stage.name s['state_history'] = stage.state_history s['tasks'] = list() for task in stage.tasks: s['tasks'].append(task.to_dict()) p['stages'].append(s) data.append(p) if fwrite: ru.write_json(data, '%s/%s.json' % (uid, workflow_fout)) return 0 return data
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False): ''' returns file name ''' if not tgt: tgt = '.' if tgt.startswith('/'): # Assume an absolute path dst = os.path.join(tgt, '%s.json' % sid) else: # Assume a relative path dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid) if skip_existing and os.path.isfile(dst) \ and os.stat(dst).st_size > 0: print "session already in %s" % dst else: if not dburl: dburl = os.environ.get('RADICAL_PILOT_DBURL') if not dburl: from radical.pilot.session import default_dburl logger.report.warn('using default dburl: %s' % default_dburl) dburl = default_dburl mongo, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) ru.write_json(json_docs, dst) print "session written to %s" % dst mongo.close() return dst
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False): ''' returns file name ''' if not tgt: tgt = '.' if tgt.startswith('/'): # Assume an absolute path dst = os.path.join(tgt, '%s.json' % sid) else: # Assume a relative path dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid) if skip_existing and os.path.isfile(dst) \ and os.stat(dst).st_size > 0: print "session already in %s" % dst else: if not dburl: dburl = os.environ['RADICAL_PILOT_DBURL'] if not dburl: raise RuntimeError('Please set RADICAL_PILOT_DBURL') mongo, db, _, _, _ = ru.mongodb_connect(dburl) json_docs = get_session_docs(db, sid) ru.write_json(json_docs, dst) print "session written to %s" % dst mongo.close() return dst
def _write_sa_configs(self): # we have all information needed by the subagents -- write the # sub-agent config files. # write deep-copies of the config for each sub-agent (sans from agent_0) for sa in self._cfg.get('agents', {}): assert(sa != 'agent_0'), 'expect subagent, not agent_0' # use our own config sans agents/components as a basis for # the sub-agent config. tmp_cfg = copy.deepcopy(self._cfg) tmp_cfg['agents'] = dict() tmp_cfg['components'] = dict() # merge sub_agent layout into the config ru.dict_merge(tmp_cfg, self._cfg['agents'][sa], ru.OVERWRITE) tmp_cfg['agent_name'] = sa tmp_cfg['owner'] = 'agent_0' ru.write_json(tmp_cfg, './%s.cfg' % sa)
def store_profile (profile, tags=None, url=None, mode=None) : if not url: url = os.environ.get ('RADICAL_SYNAPSE_DBURL') if not url: # print "warning: need dburl to store profiles" return None if not mode: raise ValueError ("document needs mode (emulated | eecuted | profiled)") url = ru.Url (url) if not tags: tags = dict() elems = filter (None, os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(',')) for elem in elems: if ':' in elem: key, val = elem.split(':', 1) tags[key] = val else: tags[elem] = None command_idx = index_command (profile['cmd'], tags) print "index %s (%s) to %s" % (profile['cmd'], tags, command_idx) host = profile['sys'].get ('hostname') if not host: host = os.environ.get ('RADICAL_SYNAPSE_HOSTNAME', socket.gethostname()) profile['sys']['hostname'] = host doc = {'type' : 'synapse_profile', 'mode' : mode, 'command_idx' : command_idx, 'command' : profile['cmd'], 'tags' : tags, 'profile' : profile} if url.schema == 'mongodb': print 'store profile in db %s' % url [dbhost, port, dbname, _, _, _, _] = ru.split_dburl (url) db_client = pymongo.MongoClient (host=dbhost, port=port) database = db_client[dbname] collection = database['profiles'] collection.insert (doc) elif url.schema == 'file': path = url.path if not os.path.isdir (path): os.system ('mkdir -p "%s"' % path) name = command_idx.split()[0] # for key, val in tags.iteritems(): # if val != None: name += "_%s:%s" % (key, val) # else : name += "_%s" % (key) for tag in sorted(tags.keys()): if tags[tag] != None: name += "_%s" % tags[tag] else : name += "_%s" % tag idx = 0 while True: fname = "%s/synapse_profile_%s_%s_%s_%03d.json" % (path, name, host, mode[0:3], idx) if not os.path.exists (fname): break idx += 1 print 'store profile in file %s' % fname os.system ('mkdir -p "%s/"' % path) ru.write_json (doc, fname)
def write_session_description(amgr): desc = dict() desc['entities'] = dict() desc['entities']['pipeline'] = { 'state_model': res._pipeline_state_values, 'state_values': res._pipeline_state_inv, 'event_model': dict(), } desc['entities']['stage'] = { 'state_model': res._stage_state_values, 'state_values': res._stage_state_inv, 'event_model': dict(), } desc['entities']['task'] = { 'state_model': res._task_state_values, 'state_values': res._task_state_inv, 'event_model': dict(), } desc['entities']['appmanager'] = { 'state_model': None, 'state_values': None, 'event_model': dict(), } # Adding amgr to the tree tree = dict() tree[amgr._uid] = {'uid': amgr._uid, 'etype': 'appmanager', 'cfg': {}, 'has': ['pipeline', 'wfprocessor', 'resource_manager', 'task_manager'], 'children': list() } # Adding wfp to the tree wfp = amgr._wfp tree[amgr._uid]['children'].append(wfp._uid) tree[wfp._uid] = {'uid': wfp._uid, 'etype': 'wfprocessor', 'cfg': {}, 'has': [], 'children': list() } # Adding rmgr to the tree rmgr = amgr._resource_manager tree[amgr._uid]['children'].append(rmgr._uid) tree[rmgr._uid] = {'uid': rmgr._uid, 'etype': 'resource_manager', 'cfg': {}, 'has': [], 'children': list() } # Adding tmgr to the tree tmgr = amgr._task_manager tree[amgr._uid]['children'].append(tmgr._uid) tree[tmgr._uid] = {'uid': tmgr._uid, 'etype': 'task_manager', 'cfg': {}, 'has': [], 'children': list() } # Adding pipelines to the tree wf = amgr._workflow for pipe in wf: tree[amgr._uid]['children'].append(pipe.uid) tree[pipe.uid] = {'uid': pipe.uid, 'etype': 'pipeline', 'cfg': {}, 'has': ['stage'], 'children': list() } # Adding stages to the tree for stage in pipe.stages: tree[pipe.uid]['children'].append(stage.uid) tree[stage.uid] = {'uid': stage.uid, 'etype': 'stage', 'cfg': {}, 'has': ['task'], 'children': list() } # Adding tasks to the tree for task in stage.tasks: tree[stage.uid]['children'].append(task.uid) tree[task.uid] = {'uid': task.uid, 'etype': 'task', 'cfg': {}, 'has': [], 'children': list() } desc['tree'] = tree desc['config'] = dict() ru.write_json(desc, '%s/radical.entk.%s.json' % (amgr.sid, amgr.sid))
def submit_pilots(self, descriptions): """ Submits on or more :class:`radical.pilot.ComputePilot` instances to the pilot manager. **Arguments:** * **descriptions** [:class:`radical.pilot.ComputePilotDescription` or list of :class:`radical.pilot.ComputePilotDescription`]: The description of the compute pilot instance(s) to create. **Returns:** * A list of :class:`radical.pilot.ComputePilot` objects. """ from .compute_pilot import ComputePilot self.is_valid() ret_list = True if not isinstance(descriptions, list): ret_list = False descriptions = [descriptions] if len(descriptions) == 0: raise ValueError('cannot submit no pilot descriptions') self._log.report.info('<<submit %d pilot(s)\n\t' % len(descriptions)) # create the pilot instance pilots = list() pilot_docs = list() for pd in descriptions : if not pd.runtime: raise ValueError('pilot runtime must be defined') if pd.runtime <= 0: raise ValueError('pilot runtime must be positive') if not pd.cores: raise ValueError('pilot core size must be defined') if not pd.resource: raise ValueError('pilot target resource must be defined') pilot = ComputePilot(pmgr=self, descr=pd) pilots.append(pilot) pilot_doc = pilot.as_dict() pilot_docs.append(pilot_doc) # keep pilots around with self._pilots_lock: self._pilots[pilot.uid] = pilot if self._session._rec: ru.write_json(pd.as_dict(), "%s/%s.batch.%03d.json" \ % (self._session._rec, pilot.uid, self._rec_id)) self._log.report.progress() # initial state advance to 'NEW' # FIXME: we should use update_pilot(), but that will not trigger an # advance, since the state did not change. We would then miss # the profile entry for the advance to NEW. So we here basically # only trigger the profile entry for NEW. self.advance(pilot_docs, state=rps.NEW, publish=False, push=False) if self._session._rec: self._rec_id += 1 # insert pilots into the database, as a bulk. self._session._dbs.insert_pilots(pilot_docs) # Only after the insert can we hand the pilots over to the next # components (ie. advance state). for pd in pilot_docs: pd['state'] = rps.PMGR_LAUNCHING_PENDING self._update_pilot(pd, advance=False) self.advance(pilot_docs, publish=True, push=True) self._log.report.ok('>>ok\n') if ret_list: return pilots else : return pilots[0]
def submit_pilots(self, descriptions): """ Submits on or more :class:`radical.pilot.ComputePilot` instances to the pilot manager. **Arguments:** * **descriptions** [:class:`radical.pilot.ComputePilotDescription` or list of :class:`radical.pilot.ComputePilotDescription`]: The description of the compute pilot instance(s) to create. **Returns:** * A list of :class:`radical.pilot.ComputePilot` objects. """ from .compute_pilot import ComputePilot self.is_valid() ret_list = True if not isinstance(descriptions, list): ret_list = False descriptions = [descriptions] if len(descriptions) == 0: raise ValueError('cannot submit no pilot descriptions') self._rep.info('<<submit %d pilot(s)\n\t' % len(descriptions)) # create the pilot instance pilots = list() pilot_docs = list() for pd in descriptions : if not pd.runtime: raise ValueError('pilot runtime must be defined') if pd.runtime <= 0: raise ValueError('pilot runtime must be positive') if not pd.cores: raise ValueError('pilot size must be defined') if not pd.resource: raise ValueError('pilot target resource must be defined') pilot = ComputePilot(pmgr=self, descr=pd) pilots.append(pilot) pilot_doc = pilot.as_dict() pilot_docs.append(pilot_doc) # keep pilots around with self._pilots_lock: self._pilots[pilot.uid] = pilot if self._session._rec: ru.write_json(pd.as_dict(), "%s/%s.batch.%03d.json" \ % (self._session._rec, pilot.uid, self._rec_id)) if 'resource' in pd and 'cores' in pd: self._rep.plain('[%s:%s]\n\t' % (pd['resource'], pd['cores'])) elif 'resource' in pd: self._rep.plain('[%s]\n\t' % pd['resource']) # initial state advance to 'NEW' # FIXME: we should use update_pilot(), but that will not trigger an # advance, since the state did not change. We would then miss # the profile entry for the advance to NEW. So we here basically # only trigger the profile entry for NEW. self.advance(pilot_docs, state=rps.NEW, publish=False, push=False) if self._session._rec: self._rec_id += 1 # insert pilots into the database, as a bulk. self._session._dbs.insert_pilots(pilot_docs) # Only after the insert can we hand the pilots over to the next # components (ie. advance state). for pd in pilot_docs: pd['state'] = rps.PMGR_LAUNCHING_PENDING self._update_pilot(pd, advance=False) self.advance(pilot_docs, publish=True, push=True) self._rep.ok('>>ok\n') if ret_list: return pilots else : return pilots[0]
def run(self): """Starts the process when Process.start() is called. """ global JOB_CHECK_INTERVAL # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: db = self._session.get_db() pilot_col = db["%s.p" % self._session.uid] logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._terminate.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) if self._disabled.is_set(): # don't process any new pilot start requests. # NOTE: this is not clean, in principle there could be other # launchers alive which want to still start those # pending pilots. In practice we only ever use one # pmgr though, and its during its shutdown that we get # here... ts = time.time() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": CANCELED}, "$push": {"statehistory": {"state": CANCELED, "timestamp": ts}}, }, ) # run state checks more frequently. JOB_CHECK_INTERVAL = 3 time.sleep(1) continue # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = time.time() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": LAUNCHING}, "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}}, }, ) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_id = self._session.uid database_url = self._session.dburl # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot["description"]["cores"] runtime = compute_pilot["description"]["runtime"] queue = compute_pilot["description"]["queue"] project = compute_pilot["description"]["project"] cleanup = compute_pilot["description"]["cleanup"] resource_key = compute_pilot["description"]["resource"] schema = compute_pilot["description"]["access_schema"] memory = compute_pilot["description"]["memory"] candidate_hosts = compute_pilot["description"]["candidate_hosts"] pilot_sandbox = compute_pilot["sandbox"] global_sandbox = compute_pilot["global_sandbox"] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config(resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_launch_method = resource_cfg.get("agent_launch_method") agent_dburl = resource_cfg.get("agent_mongodb_endpoint", database_url) agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE) rc_agent_config = resource_cfg.get("agent_config", DEFAULT_AGENT_CONFIG) agent_scheduler = resource_cfg.get("agent_scheduler") tunnel_bind_device = resource_cfg.get("tunnel_bind_device") default_queue = resource_cfg.get("default_queue") forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint") js_endpoint = resource_cfg.get("job_manager_endpoint") lrms = resource_cfg.get("lrms") mpi_launch_method = resource_cfg.get("mpi_launch_method") pre_bootstrap_1 = resource_cfg.get("pre_bootstrap_1") pre_bootstrap_2 = resource_cfg.get("pre_bootstrap_2") python_interpreter = resource_cfg.get("python_interpreter") spmd_variation = resource_cfg.get("spmd_variation") task_launch_method = resource_cfg.get("task_launch_method") rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get("stage_cacerts", "False") cores_per_node = resource_cfg.get("cores_per_node") shared_filesystem = resource_cfg.get("shared_filesystem", True) health_check = resource_cfg.get("health_check", True) python_dist = resource_cfg.get("python_dist") cu_pre_exec = resource_cfg.get("cu_pre_exec") cu_post_exec = resource_cfg.get("cu_post_exec") export_to_cu = resource_cfg.get("export_to_cu") # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = compute_pilot["description"].get("_config") if not agent_config: agent_config = os.environ.get("RADICAL_PILOT_AGENT_CONFIG") if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # nothing to do agent_cfg_dict = agent_config pass elif isinstance(agent_config, basestring): try: if os.path.exists(agent_config): # try to open as file name logger.info("Read agent config file: %s" % agent_config) agent_cfg_dict = ru.read_json(agent_config) else: # otherwise interpret as a config name module_path = os.path.dirname(os.path.abspath(__file__)) config_path = "%s/../configs/" % module_path agent_cfg_file = os.path.join(config_path, "agent_%s.json" % agent_config) logger.info("Read agent config file: %s" % agent_cfg_file) agent_cfg_dict = ru.read_json(agent_cfg_file) # no matter how we read the config file, we # allow for user level overload cfg_base = os.path.basename(agent_cfg_file) user_cfg = "%s/.radical/pilot/config/%s" % (os.environ["HOME"], cfg_base) if os.path.exists(user_cfg): logger.info("merging user config: %s" % user_cfg) user_cfg_dict = ru.read_json(user_cfg) ru.dict_merge(agent_cfg_dict, user_cfg_dict, policy="overwrite") except Exception as e: logger.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError("agent config must be string (filename) or dict") # TODO: use booleans all the way? if stage_cacerts.lower() == "true": stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { "pilot_sandbox": saga.Url(pilot_sandbox).path, "global_sandbox": saga.Url(global_sandbox).path, } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get("global_virtenv") if global_virtenv: logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'") virtenv = global_virtenv virtenv_mode = "use" # Create a host:port string for use by the bootstrap_1. db_url = saga.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # Open the remote sandbox # TODO: make conditional on shared_fs? sandbox_tgt = saga.filesystem.Directory( pilot_sandbox, session=self._session, flags=saga.filesystem.CREATE_PARENTS ) LOCAL_SCHEME = "file" # ------------------------------------------------------ # Copy the bootstrap shell script. # This also creates the sandbox. BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh" bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, BOOTSTRAPPER_SCRIPT)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, bootstrapper_path)) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, sandbox_tgt) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT) # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ["installed", "release"]: stage_sdist = False if rp_version.startswith("@"): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for sdist_path in [ru.sdist_path, saga.sdist_path, rp_sdist_path]: sdist_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, sdist_path)) msg = "Copying sdist '%s' to sandbox (%s)." % (sdist_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(sdist_url, os.path.basename(str(sdist_url))) # ------------------------------------------------------ # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz")) cc_url = saga.Url("%s://localhost/%s" % (LOCAL_SCHEME, cc_path)) msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % (cc_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(cc_url, os.path.basename(str(cc_url))) # ------------------------------------------------------ # sanity checks if not python_dist: raise RuntimeError("missing python distribution") if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = "luve" # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not "private": cleanup = cleanup.replace("v", "") sdists = ":".join([ru.sdist_name, saga.sdist_name, rp_sdist_name]) # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % sdists bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -s '%s'" % session_id bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -b '%s'" % python_dist # set optional args if agent_type: bootstrap_args += " -a '%s'" % agent_type if lrms == "CCM": bootstrap_args += " -c" if pre_bootstrap_1: bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap_1) if pre_bootstrap_2: bootstrap_args += " -w '%s'" % "' -w '".join(pre_bootstrap_2) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup # set some agent configuration agent_cfg_dict["cores"] = number_cores agent_cfg_dict["resource_cfg"] = resource_cfg agent_cfg_dict["debug"] = os.environ.get( "RADICAL_PILOT_AGENT_VERBOSE", logger.getEffectiveLevel() ) agent_cfg_dict["mongodb_url"] = str(agent_dburl) agent_cfg_dict["lrms"] = lrms agent_cfg_dict["spawner"] = agent_spawner agent_cfg_dict["scheduler"] = agent_scheduler agent_cfg_dict["runtime"] = runtime agent_cfg_dict["pilot_id"] = pilot_id agent_cfg_dict["session_id"] = session_id agent_cfg_dict["agent_launch_method"] = agent_launch_method agent_cfg_dict["task_launch_method"] = task_launch_method agent_cfg_dict["export_to_cu"] = export_to_cu agent_cfg_dict["cu_pre_exec"] = cu_pre_exec agent_cfg_dict["cu_post_exec"] = cu_post_exec if mpi_launch_method: agent_cfg_dict["mpi_launch_method"] = mpi_launch_method if cores_per_node: agent_cfg_dict["cores_per_node"] = cores_per_node # ------------------------------------------------------ # Write agent config dict to a json file in pilot sandbox. cfg_tmp_dir = tempfile.mkdtemp(prefix="rp_agent_cfg_dir") agent_cfg_name = "agent_0.cfg" cfg_tmp_file = os.path.join(cfg_tmp_dir, agent_cfg_name) cfg_tmp_handle = os.open(cfg_tmp_file, os.O_WRONLY | os.O_CREAT) # Convert dict to json file msg = "Writing agent configuration to file '%s'." % cfg_tmp_file logentries.append(Logentry(msg, logger=logger.debug)) ru.write_json(agent_cfg_dict, cfg_tmp_file) cf_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, cfg_tmp_file)) msg = "Copying agent configuration file '%s' to sandbox (%s)." % (cf_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) if shared_filesystem: sandbox_tgt.copy(cf_url, agent_cfg_name) # Close agent config file os.close(cfg_tmp_handle) # ------------------------------------------------------ # Done with all transfers to pilot sandbox, close handle sandbox_tgt.close() # ------------------------------------------------------ # now that the scripts are in place and configured, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = ["-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "bootstrap_1.out" jd.error = "bootstrap_1.err" jd.total_cpu_count = number_cores jd.processes_per_host = cores_per_node jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() # TODO: not all files might be required, this also needs to be made conditional if not shared_filesystem: jd.file_transfer = [ #'%s > %s' % (bootstrapper_path, os.path.basename(bootstrapper_path)), "%s > %s" % ( bootstrapper_path, os.path.join(jd.working_directory, "input", os.path.basename(bootstrapper_path)), ), "%s > %s" % (cfg_tmp_file, os.path.join(jd.working_directory, "input", agent_cfg_name)), #'%s < %s' % ('agent.log', os.path.join(jd.working_directory, 'agent.log')), #'%s < %s' % (os.path.join(jd.working_directory, 'agent.log'), 'agent.log'), #'%s < %s' % ('agent.log', 'agent.log'), #'%s < %s' % (os.path.join(jd.working_directory, 'STDOUT'), 'unit.000000/STDOUT'), #'%s < %s' % (os.path.join(jd.working_directory, 'unit.000000/STDERR'), 'STDERR') #'%s < %s' % ('unit.000000/STDERR', 'unit.000000/STDERR') # TODO: This needs to go into a per pilot directory on the submit node "%s < %s" % ("pilot.0000.log.tgz", "pilot.0000.log.tgz"), ] if stage_sdist: jd.file_transfer.extend( [ #'%s > %s' % (rp_sdist_path, os.path.basename(rp_sdist_path)), "%s > %s" % ( rp_sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(rp_sdist_path) ), ), #'%s > %s' % (saga.sdist_path, os.path.basename(saga.sdist_path)), "%s > %s" % ( saga.sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(saga.sdist_path) ), ), #'%s > %s' % (ru.sdist_path, os.path.basename(ru.sdist_path)), "%s > %s" % ( ru.sdist_path, os.path.join( jd.working_directory, "input", os.path.basename(ru.sdist_path) ), ), ] ) if stage_cacerts: jd.file_transfer.append( "%s > %s" % (cc_path, os.path.join(jd.working_directory, "input", os.path.basename(cc_path))) ) if "RADICAL_PILOT_PROFILE" in os.environ: # TODO: This needs to go into a per pilot directory on the submit node jd.file_transfer.append("%s < %s" % ("pilot.0000.prof.tgz", "pilot.0000.prof.tgz")) # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if "RADICAL_PILOT_PROFILE" in os.environ: jd.environment["RADICAL_PILOT_PROFILE"] = "TRUE" logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str(jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) try: pilotjob = js.create_job(jd) except saga.BadParameter as e: raise ValueError("Pilot submission to %s failed: %s" % (resource_key, e)) pilotjob.run() # Clean up agent config file and dir after submission os.unlink(cfg_tmp_file) os.rmdir(cfg_tmp_dir) # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url] msg = "SAGA job submitted with job id %s" % str(saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = time.time() ret = pilot_col.update( {"_id": pilot_id, "state": LAUNCHING}, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id, "health_check_enabled": health_check, "agent_config": agent_cfg_dict, }, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) if ret["n"] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update( {"_id": pilot_id}, { "$set": {"saga_job_id": saga_job_id, "health_check_enabled": health_check}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = time.time() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(str(le.message)) pilot_col.update( {"_id": pilot_id, "state": {"$ne": FAILED}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": {"statehistory": {"state": FAILED, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) logger.exception("\n".join(log_messages)) except SystemExit as e: logger.exception("pilot launcher thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main()
def _prepare_pilot(self, resource, rcfg, pilot): pid = pilot["uid"] ret = {'ft' : list(), 'jd' : None } # # ---------------------------------------------------------------------- # # the rcfg can contain keys with string expansion placeholders where # # values from the pilot description need filling in. A prominent # # example is `%(pd.project)s`, where the pilot description's `PROJECT` # # value needs to be filled in (here in lowercase). # expand = dict() # for k,v in pilot['description'].iteritems(): # if v is None: # v = '' # expand['pd.%s' % k] = v # if isinstance(v, basestring): # expand['pd.%s' % k.upper()] = v.upper() # expand['pd.%s' % k.lower()] = v.lower() # else: # expand['pd.%s' % k.upper()] = v # expand['pd.%s' % k.lower()] = v # # for k in rcfg: # if isinstance(rcfg[k], basestring): # orig = rcfg[k] # rcfg[k] = rcfg[k] % expand # expanded = rcfg[k] # if orig != expanded: # self._log.debug('RCFG:\n%s\n%s', orig, expanded) # ---------------------------------------------------------------------- # Database connection parameters sid = self._session.uid database_url = self._session.dburl # some default values are determined at runtime default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \ (resource, self._rp_version) # ---------------------------------------------------------------------- # pilot description and resource configuration number_cores = pilot['description']['cores'] number_gpus = pilot['description']['gpus'] runtime = pilot['description']['runtime'] queue = pilot['description']['queue'] project = pilot['description']['project'] cleanup = pilot['description']['cleanup'] memory = pilot['description']['memory'] candidate_hosts = pilot['description']['candidate_hosts'] # ---------------------------------------------------------------------- # get parameters from resource cfg, set defaults where needed agent_launch_method = rcfg.get('agent_launch_method') agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url) agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER) rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG) agent_scheduler = rcfg.get('agent_scheduler') tunnel_bind_device = rcfg.get('tunnel_bind_device') default_queue = rcfg.get('default_queue') forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint') lrms = rcfg.get('lrms') mpi_launch_method = rcfg.get('mpi_launch_method', '') pre_bootstrap_0 = rcfg.get('pre_bootstrap_0', []) pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', []) python_interpreter = rcfg.get('python_interpreter') task_launch_method = rcfg.get('task_launch_method') rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = rcfg.get('virtenv', default_virtenv) cores_per_node = rcfg.get('cores_per_node', 0) gpus_per_node = rcfg.get('gpus_per_node', 0) lfs_path_per_node = rcfg.get('lfs_path_per_node', None) lfs_size_per_node = rcfg.get('lfs_size_per_node', 0) python_dist = rcfg.get('python_dist') virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST) cu_tmp = rcfg.get('cu_tmp') spmd_variation = rcfg.get('spmd_variation') shared_filesystem = rcfg.get('shared_filesystem', True) stage_cacerts = rcfg.get('stage_cacerts', False) cu_pre_exec = rcfg.get('cu_pre_exec') cu_post_exec = rcfg.get('cu_post_exec') export_to_cu = rcfg.get('export_to_cu') mandatory_args = rcfg.get('mandatory_args', []) saga_jd_supplement = rcfg.get('saga_jd_supplement', {}) import pprint self._log.debug(cores_per_node) self._log.debug(pprint.pformat(rcfg)) # make sure that mandatory args are known for ma in mandatory_args: if pilot['description'].get(ma) is None: raise ValueError('attribute "%s" is required for "%s"' % (ma, resource)) # get pilot and global sandbox resource_sandbox = self._session._get_resource_sandbox (pilot).path session_sandbox = self._session._get_session_sandbox(pilot).path pilot_sandbox = self._session._get_pilot_sandbox (pilot).path pilot['resource_sandbox'] = str(self._session._get_resource_sandbox(pilot)) pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot)) pilot['client_sandbox'] = str(self._session._get_client_sandbox()) # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = pilot['description'].get('_config') if not agent_config: agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG') if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # use dict as is agent_cfg = agent_config elif isinstance(agent_config, basestring): try: # interpret as a config name agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config) self._log.info("Read agent config file: %s", agent_cfg_file) agent_cfg = ru.read_json(agent_cfg_file) # allow for user level overload user_cfg_file = '%s/.radical/pilot/config/%s' \ % (os.environ['HOME'], os.path.basename(agent_cfg_file)) if os.path.exists(user_cfg_file): self._log.info("merging user config: %s" % user_cfg_file) user_cfg = ru.read_json(user_cfg_file) ru.dict_merge (agent_cfg, user_cfg, policy='overwrite') except Exception as e: self._log.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError('agent config must be string (config name) or dict') # expand variables in virtenv string virtenv = virtenv % {'pilot_sandbox' : pilot_sandbox, 'session_sandbox' : session_sandbox, 'resource_sandbox': resource_sandbox} # Check for deprecated global_virtenv if 'global_virtenv' in rcfg: raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource) # Create a host:port string for use by the bootstrap_0. db_url = rs.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # ---------------------------------------------------------------------- # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to root_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug', 'release']: raise ValueError("invalid rp_version '%s'" % rp_version) if rp_version.startswith('@'): rp_version = rp_version[1:] # strip '@' # ---------------------------------------------------------------------- # sanity checks if not python_dist : raise RuntimeError("missing python distribution") if not virtenv_dist : raise RuntimeError("missing virtualenv distribution") if not agent_spawner : raise RuntimeError("missing agent spawner") if not agent_scheduler : raise RuntimeError("missing agent scheduler") if not lrms : raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method : raise RuntimeError("missing task launch method") # massage some values if not queue : queue = default_queue if cleanup and isinstance (cleanup, bool) : # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) if shared_filesystem: cleanup = 'luve' else: # we cannot clean the sandbox from within the agent, as the hop # staging would then fail, and we'd get nothing back. # FIXME: cleanup needs to be done by the pmgr.launcher, or # someone else, really, after fetching all logs and # profiles. cleanup = 'luv' # we never cleanup virtenvs which are not private if virtenv_mode is not 'private' : cleanup = cleanup.replace ('v', '') # add dists to staging files, if needed if rp_version in ['local', 'debug']: sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name] sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path] else: sdist_names = list() sdist_paths = list() # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # if gpus_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if gpus_per_node: gpus_per_node = int(gpus_per_node) number_gpus = int(gpus_per_node * math.ceil(float(number_gpus) / gpus_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % ':'.join(sdist_names) bootstrap_args += " -p '%s'" % pid bootstrap_args += " -s '%s'" % sid bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -b '%s'" % python_dist bootstrap_args += " -g '%s'" % virtenv_dist bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -y '%d'" % runtime # set optional args if lrms == "CCM": bootstrap_args += " -c" if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup for arg in pre_bootstrap_0: bootstrap_args += " -e '%s'" % arg for arg in pre_bootstrap_1: bootstrap_args += " -w '%s'" % arg agent_cfg['owner'] = 'agent_0' agent_cfg['cores'] = number_cores agent_cfg['gpus'] = number_gpus agent_cfg['lrms'] = lrms agent_cfg['spawner'] = agent_spawner agent_cfg['scheduler'] = agent_scheduler agent_cfg['runtime'] = runtime agent_cfg['dburl'] = str(database_url) agent_cfg['session_id'] = sid agent_cfg['pilot_id'] = pid agent_cfg['logdir'] = '.' agent_cfg['pilot_sandbox'] = pilot_sandbox agent_cfg['session_sandbox'] = session_sandbox agent_cfg['resource_sandbox'] = resource_sandbox agent_cfg['agent_launch_method']= agent_launch_method agent_cfg['task_launch_method'] = task_launch_method agent_cfg['mpi_launch_method'] = mpi_launch_method agent_cfg['cores_per_node'] = cores_per_node agent_cfg['gpus_per_node'] = gpus_per_node agent_cfg['lfs_path_per_node'] = lfs_path_per_node agent_cfg['lfs_size_per_node'] = lfs_size_per_node agent_cfg['cu_tmp'] = cu_tmp agent_cfg['export_to_cu'] = export_to_cu agent_cfg['cu_pre_exec'] = cu_pre_exec agent_cfg['cu_post_exec'] = cu_post_exec agent_cfg['resource_cfg'] = copy.deepcopy(rcfg) agent_cfg['debug'] = self._log.getEffectiveLevel() # we'll also push the agent config into MongoDB pilot['cfg'] = agent_cfg # ---------------------------------------------------------------------- # Write agent config dict to a json file in pilot sandbox. agent_cfg_name = 'agent_0.cfg' cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.') os.close(cfg_tmp_handle) # file exists now # Convert dict to json file self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file) self._log.debug(pprint.pformat(agent_cfg)) ru.write_json(agent_cfg, cfg_tmp_file) ret['ft'].append({'src' : cfg_tmp_file, 'tgt' : '%s/%s' % (pilot_sandbox, agent_cfg_name), 'rem' : True}) # purge the tmp file after packing # ---------------------------------------------------------------------- # we also touch the log and profile tarballs in the target pilot sandbox ret['ft'].append({'src' : '/dev/null', 'tgt' : '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid), 'rem' : False}) # don't remove /dev/null # only stage profiles if we profile if self._prof.enabled: ret['ft'].append({ 'src' : '/dev/null', 'tgt' : '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid), 'rem' : False}) # don't remove /dev/null # check if we have a sandbox cached for that resource. If so, we have # nothing to do. Otherwise we create the sandbox and stage the RP # stack etc. # NOTE: this will race when multiple pilot launcher instances are used! with self._cache_lock: if resource not in self._sandboxes: for sdist in sdist_paths: base = os.path.basename(sdist) ret['ft'].append({'src' : sdist, 'tgt' : '%s/%s' % (session_sandbox, base), 'rem' : False}) # Copy the bootstrap shell script. bootstrapper_path = os.path.abspath("%s/agent/%s" % (self._root_dir, BOOTSTRAPPER_0)) self._log.debug("use bootstrapper %s", bootstrapper_path) ret['ft'].append({'src' : bootstrapper_path, 'tgt' : '%s/%s' % (session_sandbox, BOOTSTRAPPER_0), 'rem' : False}) # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle # TODO: use booleans all the way? if stage_cacerts: cc_name = 'cacert.pem.gz' cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name)) self._log.debug("use CAs %s", cc_path) ret['ft'].append({'src' : cc_path, 'tgt' : '%s/%s' % (session_sandbox, cc_name), 'rem' : False}) self._sandboxes[resource] = True # ---------------------------------------------------------------------- # Create SAGA Job description and submit the pilot job jd = rs.job.Description() if shared_filesystem: bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0) else: bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0) jd.name = pid jd.executable = "/bin/bash" jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args] jd.working_directory = pilot_sandbox jd.project = project jd.output = "bootstrap_0.out" jd.error = "bootstrap_0.err" jd.total_cpu_count = number_cores jd.total_gpu_count = number_gpus jd.processes_per_host = cores_per_node jd.spmd_variation = spmd_variation jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() # we set any saga_jd_supplement keys which are not already set above for key, val in saga_jd_supplement.iteritems(): if not jd[key]: self._log.debug('supplement %s: %s', key, val) jd[key] = val if 'RADICAL_PILOT_PROFILE' in os.environ : jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE' # for condor backends and the like which do not have shared FSs, we add # additional staging directives so that the backend system binds the # files from the session and pilot sandboxes to the pilot job. jd.file_transfer = list() if not shared_filesystem: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0), 'site:%s/%s > %s' % (pilot_sandbox, agent_cfg_name, agent_cfg_name), 'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid) ]) if 'RADICAL_PILOT_PROFILE' in os.environ: jd.file_transfer.extend([ 'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid) ]) for sdist in sdist_names: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, sdist, sdist) ]) if stage_cacerts: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, cc_name, cc_name) ]) self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments) ret['jd'] = jd return ret
def run(self): """Starts the process when Process.start() is called. """ global JOB_CHECK_INTERVAL # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: db = self._session.get_db() pilot_col = db["%s.p" % self._session.uid] logger.debug( "Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._terminate.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) if self._disabled.is_set(): # don't process any new pilot start requests. # NOTE: this is not clean, in principle there could be other # launchers alive which want to still start those # pending pilots. In practice we only ever use one # pmgr though, and its during its shutdown that we get # here... ts = timestamp() compute_pilot = pilot_col.find_and_modify( query={ "pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH }, update={ "$set": { "state": CANCELED }, "$push": { "statehistory": { "state": CANCELED, "timestamp": ts } } }) # run state checks more frequently. JOB_CHECK_INTERVAL = 3 time.sleep(1) continue # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = timestamp() compute_pilot = pilot_col.find_and_modify( query={ "pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH }, update={ "$set": { "state": LAUNCHING }, "$push": { "statehistory": { "state": LAUNCHING, "timestamp": ts } } }) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_id = self._session.uid database_url = self._session.dburl # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot['description']['cores'] runtime = compute_pilot['description']['runtime'] queue = compute_pilot['description']['queue'] project = compute_pilot['description']['project'] cleanup = compute_pilot['description']['cleanup'] resource_key = compute_pilot['description']['resource'] schema = compute_pilot['description']['access_schema'] memory = compute_pilot['description']['memory'] pilot_sandbox = compute_pilot['sandbox'] global_sandbox = compute_pilot['global_sandbox'] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config( resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_launch_method = resource_cfg.get( 'agent_launch_method') agent_dburl = resource_cfg.get( 'agent_mongodb_endpoint', database_url) agent_spawner = resource_cfg.get( 'agent_spawner', DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get('agent_type', DEFAULT_AGENT_TYPE) rc_agent_config = resource_cfg.get( 'agent_config', DEFAULT_AGENT_CONFIG) agent_scheduler = resource_cfg.get('agent_scheduler') tunnel_bind_device = resource_cfg.get( 'tunnel_bind_device') default_queue = resource_cfg.get('default_queue') forward_tunnel_endpoint = resource_cfg.get( 'forward_tunnel_endpoint') js_endpoint = resource_cfg.get('job_manager_endpoint') lrms = resource_cfg.get('lrms') mpi_launch_method = resource_cfg.get( 'mpi_launch_method') pre_bootstrap_1 = resource_cfg.get('pre_bootstrap_1') pre_bootstrap_2 = resource_cfg.get('pre_bootstrap_2') python_interpreter = resource_cfg.get( 'python_interpreter') spmd_variation = resource_cfg.get('spmd_variation') task_launch_method = resource_cfg.get( 'task_launch_method') rp_version = resource_cfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get( 'virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get( 'stage_cacerts', 'False') cores_per_node = resource_cfg.get('cores_per_node') # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = compute_pilot['description'].get( '_config') if not agent_config: agent_config = os.environ.get( 'RADICAL_PILOT_AGENT_CONFIG') if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # nothing to do agent_cfg_dict = agent_config pass elif isinstance(agent_config, basestring): try: if os.path.exists(agent_config): # try to open as file name logger.info("Read agent config file: %s" % agent_config) agent_cfg_dict = ru.read_json(agent_config) else: # otherwise interpret as a config name # FIXME: load in session just like resource # configs, including user level overloads module_path = os.path.dirname( os.path.abspath(__file__)) config_path = "%s/../configs/" % module_path agent_cfg_file = os.path.join( config_path, "agent_%s.json" % agent_config) logger.info("Read agent config file: %s" % agent_cfg_file) agent_cfg_dict = ru.read_json( agent_cfg_file) except Exception as e: logger.exception( "Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError( 'agent config must be string (filename) or dict' ) # TODO: use booleans all the way? if stage_cacerts.lower() == 'true': stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': saga.Url(pilot_sandbox).path, 'global_sandbox': saga.Url(global_sandbox).path } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get('global_virtenv') if global_virtenv: logger.warn( "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'" ) virtenv = global_virtenv virtenv_mode = 'use' # Create a host:port string for use by the bootstrap_1. db_url = saga.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017 ) # mongodb default # Open the remote sandbox sandbox_tgt = saga.filesystem.Directory( pilot_sandbox, session=self._session, flags=saga.filesystem.CREATE_PARENTS) BOOTSTRAPPER_SCRIPT = "bootstrap_1.sh" LOCAL_SCHEME = 'file' # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" # TODO: Is this still configurable and/or in the resource configs? bootstrapper = "default_bootstrapper.sh" bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url( "%s://localhost%s" % (LOCAL_SCHEME, bootstrapper_path)) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \ % (bs_script_url, sandbox_tgt) logentries.append(Logentry(msg, logger=logger.debug)) sandbox_tgt.copy(bs_script_url, BOOTSTRAPPER_SCRIPT) # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug']: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ['installed', 'release']: stage_sdist = False if rp_version.startswith('@'): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for sdist_path in [ ru.sdist_path, saga.sdist_path, rp_sdist_path ]: sdist_url = saga.Url( "%s://localhost%s" % (LOCAL_SCHEME, sdist_path)) msg = "Copying sdist '%s' to sandbox (%s)." % ( sdist_url, pilot_sandbox) logentries.append( Logentry(msg, logger=logger.debug)) sandbox_tgt.copy( sdist_url, os.path.basename(str(sdist_url))) # ------------------------------------------------------ # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, 'cacert.pem.gz')) cc_url = saga.Url("%s://localhost/%s" % (LOCAL_SCHEME, cc_path)) msg = "Copying CA certificate bundle '%s' to sandbox (%s)." % ( cc_url, pilot_sandbox) logentries.append( Logentry(msg, logger=logger.debug)) sandbox_tgt.copy(cc_url, os.path.basename(str(cc_url))) # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method: raise RuntimeError("missing task launch method") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") # massage some values if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = 'luve' # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') sdists = ':'.join( [ru.sdist_name, saga.sdist_name, rp_sdist_name]) # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil( float(number_cores) / cores_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % sdists bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -s '%s'" % session_id bootstrap_args += " -v '%s'" % virtenv # set optional args if agent_type: bootstrap_args += " -a '%s'" % agent_type if lrms == "CCM": bootstrap_args += " -c" if pre_bootstrap_1: bootstrap_args += " -e '%s'" % "' -e '".join( pre_bootstrap_1) if pre_bootstrap_2: bootstrap_args += " -w '%s'" % "' -w '".join( pre_bootstrap_2) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup # set some agent configuration agent_cfg_dict['cores'] = number_cores agent_cfg_dict['debug'] = os.environ.get( 'RADICAL_PILOT_AGENT_VERBOSE', logger.getEffectiveLevel()) agent_cfg_dict['mongodb_url'] = str(agent_dburl) agent_cfg_dict['lrms'] = lrms agent_cfg_dict['spawner'] = agent_spawner agent_cfg_dict['scheduler'] = agent_scheduler agent_cfg_dict['runtime'] = runtime agent_cfg_dict['pilot_id'] = pilot_id agent_cfg_dict['session_id'] = session_id agent_cfg_dict[ 'agent_launch_method'] = agent_launch_method agent_cfg_dict[ 'task_launch_method'] = task_launch_method agent_cfg_dict['mpi_launch_method'] = mpi_launch_method if cores_per_node: agent_cfg_dict['cores_per_node'] = cores_per_node # ------------------------------------------------------ # Write agent config dict to a json file in pilot sandbox. cfg_tmp_handle, cf_tmp_file = tempfile.mkstemp( suffix='.json', prefix='rp_agent_cfg_') # Convert dict to json file msg = "Writing agent configuration to file '%s'." % cf_tmp_file logentries.append(Logentry(msg, logger=logger.debug)) ru.write_json(agent_cfg_dict, cf_tmp_file) cf_url = saga.Url("%s://localhost%s" % (LOCAL_SCHEME, cf_tmp_file)) msg = "Copying agent configuration file '%s' to sandbox (%s)." % ( cf_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) sandbox_tgt.copy(cf_url, 'agent_0.cfg') # close and remove temp file os.close(cfg_tmp_handle) os.unlink(cf_tmp_file) # ------------------------------------------------------ # Done with all transfers to pilot sandbox, close handle sandbox_tgt.close() # ------------------------------------------------------ # now that the scripts are in place and configured, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][ js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][ js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = [ "-l %s" % BOOTSTRAPPER_SCRIPT, bootstrap_args ] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "bootstrap_1.out" jd.error = "bootstrap_1.err" jd.total_cpu_count = number_cores jd.processes_per_host = cores_per_node jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.environment = dict() # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE' logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str( jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data['job_ids'][pilot_id] = [ saga_job_id, js_url ] msg = "SAGA job submitted with job id %s" % str( saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = timestamp() ret = pilot_col.update( { "_id": pilot_id, "state": LAUNCHING }, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id, "agent_config": agent_cfg_dict }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) if ret['n'] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update({"_id": pilot_id}, { "$set": { "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs( pilot_col, pilot_id) ts = timestamp() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( { "_id": pilot_id, "state": { "$ne": FAILED } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) logger.exception('\n'.join(log_messages)) except SystemExit as e: logger.exception( "pilot launcher thread caught system exit -- forcing application shutdown" ) import thread thread.interrupt_main()
def submit_units(self, descriptions): """ Submits on or more :class:`radical.pilot.ComputeUnit` instances to the unit manager. **Arguments:** * **descriptions** [:class:`radical.pilot.ComputeUnitDescription` or list of :class:`radical.pilot.ComputeUnitDescription`]: The description of the compute unit instance(s) to create. **Returns:** * A list of :class:`radical.pilot.ComputeUnit` objects. """ from .compute_unit import ComputeUnit self.is_valid() ret_list = True if not isinstance(descriptions, list): ret_list = False descriptions = [descriptions] if len(descriptions) == 0: raise ValueError('cannot submit no unit descriptions') self._rep.info('<<submit %d unit(s)\n\t' % len(descriptions)) # we return a list of compute units units = list() for ud in descriptions: if not ud.executable: raise ValueError('compute unit executable must be defined') unit = ComputeUnit(umgr=self, descr=ud) units.append(unit) # keep units around with self._units_lock: self._units[unit.uid] = unit if self._session._rec: ru.write_json(ud.as_dict(), "%s/%s.batch.%03d.json" % (self._session._rec, unit.uid, self._rec_id)) self._rep.progress() if self._session._rec: self._rec_id += 1 # insert units into the database, as a bulk. unit_docs = [u.as_dict() for u in units] self._session._dbs.insert_units(unit_docs) # Only after the insert can we hand the units over to the next # components (ie. advance state). self.advance(unit_docs, rps.UMGR_SCHEDULING_PENDING, publish=True, push=True) self._rep.ok('>>ok\n') if ret_list: return units else : return units[0]
bulk_size = 500 bulk_id = '%s.%04d' % (session.uid, bulk) report.info('handle bulk %s (%d)\n\t' % (bulk_id, bulk_size)) cuds = list() tasks = fetch_tasks(bulk_size=bulk_size) for task in tasks['data']: args = task['spec']['args'][0] prog = args['program'] tid = task['id'] fin = '%s/%s.in.json' % (sandbox, tid) fout = '%s/%s.out.json' % (sandbox, tid) ru.write_json(args, fin) cud = rp.ComputeUnitDescription() cud.executable = '/home/dgasmith/miniconda/envs/qcf/bin/qcengine' cud.arguments = [prog, fin] cud.name = tid cud.metadata = {'fout': fout} cud.input_staging = [fin] cud.output_staging = { 'source': 'unit:///STDOUT', 'target': '%s' % fout, 'action': rp.TRANSFER } cud.gpu_processes = 0 cud.cpu_processes = 1 cud.cpu_threads = 1
def _prepare_pilot(self, resource, rcfg, pilot): pid = pilot["uid"] ret = {'ft': list(), 'jd': None} # ------------------------------------------------------------------ # Database connection parameters sid = self._session.uid database_url = self._session.dburl # some default values are determined at runtime default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \ (resource, self._rp_version) # ------------------------------------------------------------------ # get parameters from resource cfg, set defaults where needed agent_launch_method = rcfg.get('agent_launch_method') agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url) agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER) rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG) agent_scheduler = rcfg.get('agent_scheduler') tunnel_bind_device = rcfg.get('tunnel_bind_device') default_queue = rcfg.get('default_queue') forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint') lrms = rcfg.get('lrms') mpi_launch_method = rcfg.get('mpi_launch_method', '') pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', []) pre_bootstrap_2 = rcfg.get('pre_bootstrap_2', []) python_interpreter = rcfg.get('python_interpreter') task_launch_method = rcfg.get('task_launch_method') rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = rcfg.get('virtenv', default_virtenv) cores_per_node = rcfg.get('cores_per_node', 0) health_check = rcfg.get('health_check', True) python_dist = rcfg.get('python_dist') virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST) cu_tmp = rcfg.get('cu_tmp') spmd_variation = rcfg.get('spmd_variation') shared_filesystem = rcfg.get('shared_filesystem', True) stage_cacerts = rcfg.get('stage_cacerts', False) cu_pre_exec = rcfg.get('cu_pre_exec') cu_post_exec = rcfg.get('cu_post_exec') export_to_cu = rcfg.get('export_to_cu') mandatory_args = rcfg.get('mandatory_args', []) # ------------------------------------------------------------------ # get parameters from the pilot description number_cores = pilot['description']['cores'] runtime = pilot['description']['runtime'] queue = pilot['description']['queue'] project = pilot['description']['project'] cleanup = pilot['description']['cleanup'] memory = pilot['description']['memory'] candidate_hosts = pilot['description']['candidate_hosts'] # make sure that mandatory args are known for ma in mandatory_args: if pilot['description'].get(ma) is None: raise ValueError('attribute "%s" is required for "%s"' \ % (ma, resource)) # get pilot and global sandbox resource_sandbox = self._session._get_resource_sandbox(pilot).path session_sandbox = self._session._get_session_sandbox(pilot).path pilot_sandbox = self._session._get_pilot_sandbox(pilot).path pilot['resource_sandbox'] = str( self._session._get_resource_sandbox(pilot)) pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot)) pilot['client_sandbox'] = str(self._session._get_client_sandbox()) # Agent configuration that is not part of the public API. # The agent config can either be a config dict, or # a string pointing to a configuration name. If neither # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is # set. The last fallback is 'agent_default' agent_config = pilot['description'].get('_config') if not agent_config: agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG') if not agent_config: agent_config = rc_agent_config if isinstance(agent_config, dict): # use dict as is agent_cfg = agent_config elif isinstance(agent_config, basestring): try: # interpret as a config name agent_cfg_file = os.path.join(self._conf_dir, "agent_%s.json" % agent_config) self._log.info("Read agent config file: %s", agent_cfg_file) agent_cfg = ru.read_json(agent_cfg_file) # allow for user level overload user_cfg_file = '%s/.radical/pilot/config/%s' \ % (os.environ['HOME'], os.path.basename(agent_cfg_file)) if os.path.exists(user_cfg_file): self._log.info("merging user config: %s" % user_cfg_file) user_cfg = ru.read_json(user_cfg_file) ru.dict_merge(agent_cfg, user_cfg, policy='overwrite') except Exception as e: self._log.exception("Error reading agent config file: %s" % e) raise else: # we can't handle this type raise TypeError( 'agent config must be string (config name) or dict') # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': pilot_sandbox, 'session_sandbox': session_sandbox, 'resource_sandbox': resource_sandbox } # Check for deprecated global_virtenv if 'global_virtenv' in rcfg: raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource) # Create a host:port string for use by the bootstrap_1. db_url = rs.Url(agent_dburl) if db_url.port: db_hostport = "%s:%d" % (db_url.host, db_url.port) else: db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default # ------------------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to root_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug', 'release']: raise ValueError("invalid rp_version '%s'" % rp_version) if rp_version.startswith('@'): rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------------------ # sanity checks if not python_dist: raise RuntimeError("missing python distribution") if not virtenv_dist: raise RuntimeError("missing virtualenv distribution") if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not agent_launch_method: raise RuntimeError("missing agentlaunch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) if shared_filesystem: cleanup = 'luve' else: # we cannot clean the sandbox from within the agent, as the hop # staging would then fail, and we'd get nothing back. # FIXME: cleanup needs to be done by the pmgr.launcher, or # someone else, really, after fetching all logs and # profiles. cleanup = 'luv' # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') # add dists to staging files, if needed if rp_version in ['local', 'debug']: sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name] sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path] else: sdist_names = list() sdist_paths = list() # if cores_per_node is set (!= None), then we need to # allocation full nodes, and thus round up if cores_per_node: cores_per_node = int(cores_per_node) number_cores = int(cores_per_node * math.ceil(float(number_cores) / cores_per_node)) # set mandatory args bootstrap_args = "" bootstrap_args += " -d '%s'" % ':'.join(sdist_names) bootstrap_args += " -p '%s'" % pid bootstrap_args += " -s '%s'" % sid bootstrap_args += " -m '%s'" % virtenv_mode bootstrap_args += " -r '%s'" % rp_version bootstrap_args += " -b '%s'" % python_dist bootstrap_args += " -g '%s'" % virtenv_dist bootstrap_args += " -v '%s'" % virtenv bootstrap_args += " -y '%d'" % runtime # set optional args if lrms == "CCM": bootstrap_args += " -c" if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if tunnel_bind_device: bootstrap_args += " -t '%s'" % tunnel_bind_device if cleanup: bootstrap_args += " -x '%s'" % cleanup for arg in pre_bootstrap_1: bootstrap_args += " -e '%s'" % arg for arg in pre_bootstrap_2: bootstrap_args += " -w '%s'" % arg agent_cfg['owner'] = 'agent_0' agent_cfg['cores'] = number_cores agent_cfg['lrms'] = lrms agent_cfg['spawner'] = agent_spawner agent_cfg['scheduler'] = agent_scheduler agent_cfg['runtime'] = runtime agent_cfg['dburl'] = str(database_url) agent_cfg['session_id'] = sid agent_cfg['pilot_id'] = pid agent_cfg['logdir'] = '.' agent_cfg['pilot_sandbox'] = pilot_sandbox agent_cfg['session_sandbox'] = session_sandbox agent_cfg['resource_sandbox'] = resource_sandbox agent_cfg['agent_launch_method'] = agent_launch_method agent_cfg['task_launch_method'] = task_launch_method agent_cfg['mpi_launch_method'] = mpi_launch_method agent_cfg['cores_per_node'] = cores_per_node agent_cfg['cu_tmp'] = cu_tmp agent_cfg['export_to_cu'] = export_to_cu agent_cfg['cu_pre_exec'] = cu_pre_exec agent_cfg['cu_post_exec'] = cu_post_exec agent_cfg['resource_cfg'] = copy.deepcopy(rcfg) agent_cfg['debug'] = self._log.getEffectiveLevel() # we'll also push the agent config into MongoDB pilot['cfg'] = agent_cfg # ------------------------------------------------------------------ # Write agent config dict to a json file in pilot sandbox. agent_cfg_name = 'agent_0.cfg' cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.') os.close(cfg_tmp_handle) # file exists now # Convert dict to json file self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file) self._log.debug(pprint.pformat(agent_cfg)) ru.write_json(agent_cfg, cfg_tmp_file) ret['ft'].append({ 'src': cfg_tmp_file, 'tgt': '%s/%s' % (pilot_sandbox, agent_cfg_name), 'rem': True }) # purge the tmp file after packing # ---------------------------------------------------------------------- # we also touch the log and profile tarballs in the target pilot sandbox ret['ft'].append({ 'src': '/dev/null', 'tgt': '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid), 'rem': False }) # don't remove /dev/null # only stage profiles if we profile if self._prof.enabled: ret['ft'].append({ 'src': '/dev/null', 'tgt': '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid), 'rem': False }) # don't remove /dev/null # check if we have a sandbox cached for that resource. If so, we have # nothing to do. Otherwise we create the sandbox and stage the RP # stack etc. # NOTE: this will race when multiple pilot launcher instances are used! with self._cache_lock: if not resource in self._sandboxes: for sdist in sdist_paths: base = os.path.basename(sdist) ret['ft'].append({ 'src': sdist, 'tgt': '%s/%s' % (session_sandbox, base), 'rem': False }) # Copy the bootstrap shell script. bootstrapper_path = os.path.abspath("%s/agent/%s" \ % (self._root_dir, BOOTSTRAPPER_0)) self._log.debug("use bootstrapper %s", bootstrapper_path) ret['ft'].append({ 'src': bootstrapper_path, 'tgt': '%s/%s' % (session_sandbox, BOOTSTRAPPER_0), 'rem': False }) # Some machines cannot run pip due to outdated CA certs. # For those, we also stage an updated certificate bundle # TODO: use booleans all the way? if stage_cacerts: cc_name = 'cacert.pem.gz' cc_path = os.path.abspath("%s/agent/%s" % (self._root_dir, cc_name)) self._log.debug("use CAs %s", cc_path) ret['ft'].append({ 'src': cc_path, 'tgt': '%s/%s' % (session_sandbox, cc_name), 'rem': False }) self._sandboxes[resource] = True # ------------------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = rs.job.Description() if shared_filesystem: bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0) else: bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0) jd.name = pid jd.executable = "/bin/bash" jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args] jd.working_directory = pilot_sandbox jd.project = project jd.output = "bootstrap_1.out" jd.error = "bootstrap_1.err" jd.total_cpu_count = number_cores jd.processes_per_host = cores_per_node jd.spmd_variation = spmd_variation jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue jd.candidate_hosts = candidate_hosts jd.environment = dict() if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE' # for condor backends and the like which do not have shared FSs, we add # additional staging directives so that the backend system binds the # files from the session and pilot sandboxes to the pilot job. jd.file_transfer = list() if not shared_filesystem: jd.file_transfer.extend([ 'site:%s/%s > %s' % (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0), 'site:%s/%s > %s' % (pilot_sandbox, agent_cfg_name, agent_cfg_name), 'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid) ]) if 'RADICAL_PILOT_PROFILE' in os.environ: jd.file_transfer.extend([ 'site:%s/%s.prof.tgz > %s.prof.tgz' % (pilot_sandbox, pid, pid), 'site:%s/%s.prof.tgz < %s.prof.tgz' % (pilot_sandbox, pid, pid) ]) for sdist in sdist_names: jd.file_transfer.extend( ['site:%s/%s > %s' % (session_sandbox, sdist, sdist)]) if stage_cacerts: jd.file_transfer.extend( ['site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)]) self._log.debug("Bootstrap command line: %s %s", jd.executable, jd.arguments) ret['jd'] = jd return ret
def test_read_json(): ''' Test json parser ''' # -------------------------------------------------------------------------- # default xcase data = {'test_1': 1, 'test_2': 'one', 'test_3': [1, 'one']} filename = _write_json(json.dumps(data)) data_copy = ru.read_json(filename) assert(data_copy) for key in data: assert(key in data_copy) assert(data[key] == data_copy[key]) for key in data_copy: assert(key in data) assert(data[key] == data_copy[key]) # --------------------------------------------------------------------------- # string read data_copy = ru.read_json_str(filename) assert(isinstance(data_copy['test_2'], str)) # --------------------------------------------------------------------------- # arg switching ru.write_json(filename, data_copy) ru.write_json(data_copy, filename) data_copy = ru.read_json_str(filename) assert(len(data_copy) == 3) os.unlink(filename) # -------------------------------------------------------------------------- # manual parse data = '''{ "test_1": 1, "test_2": "one", "test_3": [1, "one"] }''' data_copy = ru.parse_json(data, filter_comments=False) assert(len(data_copy) == 3) assert(data_copy['test_2'] == 'one') # -------------------------------------------------------------------------- # forced str conversion on manual parse data_copy = ru.parse_json_str(data) assert(len(data_copy) == 3) assert(isinstance(data_copy['test_2'], str)) # --------------------------------------------------------------------------- # faulty json file filename = _write_raw(b'{"foo": [False]}') with pytest.raises(ValueError): ru.read_json(filename)
def submit_units(self, unit_descriptions): """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the unit manager. **Arguments:** * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription` or list of :class:`radical.pilot.ComputeUnitDescription`]: The description of the compute unit instance(s) to create. **Returns:** * A list of :class:`radical.pilot.ComputeUnit` objects. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._valid: raise RuntimeError("instance is already closed") return_list_type = True if not isinstance(unit_descriptions, list): return_list_type = False unit_descriptions = [unit_descriptions] if len(unit_descriptions) == 0: raise ValueError('cannot submit no unit descriptions') logger.report.info('<<submit %d unit(s)\n\t' % len(unit_descriptions)) # we return a list of compute units ret = list() # the scheduler will return a dictionary of the form: # { # ud_1 : pilot_id_a, # ud_2 : pilot_id_b # ... # } # # The scheduler may not be able to schedule some units - those will # have 'None' as pilot ID. units = list() for ud in unit_descriptions: u = ComputeUnit.create(unit_description=ud, unit_manager_obj=self, local_state=SCHEDULING) units.append(u) if self._session._rec: import radical.utils as ru ru.write_json(ud.as_dict(), "%s/%s.batch.%03d.json" \ % (self._session._rec, u.uid, self._rec_id)) logger.report.progress() if self._session._rec: self._rec_id += 1 self._worker.publish_compute_units(units=units) schedule = None try: schedule = self._scheduler.schedule(units=units) except Exception as e: logger.exception("Internal error - unit scheduler failed") raise self.handle_schedule(schedule) logger.report.ok('>>ok\n') if return_list_type: return units else: return units[0]
def write_session_description(amgr): desc = dict() desc['entities'] = dict() desc['entities']['pipeline'] = { 'state_model': res._pipeline_state_values, 'state_values': res._pipeline_state_inv, 'event_model': dict(), } desc['entities']['stage'] = { 'state_model': res._stage_state_values, 'state_values': res._stage_state_inv, 'event_model': dict(), } desc['entities']['task'] = { 'state_model': res._task_state_values, 'state_values': res._task_state_inv, 'event_model': dict(), } desc['entities']['appmanager'] = { 'state_model': None, 'state_values': None, 'event_model': dict(), } # Adding amgr to the tree tree = dict() tree[amgr._uid] = { 'uid': amgr._uid, 'etype': 'appmanager', 'cfg': {}, 'has': ['pipeline', 'wfprocessor', 'resource_manager', 'task_manager'], 'children': list() } # Adding wfp to the tree wfp = amgr._wfp tree[amgr._uid]['children'].append(wfp._uid) tree[wfp._uid] = { 'uid': wfp._uid, 'etype': 'wfprocessor', 'cfg': {}, 'has': [], 'children': list() } # Adding rmgr to the tree rmgr = amgr._rmgr tree[amgr._uid]['children'].append(rmgr._uid) tree[rmgr._uid] = { 'uid': rmgr._uid, 'etype': 'resource_manager', 'cfg': {}, 'has': [], 'children': list() } # Adding tmgr to the tree tmgr = amgr._task_manager tree[amgr._uid]['children'].append(tmgr._uid) tree[tmgr._uid] = { 'uid': tmgr._uid, 'etype': 'task_manager', 'cfg': {}, 'has': [], 'children': list() } # Adding pipelines to the tree for wf in amgr._workflows: for pipe in wf: tree[amgr._uid]['children'].append(pipe._uid) tree[pipe._uid] = { 'uid': pipe._uid, 'etype': 'pipeline', 'cfg': {}, 'has': ['stage'], 'children': list() } # Adding stages to the tree for stage in pipe.stages: tree[pipe._uid]['children'].append(stage._uid) tree[stage._uid] = { 'uid': stage._uid, 'etype': 'stage', 'cfg': {}, 'has': ['task'], 'children': list() } # Adding tasks to the tree for task in stage.tasks: tree[stage._uid]['children'].append(task._uid) tree[task._uid] = { 'uid': task._uid, 'etype': 'task', 'cfg': {}, 'has': [], 'children': list() } desc['tree'] = tree desc['config'] = dict() ru.write_json(desc, '%s/radical.entk.%s.json' % (amgr.sid, amgr.sid))
def get_session_description(sid, src=None, dburl=None): """ This will return a description which is usable for radical.analytics evaluation. It informs about - set of stateful entities - state models of those entities - event models of those entities (maybe) - configuration of the application / module If `src` is given, it is interpreted as path to search for session information (json dump). `src` defaults to `$PWD/$sid`. if `dburl` is given, its value is used to fetch session information from a database. The dburl value defaults to `RADICAL_PILOT_DBURL`. """ from radical.pilot import states as rps from .session import fetch_json if not src: src = "%s/%s" % (os.getcwd(), sid) ftmp = fetch_json(sid=sid, dburl=dburl, tgt=src, skip_existing=True) json = ru.read_json(ftmp) # make sure we have uids def fix_json(json): def fix_uids(json): if isinstance(json, list): for elem in json: fix_uids(elem) elif isinstance(json, dict): if 'unitmanager' in json and 'umgr' not in json: json['umgr'] = json['unitmanager'] if 'pilotmanager' in json and 'pmgr' not in json: json['pmgr'] = json['pilotmanager'] if '_id' in json and 'uid' not in json: json['uid'] = json['_id'] if not 'cfg' in json: json['cfg'] = dict() for k,v in json.iteritems(): fix_uids(v) fix_uids(json) fix_json(json) ru.write_json(json, '/tmp/t.json') assert(sid == json['session']['uid']) ret = dict() ret['entities'] = dict() tree = dict() tree[sid] = {'uid' : sid, 'etype' : 'session', # 'cfg' : json['session']['cfg'], 'has' : ['umgr', 'pmgr'], 'children' : list() } for pmgr in sorted(json['pmgr'], key=lambda k: k['uid']): uid = pmgr['uid'] tree[sid]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'pmgr', # 'cfg' : pmgr['cfg'], 'has' : ['pilot'], 'children' : list() } for umgr in sorted(json['umgr'], key=lambda k: k['uid']): uid = umgr['uid'] tree[sid]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'umgr', # 'cfg' : umgr['cfg'], 'has' : ['unit'], 'children' : list() } for pilot in sorted(json['pilot'], key=lambda k: k['uid']): uid = pilot['uid'] pmgr = pilot['pmgr'] tree[pmgr]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'pilot', # 'cfg' : pilot['cfg'], 'has' : ['unit'], 'children' : list() } for unit in sorted(json['unit'], key=lambda k: k['uid']): uid = unit['uid'] pid = unit['umgr'] umgr = unit['pilot'] tree[pid ]['children'].append(uid) tree[umgr]['children'].append(uid) tree[uid] = {'uid' : uid, 'etype' : 'unit', # 'cfg' : unit['description'], 'has' : list(), 'children' : list() } ret['tree'] = tree import pprint, sys pprint.pprint(tree) ret['entities']['pilot'] = { 'state_model' : rps.pilot_state_by_value, 'state_values' : rps.pilot_state_value, 'event_model' : dict(), } ret['entities']['unit'] = { 'state_model' : rps.unit_state_by_value, 'state_values' : rps.unit_state_value, 'event_model' : dict(), } ret['entities']['session'] = { 'state_model' : None, # session has no states, only events 'state_values' : None, 'event_model' : dict(), } ret['config'] = dict() # magic to get session config goes here return ret
class Session(saga.Session): """A Session encapsulates a RADICAL-Pilot instance and is the *root* object for all other RADICAL-Pilot objects. A Session holds :class:`radical.pilot.PilotManager` and :class:`radical.pilot.UnitManager` instances which in turn hold :class:`radical.pilot.Pilot` and :class:`radical.pilot.ComputeUnit` instances. Each Session has a unique identifier :data:`radical.pilot.Session.uid` that can be used to re-connect to a RADICAL-Pilot instance in the database. **Example**:: s1 = radical.pilot.Session(database_url=DBURL) s2 = radical.pilot.Session(database_url=DBURL, uid=s1.uid) # s1 and s2 are pointing to the same session assert s1.uid == s2.uid """ #--------------------------------------------------------------------------- # def __init__(self, database_url=None, database_name=None, name=None): """Creates a new session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ logger = ru.get_logger('radical.pilot') if database_name: logger.error( "The 'database_name' parameter is deprecated - please specify an URL path" ) else: database_name = 'radicalpilot' # init the base class inits saga.Session.__init__(self) self._dh = ru.DebugHelper() self._valid = True self._terminate = threading.Event() self._terminate.clear() # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = dict() self._unit_manager_objects = dict() # The resource configuration dictionary associated with the session. self._resource_configs = {} if not database_url: database_url = os.getenv("RADICAL_PILOT_DBURL", None) if not database_url: raise PilotException("no database URL (set RADICAL_PILOT_DBURL)") self._dburl = ru.Url(database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) if not self._dburl.path or \ self._dburl.path[0] != '/' or \ len(self._dburl.path) <= 1 : logger.error( "incomplete URLs are deprecated -- missing database name!") self._dburl.path = database_name # defaults to 'radicalpilot' logger.info("using database %s" % self._dburl) # ---------------------------------------------------------------------- # create new session try: if name: self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else: self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid logger.report.info('<<create session %s' % self._uid) self._dbs = dbSession(sid=self._uid, name=self._name, dburl=self._dburl) self._dburl = self._dbs._dburl logger.info("New Session created: %s." % str(self)) except Exception, ex: logger.exception('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._dburl, ex)) # initialize profiling self.prof = Profiler('%s' % self._uid) self.prof.prof('start session', uid=self._uid) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/resource_*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: logger.info("Load resource configurations from %s" % config_file) rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Load resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % os.environ.get( 'HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Load resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/resource_aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] self.prof.prof('configs parsed', uid=self._uid) _rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION') if _rec: self._rec = "%s/%s" % (_rec, self._uid) os.system('mkdir -p %s' % self._rec) ru.write_json({'dburl': str(self._dburl)}, "%s/session.json" % self._rec) logger.info("recording session in %s" % self._rec) else: self._rec = None logger.report.ok('>>ok\n')
def generate_pipeline(cfg): cfg_file = cfg['run_cfg_file'] # resource and workload config run_file = cfg['run_file'] # runs for this campaign # setup S1 workload cfg = ru.Config(cfg=ru.read_json(cfg_file)) runs = check_runs(cfg_file, run_file) if not runs: print('S1: nothing to run, exiting.') return # for each run in the campaign: # - create cfg with requested receptor and smiles # - create a number of masters as EnTK tasks and add them to a pipeline # - submit configured number of masters with that cfg # setup EnTK pipeline p = Pipeline() p.name = 'S1-RAPTOR' s = Stage() # create cfg subs = dict() rurl = cfg.fs_url + cfg.workload.results d = rs.filesystem.Directory(rurl) ls = [str(u).split('/')[-1] for u in d.list()] workload = cfg.workload for receptor, smiles, nodes, runtime in runs: print('%30s %s' % (receptor, smiles)) name = '%s_-_%s' % (receptor, smiles) tgt = '%s.%s.gz' % (name, workload.output) # rec = False # if tgt in ls: # if workload.recompute: # rec += 1 # d.move(tgt, tgt + '.bak') # else: # print('skip 1 %s' % name) # continue # if smiles in ls: # if smiles not in subs: # subs[smiles] = [str(u).split('/')[-1] for u in d.list('%s/*' % smiles)] # if tgt in subs[smiles]: # if workload.recompute: # rec += 2 # d.move('%s/%s' % (smiles, tgt), # '%s/%s.bak' % (smiles, tgt)) # else: # print('skip 2 %s' % name) # continue ## if os.path.exists('results/%s.%s.gz' % (name, wofkload.output)): ## print('skip 3 %s' % name) ## continue #if rec: print('recompute %d %s' % (rec, name)) #else : print('compute 2 %s' % name) cpn = cfg.cpn gpn = cfg.gpn n_masters = cfg.n_masters cfg.workload.receptor = receptor cfg.workload.smiles = smiles cfg.workload.name = name cfg.nodes = nodes cfg.runtime = runtime cfg.n_workers = int(nodes / n_masters - 1) print('n_workers: %d' % cfg.n_workers) ru.write_json(cfg, 'configs/wf0.%s.cfg' % name) for i in range(n_masters): t = Task() t.pre_exec = [ '. /gpfs/alpine/scratch/mturilli1/med110/radical.pilot.sandbox/s1.to/bin/activate' ] t.executable = "python3" t.arguments = ['wf0_master.py', i] t.cpu_threads = cpn t.upload_input_data = [ 'wf0_master.py', 'wf0_worker.py', 'configs/wf0.%s.cfg > wf0.cfg' % name, 'read_ligand_dict.py' ] t.link_input_data = ['%s > input_dir' % workload.input_dir] t.download_output_data = [ '%s.%s.gz > results/%s.%s.gz' % (name, workload.output, name, workload.output) ] # t.input_staging = [{'source': 'wf0_master.py', # 'target': 'wf0_master.py', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': 'wf0_worker.py', # 'target': 'wf0_worker.py', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': 'configs/wf0.%s.cfg' % name, # 'target': 'wf0.cfg', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': workload.input_dir, # 'target': 'input_dir', # 'action': rp.LINK, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': workload.impress_dir, # 'target': 'impress_md', # 'action': rp.LINK, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': 'read_ligand_dict.py', # 'target': 'read_ligand_dict.py', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # ] # t.output_staging = [{'source': '%s.%s.gz' % (name, workload.output), # 'target': 'results/%s.%s.gz' % (name, workload.output), # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}] s.add_tasks(t) p.add_stages(s) return p
def __init__(self, dburl=None, uid=None, cfg=None, _connect=True): """ Creates a new session. A new Session instance is created and stored in the database. **Arguments:** * **dburl** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **uid** (`string`): Create a session with this UID. *Only use this when you know what you are doing!* **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ if os.uname()[0] == 'Darwin': # on MacOS, we are running out of file descriptors soon. The code # below attempts to increase the limit of open files - but any error # is silently ignored, so this is an best-effort, no guarantee. We # leave responsibility for system limits with the user. try: import resource limits = list(resource.getrlimit(resource.RLIMIT_NOFILE)) limits[0] = 512 resource.setrlimit(resource.RLIMIT_NOFILE, limits) except: pass self._dh = ru.DebugHelper() self._valid = True self._closed = False self._valid_iter = 0 # detect recursive calls of `is_valid()` # class state self._dbs = None self._uid = None self._dburl = None self._reconnected = False self._cache = dict() # cache sandboxes etc. self._cache_lock = threading.RLock() self._cache['resource_sandbox'] = dict() self._cache['session_sandbox'] = dict() self._cache['pilot_sandbox'] = dict() # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. # NOTE: should this also include agents? self._pmgrs = dict() self._umgrs = dict() self._bridges = list() self._components = list() # FIXME: we work around some garbage collection issues we don't yet # understand: instead of relying on the GC to eventually collect # some stuff, we actively free those on `session.close()`, at # least for the current process. Usually, all resources get # nicely collected on process termination - but not when we # create many sessions (one after the other) in the same # application instance (ie. the same process). This workarounf # takes care of that use case. # The clean solution would be to ensure clean termination # sequence, something which I seem to be unable to implement... # :/ self._to_close = list() self._to_stop = list() self._to_destroy = list() # cache the client sandbox # FIXME: this needs to be overwritten if configured differently in the # session config, as should be the case for any agent side # session instance. self._client_sandbox = os.getcwd() # The resource configuration dictionary associated with the session. self._resource_configs = {} # if a config is given, us its values: if cfg: self._cfg = copy.deepcopy(cfg) else: # otherwise we need a config self._cfg = ru.read_json("%s/configs/session_%s.json" \ % (os.path.dirname(__file__), os.environ.get('RADICAL_PILOT_SESSION_CFG', 'default'))) # fall back to config data where possible # sanity check on parameters if not uid: uid = self._cfg.get('session_id') if uid: self._uid = uid self._reconnected = True else: # generate new uid, reset all other ID counters # FIXME: this will screw up counters for *concurrent* sessions, # as the ID generation is managed in a process singleton. self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) ru.reset_id_counters(prefix='rp.session', reset_all_others=True) if not self._cfg.get('session_id'): self._cfg['session_id'] = self._uid if not self._cfg.get('owner'): self._cfg['owner'] = self._uid if not self._cfg.get('logdir'): self._cfg['logdir'] = '%s/%s' \ % (os.getcwd(), self._uid) self._logdir = self._cfg['logdir'] self._prof = self._get_profiler(name=self._cfg['owner']) self._rep = self._get_reporter(name=self._cfg['owner']) self._log = self._get_logger(name=self._cfg['owner'], level=self._cfg.get('debug')) if _connect: # we need a dburl to connect to. if not dburl: dburl = os.environ.get("RADICAL_PILOT_DBURL") if not dburl: dburl = self._cfg.get('default_dburl') if not dburl: dburl = self._cfg.get('dburl') if not dburl: # we forgive missing dburl on reconnect, but not otherwise raise RuntimeError("no database URL (set RADICAL_PILOT_DBURL)") self._dburl = ru.Url(dburl) self._cfg['dburl'] = str(self._dburl) # now we have config and uid - initialize base class (saga session) rs.Session.__init__(self, uid=self._uid) # ---------------------------------------------------------------------- # create new session if _connect: self._log.info("using database %s" % self._dburl) # if the database url contains a path element, we interpret that as # database name (without the leading slash) if not self._dburl.path or \ self._dburl.path[0] != '/' or \ len(self._dburl.path) <= 1 : if not uid: # we fake reconnnect if no DB is available -- but otherwise we # really really need a db connection... raise ValueError("incomplete DBURL '%s' no db name!" % self._dburl) if not self._reconnected: self._prof.prof('session_start', uid=self._uid) self._rep.info('<<new session: ') self._rep.plain('[%s]' % self._uid) self._rep.info('<<database : ') self._rep.plain('[%s]' % self._dburl) self._load_resource_configs() self._rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION') if self._rec: # NOTE: Session recording cannot handle reconnected sessions, yet. # We thus turn it off here with a warning if self._reconnected: self._log.warn("no session recording on reconnected session") else: # append session ID to recording path self._rec = "%s/%s" % (self._rec, self._uid) # create recording path and record session os.system('mkdir -p %s' % self._rec) ru.write_json({'dburl': str(self.dburl)}, "%s/session.json" % self._rec) self._log.info("recording session in %s" % self._rec) # create/connect database handle try: self._dbs = DBSession(sid=self.uid, dburl=str(self._dburl), cfg=self._cfg, logger=self._log, connect=_connect) # from here on we should be able to close the session again self._log.info("New Session created: %s." % self.uid) except Exception, ex: self._rep.error(">>err\n") self._log.exception('session create failed') raise RuntimeError("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (dburl, ex))
def submit_units(self, unit_descriptions): """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the unit manager. **Arguments:** * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription` or list of :class:`radical.pilot.ComputeUnitDescription`]: The description of the compute unit instance(s) to create. **Returns:** * A list of :class:`radical.pilot.ComputeUnit` objects. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._valid: raise RuntimeError("instance is already closed") return_list_type = True if not isinstance(unit_descriptions, list): return_list_type = False unit_descriptions = [unit_descriptions] if len(unit_descriptions) == 0: raise ValueError('cannot submit no unit descriptions') for ud in unit_descriptions: if float(ud.cores) != int(ud.cores): error_msg = "ComputeUnittDescription 'cores' must be integer." raise BadParameter(error_msg) if int(ud.cores) <= 0: error_msg = "ComputeUnittDescription 'cores' must be positive." raise BadParameter(error_msg) logger.report.info('<<submit %d unit(s)\n\t' % len(unit_descriptions)) # we return a list of compute units ret = list() # the scheduler will return a dictionary of the form: # { # ud_1 : pilot_id_a, # ud_2 : pilot_id_b # ... # } # # The scheduler may not be able to schedule some units - those will # have 'None' as pilot ID. units = list() for ud in unit_descriptions : u = ComputeUnit.create (unit_description=ud, unit_manager_obj=self, local_state=SCHEDULING) units.append(u) if self._session._rec: import radical.utils as ru ru.write_json(ud.as_dict(), "%s/%s.batch.%03d.json" \ % (self._session._rec, u.uid, self._rec_id)) logger.report.progress() if self._session._rec: self._rec_id += 1 self._worker.publish_compute_units (units=units) schedule = None try: schedule = self._scheduler.schedule (units=units) except Exception as e: logger.exception ("Internal error - unit scheduler failed") raise self.handle_schedule (schedule) logger.report.ok('>>ok\n') if return_list_type : return units else : return units[0]
def generate_pipeline(cfg): cfg_file = cfg['run_cfg_file'] # resource and workload config run_file = cfg['run_file'] # runs for this campaign # setup S1 workload cfg = ru.Config(cfg=ru.read_json(cfg_file)) runs = check_runs(cfg_file, run_file) if not runs: print('S1: nothing to run, exiting.') return # for each run in the campaign: # - create cfg with requested receptor and smiles # - create a number of masters as EnTK tasks and add them to a pipeline # - submit configured number of masters with that cfg # setup EnTK pipeline p = Pipeline() p.name = 'S1.RAPTOR' s = Stage() # create cfg subs = dict() rurl = cfg.fs_url + cfg.workload.results d = rs.filesystem.Directory(rurl) ls = [str(u).split('/')[-1] for u in d.list()] workload = cfg.workload for receptor, smiles, n_workers, runtime in runs: print('%30s %s' % (receptor, smiles)) name = '%s_-_%s' % (receptor, smiles) tgt = '%s.%s.gz' % (name, workload.output) cpw = cfg.cpw gpw = cfg.gpw n_masters = cfg.n_masters cfg.workload.receptor = receptor cfg.workload.smiles = smiles cfg.workload.name = name cfg.runtime = runtime cfg.n_workers = n_workers print('n_workers: %d' % cfg.n_workers) ru.write_json(cfg, 'configs/wf0.%s.cfg' % name) for i in range(n_masters): t = Task() t.pre_exec = [ '. /gpfs/alpine/scratch/mturilli1/med110/radical.pilot.sandbox/s1.to/bin/activate' ] t.executable = "python3" t.arguments = ['wf0_master.py', i] t.cpu_reqs = { 'processes': 1, 'threads_per_process': 4, 'thread_type': None, 'process_type': None } t.upload_input_data = [ 'wf0_master.py', 'wf0_worker.py', 'configs/wf0.%s.cfg > wf0.cfg' % name, 'read_ligand_dict.py' ] t.link_input_data = ['%s > input_dir' % workload.input_dir] #t.download_output_data = ['%s.%s.gz > results/%s.%s.gz' % # (name, workload.output, name, workload.output)] s.add_tasks(t) p.add_stages(s) return p
if rec: print('recompute %d %s' % (rec, name)) else: print('compute 2 %s' % name) cpn = cfg.cpn gpn = cfg.gpn n_masters = cfg.n_masters cfg.workload.receptor = receptor cfg.workload.smiles = smiles cfg.workload.name = name cfg.nodes = nodes cfg.runtime = runtime cfg.n_workers = int(nodes / n_masters - 1) print('n_workers: %d' % cfg.n_workers) ru.write_json(cfg, 'configs/wf0.%s.cfg' % name) pd = rp.ComputePilotDescription(cfg.pilot_descr) pd.cores = nodes * cpn pd.gpus = nodes * gpn pd.runtime = runtime pilot = pmgr.submit_pilots(pd) pid = pilot.uid umgr.add_pilots(pilot) tds = list() for i in range(n_masters): td = rp.ComputeUnitDescription(cfg.master_descr)
def store_profile(profile, tags=None, url=None, mode=None): if not url: url = os.environ.get('RADICAL_SYNAPSE_DBURL') if not url: # print "warning: need dburl to store profiles" return None if not mode: raise ValueError("document needs mode (emulated | eecuted | profiled)") url = ru.Url(url) if not tags: tags = dict() elems = filter(None, os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(',')) for elem in elems: if ':' in elem: key, val = elem.split(':', 1) tags[key] = val else: tags[elem] = None command_idx = index_command(profile['cmd'], tags) print "index %s (%s) to %s" % (profile['cmd'], tags, command_idx) host = profile['sys'].get('hostname') if not host: host = os.environ.get('RADICAL_SYNAPSE_HOSTNAME', socket.gethostname()) profile['sys']['hostname'] = host doc = { 'type': 'synapse_profile', 'mode': mode, 'command_idx': command_idx, 'command': profile['cmd'], 'tags': tags, 'profile': profile } if url.schema == 'mongodb': print 'store profile in db %s' % url [dbhost, port, dbname, _, _, _, _] = ru.split_dburl(url) db_client = pymongo.MongoClient(host=dbhost, port=port) database = db_client[dbname] collection = database['profiles'] collection.insert(doc) elif url.schema == 'file': path = url.path if not os.path.isdir(path): os.system('mkdir -p "%s"' % path) name = command_idx.split()[0] # for key, val in tags.iteritems(): # if val != None: name += "_%s:%s" % (key, val) # else : name += "_%s" % (key) for tag in sorted(tags.keys()): if tags[tag] != None: name += "_%s" % tags[tag] else: name += "_%s" % tag idx = 0 while True: fname = "%s/synapse_profile_%s_%s_%s_%03d.json" % ( path, name, host, mode[0:3], idx) if not os.path.exists(fname): break idx += 1 print 'store profile in file %s' % fname os.system('mkdir -p "%s/"' % path) ru.write_json(doc, fname)