def _load_resource_configs(self): self.is_valid() self._prof.prof('config_parser_start', uid=self._uid) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/resource_*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: self._log.info("Load resource configurations from %s" % config_file) rcs = ResourceConfig.from_file(config_file) except Exception as e: self._log.exception("skip config file %s: %s" % (config_file, e)) raise RuntimeError('config error (%s) - abort' % e) for rc in rcs: self._log.info("Load resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() home = os.environ.get('HOME', '') user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % home config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: self._log.exception("skip config file %s: %s" % (config_file, e)) raise RuntimeError('config error (%s) - abort' % e) for rc in rcs: self._log.info("Load resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/resource_aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] # check if we have aliases to merge usr_aliases = '%s/.radical/pilot/configs/resource_aliases.json' % home if os.path.isfile(usr_aliases): ru.dict_merge(self._resource_aliases, ru.read_json_str(usr_aliases).get('aliases', {}), policy='overwrite') self._prof.prof('config_parser_stop', uid=self._uid)
def load_resource_config_file_json(self, config_file): """Load all resource logins from one config file to a dict. """ _valid_p = ['cluster_type', 'login_server', 'username', 'password', 'key_file', "config"] rcfgs = {} try: rcf_dict = ru.read_json_str(config_file) for res_name, cfg in rcf_dict.iteritems(): invalid_cfg = False for k in ["bundle_agent", "ssh"]: if k not in cfg: invalid_cfg = True if invalid_cfg is True: continue if cfg["bundle_agent"] is True: if "cluster_type" not in cfg: continue elif cfg["cluster_type"] not in BundleAgent.supported_types: continue else: for k in ["login_server", "username"]: if k not in cfg["ssh"]: invalid_cfg=True if invalid_cfg is True: continue rcfgs[res_name] = cfg except ValueError, err: raise BadParameter("Couldn't parse resource configuration file '%s': %s." % (filename, str(err)))
def cfg(configs): fname = './configs/%s' % configs if not os.path.exists(fname): raise Exception('no such config [%s]' % fname) cfg = ru.read_json_str(fname) assert ('rs.tests' in cfg) return cfg['rs.tests']
def run_bw_client(self, src, dst, port): cfg = self._endpoints[src] # tmp solution REMOTE_JOB_ENDPOINT = "ssh://" + cfg["ssh"]["login_server"] # TODO add session id # TODO reuse session REMOTE_DIR = "sftp://" + cfg["ssh"]["login_server"] + "/tmp/aimes.bundle/iperf/" # set Context ctx = saga.Context("ssh") ctx.user_id = cfg["ssh"]["username"] # set Session session = saga.Session(default=False) session.add_context(ctx) with cfg["lock"]: # staging necessary files # remote mkdir workdir = saga.filesystem.Directory(REMOTE_DIR, saga.filesystem.CREATE_PARENTS, session=session) file1 = saga.filesystem.File( 'file://localhost/%s/run-iperf-client.sh' % os.path.dirname(__file__), session=session) file1.copy(workdir.get_url()) # create a remote job service js = saga.job.Service(REMOTE_JOB_ENDPOINT, session=session) # create job description jd = saga.job.Description() jd.working_directory = workdir.get_url().path jd.executable = 'sh' jd.arguments = ["run-iperf-client.sh", src, dst, port] myjob = js.create_job(jd) myjob.run() myjob.wait() print "Job State : %s" % (myjob.state) print "Exitcode : %s" % (myjob.exit_code) result_file = "{}-{}.json".format(src, dst) workdir.copy( result_file, 'file://localhost/%s/' % os.getcwd() ) try: result_dict = ru.read_json_str(result_file) return result_dict["end"]["sum_sent"]["bits_per_second"], \ result_dict["end"]["sum_received"]["bits_per_second"] except Exception as e: logging.exception("Parsing iperf result failed.") return None, None
def __init__(self, agent_name): assert(agent_name != 'agent_0'), 'expect subagent, not agent_0' print "startup agent %s" % agent_name # load config, create session and controller, init rpu.Worker agent_cfg = "%s/%s.cfg" % (os.getcwd(), agent_name) cfg = ru.read_json_str(agent_cfg) self._uid = agent_name self._pid = cfg['pilot_id'] self._sid = cfg['session_id'] self._final_cause = None # Create a session. # # This session will not connect to MongoDB, but will create any # communication channels and components/workers specified in the # config -- we merge that information into our own config. # We don't want the session to start components though, so remove them # from the config copy. session_cfg = copy.deepcopy(cfg) session_cfg['owner'] = self._uid session_cfg['components'] = dict() session = rp_Session(cfg=session_cfg, _connect=False, uid=self._sid) # we still want the bridge addresses known though, so make sure they are # merged into our own copy, along with any other additions done by the # session. ru.dict_merge(cfg, session._cfg, ru.PRESERVE) pprint.pprint(cfg) if session.is_connected: raise RuntimeError('agent_n should not connect to mongodb') # at this point the session is up and workin, and the session # controller should have brought up all communication bridges and the # agent components. We are ready to roll! rpu.Worker.__init__(self, cfg, session)
# it will thus *not* clean out the session's database record (s that is # used for some statistics post-mortem), but will kill all remaining # pilots. #------------------------------------------------------------------------------- # if __name__ == "__main__": # TODO: the json config should be converted into an mpi_test kernel, once # the application kernels become maintainable... pwd = os.path.dirname(__file__) if not pwd: pwd = '.' configs = ru.read_json_str('%s/test.json' % pwd) targets = sys.argv[1:] failed = 0 if not targets: print "\n\n\tusage: %s <target> [target] ...\n\n" sys.exit(-1) for target in targets: if not target in configs: print 'no config found for %s' % target print 'known targets: %s' % ', '.join(configs.keys()) continue cfg = configs[target]
def test_read_json(): ''' Test json parser ''' # -------------------------------------------------------------------------- # default xcase data = {'test_1': 1, 'test_2': 'one', 'test_3': [1, 'one']} filename = _write_json(json.dumps(data)) data_copy = ru.read_json(filename) assert(data_copy) for key in data: assert(key in data_copy) assert(data[key] == data_copy[key]) for key in data_copy: assert(key in data) assert(data[key] == data_copy[key]) # --------------------------------------------------------------------------- # string read data_copy = ru.read_json_str(filename) assert(isinstance(data_copy['test_2'], str)) # --------------------------------------------------------------------------- # arg switching ru.write_json(filename, data_copy) ru.write_json(data_copy, filename) data_copy = ru.read_json_str(filename) assert(len(data_copy) == 3) os.unlink(filename) # -------------------------------------------------------------------------- # manual parse data = '''{ "test_1": 1, "test_2": "one", "test_3": [1, "one"] }''' data_copy = ru.parse_json(data, filter_comments=False) assert(len(data_copy) == 3) assert(data_copy['test_2'] == 'one') # -------------------------------------------------------------------------- # forced str conversion on manual parse data_copy = ru.parse_json_str(data) assert(len(data_copy) == 3) assert(isinstance(data_copy['test_2'], str)) # --------------------------------------------------------------------------- # faulty json file filename = _write_raw(b'{"foo": [False]}') with pytest.raises(ValueError): ru.read_json(filename)
def __init__ (self, database_url=None, database_name="radicalpilot", uid=None, name=None): """Creates a new or reconnects to an exising session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ # init the base class inits saga.Session.__init__ (self) Object.__init__ (self) # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper () # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = list() self._unit_manager_objects = list() # Create a new process registry. All objects belonging to this # session will register their worker processes (if they have any) # in this registry. This makes it easier to shut down things in # a more coordinate fashion. self._process_registry = _ProcessRegistry() # The resource configuration dictionary associated with the session. self._resource_configs = {} self._database_url = database_url self._database_name = database_name if not self._database_url : self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None) if not self._database_url : raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)") logger.info("using database url %s" % self._database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) tmp_url = ru.Url (self._database_url) if tmp_url.path and \ tmp_url.path[0] == '/' and \ len(tmp_url.path) > 1 : self._database_name = tmp_url.path[1:] logger.info("using database path %s" % self._database_name) else : logger.info("using database name %s" % self._database_name) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try : rcs = ResourceConfig.from_file(config_file) except Exception as e : logger.error ("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try : rcs = ResourceConfig.from_file(config_file) except Exception as e : logger.error ("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) if rc in self._resource_configs : # config exists -- merge user config into it ru.dict_merge (self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else : # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/aliases.json" % module_path self._resource_aliases = ru.read_json_str (default_aliases)['aliases'] ########################## ## CREATE A NEW SESSION ## ########################## if uid is None: try: self._connected = None if name : self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else : self._uid = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid self._dbs, self._created, self._connection_info = \ dbSession.new(sid = self._uid, name = self._name, db_url = self._database_url, db_name = database_name) logger.info("New Session created%s." % str(self)) except Exception, ex: logger.exception ('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._database_url, ex))
""") configs = os.listdir( "{0}/../../src/radical/pilot/configs/".format(script_dir)) for config in configs: if config.endswith(".json") is False: continue # skip all non-python files if config.startswith("aliases") is True: continue # skip alias files print " * %s" % config try: json_data = ru.read_json_str("../../src/radical/pilot/configs/%s" % config) except Exception, ex: print " * JSON PARSING ERROR: %s" % str(ex) continue resources_rst.write("{0}\n".format(config[:-5].upper())) resources_rst.write("{0}\n\n".format("=" * len(config[:-5]))) for host_key, resource_config in json_data.iteritems(): resource_key = "%s.%s" % (config[:-5], host_key) print " * %s" % resource_key try: default_queue = resource_config["default_queue"] except Exception, ex: default_queue = None
""") configs = os.listdir("{0}/../../src/radical/pilot/configs/".format(script_dir)) for config in configs: if config.endswith(".json") is False: continue # skip all non-python files if config.startswith("aliases") is True: continue # skip alias files print " * %s" % config try: json_data = ru.read_json_str("../../src/radical/pilot/configs/%s" % config) except Exception, ex: print " * JSON PARSING ERROR: %s" % str(ex) continue resources_rst.write("{0}\n".format(config[:-5].upper())) resources_rst.write("{0}\n\n".format("="*len(config[:-5]))) for host_key, resource_config in json_data.iteritems(): resource_key = "%s.%s" % (config[:-5], host_key) print " * %s" % resource_key try: default_queue = resource_config["default_queue"] except Exception, ex: default_queue = None
# # { # "scheduler" : "rp.SCHED_BACKFILLING", # "resources" : ["india.furturegrid.org", "sierra.futuregrid.org"], # "resource_cfg" : # { # "*.futuregrid.org" : # { # "username" : "merzky" # } # } # } USER_CONFIG_PATH = os.environ.get('HOME', '/tmp') + '/.my_app.cfg' # load the user config, and merge it with the default config user_config = ru.read_json_str(USER_CONFIG_PATH) # merge the user config into the app config, so that the user config keys are # applied where appropriate ru.dict_merge(app_config, user_config, policy='overwrite', wildcards=True) # lets see what we got pprint.pprint(app_config) # this should result in : # # { # 'log_level' : 0, # 'scheduler' : 'rp.SCHED_BACKFILLING', # 'resources' : ['india.furturegrid.org', 'sierra.futuregrid.org'], # 'resource_cfg':
import glob configs = glob.glob("%s/../../src/radical/pilot/configs/resource_*.json" % script_dir) for config in configs: if not config.endswith(".json"): continue # skip all non-python files if "/resource_aliases" in config: continue # skip alias files print " * %s" % config try: json_data = ru.read_json_str(config) except Exception, ex: print " * JSON PARSING ERROR: %s" % str(ex) continue config = config.split('/')[-1] resources_rst.write("{0}\n".format(config[:-5].upper())) resources_rst.write("{0}\n\n".format("=" * len(config[:-5]))) for host_key, resource_config in json_data.iteritems(): resource_key = "%s.%s" % (config[:-5], host_key) print " * %s" % resource_key try: default_queue = resource_config["default_queue"] except Exception, ex:
def __init__(self, agent_name): assert(agent_name == 'agent_0'), 'expect agent_0, not subagent' print 'startup agent %s' % agent_name # load config, create session, init rpu.Worker agent_cfg = '%s/%s.cfg' % (os.getcwd(), agent_name) cfg = ru.read_json_str(agent_cfg) cfg['agent_name'] = agent_name self._uid = agent_name self._pid = cfg['pilot_id'] self._sid = cfg['session_id'] self._runtime = cfg['runtime'] self._starttime = time.time() self._final_cause = None self._lrms = None # this better be on a shared FS! cfg['workdir'] = os.getcwd() # sanity check on config settings if not 'cores' in cfg: raise ValueError('Missing number of cores') if not 'lrms' in cfg: raise ValueError('Missing LRMS') if not 'dburl' in cfg: raise ValueError('Missing DBURL') if not 'pilot_id' in cfg: raise ValueError('Missing pilot id') if not 'runtime' in cfg: raise ValueError('Missing or zero agent runtime') if not 'scheduler' in cfg: raise ValueError('Missing agent scheduler') if not 'session_id' in cfg: raise ValueError('Missing session id') if not 'spawner' in cfg: raise ValueError('Missing agent spawner') if not 'task_launch_method' in cfg: raise ValueError('Missing unit launch method') # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold # the address of the tunnelized DB endpoint. If it exists, we # overrule the agent config with it. hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT') if hostport: dburl = ru.Url(cfg['dburl']) dburl.host, dburl.port = hostport.split(':') cfg['dburl'] = str(dburl) # Create a session. # # This session will connect to MongoDB, and will also create any # communication channels and components/workers specified in the # config -- we merge that information into our own config. # We don't want the session to start components though, so remove them # from the config copy. session_cfg = copy.deepcopy(cfg) session_cfg['components'] = dict() session = rp_Session(cfg=session_cfg, uid=self._sid) # we still want the bridge addresses known though, so make sure they are # merged into our own copy, along with any other additions done by the # session. ru.dict_merge(cfg, session._cfg, ru.PRESERVE) pprint.pprint(cfg) if not session.is_connected: raise RuntimeError('agent_0 could not connect to mongodb') # at this point the session is up and connected, and it should have # brought up all communication bridges and the UpdateWorker. We are # ready to rumble! rpu.Worker.__init__(self, cfg, session) # this is the earlier point to sync bootstrapper and agent # profiles self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid) # Create LRMS which will give us the set of agent_nodes to use for # sub-agent startup. Add the remaining LRMS information to the # config, for the benefit of the scheduler). self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg, session=self._session) # add the resource manager information to our own config self._cfg['lrms_info'] = self._lrms.lrms_info
# it will thus both clean out the session's database record, and kill # all remaining pilots (none in our example). #------------------------------------------------------------------------------ # if __name__ == "__main__": # TODO: the json config should be converted into an mpi_test kernel, once # the application kernels become maintainable... print __file__ pwd = os.path.dirname(__file__) if not pwd: pwd = '.' configs = ru.read_json_str ('%s/test.json' % pwd) targets = sys.argv[1:] failed = 0 if not targets: print "\n\n\tusage: %s <target> [target] ...\n\n" sys.exit (-1) for target in targets: if not target in configs: print 'no config found for %s' % target print 'known targets: %s' % ', '.join (configs.keys()) continue
def bootstrap_3(): """ This method continues where the bootstrapper left off, but will quickly pass control to the Agent class which will spawn the functional components. Most of bootstrap_3 applies only to agent_0, in particular all mongodb interactions remains excluded for other sub-agent instances. The agent interprets a config file, which will specify in an agent_layout section: - what nodes should be used for sub-agent startup - what bridges should be started - what components should be started - what are the endpoints for bridges which are not started bootstrap_3 will create derived config files for all sub-agents. The agent master (agent_0) will collect information about the nodes required for all instances. That is added to the config itself, for the benefit of the LRMS initialisation which is expected to block those nodes from the scheduler. """ global lrms, agent, bridges # find out what agent instance name we have if len(sys.argv) != 2: raise RuntimeError("invalid number of parameters (%s)" % sys.argv) agent_name = sys.argv[1] # load the agent config, and overload the config dicts agent_cfg = "%s/%s.cfg" % (os.getcwd(), agent_name) print "startup agent %s : %s" % (agent_name, agent_cfg) cfg = ru.read_json_str(agent_cfg) cfg["agent_name"] = agent_name pilot_id = cfg["pilot_id"] # set up a logger and profiler prof = ru.Profiler("%s.bootstrap_3" % agent_name) prof.prof("sync ref", msg="agent start", uid=pilot_id) log = ru.get_logger("%s.bootstrap_3" % agent_name, "%s.bootstrap_3.log" % agent_name, "DEBUG") # FIXME? log.info("start") prof.prof("sync ref", msg="agent start") try: import setproctitle as spt spt.setproctitle("radical.pilot %s" % agent_name) except Exception as e: log.debug("no setproctitle: %s", e) log.setLevel(cfg.get("debug", "INFO")) print "Agent config (%s):\n%s\n\n" % (agent_cfg, pprint.pformat(cfg)) # quickly set up a mongodb handle so that we can report errors. # FIXME: signal handlers need mongo_p, but we won't have that until later if agent_name == "agent_0": # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold the # address of the tunnelized DB endpoint. # If it exists, we overrule the agent config with it. hostport = os.environ.get("RADICAL_PILOT_DB_HOSTPORT") if hostport: dburl = ru.Url(cfg["mongodb_url"]) dburl.host, dburl.port = hostport.split(":") cfg["mongodb_url"] = str(dburl) _, mongo_db, _, _, _ = ru.mongodb_connect(cfg["mongodb_url"]) mongo_p = mongo_db["%s.p" % cfg["session_id"]] if not mongo_p: raise RuntimeError("could not get a mongodb handle") # set up signal and exit handlers def exit_handler(): global lrms, agent, bridges print "atexit" if lrms: lrms.stop() lrms = None if bridges: for b in bridges: b.stop() bridges = dict() if agent: agent.stop() agent = None sys.exit(1) def sigint_handler(signum, frame): if agent_name == "agent_0": pilot_FAILED(msg="Caught SIGINT. EXITING (%s)" % frame) print "sigint" prof.prof("stop", msg="sigint_handler", uid=pilot_id) prof.close() sys.exit(2) def sigterm_handler(signum, frame): if agent_name == "agent_0": pilot_FAILED(msg="Caught SIGTERM. EXITING (%s)" % frame) print "sigterm" prof.prof("stop", msg="sigterm_handler %s" % os.getpid(), uid=pilot_id) prof.close() sys.exit(3) def sigalarm_handler(signum, frame): if agent_name == "agent_0": pilot_FAILED(msg="Caught SIGALRM (Walltime limit?). EXITING (%s)" % frame) print "sigalrm" prof.prof("stop", msg="sigalarm_handler", uid=pilot_id) prof.close() sys.exit(4) import atexit atexit.register(exit_handler) signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigterm_handler) signal.signal(signal.SIGALRM, sigalarm_handler) # if anything went wrong up to this point, we would have been unable to # report errors into mongodb. From here on, any fatal error should result # in one of the above handlers or exit handlers being activated, thus # reporting the error dutifully. try: # ---------------------------------------------------------------------- # des Pudels Kern: merge LRMS info into cfg and get the agent started if agent_name == "agent_0": # only the master agent creates LRMS and sub-agent config files. # The LRMS which will give us the set of agent_nodes to use for # sub-agent startup. Add the remaining LRMS information to the # config, for the benefit of the scheduler). lrms = rp.agent.RM.create(name=cfg["lrms"], cfg=cfg, logger=log) cfg["lrms_info"] = lrms.lrms_info # the master agent also is the only one which starts bridges. It # has to do so before creating the Agent Worker instance, as that is # using the bridges already. bridges = start_bridges(cfg, log) # FIXME: make sure all communication channels are in place. This could # be replaced with a proper barrier, but not sure if that is worth it... time.sleep(1) # after we started bridges, we'll add their in and out addresses # to the config, so that the communication channels can connect to # them. At this point we also write configs for all sub-agents this # instance intents to spawn. # # FIXME: we should point the address to the node of the subagent # which hosts the bridge, not the local IP. Until this # is fixed, bridges MUST run on agent_0 (which is what # RM.hostip() below will point to). nodeip = rp.agent.RM.hostip(cfg.get("network_interface"), logger=log) write_sub_configs(cfg, bridges, nodeip, log) # Store some runtime information into the session mongo_p.update( {"_id": pilot_id}, {"$set": {"lm_info": lrms.lm_info.get("version_info"), "lm_detail": lrms.lm_info.get("lm_detail")}}, ) # we now have correct bridge addresses added to the agent_0.cfg, and all # other agents will have picked that up from their config files -- we # can start the agent and all its components! agent = rp.worker.Agent(cfg) agent.start() log.debug("waiting for agent %s to join" % agent_name) agent.join() log.debug("agent %s joined" % agent_name) # ---------------------------------------------------------------------- except SystemExit: log.exception("Exit running agent: %s" % agent_name) if agent and not agent.final_cause: agent.final_cause = "sys.exit" except Exception as e: log.exception("Error running agent: %s" % agent_name) if agent and not agent.final_cause: agent.final_cause = "error" finally: # in all cases, make sure we perform an orderly shutdown. I hope python # does not mind doing all those things in a finally clause of # (essentially) main... if agent: agent.stop() log.debug("agent %s finalized" % agent_name) # agent.stop will not tear down bridges -- we do that here at last for name, b in bridges.items(): try: log.info("closing bridge %s", b) b["handle"].stop() except Exception as e: log.exception("ignore failing bridge terminate (%s)", e) bridges = dict() # make sure the lrms release whatever it acquired if lrms: lrms.stop() lrms = None # agent_0 will also report final pilot state to the DB if agent_name == "agent_0": if agent and agent.final_cause == "timeout": pilot_DONE(mongo_p, pilot_id, log, "TIMEOUT received. Terminating.") elif agent and agent.final_cause == "cancel": pilot_CANCELED(mongo_p, pilot_id, log, "CANCEL received. Terminating.") elif agent and agent.final_cause == "sys.exit": pilot_CANCELED(mongo_p, pilot_id, log, "EXIT received. Terminating.") elif agent and agent.final_cause == "finalize": log.info("shutdown due to component finalization -- assuming error") pilot_FAILED(mongo_p, pilot_id, log, "FINALIZE received") elif agent: pilot_FAILED(mongo_p, pilot_id, log, "TERMINATE received") else: pilot_FAILED(mongo_p, pilot_id, log, "FAILED startup") log.info("stop") prof.prof("stop", msg="finally clause agent", uid=pilot_id) prof.close()
def __init__(self, agent_name): assert(agent_name == 'agent_0'), 'expect agent_0, not subagent' print 'startup agent %s' % agent_name # load config, create session, init rpu.Worker agent_cfg = '%s/%s.cfg' % (os.getcwd(), agent_name) cfg = ru.read_json_str(agent_cfg) cfg['agent_name'] = agent_name self._uid = agent_name self._pid = cfg['pilot_id'] self._sid = cfg['session_id'] self._runtime = cfg['runtime'] self._starttime = time.time() self._final_cause = None self._lrms = None # this better be on a shared FS! cfg['workdir'] = os.getcwd() # sanity check on config settings if 'cores' not in cfg: raise ValueError('Missing number of cores') if 'lrms' not in cfg: raise ValueError('Missing LRMS') if 'dburl' not in cfg: raise ValueError('Missing DBURL') if 'pilot_id' not in cfg: raise ValueError('Missing pilot id') if 'runtime' not in cfg: raise ValueError('Missing or zero agent runtime') if 'scheduler' not in cfg: raise ValueError('Missing agent scheduler') if 'session_id' not in cfg: raise ValueError('Missing session id') if 'spawner' not in cfg: raise ValueError('Missing agent spawner') if 'task_launch_method' not in cfg: raise ValueError('Missing unit launch method') # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold # the address of the tunnelized DB endpoint. If it exists, we # overrule the agent config with it. hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT') if hostport: dburl = ru.Url(cfg['dburl']) dburl.host, dburl.port = hostport.split(':') cfg['dburl'] = str(dburl) # Create a session. # # This session will connect to MongoDB, and will also create any # communication channels and components/workers specified in the # config -- we merge that information into our own config. # We don't want the session to start components though, so remove them # from the config copy. session_cfg = copy.deepcopy(cfg) session_cfg['components'] = dict() session = rp_Session(cfg=session_cfg, uid=self._sid) # we still want the bridge addresses known though, so make sure they are # merged into our own copy, along with any other additions done by the # session. ru.dict_merge(cfg, session._cfg, ru.PRESERVE) pprint.pprint(cfg) if not session.is_connected: raise RuntimeError('agent_0 could not connect to mongodb') # at this point the session is up and connected, and it should have # brought up all communication bridges and the UpdateWorker. We are # ready to rumble! rpu.Worker.__init__(self, cfg, session) # this is the earlier point to sync bootstrapper and agent # profiles self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid) # Create LRMS which will give us the set of agent_nodes to use for # sub-agent startup. Add the remaining LRMS information to the # config, for the benefit of the scheduler). self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg, session=self._session) # add the resource manager information to our own config self._cfg['lrms_info'] = self._lrms.lrms_info
def get_profiles (command, tags=None, url=None, mode=None) : print command if not url: url = os.environ.get ('RADICAL_SYNAPSE_DBURL') if not url: print "warning: need dburl to retrieve profiles" return None url = ru.Url(url) if mode and not isinstance (mode, list): mode = [mode] if not tags: tags = dict() elems = filter (None, os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(',')) for elem in elems: if ':' in elem: key, val = elem.split(':', 1) tags[key] = val else: tags[elem] = None command_idx = index_command (command, tags) if url.schema == 'mongodb': [dbhost, port, dbname, _, _, _, _] = ru.split_dburl (url) db_client = pymongo.MongoClient (host=dbhost, port=port) database = db_client[dbname] collection = database['profiles'] # FIXME: eval partial tags if mode: results = collection.find ({'type' : 'synapse_profile', 'tags' : tags, 'mode' : {'$in': mode}, # FIXME: check 'command_idx' : command_idx}) else: results = collection.find ({'type' : 'synapse_profile', 'tags' : tags, 'command_idx' : command_idx}) if not results.count() : raise RuntimeError ("Could not get profile for %s at %s/profiles" % (command, url)) ret = list(results) elif url.schema == 'file': path = url.path if not os.path.isdir (path): raise ValueError ("dburl (%s) must point to an existing dir" % url) name = command_idx.split()[0] # for key, val in tags.iteritems(): # if val != None: name += "_%s:%s" % (key, val) # else : name += "_%s" % (key) for tag in sorted(tags.keys()): if tags[tag] != None: name += "_%s" % tags[tag] else : name += "_%s" % tag # print "checking profiles %s/synapse_profile_%s_*.json" % (path, name) base = "%s/synapse_profile_%s_*.json" % (path, name) fnames = glob.glob (base) ret = list() for fname in fnames: # print 'reading profile %s' % fname doc = ru.read_json_str (fname) use = False docmode = doc['mode'][0:3] doc['fname'] = fname if doc['command'] == command: if not mode : use = True elif docmode in mode: use = True else: print "skip: mode %s not in %s" % (docmode, mode) else: print "skip command %s" % command print " ! command %s" % doc['command'] if use: ret.append (doc) if not len(ret): raise LookupError ("No matching profile at %s" % base) # print 'retrieved %d profiles from %s' % (len(ret), url) # pprint.pprint (ret) return ret
def __init__(self, database_url=None, database_name="radicalpilot", uid=None, name=None): """Creates a new or reconnects to an exising session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ # init the base class inits saga.Session.__init__(self) Object.__init__(self) # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = list() self._unit_manager_objects = list() # Create a new process registry. All objects belonging to this # session will register their worker processes (if they have any) # in this registry. This makes it easier to shut down things in # a more coordinate fashion. self._process_registry = _ProcessRegistry() # The resource configuration dictionary associated with the session. self._resource_configs = {} self._database_url = database_url self._database_name = database_name if not self._database_url: self._database_url = os.getenv("RADICAL_PILOT_DBURL", None) if not self._database_url: raise PilotException("no database URL (set RADICAL_PILOT_DBURL)") logger.info("using database url %s" % self._database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) tmp_url = ru.Url(self._database_url) if tmp_url.path and \ tmp_url.path[0] == '/' and \ len(tmp_url.path) > 1 : self._database_name = tmp_url.path[1:] logger.info("using database path %s" % self._database_name) else: logger.info("using database name %s" % self._database_name) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] ########################## ## CREATE A NEW SESSION ## ########################## if uid is None: try: self._connected = None if name: self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else: self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid self._dbs, self._created, self._connection_info = \ dbSession.new(sid = self._uid, name = self._name, db_url = self._database_url, db_name = database_name) logger.info("New Session created%s." % str(self)) except Exception, ex: logger.exception('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._database_url, ex))
class Session(saga.Session): """A Session encapsulates a RADICAL-Pilot instance and is the *root* object for all other RADICAL-Pilot objects. A Session holds :class:`radical.pilot.PilotManager` and :class:`radical.pilot.UnitManager` instances which in turn hold :class:`radical.pilot.Pilot` and :class:`radical.pilot.ComputeUnit` instances. Each Session has a unique identifier :data:`radical.pilot.Session.uid` that can be used to re-connect to a RADICAL-Pilot instance in the database. **Example**:: s1 = radical.pilot.Session(database_url=DBURL) s2 = radical.pilot.Session(database_url=DBURL, uid=s1.uid) # s1 and s2 are pointing to the same session assert s1.uid == s2.uid """ #--------------------------------------------------------------------------- # def __init__(self, database_url=None, database_name=None, name=None): """Creates a new session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ logger = ru.get_logger('radical.pilot') if database_name: logger.error( "The 'database_name' parameter is deprecated - please specify an URL path" ) else: database_name = 'radicalpilot' # init the base class inits saga.Session.__init__(self) self._dh = ru.DebugHelper() self._valid = True self._terminate = threading.Event() self._terminate.clear() # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = dict() self._unit_manager_objects = dict() # The resource configuration dictionary associated with the session. self._resource_configs = {} if not database_url: database_url = os.getenv("RADICAL_PILOT_DBURL", None) if not database_url: raise PilotException("no database URL (set RADICAL_PILOT_DBURL)") self._dburl = ru.Url(database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) if not self._dburl.path or \ self._dburl.path[0] != '/' or \ len(self._dburl.path) <= 1 : logger.error( "incomplete URLs are deprecated -- missing database name!") self._dburl.path = database_name # defaults to 'radicalpilot' logger.info("using database %s" % self._dburl) # ---------------------------------------------------------------------- # create new session try: if name: self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else: self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid logger.report.info('<<create session %s' % self._uid) self._dbs = dbSession(sid=self._uid, name=self._name, dburl=self._dburl) self._dburl = self._dbs._dburl logger.info("New Session created: %s." % str(self)) except Exception, ex: logger.exception('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._dburl, ex)) # initialize profiling self.prof = Profiler('%s' % self._uid) self.prof.prof('start session', uid=self._uid) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/resource_*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: logger.info("Load resource configurations from %s" % config_file) rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Load resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % os.environ.get( 'HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Load resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/resource_aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] self.prof.prof('configs parsed', uid=self._uid) _rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION') if _rec: self._rec = "%s/%s" % (_rec, self._uid) os.system('mkdir -p %s' % self._rec) ru.write_json({'dburl': str(self._dburl)}, "%s/session.json" % self._rec) logger.info("recording session in %s" % self._rec) else: self._rec = None logger.report.ok('>>ok\n')
def get_profiles(command, tags=None, url=None, mode=None): print command if not url: url = os.environ.get('RADICAL_SYNAPSE_DBURL') if not url: print "warning: need dburl to retrieve profiles" return None url = ru.Url(url) if mode and not isinstance(mode, list): mode = [mode] if not tags: tags = dict() elems = filter(None, os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(',')) for elem in elems: if ':' in elem: key, val = elem.split(':', 1) tags[key] = val else: tags[elem] = None command_idx = index_command(command, tags) if url.schema == 'mongodb': [dbhost, port, dbname, _, _, _, _] = ru.split_dburl(url) db_client = pymongo.MongoClient(host=dbhost, port=port) database = db_client[dbname] collection = database['profiles'] # FIXME: eval partial tags if mode: results = collection.find({ 'type': 'synapse_profile', 'tags': tags, 'mode': { '$in': mode }, # FIXME: check 'command_idx': command_idx }) else: results = collection.find({ 'type': 'synapse_profile', 'tags': tags, 'command_idx': command_idx }) if not results.count(): raise RuntimeError("Could not get profile for %s at %s/profiles" % (command, url)) ret = list(results) elif url.schema == 'file': path = url.path if not os.path.isdir(path): raise ValueError("dburl (%s) must point to an existing dir" % url) name = command_idx.split()[0] # for key, val in tags.iteritems(): # if val != None: name += "_%s:%s" % (key, val) # else : name += "_%s" % (key) for tag in sorted(tags.keys()): if tags[tag] != None: name += "_%s" % tags[tag] else: name += "_%s" % tag # print "checking profiles %s/synapse_profile_%s_*.json" % (path, name) base = "%s/synapse_profile_%s_*.json" % (path, name) fnames = glob.glob(base) ret = list() for fname in fnames: # print 'reading profile %s' % fname doc = ru.read_json_str(fname) use = False docmode = doc['mode'][0:3] doc['fname'] = fname if doc['command'] == command: if not mode: use = True elif docmode in mode: use = True else: print "skip: mode %s not in %s" % (docmode, mode) else: print "skip command %s" % command print " ! command %s" % doc['command'] if use: ret.append(doc) if not len(ret): raise LookupError("No matching profile at %s" % base) # print 'retrieved %d profiles from %s' % (len(ret), url) # pprint.pprint (ret) return ret
""") import glob configs = glob.glob("%s/../../src/radical/pilot/configs/resource_*.json" % script_dir) for config in configs: if not config.endswith(".json"): continue # skip all non-python files if "/resource_aliases" in config: continue # skip alias files print " * %s" % config try: json_data = ru.read_json_str(config) except Exception, ex: print " * JSON PARSING ERROR: %s" % str(ex) continue config = config.split('/')[-1] resources_rst.write("{0}\n".format(config[:-5].upper())) resources_rst.write("{0}\n\n".format("="*len(config[:-5]))) for host_key, resource_config in json_data.iteritems(): resource_key = "%s.%s" % (config[:-5], host_key) print " * %s" % resource_key try: default_queue = resource_config["default_queue"] except Exception, ex:
# # { # "scheduler" : "rp.SCHED_BACKFILLING", # "resources" : ["india.furturegrid.org", "sierra.futuregrid.org"], # "resource_cfg" : # { # "*.futuregrid.org" : # { # "username" : "merzky" # } # } # } USER_CONFIG_PATH = os.environ.get ('HOME', '/tmp') + '/.my_app.cfg' # load the user config, and merge it with the default config user_config = ru.read_json_str (USER_CONFIG_PATH) # merge the user config into the app config, so that the user config keys are # applied where appropriate ru.dict_merge (app_config, user_config, policy='overwrite', wildcards=True) # lets see what we got pprint.pprint (app_config) # this should result in : # # { # 'log_level' : 0,